Fixed few issues seen in CTS tests

Added support for encoding non-multiple of 16 dimensions
Added support for encoding dimensions smaller than 64x64
Aligned coeff data to 4 byte boundary

Change-Id: I111093950f94698296d8499a2845cfe2db6c557b
diff --git a/encoder/arm/ih264e_fmt_conv.s b/encoder/arm/ih264e_fmt_conv.s
index 2bf1479..2c04141 100755
--- a/encoder/arm/ih264e_fmt_conv.s
+++ b/encoder/arm/ih264e_fmt_conv.s
@@ -83,7 +83,6 @@
     sub           r7, r7, r5            @// Source increment
     sub           r8, r8, r5            @// Destination increment
 
-    vpush         {d8-d15}
 yuv420sp_uv_row_loop_y:
     mov           r6, r5
 
@@ -134,7 +133,7 @@
     mov           r5, r5, lsr #1
     mov           r4, r4, lsr #1
     ldr           r3, [sp, #40]         @// Load pu1_dest_uv from stack
-    vpush         {d8-d15}
+
 yuv420sp_uv_row_loop_uv:
     mov           r6, r5
 
@@ -171,7 +170,6 @@
     subs          r4, r4, #1
     bgt           yuv420sp_uv_row_loop_uv
     @//POP THE REGISTERS
-    vpop          {d8-d15}
     ldmfd         sp!, {r4-r12, pc}
 
 
@@ -276,8 +274,6 @@
     add           r4, r12, r4           @// u2_offset1 = u2_offset1 + u4_stride_y
     add           r5, r14, r5, lsl #1   @// u2_offset_yuv422i = u2_offset_yuv422i + u4_stride_yuv422i
 
-    vpush         {d8-d15}
-
 @// Register Assignment
 @// pu1_y               - r0
 @// pu1_y_nxt_row       - r6
@@ -322,7 +318,6 @@
 
     add           r8, r8, r5            @// pu2_yuv422i_nxt_row = pu2_yuv422i_nxt_row   + u2_offset_yuv422i
     bgt           yuv420_to_yuv422i_hight_loop
-    vpop          {d8-d15}
     ldmfd         sp!, {r4-r12, pc}     @// Restore the register which are used
 
 
diff --git a/encoder/ih264e_cavlc.c b/encoder/ih264e_cavlc.c
index 1341dcd..1f98b6a 100755
--- a/encoder/ih264e_cavlc.c
+++ b/encoder/ih264e_cavlc.c
@@ -59,6 +59,7 @@
 #include "iv2.h"
 #include "ive2.h"
 #include "ih264_debug.h"
+#include "ih264_macros.h"
 #include "ih264_defs.h"
 #include "ih264e_defs.h"
 #include "ih264e_error.h"
diff --git a/encoder/ih264e_cavlc.h b/encoder/ih264e_cavlc.h
index 86f4cd4..acd0def 100755
--- a/encoder/ih264e_cavlc.h
+++ b/encoder/ih264e_cavlc.h
@@ -43,20 +43,20 @@
 /*****************************************************************************/
 
 #define PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, u4_nnz, u4_sig_coeff_map, pi2_res_block)   \
+{\
+    ps_mb_coeff_data = pv_mb_coeff_data; \
+    u4_nnz = ps_mb_coeff_data->i4_sig_map_nnz & 0xff;    \
+    if (u4_nnz)\
     {\
-                ps_mb_coeff_data = pv_mb_coeff_data; \
-                u4_nnz = ps_mb_coeff_data->i4_sig_map_nnz & 0xff;    \
-                if (u4_nnz)\
-                {\
-                    u4_sig_coeff_map = ps_mb_coeff_data->i4_sig_map_nnz >> 16; \
-                    pi2_res_block = ps_mb_coeff_data->ai2_residue; \
-                    pv_mb_coeff_data = ps_mb_coeff_data->ai2_residue + u4_nnz; \
-                }\
-                else\
-                {\
-                  pv_mb_coeff_data = ps_mb_coeff_data->ai2_residue;\
-                }\
-    }
+        u4_sig_coeff_map = ps_mb_coeff_data->i4_sig_map_nnz >> 16; \
+        pi2_res_block = ps_mb_coeff_data->ai2_residue; \
+        pv_mb_coeff_data = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz); \
+    }\
+    else\
+    {\
+      pv_mb_coeff_data = ps_mb_coeff_data->ai2_residue;\
+    }\
+}
 
 
 /*****************************************************************************/
diff --git a/encoder/ih264e_core_coding.c b/encoder/ih264e_core_coding.c
index 5ba18de..89243a5 100755
--- a/encoder/ih264e_core_coding.c
+++ b/encoder/ih264e_core_coding.c
@@ -58,6 +58,7 @@
 #include "ih264_platform_macros.h"
 #include "iv2.h"
 #include "ive2.h"
+#include "ih264_macros.h"
 #include "ih264_defs.h"
 #include "ih264e_defs.h"
 #include "ih264_trans_data.h"
@@ -843,7 +844,7 @@
         }
         /* write significant coeff map */
         ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
-        (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt;
+        (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
 
         u4_cntrl = 0x00008000;// Set DC bit in ctrl code
     }
@@ -896,7 +897,7 @@
             }
             /* write significant coeff map */
             ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
-            (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt;
+            (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
             *u1_cbp_l = 15;
 
             u4_cntrl |= (1 << (31 - u1_scan_order[b4]));
@@ -1059,7 +1060,7 @@
 
             /* write significant coeff map */
             ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
-            (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt;
+            (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
 
             /* cbp */
             *u1_cbp_l |= (1 << b8);
@@ -1283,7 +1284,7 @@
             }
             /* write significant coeff map U/V */
             ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
-            (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt;
+            (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
             *u1_cbp_c = 1;
 
             (*pu4_cntrl) |= (1 << cntrl_pos);
@@ -1388,7 +1389,7 @@
 
                 /* write significant coeff map U/V */
                 ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
-                (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt;
+                (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
                 u1_cbp_ac = 2;
 
                 (*pu4_cntrl) |= 1 << cntrl_pos;
@@ -1804,7 +1805,7 @@
                 ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
 
                 /* update ptr to coeff data */
-                (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt;
+                (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
 
                 /* cbp */
                 u1_cbp_l |= (1 << b8);
@@ -1950,7 +1951,7 @@
                 ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
 
                 /* update ptr to coeff data */
-                (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt;
+                (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
 
                 /* cbp */
                 u1_cbp_l |= (1 << b8);
diff --git a/encoder/ih264e_process.c b/encoder/ih264e_process.c
index 9a468e9..fa67d84 100755
--- a/encoder/ih264e_process.c
+++ b/encoder/ih264e_process.c
@@ -778,13 +778,6 @@
     /* sub mb modes */
     UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (i4_mb_x << 4);
 
-//    /* zero mv */
-//    mv_t zero_mv = {0, 0};
-
-    /* Pad the MB to support non standard sizes */
-    UWORD32 u4_pad_right_sz = ps_codec->s_cfg.u4_wd - ps_codec->s_cfg.u4_disp_wd;
-    UWORD32 u4_pad_bottom_sz = ps_codec->s_cfg.u4_ht - ps_codec->s_cfg.u4_disp_ht;
-
     /*************************************************************/
     /* During MV prediction, when top right mb is not available, */
     /* top left mb info. is used for prediction. Hence the curr  */
@@ -946,28 +939,6 @@
     ps_proc->pu1_rec_buf_chroma += MB_SIZE;
     ps_proc->pu1_ref_buf_chroma += MB_SIZE;
 
-    /* pad right edge */
-    if (u4_pad_right_sz && (ps_proc->i4_mb_x == i4_wd_mbs - 1))
-    {
-        ih264_pad_right_luma(
-                        ps_proc->pu1_src_buf_luma + MB_SIZE - u4_pad_right_sz,
-                        ps_proc->i4_src_strd, MB_SIZE, u4_pad_right_sz);
-
-        ih264_pad_right_chroma(
-                        ps_proc->pu1_src_buf_chroma + MB_SIZE - u4_pad_right_sz,
-                        ps_proc->i4_src_strd, BLK8x8SIZE, u4_pad_right_sz);
-    }
-
-    /* pad bottom edge */
-    if (u4_pad_bottom_sz && (ps_proc->i4_mb_y == i4_ht_mbs - 1) &&
-                    ps_proc->i4_mb_x != 0)
-    {
-        ih264_pad_bottom(ps_proc->pu1_src_buf_luma + (MB_SIZE - u4_pad_bottom_sz) * ps_proc->i4_src_strd,
-                         ps_proc->i4_src_strd, MB_SIZE, u4_pad_bottom_sz);
-
-        ih264_pad_bottom(ps_proc->pu1_src_buf_chroma + (MB_SIZE - u4_pad_bottom_sz) * ps_proc->i4_src_strd / 2,
-                         ps_proc->i4_src_strd, MB_SIZE, (u4_pad_bottom_sz / 2));
-    }
 
     /* Reset cost, distortion params */
     ps_proc->i4_mb_cost = INT_MAX;
@@ -1051,7 +1022,10 @@
     UWORD8 *pu1_y_buf_base, *pu1_u_buf_base, *pu1_v_buf_base;
 
     /* Pad the MB to support non standard sizes */
+    UWORD32 u4_pad_right_sz = ps_codec->s_cfg.u4_wd - ps_codec->s_cfg.u4_disp_wd;
     UWORD32 u4_pad_bottom_sz = ps_codec->s_cfg.u4_ht - ps_codec->s_cfg.u4_disp_ht;
+    UWORD16 u2_num_rows = MB_SIZE;
+    WORD32 convert_uv_only;
 
     /********************************************************************/
     /*                            BEGIN INIT                            */
@@ -1064,14 +1038,27 @@
     ps_proc->i4_nmb_ntrpy = (ps_proc->i4_wd_mbs > MAX_NMB) ? MAX_NMB : ps_proc->i4_wd_mbs;
     ps_proc->u4_nmb_me = (ps_proc->i4_wd_mbs > MAX_NMB)? MAX_NMB : ps_proc->i4_wd_mbs;
 
+    convert_uv_only = 1;
+    if (u4_pad_bottom_sz && (ps_proc->i4_mb_y == ps_proc->i4_ht_mbs - 1))
+    {
+        u2_num_rows = (UWORD16) MB_SIZE - u4_pad_bottom_sz;
+        ps_proc->pu1_src_buf_luma_base = ps_codec->pu1_y_csc_buf_base;
+        ps_proc->pu1_src_buf_luma = ps_proc->pu1_src_buf_luma_base + (i4_mb_x * MB_SIZE) + ps_codec->s_cfg.u4_max_wd * (i4_mb_y * MB_SIZE);
+        convert_uv_only = 0;
+
+    }
+    else
+        ps_proc->pu1_src_buf_luma = ps_proc->pu1_src_buf_luma_base + (i4_mb_x * MB_SIZE) + i4_src_strd * (i4_mb_y * MB_SIZE);
+
     /* init buffer pointers */
-    ps_proc->pu1_src_buf_luma = ps_proc->pu1_src_buf_luma_base + (i4_mb_x * MB_SIZE) + i4_src_strd * (i4_mb_y * MB_SIZE);
-    ps_proc->pu1_src_buf_chroma = ps_proc->pu1_src_buf_chroma_base + (i4_mb_x * MB_SIZE) + i4_src_strd * (i4_mb_y * BLK8x8SIZE);
+
+    ps_proc->pu1_src_buf_chroma = ps_proc->pu1_src_buf_chroma_base + (i4_mb_x * MB_SIZE) + ps_codec->s_cfg.u4_max_wd * (i4_mb_y * BLK8x8SIZE);
     ps_proc->pu1_rec_buf_luma = ps_proc->pu1_rec_buf_luma_base + (i4_mb_x * MB_SIZE) + i4_rec_strd * (i4_mb_y * MB_SIZE);
     ps_proc->pu1_rec_buf_chroma = ps_proc->pu1_rec_buf_chroma_base + (i4_mb_x * MB_SIZE) + i4_rec_strd * (i4_mb_y * BLK8x8SIZE);
     ps_proc->pu1_ref_buf_luma = ps_proc->pu1_ref_buf_luma_base + (i4_mb_x * MB_SIZE) + i4_rec_strd * (i4_mb_y * MB_SIZE);
     ps_proc->pu1_ref_buf_chroma = ps_proc->pu1_ref_buf_chroma_base + (i4_mb_x * MB_SIZE) + i4_rec_strd * (i4_mb_y * BLK8x8SIZE);
 
+
     /*
      * Do color space conversion
      * NOTE : We assume there that the number of MB's to process will not span multiple rows
@@ -1095,12 +1082,13 @@
             ps_codec->pf_ih264e_conv_420p_to_420sp(
                             pu1_y_buf_base, pu1_u_buf_base, pu1_v_buf_base,
                             ps_proc->pu1_src_buf_luma,
-                            ps_proc->pu1_src_buf_chroma, MB_SIZE,
-                            ps_proc->i4_wd_mbs * MB_SIZE,
+                            ps_proc->pu1_src_buf_chroma, u2_num_rows,
+                            ps_codec->s_cfg.u4_disp_wd,
                             ps_proc->s_inp_buf.s_raw_buf.au4_strd[0],
                             ps_proc->s_inp_buf.s_raw_buf.au4_strd[1],
                             ps_proc->s_inp_buf.s_raw_buf.au4_strd[2],
-                            ps_proc->i4_src_strd, ps_proc->i4_src_strd, 1);
+                            ps_proc->i4_src_strd, ps_proc->i4_src_strd,
+                            convert_uv_only);
             break;
 
         case IV_YUV_422ILE :
@@ -1111,7 +1099,7 @@
                             ps_proc->pu1_src_buf_luma,
                             ps_proc->pu1_src_buf_chroma,
                             ps_proc->pu1_src_buf_chroma + 1, pu1_y_buf_base,
-                            ps_proc->i4_wd_mbs * MB_SIZE, MB_SIZE,
+                            ps_codec->s_cfg.u4_disp_wd, u2_num_rows,
                             ps_proc->i4_src_strd, ps_proc->i4_src_strd,
                             ps_proc->i4_src_strd,
                             ps_proc->s_inp_buf.s_raw_buf.au4_strd[0] >> 1);
@@ -1121,16 +1109,36 @@
             break;
     }
 
+    if (u4_pad_right_sz && (ps_proc->i4_mb_x == 0) &&
+                    (ps_proc->i4_src_strd > (WORD32)ps_codec->s_cfg.u4_disp_wd) )
+    {
+        UWORD32 u4_pad_wd, u4_pad_ht;
+        u4_pad_wd = (UWORD32)(ps_proc->i4_src_strd - ps_codec->s_cfg.u4_disp_wd);
+        u4_pad_wd = MIN(u4_pad_right_sz, u4_pad_wd);
+        u4_pad_ht = MB_SIZE;
+        if(ps_proc->i4_mb_y == ps_proc->i4_ht_mbs - 1)
+            u4_pad_ht = MIN(MB_SIZE, (MB_SIZE - u4_pad_bottom_sz));
+
+        ih264_pad_right_luma(
+                        ps_proc->pu1_src_buf_luma + ps_codec->s_cfg.u4_disp_wd,
+                        ps_proc->i4_src_strd, u4_pad_ht, u4_pad_wd);
+
+        ih264_pad_right_chroma(
+                        ps_proc->pu1_src_buf_chroma + ps_codec->s_cfg.u4_disp_wd,
+                        ps_proc->i4_src_strd, u4_pad_ht / 2, u4_pad_wd);
+    }
+
     /* pad bottom edge */
     if (u4_pad_bottom_sz && (ps_proc->i4_mb_y == ps_proc->i4_ht_mbs - 1) && ps_proc->i4_mb_x == 0)
     {
         ih264_pad_bottom(ps_proc->pu1_src_buf_luma + (MB_SIZE - u4_pad_bottom_sz) * ps_proc->i4_src_strd,
-                         ps_proc->i4_src_strd, MB_SIZE, u4_pad_bottom_sz);
+                         ps_proc->i4_src_strd, ps_proc->i4_src_strd, u4_pad_bottom_sz);
 
         ih264_pad_bottom(ps_proc->pu1_src_buf_chroma + (MB_SIZE - u4_pad_bottom_sz) * ps_proc->i4_src_strd / 2,
-                         ps_proc->i4_src_strd, MB_SIZE, (u4_pad_bottom_sz / 2));
+                         ps_proc->i4_src_strd, ps_proc->i4_src_strd, (u4_pad_bottom_sz / 2));
     }
 
+
     /* packed mb coeff data */
     ps_proc->pv_mb_coeff_data = ((UWORD8 *)ps_proc->pv_pic_mb_coeff_data) + i4_mb_y * ps_codec->u4_size_coeff_data;
 
@@ -1266,7 +1274,7 @@
         /* padding left chroma */
         ps_codec->pf_pad_left_chroma(pu1_curr_pic_chroma, i4_rec_strd, i4_pad_ht >> 1, PAD_LEFT);
     }
-    else if (i4_mb_x == ps_proc->i4_wd_mbs - 1)
+    if (i4_mb_x == ps_proc->i4_wd_mbs - 1)
     {
         /* padding right luma */
         ps_codec->pf_pad_right_luma(pu1_curr_pic_luma + MB_SIZE, i4_rec_strd, i4_pad_ht, PAD_RIGHT);
@@ -1300,7 +1308,7 @@
 
             wd += PAD_LEFT;
         }
-        else if (i4_mb_x == ps_proc->i4_wd_mbs - 1)
+        if (i4_mb_x == ps_proc->i4_wd_mbs - 1)
         {
             wd += PAD_RIGHT;
         }
@@ -1415,7 +1423,7 @@
         }
     }
 
-    if (i4_mb_y > 0)
+    if ((i4_mb_y > 0) || (i4_mb_y == (ps_proc->i4_ht_mbs - 1)))
     {
         /* if number of mb's to be processed are less than 'N', go back.
          * exception to the above clause is end of row */
@@ -1442,10 +1450,10 @@
             }
 
             /* performing deblocking for required number of MBs */
-            if (ps_proc->u4_disable_deblock_level != 1)
+            if ((i4_mb_y > 0) && (ps_proc->u4_disable_deblock_level != 1))
             {
                 /* Top or Top right MBs not deblocked */
-                if (u4_deblk_prev_row != 1)
+                if ((u4_deblk_prev_row != 1) && (i4_mb_y > 0))
                 {
                     return IH264E_SUCCESS;
                 }
@@ -1483,7 +1491,7 @@
 
                 }
             }
-            else
+            else if(i4_mb_y > 0)
             {
                 ps_deblk->i4_mb_x += i4_n_mb_process_count;
 
@@ -1606,6 +1614,19 @@
 
                     }
 
+                    /* In case height is less than 2 MBs pad top */
+                    if (ps_proc->i4_ht_mbs <= 2)
+                    {
+                        UWORD8 *pu1_pad_top_src;
+                        /* padding top luma */
+                        pu1_pad_top_src = ps_proc->pu1_rec_buf_luma_base - PAD_LEFT;
+                        ps_codec->pf_pad_top(pu1_pad_top_src, i4_rec_strd, i4_rec_strd, PAD_TOP);
+
+                        /* padding top chroma */
+                        pu1_pad_top_src = ps_proc->pu1_rec_buf_chroma_base - PAD_LEFT;
+                        ps_codec->pf_pad_top(pu1_pad_top_src, i4_rec_strd, i4_rec_strd, (PAD_TOP >> 1));
+                    }
+
                     /* padding bottom luma */
                     pu1_pad_bottom_src = ps_proc->pu1_rec_buf_luma_base + ps_proc->i4_ht_mbs * MB_SIZE * i4_rec_strd - PAD_LEFT;
                     ps_codec->pf_pad_bottom(pu1_pad_bottom_src, i4_rec_strd, i4_rec_strd, PAD_BOT);