Add SSE2 SIMD implementation of computationally intensive routines.


git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@22 632fc199-4ca6-4c93-a231-07263d6284db
diff --git a/jsimd.c b/jsimd.c
index 6c60b5b..3248911 100644
--- a/jsimd.c
+++ b/jsimd.c
@@ -63,6 +63,9 @@
   if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
     return 0;
 
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+    return 1;
   if (simd_support & JSIMD_MMX)
     return 1;
 
@@ -82,6 +85,9 @@
   if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
     return 0;
 
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+    return 1;
   if (simd_support & JSIMD_MMX)
     return 1;
 
@@ -94,7 +100,11 @@
                        JDIMENSION output_row, int num_rows)
 {
 #ifdef WITH_SIMD
-  if (simd_support & JSIMD_MMX)
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+    jsimd_rgb_ycc_convert_sse2(cinfo->image_width, input_buf,
+        output_buf, output_row, num_rows);
+  else if (simd_support & JSIMD_MMX)
     jsimd_rgb_ycc_convert_mmx(cinfo->image_width, input_buf,
         output_buf, output_row, num_rows);
 #endif
@@ -106,7 +116,11 @@
                        JSAMPARRAY output_buf, int num_rows)
 {
 #ifdef WITH_SIMD
-  if (simd_support & JSIMD_MMX)
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+    jsimd_ycc_rgb_convert_sse2(cinfo->output_width, input_buf,
+        input_row, output_buf, num_rows);
+  else if (simd_support & JSIMD_MMX)
     jsimd_ycc_rgb_convert_mmx(cinfo->output_width, input_buf,
         input_row, output_buf, num_rows);
 #endif
@@ -123,6 +137,8 @@
   if (sizeof(JDIMENSION) != 4)
     return 0;
 
+  if (simd_support & JSIMD_SSE2)
+    return 1;
   if (simd_support & JSIMD_MMX)
     return 1;
 
@@ -140,6 +156,8 @@
   if (sizeof(JDIMENSION) != 4)
     return 0;
 
+  if (simd_support & JSIMD_SSE2)
+    return 1;
   if (simd_support & JSIMD_MMX)
     return 1;
 
@@ -151,7 +169,11 @@
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
 #ifdef WITH_SIMD
-  if (simd_support & JSIMD_MMX)
+  if (simd_support & JSIMD_SSE2)
+    jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+        compptr->v_samp_factor, compptr->width_in_blocks,
+        input_data, output_data);
+  else if (simd_support & JSIMD_MMX)
     jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
         compptr->v_samp_factor, compptr->width_in_blocks,
         input_data, output_data);
@@ -163,7 +185,11 @@
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
 #ifdef WITH_SIMD
-  if (simd_support & JSIMD_MMX)
+  if (simd_support & JSIMD_SSE2)
+    jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+        compptr->v_samp_factor, compptr->width_in_blocks,
+        input_data, output_data);
+  else if (simd_support & JSIMD_MMX)
     jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
         compptr->v_samp_factor, compptr->width_in_blocks,
         input_data, output_data);
@@ -181,6 +207,8 @@
   if (sizeof(JDIMENSION) != 4)
     return 0;
 
+  if (simd_support & JSIMD_SSE2)
+    return 1;
   if (simd_support & JSIMD_MMX)
     return 1;
 
@@ -198,6 +226,8 @@
   if (sizeof(JDIMENSION) != 4)
     return 0;
 
+  if (simd_support & JSIMD_SSE2)
+    return 1;
   if (simd_support & JSIMD_MMX)
     return 1;
 
@@ -211,7 +241,10 @@
                      JSAMPARRAY * output_data_ptr)
 {
 #ifdef WITH_SIMD
-  if (simd_support & JSIMD_MMX)
+  if (simd_support & JSIMD_SSE2)
+    jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor,
+        cinfo->output_width, input_data, output_data_ptr);
+  else if (simd_support & JSIMD_MMX)
     jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor,
         cinfo->output_width, input_data, output_data_ptr);
 #endif
@@ -224,7 +257,10 @@
                      JSAMPARRAY * output_data_ptr)
 {
 #ifdef WITH_SIMD
-  if (simd_support & JSIMD_MMX)
+  if (simd_support & JSIMD_SSE2)
+    jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor,
+        cinfo->output_width, input_data, output_data_ptr);
+  else if (simd_support & JSIMD_MMX)
     jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor,
         cinfo->output_width, input_data, output_data_ptr);
 #endif
@@ -241,6 +277,9 @@
   if (sizeof(JDIMENSION) != 4)
     return 0;
 
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
   if (simd_support & JSIMD_MMX)
     return 1;
 
@@ -258,6 +297,9 @@
   if (sizeof(JDIMENSION) != 4)
     return 0;
 
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
   if (simd_support & JSIMD_MMX)
     return 1;
 
@@ -271,7 +313,11 @@
                            JSAMPARRAY * output_data_ptr)
 {
 #ifdef WITH_SIMD
-  if (simd_support & JSIMD_MMX)
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+        compptr->downsampled_width, input_data, output_data_ptr);
+  else if (simd_support & JSIMD_MMX)
     jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor,
         compptr->downsampled_width, input_data, output_data_ptr);
 #endif
@@ -284,7 +330,11 @@
                            JSAMPARRAY * output_data_ptr)
 {
 #ifdef WITH_SIMD
-  if (simd_support & JSIMD_MMX)
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+        compptr->downsampled_width, input_data, output_data_ptr);
+  else if (simd_support & JSIMD_MMX)
     jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor,
         compptr->downsampled_width, input_data, output_data_ptr);
 #endif
@@ -301,6 +351,9 @@
   if (sizeof(JDIMENSION) != 4)
     return 0;
 
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
   if (simd_support & JSIMD_MMX)
     return 1;
 
@@ -318,6 +371,9 @@
   if (sizeof(JDIMENSION) != 4)
     return 0;
 
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
   if (simd_support & JSIMD_MMX)
     return 1;
 
@@ -331,7 +387,11 @@
                             JSAMPARRAY output_buf)
 {
 #ifdef WITH_SIMD
-  if (simd_support & JSIMD_MMX)
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    jsimd_h2v2_merged_upsample_sse2(cinfo->output_width, input_buf,
+        in_row_group_ctr, output_buf);
+  else if (simd_support & JSIMD_MMX)
     jsimd_h2v2_merged_upsample_mmx(cinfo->output_width, input_buf,
         in_row_group_ctr, output_buf);
 #endif
@@ -344,7 +404,11 @@
                             JSAMPARRAY output_buf)
 {
 #ifdef WITH_SIMD
-  if (simd_support & JSIMD_MMX)
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    jsimd_h2v1_merged_upsample_sse2(cinfo->output_width, input_buf,
+        in_row_group_ctr, output_buf);
+  else if (simd_support & JSIMD_MMX)
     jsimd_h2v1_merged_upsample_mmx(cinfo->output_width, input_buf,
         in_row_group_ctr, output_buf);
 #endif
@@ -365,6 +429,8 @@
   if (sizeof(DCTELEM) != 2)
     return 0;
 
+  if (simd_support & JSIMD_SSE2)
+    return 1;
   if (simd_support & JSIMD_MMX)
     return 1;
 
@@ -386,6 +452,8 @@
   if (sizeof(FAST_FLOAT) != 4)
     return 0;
 
+  if (simd_support & JSIMD_SSE2)
+    return 1;
   if (simd_support & JSIMD_SSE)
     return 1;
   if (simd_support & JSIMD_3DNOW)
@@ -399,7 +467,9 @@
                 DCTELEM * workspace)
 {
 #ifdef WITH_SIMD
-  if (simd_support & JSIMD_MMX)
+  if (simd_support & JSIMD_SSE2)
+    jsimd_convsamp_sse2(sample_data, start_col, workspace);
+  else if (simd_support & JSIMD_MMX)
     jsimd_convsamp_mmx(sample_data, start_col, workspace);
 #endif
 }
@@ -409,7 +479,9 @@
                       FAST_FLOAT * workspace)
 {
 #ifdef WITH_SIMD
-  if (simd_support & JSIMD_SSE)
+  if (simd_support & JSIMD_SSE2)
+    jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
+  else if (simd_support & JSIMD_SSE)
     jsimd_convsamp_float_sse(sample_data, start_col, workspace);
   else if (simd_support & JSIMD_3DNOW)
     jsimd_convsamp_float_3dnow(sample_data, start_col, workspace);
@@ -427,6 +499,8 @@
   if (sizeof(DCTELEM) != 2)
     return 0;
 
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    return 1;
   if (simd_support & JSIMD_MMX)
     return 1;
 
@@ -444,6 +518,8 @@
   if (sizeof(DCTELEM) != 2)
     return 0;
 
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
+    return 1;
   if (simd_support & JSIMD_MMX)
     return 1;
 
@@ -473,7 +549,9 @@
 jsimd_fdct_islow (DCTELEM * data)
 {
 #ifdef WITH_SIMD
-  if (simd_support & JSIMD_MMX)
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    jsimd_fdct_islow_sse2(data);
+  else if (simd_support & JSIMD_MMX)
     jsimd_fdct_islow_mmx(data);
 #endif
 }
@@ -482,7 +560,9 @@
 jsimd_fdct_ifast (DCTELEM * data)
 {
 #ifdef WITH_SIMD
-  if (simd_support & JSIMD_MMX)
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    jsimd_fdct_ifast_sse2(data);
+  else if (simd_support & JSIMD_MMX)
     jsimd_fdct_ifast_mmx(data);
 #endif
 }
@@ -511,6 +591,8 @@
   if (sizeof(DCTELEM) != 2)
     return 0;
 
+  if (simd_support & JSIMD_SSE2)
+    return 1;
   if (simd_support & JSIMD_MMX)
     return 1;
 
@@ -530,6 +612,8 @@
   if (sizeof(FAST_FLOAT) != 4)
     return 0;
 
+  if (simd_support & JSIMD_SSE2)
+    return 1;
   if (simd_support & JSIMD_SSE)
     return 1;
   if (simd_support & JSIMD_3DNOW)
@@ -543,7 +627,9 @@
                 DCTELEM * workspace)
 {
 #ifdef WITH_SIMD
-  if (simd_support & JSIMD_MMX)
+  if (simd_support & JSIMD_SSE2)
+    jsimd_quantize_sse2(coef_block, divisors, workspace);
+  else if (simd_support & JSIMD_MMX)
     jsimd_quantize_mmx(coef_block, divisors, workspace);
 #endif
 }
@@ -553,7 +639,9 @@
                       FAST_FLOAT * workspace)
 {
 #ifdef WITH_SIMD
-  if (simd_support & JSIMD_SSE)
+  if (simd_support & JSIMD_SSE2)
+    jsimd_quantize_float_sse2(coef_block, divisors, workspace);
+  else if (simd_support & JSIMD_SSE)
     jsimd_quantize_float_sse(coef_block, divisors, workspace);
   else if (simd_support & JSIMD_3DNOW)
     jsimd_quantize_float_3dnow(coef_block, divisors, workspace);
@@ -577,6 +665,8 @@
   if (sizeof(ISLOW_MULT_TYPE) != 2)
     return 0;
 
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
   if (simd_support & JSIMD_MMX)
     return 1;
 
@@ -600,6 +690,8 @@
   if (sizeof(ISLOW_MULT_TYPE) != 2)
     return 0;
 
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
   if (simd_support & JSIMD_MMX)
     return 1;
 
@@ -612,7 +704,9 @@
                 JDIMENSION output_col)
 {
 #if WITH_SIMD
-  if (simd_support & JSIMD_MMX)
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+  else if (simd_support & JSIMD_MMX)
     jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col);
 #endif
 }
@@ -623,7 +717,9 @@
                 JDIMENSION output_col)
 {
 #if WITH_SIMD
-  if (simd_support & JSIMD_MMX)
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+  else if (simd_support & JSIMD_MMX)
     jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col);
 #endif
 }
@@ -645,6 +741,8 @@
   if (sizeof(ISLOW_MULT_TYPE) != 2)
     return 0;
 
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+    return 1;
   if (simd_support & JSIMD_MMX)
     return 1;
 
@@ -670,6 +768,8 @@
   if (IFAST_SCALE_BITS != 2)
     return 0;
 
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    return 1;
   if (simd_support & JSIMD_MMX)
     return 1;
 
@@ -694,6 +794,8 @@
   if (sizeof(FLOAT_MULT_TYPE) != 4)
     return 0;
 
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    return 1;
   if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
     return 1;
   if (simd_support & JSIMD_3DNOW)
@@ -708,7 +810,9 @@
                 JDIMENSION output_col)
 {
 #if WITH_SIMD
-  if (simd_support & JSIMD_MMX)
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+    jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+  else if (simd_support & JSIMD_MMX)
     jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf, output_col);
 #endif
 }
@@ -719,7 +823,9 @@
                 JDIMENSION output_col)
 {
 #if WITH_SIMD
-  if (simd_support & JSIMD_MMX)
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+  else if (simd_support & JSIMD_MMX)
     jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf, output_col);
 #endif
 }
@@ -730,7 +836,10 @@
                 JDIMENSION output_col)
 {
 #if WITH_SIMD
-  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    jsimd_idct_float_sse2(compptr->dct_table, coef_block,
+        output_buf, output_col);
+  else if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
     jsimd_idct_float_sse(compptr->dct_table, coef_block,
         output_buf, output_col);
   else if (simd_support & JSIMD_3DNOW)