SIMD support for performing color conversion using MIPS DSPr2 instructions


git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@993 632fc199-4ca6-4c93-a231-07263d6284db
diff --git a/ChangeLog.txt b/ChangeLog.txt
index c859656..7c54a82 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -6,6 +6,10 @@
 compatible with X Video.)  Also, the decompress-to-YUV function has been
 extended to support image scaling.
 
+[2] Added SIMD acceleration for performing color conversion on DSPr2-capable
+MIPS platforms.  This speeds up the compression of full-color JPEGs by 6-17%
+on such platforms and decompression by 3-5%.
+
 
 1.3.0
 =====
diff --git a/acinclude.m4 b/acinclude.m4
index ea8ec52..8031136 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -180,3 +180,33 @@
     $2
   fi
 ])
+
+# AC_CHECK_COMPATIBLE_MIPSEL_ASSEMBLER_IFELSE
+# --------------------------
+# Test whether the assembler is suitable and supports MIPS instructions
+AC_DEFUN([AC_CHECK_COMPATIBLE_MIPSEL_ASSEMBLER_IFELSE],[
+  have_mips_dspr2=no
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="$CCASFLAGS -mdspr2"
+
+  AC_COMPILE_IFELSE([[
+
+  int main ()
+  {
+    int c = 0, a = 0, b = 0;
+    __asm__ __volatile__ (
+        "precr.qb.ph %[c], %[a], %[b]          \n\t"
+        : [c] "=r" (c)
+        : [a] "r" (a), [b] "r" (b)
+    );
+    return c;
+  }
+  ]], have_mips_dspr2=yes)
+  CFLAGS=$ac_save_CFLAGS
+
+  if test "x$have_mips_dspr2" = "xyes" ; then
+    $1
+  else
+    $2
+  fi
+])
diff --git a/configure.ac b/configure.ac
index 1a1632e..cb13fb3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -425,6 +425,16 @@
          with_simd=no
          AC_MSG_WARN([SIMD support can't be enabled.  Performance will suffer.])])
       ;;
+    mipsel*)
+      AC_MSG_RESULT([yes (mipsel)])
+      AC_MSG_CHECKING([if the assembler is GNU-compatible and can be used])
+      AC_CHECK_COMPATIBLE_MIPSEL_ASSEMBLER_IFELSE(
+        [AC_MSG_RESULT([yes])
+         simd_arch=mipsel],
+        [AC_MSG_RESULT([no])
+         with_simd=no
+         AC_MSG_WARN([SIMD support can't be enabled.  Performance will suffer.])])
+      ;;
     *)
       AC_MSG_RESULT([no ("$host_cpu")])
       AC_MSG_WARN([SIMD support not available for this CPU.  Performance will suffer.])
@@ -444,6 +454,7 @@
 AM_CONDITIONAL([SIMD_I386], [test "x$simd_arch" = "xi386"])
 AM_CONDITIONAL([SIMD_X86_64], [test "x$simd_arch" = "xx86_64"])
 AM_CONDITIONAL([SIMD_ARM], [test "x$simd_arch" = "xarm"])
+AM_CONDITIONAL([SIMD_MIPSEL], [test "x$simd_arch" = "xmipsel"])
 AM_CONDITIONAL([X86_64], [test "x$host_cpu" = "xx86_64" -o "x$host_cpu" = "xamd64"])
 AM_CONDITIONAL([WITH_TURBOJPEG], [test "x$with_turbojpeg" != "xno"])
 
diff --git a/simd/Makefile.am b/simd/Makefile.am
index a12ff6e..272c2a4 100644
--- a/simd/Makefile.am
+++ b/simd/Makefile.am
@@ -58,6 +58,12 @@
 
 endif
 
+if SIMD_MIPSEL
+
+libsimd_la_SOURCES = jsimd_mips.c jsimd_mips_dspr2.S
+
+endif
+
 AM_CPPFLAGS = -I$(top_srcdir) 
 
 .asm.lo:
diff --git a/simd/jsimd.h b/simd/jsimd.h
index 3d4751f..de1f495 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -3,6 +3,7 @@
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright 2011 D. R. Commander
+ * Copyright (C) 2013, MIPS Technologies, Inc., California
  * 
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -18,6 +19,7 @@
 #define JSIMD_SSE        0x04
 #define JSIMD_SSE2       0x08
 #define JSIMD_ARM_NEON   0x10
+#define JSIMD_MIPS_DSPR2 0x20
 
 /* Short forms of external names for systems with brain-damaged linkers. */
 
@@ -386,6 +388,64 @@
              JSAMPIMAGE input_buf, JDIMENSION input_row,
              JSAMPARRAY output_buf, int num_rows));
 
+EXTERN(void) jsimd_rgb_ycc_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgb_ycc_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgbx_ycc_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgr_ycc_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgrx_ycc_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxbgr_ycc_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxrgb_ycc_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+
+EXTERN (void) jsimd_ycc_rgb_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgb_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgbx_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgr_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgrx_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxbgr_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxrgb_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+
 /* SIMD Downsample */
 EXTERN(void) jsimd_h2v2_downsample_mmx
         JPP((JDIMENSION image_width, int max_v_samp_factor,
diff --git a/simd/jsimd_mips.c b/simd/jsimd_mips.c
new file mode 100644
index 0000000..f74d908
--- /dev/null
+++ b/simd/jsimd_mips.c
@@ -0,0 +1,465 @@
+/*
+ * jsimd_mips.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009-2011 D. R. Commander
+ * Copyright (C) 2013, MIPS Technologies, Inc., California
+ * 
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on
+ * MIPS architecture.
+ *
+ * Based on the stubs from 'jsimd_none.c'
+ */
+
+#define JPEG_INTERNALS
+#include "../jinclude.h"
+#include "../jpeglib.h"
+#include "../jsimd.h"
+#include "../jdct.h"
+#include "../jsimddct.h"
+#include "jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+static unsigned int simd_support = ~0;
+
+#if defined(__linux__)
+
+LOCAL(int)
+parse_proc_cpuinfo(const char* search_string)
+{
+  const char* file_name = "/proc/cpuinfo";
+  char cpuinfo_line[256];
+  FILE* f = NULL;
+  simd_support = 0;
+
+  if ((f = fopen(file_name, "r")) != NULL) {
+    while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) {
+      if (strstr(cpuinfo_line, search_string) != NULL) {
+        fclose(f);
+        simd_support |= JSIMD_MIPS_DSPR2;
+        return 1;
+      }
+    }
+    fclose(f);
+  }
+  /* Did not find string in the proc file, or not Linux ELF. */
+  return 0;
+}
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd (void)
+{
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+  simd_support |= JSIMD_MIPS_DSPR2;
+#elif defined(__linux__)
+  /* We still have a chance to use MIPS DSPR2 regardless of globally used
+   * -mdspr2 options passed to gcc by performing runtime detection via
+   * /proc/cpuinfo parsing on linux */
+  if (!parse_proc_cpuinfo("MIPS 74K"))
+    return;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
+                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                       JDIMENSION output_row, int num_rows)
+{
+  void (*mipsdspr2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  switch(cinfo->in_color_space)
+  {
+    case JCS_EXT_RGB:
+      mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      mipsdspr2fct=jsimd_extrgbx_ycc_convert_mips_dspr2;
+      break;
+    case JCS_EXT_BGR:
+      mipsdspr2fct=jsimd_extbgr_ycc_convert_mips_dspr2;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      mipsdspr2fct=jsimd_extbgrx_ycc_convert_mips_dspr2;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      mipsdspr2fct=jsimd_extxbgr_ycc_convert_mips_dspr2;
+
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      mipsdspr2fct=jsimd_extxrgb_ycc_convert_mips_dspr2;
+      break;
+    default:
+      mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2;
+      break;
+  }
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    mipsdspr2fct(cinfo->image_width, input_buf,
+        output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert (j_compress_ptr cinfo,
+                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                        JDIMENSION output_row, int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
+                       JSAMPIMAGE input_buf, JDIMENSION input_row,
+                       JSAMPARRAY output_buf, int num_rows)
+{
+  void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch(cinfo->out_color_space)
+  {
+    case JCS_EXT_RGB:
+      mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      mipsdspr2fct=jsimd_ycc_extrgbx_convert_mips_dspr2;
+      break;
+    case JCS_EXT_BGR:
+      mipsdspr2fct=jsimd_ycc_extbgr_convert_mips_dspr2;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      mipsdspr2fct=jsimd_ycc_extbgrx_convert_mips_dspr2;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      mipsdspr2fct=jsimd_ycc_extxbgr_convert_mips_dspr2;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      mipsdspr2fct=jsimd_ycc_extxrgb_convert_mips_dspr2;
+      break;
+  default:
+      mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2;
+      break;
+  }
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    mipsdspr2fct(cinfo->output_width, input_buf,
+        input_row, output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info * compptr,
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info * compptr,
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info * compptr,
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info * compptr,
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(int)
+jsimd_can_convsamp (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
+                DCTELEM * workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
+                      FAST_FLOAT * workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow (DCTELEM * data)
+{
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast (DCTELEM * data)
+{
+}
+
+GLOBAL(void)
+jsimd_fdct_float (FAST_FLOAT * data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
+                DCTELEM * workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+                      FAST_FLOAT * workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2 (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4 (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
diff --git a/simd/jsimd_mips_dspr2.S b/simd/jsimd_mips_dspr2.S
new file mode 100644
index 0000000..b2ab04f
--- /dev/null
+++ b/simd/jsimd_mips_dspr2.S
@@ -0,0 +1,250 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013, MIPS Technologies, Inc., California.
+ * All rights reserved.
+ * Authors:  Teodora Novkovic (teodora.novkovic@imgtec.com)
+ *           Darko Laus       (darko.laus@imgtec.com)
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#include "jsimd_mips_dspr2_asm.h"
+
+/*****************************************************************************/
+/*
+ * jsimd_extrgb_ycc_convert_mips_dspr2
+ * jsimd_extbgr_ycc_convert_mips_dspr2
+ * jsimd_extrgbx_ycc_convert_mips_dspr2
+ * jsimd_extbgrx_ycc_convert_mips_dspr2
+ * jsimd_extxbgr_ycc_convert_mips_dspr2
+ * jsimd_extxrgb_ycc_convert_mips_dspr2
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
+
+.macro DO_RGB_TO_YCC r,    \
+                     g,    \
+                     b,    \
+                     inptr
+    lbu     \r, \r_offs(\inptr)
+    lbu     \g, \g_offs(\inptr)
+    lbu     \b, \b_offs(\inptr)
+    addiu   \inptr, \pixel_size
+.endm
+
+LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2)
+/*
+ * a0     - cinfo->image_width
+ * a1     - input_buf
+ * a2     - output_buf
+ * a3     - output_row
+ * 16(sp) - num_rows
+ */
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw      t7, 48(sp)        // t7 = num_rows
+    li      s0, 0x4c8b        // FIX(0.29900)
+    li      s1, 0x9646        // FIX(0.58700)
+    li      s2, 0x1d2f        // FIX(0.11400)
+    li      s3, 0xffffd4cd    // -FIX(0.16874)
+    li      s4, 0xffffab33    // -FIX(0.33126)
+    li      s5, 0x8000        // FIX(0.50000)
+    li      s6, 0xffff94d1    // -FIX(0.41869)
+    li      s7, 0xffffeb2f    // -FIX(0.08131)
+    li      t8, 0x807fff      // CBCR_OFFSET + ONE_HALF-1
+
+0:
+    addiu   t7, -1            // --num_rows
+    lw      t6, 0(a1)         // t6 = input_buf[0]
+    lw      t0, 0(a2)
+    lw      t1, 4(a2)
+    lw      t2, 8(a2)
+    sll     t3, a3, 2
+    lwx     t0, t3(t0)        // t0 = output_buf[0][output_row]
+    lwx     t1, t3(t1)        // t1 = output_buf[1][output_row]
+    lwx     t2, t3(t2)        // t2 = output_buf[2][output_row]
+
+    addu    t9, t2, a0        // t9 = end address
+    addiu   a3, 1
+
+1:
+    DO_RGB_TO_YCC t3, t4, t5, t6
+
+    mtlo    s5, $ac0
+    mtlo    t8, $ac1
+    mtlo    t8, $ac2
+    maddu   $ac0, s2, t5
+    maddu   $ac1, s5, t5
+    maddu   $ac2, s5, t3
+    maddu   $ac0, s0, t3
+    maddu   $ac1, s3, t3
+    maddu   $ac2, s6, t4
+    maddu   $ac0, s1, t4
+    maddu   $ac1, s4, t4
+    maddu   $ac2, s7, t5
+    extr.w  t3, $ac0, 16
+    extr.w  t4, $ac1, 16
+    extr.w  t5, $ac2, 16
+    sb      t3, 0(t0)
+    sb      t4, 0(t1)
+    sb      t5, 0(t2)
+    addiu   t0, 1
+    addiu   t2, 1
+    bne     t2, t9, 1b
+     addiu  t1, 1
+    bgtz    t7, 0b
+     addiu  a1, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j ra
+     nop
+END(jsimd_\colorid\()_ycc_convert_mips_dspr2)
+
+.purgem DO_RGB_TO_YCC
+
+.endm
+
+/*------------------------------------------id -- pix R  G  B */
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
+
+/*****************************************************************************/
+/*
+ * jsimd_ycc_extrgb_convert_mips_dspr2
+ * jsimd_ycc_extbgr_convert_mips_dspr2
+ * jsimd_ycc_extrgbx_convert_mips_dspr2
+ * jsimd_ycc_extbgrx_convert_mips_dspr2
+ * jsimd_ycc_extxbgr_convert_mips_dspr2
+ * jsimd_ycc_extxrgb_convert_mips_dspr2
+ *
+ * Colorspace conversion YCbCr -> RGB
+ */
+
+.macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs, a_offs
+
+.macro STORE_YCC_TO_RGB  scratch0 \
+                         scratch1 \
+                         scratch2 \
+                         outptr
+    sb       \scratch0, \r_offs(\outptr)
+    sb       \scratch1, \g_offs(\outptr)
+    sb       \scratch2, \b_offs(\outptr)
+.if (\pixel_size == 4)
+    li       t0, 0xFF
+    sb       t0, \a_offs(\outptr)
+.endif
+    addiu    \outptr, \pixel_size
+.endm
+
+LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2)
+/*
+ * a0     - cinfo->image_width
+ * a1     - input_buf
+ * a2     - input_row
+ * a3     - output_buf
+ * 16(sp) - num_rows
+ */
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw         s1, 48(sp)
+    li         t3, 0x8000
+    li         t4, 0x166e9     // FIX(1.40200)
+    li         t5, 0x1c5a2     // FIX(1.77200)
+    li         t6, 0xffff492e  // -FIX(0.71414)
+    li         t7, 0xffffa7e6  // -FIX(0.34414)
+    repl.ph    t8, 128
+
+0:
+    lw         s0, 0(a3)
+    lw         t0, 0(a1)
+    lw         t1, 4(a1)
+    lw         t2, 8(a1)
+    sll        s5, a2, 2
+    addiu      s1, -1
+    lwx        s2, s5(t0)
+    lwx        s3, s5(t1)
+    lwx        s4, s5(t2)
+    addu       t9, s2, a0
+    addiu      a2, 1
+
+1:
+    lbu        s7, 0(s4)       // cr
+    lbu        s6, 0(s3)       // cb
+    lbu        s5, 0(s2)       // y
+    addiu      s2, 1
+    addiu      s4, 1
+    addiu      s7, -128
+    addiu      s6, -128
+    mul        t2, t7, s6
+    mul        t0, t6, s7      // Crgtab[cr]
+    sll        s7, 15
+    mulq_rs.w  t1, t4, s7      // Crrtab[cr]
+    sll        s6, 15
+    addu       t2, t3          // Cbgtab[cb]
+    addu       t2, t0
+
+    mulq_rs.w  t0, t5, s6      // Cbbtab[cb]
+    sra        t2, 16
+    addu       t1, s5
+    addu       t2, s5          // add y
+    ins        t2, t1, 16, 16
+    subu.ph    t2, t2, t8
+    addu       t0, s5
+    shll_s.ph  t2, t2, 8
+    subu       t0, 128
+    shra.ph    t2, t2, 8
+    shll_s.w   t0, t0, 24
+    addu.ph    t2, t2, t8      // clip & store
+    sra        t0, t0, 24
+    sra        t1, t2, 16
+    addiu      t0, 128
+
+    STORE_YCC_TO_RGB t1, t2, t0, s0
+
+    bne        s2, t9, 1b
+     addiu     s3, 1
+    bgtz       s1, 0b
+     addiu     a3, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j ra
+     nop
+END(jsimd_ycc_\colorid\()_convert_mips_dspr2)
+
+.purgem STORE_YCC_TO_RGB
+
+.endm
+
+/*------------------------------------------id -- pix R  G  B  A */
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0
+
+/*****************************************************************************/
diff --git a/simd/jsimd_mips_dspr2_asm.h b/simd/jsimd_mips_dspr2_asm.h
new file mode 100644
index 0000000..53cf2bc
--- /dev/null
+++ b/simd/jsimd_mips_dspr2_asm.h
@@ -0,0 +1,252 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013, MIPS Technologies, Inc., California.
+ * All rights reserved.
+ * Authors:  Teodora Novkovic (teodora.novkovic@imgtec.com)
+ *           Darko Laus       (darko.laus@imgtec.com)
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define zero $0
+#define AT   $1
+#define v0   $2
+#define v1   $3
+#define a0   $4
+#define a1   $5
+#define a2   $6
+#define a3   $7
+#define t0   $8
+#define t1   $9
+#define t2   $10
+#define t3   $11
+#define t4   $12
+#define t5   $13
+#define t6   $14
+#define t7   $15
+#define s0   $16
+#define s1   $17
+#define s2   $18
+#define s3   $19
+#define s4   $20
+#define s5   $21
+#define s6   $22
+#define s7   $23
+#define t8   $24
+#define t9   $25
+#define k0   $26
+#define k1   $27
+#define gp   $28
+#define sp   $29
+#define fp   $30
+#define s8   $30
+#define ra   $31
+
+/*
+ * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2
+ */
+#define LEAF_MIPS32R2(symbol)                           \
+                .globl  symbol;                         \
+                .align  2;                              \
+                .type   symbol, @function;              \
+                .ent    symbol, 0;                      \
+symbol:         .frame  sp, 0, ra;                      \
+                .set    push;                           \
+                .set    arch=mips32r2;                  \
+                .set    noreorder;                      \
+                .set    noat;
+
+/*
+ * LEAF_MIPS_DSPR2 - declare leaf routine for MIPS DSPr2
+ */
+#define LEAF_MIPS_DSPR2(symbol)                         \
+LEAF_MIPS32R2(symbol)                                   \
+                .set    dspr2;
+
+/*
+ * END - mark end of function
+ */
+#define END(function)                                   \
+                .set    pop;                            \
+                .end    function;                       \
+                .size   function,.-function
+
+/*
+ * Checks if stack offset is big enough for storing/restoring regs_num
+ * number of register to/from stack. Stack offset must be greater than
+ * or equal to the number of bytes needed for storing registers (regs_num*4).
+ * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is
+ * preserved for input arguments of the functions, already stored in a0-a3),
+ * stack size can be further optimized by utilizing this space.
+ */
+.macro CHECK_STACK_OFFSET regs_num, stack_offset
+.if \stack_offset < \regs_num * 4 - 16
+.error "Stack offset too small."
+.endif
+.endm
+
+/*
+ * Saves set of registers on stack. Maximum number of registers that
+ * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * before registers are pushed in order to provide enough space on stack
+ * (offset must be multiple of 4, and must be big enough, as described by
+ * CHECK_STACK_OFFSET macro). This macro is intended to be used in
+ * combination with RESTORE_REGS_FROM_STACK macro. Example:
+ *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
+ *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \
+                          r2  = 0, r3  = 0, r4  = 0, \
+                          r5  = 0, r6  = 0, r7  = 0, \
+                          r8  = 0, r9  = 0, r10 = 0, \
+                          r11 = 0, r12 = 0, r13 = 0, \
+                          r14 = 0
+    .if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
+    .error "Stack offset must be pozitive and multiple of 4."
+    .endif
+    .if \stack_offset != 0
+    addiu           sp, sp, -\stack_offset
+    .endif
+    sw              \r1, 0(sp)
+    .if \r2 != 0
+    sw              \r2, 4(sp)
+    .endif
+    .if \r3 != 0
+    sw              \r3, 8(sp)
+    .endif
+    .if \r4 != 0
+    sw              \r4, 12(sp)
+    .endif
+    .if \r5 != 0
+    CHECK_STACK_OFFSET 5, \stack_offset
+    sw              \r5, 16(sp)
+    .endif
+    .if \r6 != 0
+    CHECK_STACK_OFFSET 6, \stack_offset
+    sw              \r6, 20(sp)
+    .endif
+    .if \r7 != 0
+    CHECK_STACK_OFFSET 7, \stack_offset
+    sw              \r7, 24(sp)
+    .endif
+    .if \r8 != 0
+    CHECK_STACK_OFFSET 8, \stack_offset
+    sw              \r8, 28(sp)
+    .endif
+    .if \r9 != 0
+    CHECK_STACK_OFFSET 9, \stack_offset
+    sw              \r9, 32(sp)
+    .endif
+    .if \r10 != 0
+    CHECK_STACK_OFFSET 10, \stack_offset
+    sw              \r10, 36(sp)
+    .endif
+    .if \r11 != 0
+    CHECK_STACK_OFFSET 11, \stack_offset
+    sw              \r11, 40(sp)
+    .endif
+    .if \r12 != 0
+    CHECK_STACK_OFFSET 12, \stack_offset
+    sw              \r12, 44(sp)
+    .endif
+    .if \r13 != 0
+    CHECK_STACK_OFFSET 13, \stack_offset
+    sw              \r13, 48(sp)
+    .endif
+    .if \r14 != 0
+    CHECK_STACK_OFFSET 14, \stack_offset
+    sw              \r14, 52(sp)
+    .endif
+.endm
+
+/*
+ * Restores set of registers from stack. Maximum number of registers that
+ * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * after registers are restored (offset must be multiple of 4, and must
+ * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is
+ * intended to be used in combination with RESTORE_REGS_FROM_STACK macro.
+ * Example:
+ *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
+ *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \
+                               r2  = 0, r3  = 0, r4  = 0, \
+                               r5  = 0, r6  = 0, r7  = 0, \
+                               r8  = 0, r9  = 0, r10 = 0, \
+                               r11 = 0, r12 = 0, r13 = 0, \
+                               r14 = 0
+    .if (\stack_offset < 0) || (\stack_offset - (\stack_offset/4)*4)
+    .error "Stack offset must be pozitive and multiple of 4."
+    .endif
+    lw              \r1, 0(sp)
+    .if \r2 != 0
+    lw              \r2, 4(sp)
+    .endif
+    .if \r3 != 0
+    lw              \r3, 8(sp)
+    .endif
+    .if \r4 != 0
+    lw              \r4, 12(sp)
+    .endif
+    .if \r5 != 0
+    CHECK_STACK_OFFSET 5, \stack_offset
+    lw              \r5, 16(sp)
+    .endif
+    .if \r6 != 0
+    CHECK_STACK_OFFSET 6, \stack_offset
+    lw              \r6, 20(sp)
+    .endif
+    .if \r7 != 0
+    CHECK_STACK_OFFSET 7, \stack_offset
+    lw              \r7, 24(sp)
+    .endif
+    .if \r8 != 0
+    CHECK_STACK_OFFSET 8, \stack_offset
+    lw              \r8, 28(sp)
+    .endif
+    .if \r9 != 0
+    CHECK_STACK_OFFSET 9, \stack_offset
+    lw              \r9, 32(sp)
+    .endif
+    .if \r10 != 0
+    CHECK_STACK_OFFSET 10, \stack_offset
+    lw              \r10, 36(sp)
+    .endif
+    .if \r11 != 0
+    CHECK_STACK_OFFSET 11, \stack_offset
+    lw              \r11, 40(sp)
+    .endif
+    .if \r12 != 0
+    CHECK_STACK_OFFSET 12, \stack_offset
+    lw              \r12, 44(sp)
+    .endif
+    .if \r13 != 0
+    CHECK_STACK_OFFSET 13, \stack_offset
+    lw              \r13, 48(sp)
+    .endif
+    .if \r14 != 0
+    CHECK_STACK_OFFSET 14, \stack_offset
+    lw              \r14, 52(sp)
+    .endif
+    .if \stack_offset != 0
+    addiu           sp, sp, \stack_offset
+    .endif
+.endm
+
+