Update libjpeg-turbo to 2.0.0
Bug: 78329453
Update to upstream at https://github.com/libjpeg-turbo/libjpeg-turbo/tree/2.0.0
This includes a fix for a bug that could result in an infinite loop.
ChangeLog.md contains detailed changes about the upstream library. Changes
I made are below:
- Remove files that are no longer in upstream, and include all current
files from upstream.
- Update various references to the version.
Android.bp:
- Update to build new files/files in new locations.
- Run bpfmt
README.android:
- Remove cherry-pick references, as they are no longer needed.
- Remove modification in jsimdext.inc, which no longer appears to be
necessary.
README.version:
- Use the github URL, as it is now the official upstream build
- Replace msarett as OWNER, as he no longer works on this project
- Update the version
Change-Id: Ie6cfee5a8f820f28656bbb305f500e75e7ce7915
diff --git a/simd/mips/jsimd.c b/simd/mips/jsimd.c
new file mode 100644
index 0000000..af886f6
--- /dev/null
+++ b/simd/mips/jsimd.c
@@ -0,0 +1,1115 @@
+/*
+ * jsimd_mips.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander.
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * MIPS architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+static unsigned int simd_support = ~0;
+
+#if defined(__linux__)
+
+LOCAL(int)
+parse_proc_cpuinfo(const char *search_string)
+{
+ const char *file_name = "/proc/cpuinfo";
+ char cpuinfo_line[256];
+ FILE *f = NULL;
+
+ simd_support = 0;
+
+ if ((f = fopen(file_name, "r")) != NULL) {
+ while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) {
+ if (strstr(cpuinfo_line, search_string) != NULL) {
+ fclose(f);
+ simd_support |= JSIMD_DSPR2;
+ return 1;
+ }
+ }
+ fclose(f);
+ }
+ /* Did not find string in the proc file, or not Linux ELF. */
+ return 0;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+ char *env = NULL;
+#endif
+
+ if (simd_support != ~0U)
+ return;
+
+ simd_support = 0;
+
+#if defined(__MIPSEL__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+ simd_support |= JSIMD_DSPR2;
+#elif defined(__linux__)
+ /* We still have a chance to use MIPS DSPR2 regardless of globally used
+ * -mdspr2 options passed to gcc by performing runtime detection via
+ * /proc/cpuinfo parsing on linux */
+ if (!parse_proc_cpuinfo("MIPS 74K"))
+ return;
+#endif
+
+#ifndef NO_GETENV
+ /* Force different settings through environment variables */
+ env = getenv("JSIMD_FORCEDSPR2");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support = JSIMD_DSPR2;
+ env = getenv("JSIMD_FORCENONE");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support = 0;
+#endif
+}
+
+static const int mips_idct_ifast_coefs[4] = {
+ 0x45404540, /* FIX( 1.082392200 / 2) = 17734 = 0x4546 */
+ 0x5A805A80, /* FIX( 1.414213562 / 2) = 23170 = 0x5A82 */
+ 0x76407640, /* FIX( 1.847759065 / 2) = 30274 = 0x7642 */
+ 0xAC60AC60 /* FIX(-2.613125930 / 4) = -21407 = 0xAC61 */
+};
+
+/* The following struct is borrowed from jdsample.c */
+typedef void (*upsample1_ptr) (j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+typedef struct {
+ struct jpeg_upsampler pub;
+ JSAMPARRAY color_buf[MAX_COMPONENTS];
+ upsample1_ptr methods[MAX_COMPONENTS];
+ int next_row_out;
+ JDIMENSION rows_to_go;
+ int rowgroup_height[MAX_COMPONENTS];
+ UINT8 h_expand[MAX_COMPONENTS];
+ UINT8 v_expand[MAX_COMPONENTS];
+} my_upsampler;
+
+typedef my_upsampler *my_upsample_ptr;
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_c_can_null_convert(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*dspr2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ dspr2fct = jsimd_extrgb_ycc_convert_dspr2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ dspr2fct = jsimd_extrgbx_ycc_convert_dspr2;
+ break;
+ case JCS_EXT_BGR:
+ dspr2fct = jsimd_extbgr_ycc_convert_dspr2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ dspr2fct = jsimd_extbgrx_ycc_convert_dspr2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ dspr2fct = jsimd_extxbgr_ycc_convert_dspr2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ dspr2fct = jsimd_extxrgb_ycc_convert_dspr2;
+ break;
+ default:
+ dspr2fct = jsimd_extrgb_ycc_convert_dspr2;
+ break;
+ }
+
+ dspr2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*dspr2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ dspr2fct = jsimd_extrgb_gray_convert_dspr2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ dspr2fct = jsimd_extrgbx_gray_convert_dspr2;
+ break;
+ case JCS_EXT_BGR:
+ dspr2fct = jsimd_extbgr_gray_convert_dspr2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ dspr2fct = jsimd_extbgrx_gray_convert_dspr2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ dspr2fct = jsimd_extxbgr_gray_convert_dspr2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ dspr2fct = jsimd_extxrgb_gray_convert_dspr2;
+ break;
+ default:
+ dspr2fct = jsimd_extrgb_gray_convert_dspr2;
+ break;
+ }
+
+ dspr2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ dspr2fct = jsimd_ycc_extrgb_convert_dspr2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ dspr2fct = jsimd_ycc_extrgbx_convert_dspr2;
+ break;
+ case JCS_EXT_BGR:
+ dspr2fct = jsimd_ycc_extbgr_convert_dspr2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ dspr2fct = jsimd_ycc_extbgrx_convert_dspr2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ dspr2fct = jsimd_ycc_extxbgr_convert_dspr2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ dspr2fct = jsimd_ycc_extxrgb_convert_dspr2;
+ break;
+ default:
+ dspr2fct = jsimd_ycc_extrgb_convert_dspr2;
+ break;
+ }
+
+ dspr2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ jsimd_c_null_convert_dspr2(cinfo->image_width, input_buf, output_buf,
+ output_row, num_rows, cinfo->num_components);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_smooth_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (DCTSIZE != 8)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v2_downsample_dspr2(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v2_smooth_downsample_dspr2(input_data, output_data,
+ compptr->v_samp_factor,
+ cinfo->max_v_samp_factor,
+ cinfo->smoothing_factor,
+ compptr->width_in_blocks,
+ cinfo->image_width);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v1_downsample_dspr2(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_int_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_upsample_dspr2(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_upsample_dspr2(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
+
+ jsimd_int_upsample_dspr2(upsample->h_expand[compptr->component_index],
+ upsample->v_expand[compptr->component_index],
+ input_data, output_data_ptr, cinfo->output_width,
+ cinfo->max_v_samp_factor);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_fancy_upsample_dspr2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_fancy_upsample_dspr2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, JSAMPLE *);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ dspr2fct = jsimd_h2v2_extrgb_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ dspr2fct = jsimd_h2v2_extrgbx_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_BGR:
+ dspr2fct = jsimd_h2v2_extbgr_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ dspr2fct = jsimd_h2v2_extbgrx_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ dspr2fct = jsimd_h2v2_extxbgr_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ dspr2fct = jsimd_h2v2_extxrgb_merged_upsample_dspr2;
+ break;
+ default:
+ dspr2fct = jsimd_h2v2_extrgb_merged_upsample_dspr2;
+ break;
+ }
+
+ dspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
+ cinfo->sample_range_limit);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, JSAMPLE *);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ dspr2fct = jsimd_h2v1_extrgb_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ dspr2fct = jsimd_h2v1_extrgbx_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_BGR:
+ dspr2fct = jsimd_h2v1_extbgr_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ dspr2fct = jsimd_h2v1_extbgrx_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ dspr2fct = jsimd_h2v1_extxbgr_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ dspr2fct = jsimd_h2v1_extxrgb_merged_upsample_dspr2;
+ break;
+ default:
+ dspr2fct = jsimd_h2v1_extrgb_merged_upsample_dspr2;
+ break;
+ }
+
+ dspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
+ cinfo->sample_range_limit);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ jsimd_convsamp_dspr2(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace)
+{
+ jsimd_convsamp_float_dspr2(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+ jsimd_fdct_islow_dspr2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+ jsimd_fdct_ifast_dspr2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+ jsimd_quantize_dspr2(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace)
+{
+ jsimd_quantize_float_dspr2(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_6x6(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_12x12(void)
+{
+ init_simd();
+
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_2x2_dspr2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ int workspace[DCTSIZE * 4]; /* buffers data between passes */
+
+ jsimd_idct_4x4_dspr2(compptr->dct_table, coef_block, output_buf, output_col,
+ workspace);
+}
+
+GLOBAL(void)
+jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_6x6_dspr2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ int workspace[96];
+ int output[12] = {
+ (int)(output_buf[0] + output_col),
+ (int)(output_buf[1] + output_col),
+ (int)(output_buf[2] + output_col),
+ (int)(output_buf[3] + output_col),
+ (int)(output_buf[4] + output_col),
+ (int)(output_buf[5] + output_col),
+ (int)(output_buf[6] + output_col),
+ (int)(output_buf[7] + output_col),
+ (int)(output_buf[8] + output_col),
+ (int)(output_buf[9] + output_col),
+ (int)(output_buf[10] + output_col),
+ (int)(output_buf[11] + output_col)
+ };
+
+ jsimd_idct_12x12_pass1_dspr2(coef_block, compptr->dct_table, workspace);
+ jsimd_idct_12x12_pass2_dspr2(workspace, output);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(IFAST_MULT_TYPE) != 2)
+ return 0;
+ if (IFAST_SCALE_BITS != 2)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ int output[8] = {
+ (int)(output_buf[0] + output_col),
+ (int)(output_buf[1] + output_col),
+ (int)(output_buf[2] + output_col),
+ (int)(output_buf[3] + output_col),
+ (int)(output_buf[4] + output_col),
+ (int)(output_buf[5] + output_col),
+ (int)(output_buf[6] + output_col),
+ (int)(output_buf[7] + output_col)
+ };
+
+ jsimd_idct_islow_dspr2(coef_block, compptr->dct_table, output,
+ IDCT_range_limit(cinfo));
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ JCOEFPTR inptr;
+ IFAST_MULT_TYPE *quantptr;
+ DCTELEM workspace[DCTSIZE2]; /* buffers data between passes */
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (IFAST_MULT_TYPE *)compptr->dct_table;
+
+ jsimd_idct_ifast_cols_dspr2(inptr, quantptr, workspace,
+ mips_idct_ifast_coefs);
+
+ /* Pass 2: process rows from work array, store into output array. */
+ /* Note that we must descale the results by a factor of 8 == 2**3, */
+ /* and also undo the PASS1_BITS scaling. */
+
+ jsimd_idct_ifast_rows_dspr2(workspace, output_buf, output_col,
+ mips_idct_ifast_coefs);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+ return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+ int last_dc_val, c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+ return NULL;
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *zerobits)
+{
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits)
+{
+ return 0;
+}
diff --git a/simd/mips/jsimd_dspr2.S b/simd/mips/jsimd_dspr2.S
new file mode 100644
index 0000000..2ec543e
--- /dev/null
+++ b/simd/mips/jsimd_dspr2.S
@@ -0,0 +1,4471 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ * All Rights Reserved.
+ * Authors: Teodora Novkovic <teodora.novkovic@imgtec.com>
+ * Darko Laus <darko.laus@imgtec.com>
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#include "jsimd_dspr2_asm.h"
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_c_null_convert_dspr2)
+/*
+ * a0 = cinfo->image_width
+ * a1 = input_buf
+ * a2 = output_buf
+ * a3 = output_row
+ * 16(sp) = num_rows
+ * 20(sp) = cinfo->num_components
+ *
+ * Null conversion for compression
+ */
+ SAVE_REGS_ON_STACK 8, s0, s1
+
+ lw t9, 24(sp) // t9 = num_rows
+ lw s0, 28(sp) // s0 = cinfo->num_components
+ andi t0, a0, 3 // t0 = cinfo->image_width & 3
+ beqz t0, 4f // no residual
+ nop
+0:
+ addiu t9, t9, -1
+ bltz t9, 7f
+ li t1, 0
+1:
+ sll t3, t1, 2
+ lwx t5, t3(a2) // t5 = outptr = output_buf[ci]
+ lw t2, 0(a1) // t2 = inptr = *input_buf
+ sll t4, a3, 2
+ lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row]
+ addu t2, t2, t1
+ addu s1, t5, a0
+ addu t6, t5, t0
+2:
+ lbu t3, 0(t2)
+ addiu t5, t5, 1
+ sb t3, -1(t5)
+ bne t6, t5, 2b
+ addu t2, t2, s0
+3:
+ lbu t3, 0(t2)
+ addu t4, t2, s0
+ addu t7, t4, s0
+ addu t8, t7, s0
+ addu t2, t8, s0
+ lbu t4, 0(t4)
+ lbu t7, 0(t7)
+ lbu t8, 0(t8)
+ addiu t5, t5, 4
+ sb t3, -4(t5)
+ sb t4, -3(t5)
+ sb t7, -2(t5)
+ bne s1, t5, 3b
+ sb t8, -1(t5)
+ addiu t1, t1, 1
+ bne t1, s0, 1b
+ nop
+ addiu a1, a1, 4
+ bgez t9, 0b
+ addiu a3, a3, 1
+ b 7f
+ nop
+4:
+ addiu t9, t9, -1
+ bltz t9, 7f
+ li t1, 0
+5:
+ sll t3, t1, 2
+ lwx t5, t3(a2) // t5 = outptr = output_buf[ci]
+ lw t2, 0(a1) // t2 = inptr = *input_buf
+ sll t4, a3, 2
+ lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row]
+ addu t2, t2, t1
+ addu s1, t5, a0
+ addu t6, t5, t0
+6:
+ lbu t3, 0(t2)
+ addu t4, t2, s0
+ addu t7, t4, s0
+ addu t8, t7, s0
+ addu t2, t8, s0
+ lbu t4, 0(t4)
+ lbu t7, 0(t7)
+ lbu t8, 0(t8)
+ addiu t5, t5, 4
+ sb t3, -4(t5)
+ sb t4, -3(t5)
+ sb t7, -2(t5)
+ bne s1, t5, 6b
+ sb t8, -1(t5)
+ addiu t1, t1, 1
+ bne t1, s0, 5b
+ nop
+ addiu a1, a1, 4
+ bgez t9, 4b
+ addiu a3, a3, 1
+7:
+ RESTORE_REGS_FROM_STACK 8, s0, s1
+
+ j ra
+ nop
+
+END(jsimd_c_null_convert_dspr2)
+
+
+/*****************************************************************************/
+/*
+ * jsimd_extrgb_ycc_convert_dspr2
+ * jsimd_extbgr_ycc_convert_dspr2
+ * jsimd_extrgbx_ycc_convert_dspr2
+ * jsimd_extbgrx_ycc_convert_dspr2
+ * jsimd_extxbgr_ycc_convert_dspr2
+ * jsimd_extxrgb_ycc_convert_dspr2
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 colorid, pixel_size, \
+ r_offs, g_offs, b_offs
+
+.macro DO_RGB_TO_YCC r, g, b, inptr
+ lbu \r, \r_offs(\inptr)
+ lbu \g, \g_offs(\inptr)
+ lbu \b, \b_offs(\inptr)
+ addiu \inptr, \pixel_size
+.endm
+
+LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2)
+/*
+ * a0 = cinfo->image_width
+ * a1 = input_buf
+ * a2 = output_buf
+ * a3 = output_row
+ * 16(sp) = num_rows
+ */
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ lw t7, 48(sp) // t7 = num_rows
+ li s0, 0x4c8b // FIX(0.29900)
+ li s1, 0x9646 // FIX(0.58700)
+ li s2, 0x1d2f // FIX(0.11400)
+ li s3, 0xffffd4cd // -FIX(0.16874)
+ li s4, 0xffffab33 // -FIX(0.33126)
+ li s5, 0x8000 // FIX(0.50000)
+ li s6, 0xffff94d1 // -FIX(0.41869)
+ li s7, 0xffffeb2f // -FIX(0.08131)
+ li t8, 0x807fff // CBCR_OFFSET + ONE_HALF-1
+
+0:
+ addiu t7, -1 // --num_rows
+ lw t6, 0(a1) // t6 = input_buf[0]
+ lw t0, 0(a2)
+ lw t1, 4(a2)
+ lw t2, 8(a2)
+ sll t3, a3, 2
+ lwx t0, t3(t0) // t0 = output_buf[0][output_row]
+ lwx t1, t3(t1) // t1 = output_buf[1][output_row]
+ lwx t2, t3(t2) // t2 = output_buf[2][output_row]
+
+ addu t9, t2, a0 // t9 = end address
+ addiu a3, 1
+
+1:
+ DO_RGB_TO_YCC t3, t4, t5, t6
+
+ mtlo s5, $ac0
+ mtlo t8, $ac1
+ mtlo t8, $ac2
+ maddu $ac0, s2, t5
+ maddu $ac1, s5, t5
+ maddu $ac2, s5, t3
+ maddu $ac0, s0, t3
+ maddu $ac1, s3, t3
+ maddu $ac2, s6, t4
+ maddu $ac0, s1, t4
+ maddu $ac1, s4, t4
+ maddu $ac2, s7, t5
+ extr.w t3, $ac0, 16
+ extr.w t4, $ac1, 16
+ extr.w t5, $ac2, 16
+ sb t3, 0(t0)
+ sb t4, 0(t1)
+ sb t5, 0(t2)
+ addiu t0, 1
+ addiu t2, 1
+ bne t2, t9, 1b
+ addiu t1, 1
+ bgtz t7, 0b
+ addiu a1, 4
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+END(jsimd_\colorid\()_ycc_convert_dspr2)
+
+.purgem DO_RGB_TO_YCC
+
+.endm
+
+/*-------------------------------------id -- pix R G B */
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb, 3, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr, 3, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
+
+
+/*****************************************************************************/
+/*
+ * jsimd_ycc_extrgb_convert_dspr2
+ * jsimd_ycc_extbgr_convert_dspr2
+ * jsimd_ycc_extrgbx_convert_dspr2
+ * jsimd_ycc_extbgrx_convert_dspr2
+ * jsimd_ycc_extxbgr_convert_dspr2
+ * jsimd_ycc_extxrgb_convert_dspr2
+ *
+ * Colorspace conversion YCbCr -> RGB
+ */
+
+.macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 colorid, pixel_size, \
+ r_offs, g_offs, b_offs, a_offs
+
+.macro STORE_YCC_TO_RGB scratch0 scratch1 scratch2 outptr
+ sb \scratch0, \r_offs(\outptr)
+ sb \scratch1, \g_offs(\outptr)
+ sb \scratch2, \b_offs(\outptr)
+.if (\pixel_size == 4)
+ li t0, 0xFF
+ sb t0, \a_offs(\outptr)
+.endif
+ addiu \outptr, \pixel_size
+.endm
+
+LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2)
+/*
+ * a0 = cinfo->image_width
+ * a1 = input_buf
+ * a2 = input_row
+ * a3 = output_buf
+ * 16(sp) = num_rows
+ */
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ lw s1, 48(sp)
+ li t3, 0x8000
+ li t4, 0x166e9 // FIX(1.40200)
+ li t5, 0x1c5a2 // FIX(1.77200)
+ li t6, 0xffff492e // -FIX(0.71414)
+ li t7, 0xffffa7e6 // -FIX(0.34414)
+ repl.ph t8, 128
+
+0:
+ lw s0, 0(a3)
+ lw t0, 0(a1)
+ lw t1, 4(a1)
+ lw t2, 8(a1)
+ sll s5, a2, 2
+ addiu s1, -1
+ lwx s2, s5(t0)
+ lwx s3, s5(t1)
+ lwx s4, s5(t2)
+ addu t9, s2, a0
+ addiu a2, 1
+
+1:
+ lbu s7, 0(s4) // cr
+ lbu s6, 0(s3) // cb
+ lbu s5, 0(s2) // y
+ addiu s2, 1
+ addiu s4, 1
+ addiu s7, -128
+ addiu s6, -128
+ mul t2, t7, s6
+ mul t0, t6, s7 // Crgtab[cr]
+ sll s7, 15
+ mulq_rs.w t1, t4, s7 // Crrtab[cr]
+ sll s6, 15
+ addu t2, t3 // Cbgtab[cb]
+ addu t2, t0
+
+ mulq_rs.w t0, t5, s6 // Cbbtab[cb]
+ sra t2, 16
+ addu t1, s5
+ addu t2, s5 // add y
+ ins t2, t1, 16, 16
+ subu.ph t2, t2, t8
+ addu t0, s5
+ shll_s.ph t2, t2, 8
+ subu t0, 128
+ shra.ph t2, t2, 8
+ shll_s.w t0, t0, 24
+ addu.ph t2, t2, t8 // clip & store
+ sra t0, t0, 24
+ sra t1, t2, 16
+ addiu t0, 128
+
+ STORE_YCC_TO_RGB t1, t2, t0, s0
+
+ bne s2, t9, 1b
+ addiu s3, 1
+ bgtz s1, 0b
+ addiu a3, 4
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+END(jsimd_ycc_\colorid\()_convert_dspr2)
+
+.purgem STORE_YCC_TO_RGB
+
+.endm
+
+/*-------------------------------------id -- pix R G B A */
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb, 3, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr, 3, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0
+
+
+/*****************************************************************************/
+/*
+ * jsimd_extrgb_gray_convert_dspr2
+ * jsimd_extbgr_gray_convert_dspr2
+ * jsimd_extrgbx_gray_convert_dspr2
+ * jsimd_extbgrx_gray_convert_dspr2
+ * jsimd_extxbgr_gray_convert_dspr2
+ * jsimd_extxrgb_gray_convert_dspr2
+ *
+ * Colorspace conversion RGB -> GRAY
+ */
+
+.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 colorid, pixel_size, \
+ r_offs, g_offs, b_offs
+
+.macro DO_RGB_TO_GRAY r, g, b, inptr
+ lbu \r, \r_offs(\inptr)
+ lbu \g, \g_offs(\inptr)
+ lbu \b, \b_offs(\inptr)
+ addiu \inptr, \pixel_size
+.endm
+
+LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2)
+/*
+ * a0 = cinfo->image_width
+ * a1 = input_buf
+ * a2 = output_buf
+ * a3 = output_row
+ * 16(sp) = num_rows
+ */
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ li s0, 0x4c8b // s0 = FIX(0.29900)
+ li s1, 0x9646 // s1 = FIX(0.58700)
+ li s2, 0x1d2f // s2 = FIX(0.11400)
+ li s7, 0x8000 // s7 = FIX(0.50000)
+ lw s6, 48(sp)
+ andi t7, a0, 3
+
+0:
+ addiu s6, -1 // s6 = num_rows
+ lw t0, 0(a1)
+ lw t1, 0(a2)
+ sll t3, a3, 2
+ lwx t1, t3(t1)
+ addiu a3, 1
+ addu t9, t1, a0
+ subu t8, t9, t7
+ beq t1, t8, 2f
+ nop
+
+1:
+ DO_RGB_TO_GRAY t3, t4, t5, t0
+ DO_RGB_TO_GRAY s3, s4, s5, t0
+
+ mtlo s7, $ac0
+ maddu $ac0, s2, t5
+ maddu $ac0, s1, t4
+ maddu $ac0, s0, t3
+ mtlo s7, $ac1
+ maddu $ac1, s2, s5
+ maddu $ac1, s1, s4
+ maddu $ac1, s0, s3
+ extr.w t6, $ac0, 16
+
+ DO_RGB_TO_GRAY t3, t4, t5, t0
+ DO_RGB_TO_GRAY s3, s4, s5, t0
+
+ mtlo s7, $ac0
+ maddu $ac0, s2, t5
+ maddu $ac0, s1, t4
+ extr.w t2, $ac1, 16
+ maddu $ac0, s0, t3
+ mtlo s7, $ac1
+ maddu $ac1, s2, s5
+ maddu $ac1, s1, s4
+ maddu $ac1, s0, s3
+ extr.w t5, $ac0, 16
+ sb t6, 0(t1)
+ sb t2, 1(t1)
+ extr.w t3, $ac1, 16
+ addiu t1, 4
+ sb t5, -2(t1)
+ sb t3, -1(t1)
+ bne t1, t8, 1b
+ nop
+
+2:
+ beqz t7, 4f
+ nop
+
+3:
+ DO_RGB_TO_GRAY t3, t4, t5, t0
+
+ mtlo s7, $ac0
+ maddu $ac0, s2, t5
+ maddu $ac0, s1, t4
+ maddu $ac0, s0, t3
+ extr.w t6, $ac0, 16
+ sb t6, 0(t1)
+ addiu t1, 1
+ bne t1, t9, 3b
+ nop
+
+4:
+ bgtz s6, 0b
+ addiu a1, 4
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+END(jsimd_\colorid\()_gray_convert_dspr2)
+
+.purgem DO_RGB_TO_GRAY
+
+.endm
+
+/*-------------------------------------id -- pix R G B */
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb, 3, 0, 1, 2
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr, 3, 2, 1, 0
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
+
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v2_merged_upsample_dspr2
+ * jsimd_h2v2_extrgb_merged_upsample_dspr2
+ * jsimd_h2v2_extrgbx_merged_upsample_dspr2
+ * jsimd_h2v2_extbgr_merged_upsample_dspr2
+ * jsimd_h2v2_extbgrx_merged_upsample_dspr2
+ * jsimd_h2v2_extxbgr_merged_upsample_dspr2
+ * jsimd_h2v2_extxrgb_merged_upsample_dspr2
+ *
+ * Merged h2v2 upsample routines
+ */
+.macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \
+ r1_offs, g1_offs, \
+ b1_offs, a1_offs, \
+ r2_offs, g2_offs, \
+ b2_offs, a2_offs
+
+.macro STORE_H2V2_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \
+ scratch5 outptr
+ sb \scratch0, \r1_offs(\outptr)
+ sb \scratch1, \g1_offs(\outptr)
+ sb \scratch2, \b1_offs(\outptr)
+ sb \scratch3, \r2_offs(\outptr)
+ sb \scratch4, \g2_offs(\outptr)
+ sb \scratch5, \b2_offs(\outptr)
+.if (\pixel_size == 8)
+ li \scratch0, 0xFF
+ sb \scratch0, \a1_offs(\outptr)
+ sb \scratch0, \a2_offs(\outptr)
+.endif
+ addiu \outptr, \pixel_size
+.endm
+
+.macro STORE_H2V2_1_PIXEL scratch0 scratch1 scratch2 outptr
+ sb \scratch0, \r1_offs(\outptr)
+ sb \scratch1, \g1_offs(\outptr)
+ sb \scratch2, \b1_offs(\outptr)
+
+.if (\pixel_size == 8)
+ li t0, 0xFF
+ sb t0, \a1_offs(\outptr)
+.endif
+.endm
+
+LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
+/*
+ * a0 = cinfo->output_width
+ * a1 = input_buf
+ * a2 = in_row_group_ctr
+ * a3 = output_buf
+ * 16(sp) = cinfo->sample_range_limit
+ */
+ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+ lw t9, 56(sp) // cinfo->sample_range_limit
+ lw v0, 0(a1)
+ lw v1, 4(a1)
+ lw t0, 8(a1)
+ sll t1, a2, 3
+ addiu t2, t1, 4
+ sll t3, a2, 2
+ lw t4, 0(a3) // t4 = output_buf[0]
+ lwx t1, t1(v0) // t1 = input_buf[0][in_row_group_ctr*2]
+ lwx t2, t2(v0) // t2 = input_buf[0][in_row_group_ctr*2 + 1]
+ lwx t5, t3(v1) // t5 = input_buf[1][in_row_group_ctr]
+ lwx t6, t3(t0) // t6 = input_buf[2][in_row_group_ctr]
+ lw t7, 4(a3) // t7 = output_buf[1]
+ li s1, 0xe6ea
+ addiu t8, s1, 0x7fff // t8 = 0x166e9 [FIX(1.40200)]
+ addiu s0, t8, 0x5eb9 // s0 = 0x1c5a2 [FIX(1.77200)]
+ addiu s1, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)]
+ xori s2, s1, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)]
+ srl t3, a0, 1
+ blez t3, 2f
+ addu t0, t5, t3 // t0 = end address
+ 1:
+ lbu t3, 0(t5)
+ lbu s3, 0(t6)
+ addiu t5, t5, 1
+ addiu t3, t3, -128 // (cb - 128)
+ addiu s3, s3, -128 // (cr - 128)
+ mult $ac1, s1, t3
+ madd $ac1, s2, s3
+ sll s3, s3, 15
+ sll t3, t3, 15
+ mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
+ extr_r.w s5, $ac1, 16
+ mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
+ lbu v0, 0(t1)
+ addiu t6, t6, 1
+ addiu t1, t1, 2
+ addu t3, v0, s4 // y+cred
+ addu s3, v0, s5 // y+cgreen
+ addu v1, v0, s6 // y+cblue
+ addu t3, t9, t3 // y+cred
+ addu s3, t9, s3 // y+cgreen
+ addu v1, t9, v1 // y+cblue
+ lbu AT, 0(t3)
+ lbu s7, 0(s3)
+ lbu ra, 0(v1)
+ lbu v0, -1(t1)
+ addu t3, v0, s4 // y+cred
+ addu s3, v0, s5 // y+cgreen
+ addu v1, v0, s6 // y+cblue
+ addu t3, t9, t3 // y+cred
+ addu s3, t9, s3 // y+cgreen
+ addu v1, t9, v1 // y+cblue
+ lbu t3, 0(t3)
+ lbu s3, 0(s3)
+ lbu v1, 0(v1)
+ lbu v0, 0(t2)
+
+ STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
+
+ addu t3, v0, s4 // y+cred
+ addu s3, v0, s5 // y+cgreen
+ addu v1, v0, s6 // y+cblue
+ addu t3, t9, t3 // y+cred
+ addu s3, t9, s3 // y+cgreen
+ addu v1, t9, v1 // y+cblue
+ lbu AT, 0(t3)
+ lbu s7, 0(s3)
+ lbu ra, 0(v1)
+ lbu v0, 1(t2)
+ addiu t2, t2, 2
+ addu t3, v0, s4 // y+cred
+ addu s3, v0, s5 // y+cgreen
+ addu v1, v0, s6 // y+cblue
+ addu t3, t9, t3 // y+cred
+ addu s3, t9, s3 // y+cgreen
+ addu v1, t9, v1 // y+cblue
+ lbu t3, 0(t3)
+ lbu s3, 0(s3)
+ lbu v1, 0(v1)
+
+ STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
+
+ bne t0, t5, 1b
+ nop
+2:
+ andi t0, a0, 1
+ beqz t0, 4f
+ lbu t3, 0(t5)
+ lbu s3, 0(t6)
+ addiu t3, t3, -128 // (cb - 128)
+ addiu s3, s3, -128 // (cr - 128)
+ mult $ac1, s1, t3
+ madd $ac1, s2, s3
+ sll s3, s3, 15
+ sll t3, t3, 15
+ lbu v0, 0(t1)
+ extr_r.w s5, $ac1, 16
+ mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
+ mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
+ addu t3, v0, s4 // y+cred
+ addu s3, v0, s5 // y+cgreen
+ addu v1, v0, s6 // y+cblue
+ addu t3, t9, t3 // y+cred
+ addu s3, t9, s3 // y+cgreen
+ addu v1, t9, v1 // y+cblue
+ lbu t3, 0(t3)
+ lbu s3, 0(s3)
+ lbu v1, 0(v1)
+ lbu v0, 0(t2)
+
+ STORE_H2V2_1_PIXEL t3, s3, v1, t4
+
+ addu t3, v0, s4 // y+cred
+ addu s3, v0, s5 // y+cgreen
+ addu v1, v0, s6 // y+cblue
+ addu t3, t9, t3 // y+cred
+ addu s3, t9, s3 // y+cgreen
+ addu v1, t9, v1 // y+cblue
+ lbu t3, 0(t3)
+ lbu s3, 0(s3)
+ lbu v1, 0(v1)
+
+ STORE_H2V2_1_PIXEL t3, s3, v1, t7
+4:
+ RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+ j ra
+ nop
+
+END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
+
+.purgem STORE_H2V2_1_PIXEL
+.purgem STORE_H2V2_2_PIXELS
+.endm
+
+/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
+
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v1_merged_upsample_dspr2
+ * jsimd_h2v1_extrgb_merged_upsample_dspr2
+ * jsimd_h2v1_extrgbx_merged_upsample_dspr2
+ * jsimd_h2v1_extbgr_merged_upsample_dspr2
+ * jsimd_h2v1_extbgrx_merged_upsample_dspr2
+ * jsimd_h2v1_extxbgr_merged_upsample_dspr2
+ * jsimd_h2v1_extxrgb_merged_upsample_dspr2
+ *
+ * Merged h2v1 upsample routines
+ */
+
+.macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \
+ r1_offs, g1_offs, \
+ b1_offs, a1_offs, \
+ r2_offs, g2_offs, \
+ b2_offs, a2_offs
+
+.macro STORE_H2V1_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \
+ scratch5 outptr
+ sb \scratch0, \r1_offs(\outptr)
+ sb \scratch1, \g1_offs(\outptr)
+ sb \scratch2, \b1_offs(\outptr)
+ sb \scratch3, \r2_offs(\outptr)
+ sb \scratch4, \g2_offs(\outptr)
+ sb \scratch5, \b2_offs(\outptr)
+.if (\pixel_size == 8)
+ li t0, 0xFF
+ sb t0, \a1_offs(\outptr)
+ sb t0, \a2_offs(\outptr)
+.endif
+ addiu \outptr, \pixel_size
+.endm
+
+.macro STORE_H2V1_1_PIXEL scratch0 scratch1 scratch2 outptr
+ sb \scratch0, \r1_offs(\outptr)
+ sb \scratch1, \g1_offs(\outptr)
+ sb \scratch2, \b1_offs(\outptr)
+.if (\pixel_size == 8)
+ li t0, 0xFF
+ sb t0, \a1_offs(\outptr)
+.endif
+.endm
+
+LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
+/*
+ * a0 = cinfo->output_width
+ * a1 = input_buf
+ * a2 = in_row_group_ctr
+ * a3 = output_buf
+ * 16(sp) = range_limit
+ */
+ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+ li t0, 0xe6ea
+ lw t1, 0(a1) // t1 = input_buf[0]
+ lw t2, 4(a1) // t2 = input_buf[1]
+ lw t3, 8(a1) // t3 = input_buf[2]
+ lw t8, 56(sp) // t8 = range_limit
+ addiu s1, t0, 0x7fff // s1 = 0x166e9 [FIX(1.40200)]
+ addiu s2, s1, 0x5eb9 // s2 = 0x1c5a2 [FIX(1.77200)]
+ addiu s0, t0, 0x9916 // s0 = 0x8000
+ addiu s4, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)]
+ xori s3, s4, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)]
+ srl t0, a0, 1
+ sll t4, a2, 2
+ lwx s5, t4(t1) // s5 = inptr0
+ lwx s6, t4(t2) // s6 = inptr1
+ lwx s7, t4(t3) // s7 = inptr2
+ lw t7, 0(a3) // t7 = outptr
+ blez t0, 2f
+ addu t9, s6, t0 // t9 = end address
+1:
+ lbu t2, 0(s6) // t2 = cb
+ lbu t0, 0(s7) // t0 = cr
+ lbu t1, 0(s5) // t1 = y
+ addiu t2, t2, -128 // t2 = cb - 128
+ addiu t0, t0, -128 // t0 = cr - 128
+ mult $ac1, s4, t2
+ madd $ac1, s3, t0
+ sll t0, t0, 15
+ sll t2, t2, 15
+ mulq_rs.w t0, s1, t0 // t0 = (C1*cr + ONE_HALF)>> SCALEBITS
+ extr_r.w t5, $ac1, 16
+ mulq_rs.w t6, s2, t2 // t6 = (C2*cb + ONE_HALF)>> SCALEBITS
+ addiu s7, s7, 1
+ addiu s6, s6, 1
+ addu t2, t1, t0 // t2 = y + cred
+ addu t3, t1, t5 // t3 = y + cgreen
+ addu t4, t1, t6 // t4 = y + cblue
+ addu t2, t8, t2
+ addu t3, t8, t3
+ addu t4, t8, t4
+ lbu t1, 1(s5)
+ lbu v0, 0(t2)
+ lbu v1, 0(t3)
+ lbu ra, 0(t4)
+ addu t2, t1, t0
+ addu t3, t1, t5
+ addu t4, t1, t6
+ addu t2, t8, t2
+ addu t3, t8, t3
+ addu t4, t8, t4
+ lbu t2, 0(t2)
+ lbu t3, 0(t3)
+ lbu t4, 0(t4)
+
+ STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
+
+ bne t9, s6, 1b
+ addiu s5, s5, 2
+2:
+ andi t0, a0, 1
+ beqz t0, 4f
+ nop
+3:
+ lbu t2, 0(s6)
+ lbu t0, 0(s7)
+ lbu t1, 0(s5)
+ addiu t2, t2, -128 // (cb - 128)
+ addiu t0, t0, -128 // (cr - 128)
+ mul t3, s4, t2
+ mul t4, s3, t0
+ sll t0, t0, 15
+ sll t2, t2, 15
+ mulq_rs.w t0, s1, t0 // (C1*cr + ONE_HALF)>> SCALEBITS
+ mulq_rs.w t6, s2, t2 // (C2*cb + ONE_HALF)>> SCALEBITS
+ addu t3, t3, s0
+ addu t3, t4, t3
+ sra t5, t3, 16 // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS
+ addu t2, t1, t0 // y + cred
+ addu t3, t1, t5 // y + cgreen
+ addu t4, t1, t6 // y + cblue
+ addu t2, t8, t2
+ addu t3, t8, t3
+ addu t4, t8, t4
+ lbu t2, 0(t2)
+ lbu t3, 0(t3)
+ lbu t4, 0(t4)
+
+ STORE_H2V1_1_PIXEL t2, t3, t4, t7
+4:
+ RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+ j ra
+ nop
+
+END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
+
+.purgem STORE_H2V1_1_PIXEL
+.purgem STORE_H2V1_2_PIXELS
+.endm
+
+/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
+
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v2_fancy_upsample_dspr2
+ *
+ * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+ */
+LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = downsampled_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+ SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
+
+ li s4, 0
+ lw s2, 0(a3) // s2 = *output_data_ptr
+0:
+ li t9, 2
+ lw s1, -4(a2) // s1 = inptr1
+
+1:
+ lw s0, 0(a2) // s0 = inptr0
+ lwx s3, s4(s2)
+ addiu s5, a1, -2 // s5 = downsampled_width - 2
+ srl t4, s5, 1
+ sll t4, t4, 1
+ lbu t0, 0(s0)
+ lbu t1, 1(s0)
+ lbu t2, 0(s1)
+ lbu t3, 1(s1)
+ addiu s0, 2
+ addiu s1, 2
+ addu t8, s0, t4 // t8 = end address
+ andi s5, s5, 1 // s5 = residual
+ sll t4, t0, 1
+ sll t6, t1, 1
+ addu t0, t0, t4 // t0 = (*inptr0++) * 3
+ addu t1, t1, t6 // t1 = (*inptr0++) * 3
+ addu t7, t0, t2 // t7 = thiscolsum
+ addu t6, t1, t3 // t5 = nextcolsum
+ sll t0, t7, 2 // t0 = thiscolsum * 4
+ subu t1, t0, t7 // t1 = thiscolsum * 3
+ shra_r.w t0, t0, 4
+ addiu t1, 7
+ addu t1, t1, t6
+ srl t1, t1, 4
+ sb t0, 0(s3)
+ sb t1, 1(s3)
+ beq t8, s0, 22f // skip to final iteration if width == 3
+ addiu s3, 2
+2:
+ lh t0, 0(s0) // t0 = A3|A2
+ lh t2, 0(s1) // t2 = B3|B2
+ addiu s0, 2
+ addiu s1, 2
+ preceu.ph.qbr t0, t0 // t0 = 0|A3|0|A2
+ preceu.ph.qbr t2, t2 // t2 = 0|B3|0|B2
+ shll.ph t1, t0, 1
+ sll t3, t6, 1
+ addu.ph t0, t1, t0 // t0 = A3*3|A2*3
+ addu t3, t3, t6 // t3 = this * 3
+ addu.ph t0, t0, t2 // t0 = next2|next1
+ addu t1, t3, t7
+ andi t7, t0, 0xFFFF // t7 = next1
+ sll t2, t7, 1
+ addu t2, t7, t2 // t2 = next1*3
+ addu t4, t2, t6
+ srl t6, t0, 16 // t6 = next2
+ shra_r.w t1, t1, 4 // t1 = (this*3 + last + 8) >> 4
+ addu t0, t3, t7
+ addiu t0, 7
+ srl t0, t0, 4 // t0 = (this*3 + next1 + 7) >> 4
+ shra_r.w t4, t4, 4 // t3 = (next1*3 + this + 8) >> 4
+ addu t2, t2, t6
+ addiu t2, 7
+ srl t2, t2, 4 // t2 = (next1*3 + next2 + 7) >> 4
+ sb t1, 0(s3)
+ sb t0, 1(s3)
+ sb t4, 2(s3)
+ sb t2, 3(s3)
+ bne t8, s0, 2b
+ addiu s3, 4
+22:
+ beqz s5, 4f
+ addu t8, s0, s5
+3:
+ lbu t0, 0(s0)
+ lbu t2, 0(s1)
+ addiu s0, 1
+ addiu s1, 1
+ sll t3, t6, 1
+ sll t1, t0, 1
+ addu t1, t0, t1 // t1 = inptr0 * 3
+ addu t3, t3, t6 // t3 = thiscolsum * 3
+ addu t5, t1, t2
+ addu t1, t3, t7
+ shra_r.w t1, t1, 4
+ addu t0, t3, t5
+ addiu t0, 7
+ srl t0, t0, 4
+ sb t1, 0(s3)
+ sb t0, 1(s3)
+ addiu s3, 2
+ move t7, t6
+ bne t8, s0, 3b
+ move t6, t5
+4:
+ sll t0, t6, 2 // t0 = thiscolsum * 4
+ subu t1, t0, t6 // t1 = thiscolsum * 3
+ addu t1, t1, t7
+ addiu s4, 4
+ shra_r.w t1, t1, 4
+ addiu t0, 7
+ srl t0, t0, 4
+ sb t1, 0(s3)
+ sb t0, 1(s3)
+ addiu t9, -1
+ addiu s3, 2
+ bnez t9, 1b
+ lw s1, 4(a2)
+ srl t0, s4, 2
+ subu t0, a0, t0
+ bgtz t0, 0b
+ addiu a2, 4
+
+ RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
+
+ j ra
+ nop
+END(jsimd_h2v2_fancy_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = downsampled_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+ .set at
+
+ beqz a0, 3f
+ sll t0, a0, 2
+ lw s1, 0(a3)
+ li s3, 0x10001
+ addu s0, s1, t0
+0:
+ addiu t8, a1, -2
+ srl t9, t8, 2
+ lw t7, 0(a2)
+ lw s2, 0(s1)
+ lbu t0, 0(t7)
+ lbu t1, 1(t7) // t1 = inptr[1]
+ sll t2, t0, 1
+ addu t2, t2, t0 // t2 = invalue*3
+ addu t2, t2, t1
+ shra_r.w t2, t2, 2
+ sb t0, 0(s2)
+ sb t2, 1(s2)
+ beqz t9, 11f
+ addiu s2, 2
+1:
+ ulw t0, 0(t7) // t0 = |P3|P2|P1|P0|
+ ulw t1, 1(t7)
+ ulh t2, 4(t7) // t2 = |0|0|P5|P4|
+ preceu.ph.qbl t3, t0 // t3 = |0|P3|0|P2|
+ preceu.ph.qbr t0, t0 // t0 = |0|P1|0|P0|
+ preceu.ph.qbr t2, t2 // t2 = |0|P5|0|P4|
+ preceu.ph.qbl t4, t1 // t4 = |0|P4|0|P3|
+ preceu.ph.qbr t1, t1 // t1 = |0|P2|0|P1|
+ shll.ph t5, t4, 1
+ shll.ph t6, t1, 1
+ addu.ph t5, t5, t4 // t5 = |P4*3|P3*3|
+ addu.ph t6, t6, t1 // t6 = |P2*3|P1*3|
+ addu.ph t4, t3, s3
+ addu.ph t0, t0, s3
+ addu.ph t4, t4, t5
+ addu.ph t0, t0, t6
+ shrl.ph t4, t4, 2 // t4 = |0|P3|0|P2|
+ shrl.ph t0, t0, 2 // t0 = |0|P1|0|P0|
+ addu.ph t2, t2, t5
+ addu.ph t3, t3, t6
+ shra_r.ph t2, t2, 2 // t2 = |0|P5|0|P4|
+ shra_r.ph t3, t3, 2 // t3 = |0|P3|0|P2|
+ shll.ph t2, t2, 8
+ shll.ph t3, t3, 8
+ or t2, t4, t2
+ or t3, t3, t0
+ addiu t9, -1
+ usw t3, 0(s2)
+ usw t2, 4(s2)
+ addiu s2, 8
+ bgtz t9, 1b
+ addiu t7, 4
+11:
+ andi t8, 3
+ beqz t8, 22f
+ addiu t7, 1
+
+2:
+ lbu t0, 0(t7)
+ addiu t7, 1
+ sll t1, t0, 1
+ addu t2, t0, t1 // t2 = invalue
+ lbu t3, -2(t7)
+ lbu t4, 0(t7)
+ addiu t3, 1
+ addiu t4, 2
+ addu t3, t3, t2
+ addu t4, t4, t2
+ srl t3, 2
+ srl t4, 2
+ sb t3, 0(s2)
+ sb t4, 1(s2)
+ addiu t8, -1
+ bgtz t8, 2b
+ addiu s2, 2
+
+22:
+ lbu t0, 0(t7)
+ lbu t2, -1(t7)
+ sll t1, t0, 1
+ addu t1, t1, t0 // t1 = invalue * 3
+ addu t1, t1, t2
+ addiu t1, 1
+ srl t1, t1, 2
+ sb t1, 0(s2)
+ sb t0, 1(s2)
+ addiu s1, 4
+ bne s1, s0, 0b
+ addiu a2, 4
+3:
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+ j ra
+ nop
+END(jsimd_h2v1_fancy_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v1_downsample_dspr2)
+/*
+ * a0 = cinfo->image_width
+ * a1 = cinfo->max_v_samp_factor
+ * a2 = compptr->v_samp_factor
+ * a3 = compptr->width_in_blocks
+ * 16(sp) = input_data
+ * 20(sp) = output_data
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
+
+ beqz a2, 7f
+ lw s1, 44(sp) // s1 = output_data
+ lw s0, 40(sp) // s0 = input_data
+ srl s2, a0, 2
+ andi t9, a0, 2
+ srl t7, t9, 1
+ addu s2, t7, s2
+ sll t0, a3, 3 // t0 = width_in_blocks*DCT
+ srl t7, t0, 1
+ subu s2, t7, s2
+0:
+ andi t6, a0, 1 // t6 = temp_index
+ addiu t6, -1
+ lw t4, 0(s1) // t4 = outptr
+ lw t5, 0(s0) // t5 = inptr0
+ li s3, 0 // s3 = bias
+ srl t7, a0, 1 // t7 = image_width1
+ srl s4, t7, 2
+ andi t8, t7, 3
+1:
+ ulhu t0, 0(t5)
+ ulhu t1, 2(t5)
+ ulhu t2, 4(t5)
+ ulhu t3, 6(t5)
+ raddu.w.qb t0, t0
+ raddu.w.qb t1, t1
+ raddu.w.qb t2, t2
+ raddu.w.qb t3, t3
+ shra.ph t0, t0, 1
+ shra_r.ph t1, t1, 1
+ shra.ph t2, t2, 1
+ shra_r.ph t3, t3, 1
+ sb t0, 0(t4)
+ sb t1, 1(t4)
+ sb t2, 2(t4)
+ sb t3, 3(t4)
+ addiu s4, -1
+ addiu t4, 4
+ bgtz s4, 1b
+ addiu t5, 8
+ beqz t8, 3f
+ addu s4, t4, t8
+2:
+ ulhu t0, 0(t5)
+ raddu.w.qb t0, t0
+ addqh.w t0, t0, s3
+ xori s3, s3, 1
+ sb t0, 0(t4)
+ addiu t4, 1
+ bne t4, s4, 2b
+ addiu t5, 2
+3:
+ lbux t1, t6(t5)
+ sll t1, 1
+ addqh.w t2, t1, s3 // t2 = pixval1
+ xori s3, s3, 1
+ addqh.w t3, t1, s3 // t3 = pixval2
+ blez s2, 5f
+ append t3, t2, 8
+ addu t5, t4, s2 // t5 = loop_end2
+4:
+ ush t3, 0(t4)
+ addiu s2, -1
+ bgtz s2, 4b
+ addiu t4, 2
+5:
+ beqz t9, 6f
+ nop
+ sb t2, 0(t4)
+6:
+ addiu s1, 4
+ addiu a2, -1
+ bnez a2, 0b
+ addiu s0, 4
+7:
+ RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
+
+ j ra
+ nop
+END(jsimd_h2v1_downsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v2_downsample_dspr2)
+/*
+ * a0 = cinfo->image_width
+ * a1 = cinfo->max_v_samp_factor
+ * a2 = compptr->v_samp_factor
+ * a3 = compptr->width_in_blocks
+ * 16(sp) = input_data
+ * 20(sp) = output_data
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ beqz a2, 8f
+ lw s1, 52(sp) // s1 = output_data
+ lw s0, 48(sp) // s0 = input_data
+
+ andi t6, a0, 1 // t6 = temp_index
+ addiu t6, -1
+ srl t7, a0, 1 // t7 = image_width1
+ srl s4, t7, 2
+ andi t8, t7, 3
+ andi t9, a0, 2
+ srl s2, a0, 2
+ srl t7, t9, 1
+ addu s2, t7, s2
+ sll t0, a3, 3 // s2 = width_in_blocks*DCT
+ srl t7, t0, 1
+ subu s2, t7, s2
+0:
+ lw t4, 0(s1) // t4 = outptr
+ lw t5, 0(s0) // t5 = inptr0
+ lw s7, 4(s0) // s7 = inptr1
+ li s6, 1 // s6 = bias
+2:
+ ulw t0, 0(t5) // t0 = |P3|P2|P1|P0|
+ ulw t1, 0(s7) // t1 = |Q3|Q2|Q1|Q0|
+ ulw t2, 4(t5)
+ ulw t3, 4(s7)
+ precrq.ph.w t7, t0, t1 // t2 = |P3|P2|Q3|Q2|
+ ins t0, t1, 16, 16 // t0 = |Q1|Q0|P1|P0|
+ raddu.w.qb t1, t7
+ raddu.w.qb t0, t0
+ shra_r.w t1, t1, 2
+ addiu t0, 1
+ srl t0, 2
+ precrq.ph.w t7, t2, t3
+ ins t2, t3, 16, 16
+ raddu.w.qb t7, t7
+ raddu.w.qb t2, t2
+ shra_r.w t7, t7, 2
+ addiu t2, 1
+ srl t2, 2
+ sb t0, 0(t4)
+ sb t1, 1(t4)
+ sb t2, 2(t4)
+ sb t7, 3(t4)
+ addiu t4, 4
+ addiu t5, 8
+ addiu s4, s4, -1
+ bgtz s4, 2b
+ addiu s7, 8
+ beqz t8, 4f
+ addu t8, t4, t8
+3:
+ ulhu t0, 0(t5)
+ ulhu t1, 0(s7)
+ ins t0, t1, 16, 16
+ raddu.w.qb t0, t0
+ addu t0, t0, s6
+ srl t0, 2
+ xori s6, s6, 3
+ sb t0, 0(t4)
+ addiu t5, 2
+ addiu t4, 1
+ bne t8, t4, 3b
+ addiu s7, 2
+4:
+ lbux t1, t6(t5)
+ sll t1, 1
+ lbux t0, t6(s7)
+ sll t0, 1
+ addu t1, t1, t0
+ addu t3, t1, s6
+ srl t0, t3, 2 // t2 = pixval1
+ xori s6, s6, 3
+ addu t2, t1, s6
+ srl t1, t2, 2 // t3 = pixval2
+ blez s2, 6f
+ append t1, t0, 8
+5:
+ ush t1, 0(t4)
+ addiu s2, -1
+ bgtz s2, 5b
+ addiu t4, 2
+6:
+ beqz t9, 7f
+ nop
+ sb t0, 0(t4)
+7:
+ addiu s1, 4
+ addiu a2, -1
+ bnez a2, 0b
+ addiu s0, 8
+8:
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+END(jsimd_h2v2_downsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2)
+/*
+ * a0 = input_data
+ * a1 = output_data
+ * a2 = compptr->v_samp_factor
+ * a3 = cinfo->max_v_samp_factor
+ * 16(sp) = cinfo->smoothing_factor
+ * 20(sp) = compptr->width_in_blocks
+ * 24(sp) = cinfo->image_width
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ lw s7, 52(sp) // compptr->width_in_blocks
+ lw s0, 56(sp) // cinfo->image_width
+ lw s6, 48(sp) // cinfo->smoothing_factor
+ sll s7, 3 // output_cols = width_in_blocks * DCTSIZE
+ sll v0, s7, 1
+ subu v0, v0, s0
+ blez v0, 2f
+ move v1, zero
+ addiu t0, a3, 2 // t0 = cinfo->max_v_samp_factor + 2
+0:
+ addiu t1, a0, -4
+ sll t2, v1, 2
+ lwx t1, t2(t1)
+ move t3, v0
+ addu t1, t1, s0
+ lbu t2, -1(t1)
+1:
+ addiu t3, t3, -1
+ sb t2, 0(t1)
+ bgtz t3, 1b
+ addiu t1, t1, 1
+ addiu v1, v1, 1
+ bne v1, t0, 0b
+ nop
+2:
+ li v0, 80
+ mul v0, s6, v0
+ li v1, 16384
+ move t4, zero
+ move t5, zero
+ subu t6, v1, v0 // t6 = 16384 - tmp_smoot_f * 80
+ sll t7, s6, 4 // t7 = tmp_smoot_f * 16
+3:
+/* Special case for first column: pretend column -1 is same as column 0 */
+ sll v0, t4, 2
+ lwx t8, v0(a1) // outptr = output_data[outrow]
+ sll v1, t5, 2
+ addiu t9, v1, 4
+ addiu s0, v1, -4
+ addiu s1, v1, 8
+ lwx s2, v1(a0) // inptr0 = input_data[inrow]
+ lwx t9, t9(a0) // inptr1 = input_data[inrow+1]
+ lwx s0, s0(a0) // above_ptr = input_data[inrow-1]
+ lwx s1, s1(a0) // below_ptr = input_data[inrow+2]
+ lh v0, 0(s2)
+ lh v1, 0(t9)
+ lh t0, 0(s0)
+ lh t1, 0(s1)
+ ins v0, v1, 16, 16
+ ins t0, t1, 16, 16
+ raddu.w.qb t2, v0
+ raddu.w.qb s3, t0
+ lbu v0, 0(s2)
+ lbu v1, 2(s2)
+ lbu t0, 0(t9)
+ lbu t1, 2(t9)
+ addu v0, v0, v1
+ mult $ac1, t2, t6
+ addu t0, t0, t1
+ lbu t2, 2(s0)
+ addu t0, t0, v0
+ lbu t3, 2(s1)
+ addu s3, t0, s3
+ lbu v0, 0(s0)
+ lbu t0, 0(s1)
+ sll s3, s3, 1
+ addu v0, v0, t2
+ addu t0, t0, t3
+ addu t0, t0, v0
+ addu s3, t0, s3
+ madd $ac1, s3, t7
+ extr_r.w v0, $ac1, 16
+ addiu t8, t8, 1
+ addiu s2, s2, 2
+ addiu t9, t9, 2
+ addiu s0, s0, 2
+ addiu s1, s1, 2
+ sb v0, -1(t8)
+ addiu s4, s7, -2
+ and s4, s4, 3
+ addu s5, s4, t8 // end address
+4:
+ lh v0, 0(s2)
+ lh v1, 0(t9)
+ lh t0, 0(s0)
+ lh t1, 0(s1)
+ ins v0, v1, 16, 16
+ ins t0, t1, 16, 16
+ raddu.w.qb t2, v0
+ raddu.w.qb s3, t0
+ lbu v0, -1(s2)
+ lbu v1, 2(s2)
+ lbu t0, -1(t9)
+ lbu t1, 2(t9)
+ addu v0, v0, v1
+ mult $ac1, t2, t6
+ addu t0, t0, t1
+ lbu t2, 2(s0)
+ addu t0, t0, v0
+ lbu t3, 2(s1)
+ addu s3, t0, s3
+ lbu v0, -1(s0)
+ lbu t0, -1(s1)
+ sll s3, s3, 1
+ addu v0, v0, t2
+ addu t0, t0, t3
+ addu t0, t0, v0
+ addu s3, t0, s3
+ madd $ac1, s3, t7
+ extr_r.w t2, $ac1, 16
+ addiu t8, t8, 1
+ addiu s2, s2, 2
+ addiu t9, t9, 2
+ addiu s0, s0, 2
+ sb t2, -1(t8)
+ bne s5, t8, 4b
+ addiu s1, s1, 2
+ addiu s5, s7, -2
+ subu s5, s5, s4
+ addu s5, s5, t8 // end address
+5:
+ lh v0, 0(s2)
+ lh v1, 0(t9)
+ lh t0, 0(s0)
+ lh t1, 0(s1)
+ ins v0, v1, 16, 16
+ ins t0, t1, 16, 16
+ raddu.w.qb t2, v0
+ raddu.w.qb s3, t0
+ lbu v0, -1(s2)
+ lbu v1, 2(s2)
+ lbu t0, -1(t9)
+ lbu t1, 2(t9)
+ addu v0, v0, v1
+ mult $ac1, t2, t6
+ addu t0, t0, t1
+ lbu t2, 2(s0)
+ addu t0, t0, v0
+ lbu t3, 2(s1)
+ addu s3, t0, s3
+ lbu v0, -1(s0)
+ lbu t0, -1(s1)
+ sll s3, s3, 1
+ addu v0, v0, t2
+ addu t0, t0, t3
+ lh v1, 2(t9)
+ addu t0, t0, v0
+ lh v0, 2(s2)
+ addu s3, t0, s3
+ lh t0, 2(s0)
+ lh t1, 2(s1)
+ madd $ac1, s3, t7
+ extr_r.w t2, $ac1, 16
+ ins t0, t1, 16, 16
+ ins v0, v1, 16, 16
+ raddu.w.qb s3, t0
+ lbu v1, 4(s2)
+ lbu t0, 1(t9)
+ lbu t1, 4(t9)
+ sb t2, 0(t8)
+ raddu.w.qb t3, v0
+ lbu v0, 1(s2)
+ addu t0, t0, t1
+ mult $ac1, t3, t6
+ addu v0, v0, v1
+ lbu t2, 4(s0)
+ addu t0, t0, v0
+ lbu v0, 1(s0)
+ addu s3, t0, s3
+ lbu t0, 1(s1)
+ lbu t3, 4(s1)
+ addu v0, v0, t2
+ sll s3, s3, 1
+ addu t0, t0, t3
+ lh v1, 4(t9)
+ addu t0, t0, v0
+ lh v0, 4(s2)
+ addu s3, t0, s3
+ lh t0, 4(s0)
+ lh t1, 4(s1)
+ madd $ac1, s3, t7
+ extr_r.w t2, $ac1, 16
+ ins t0, t1, 16, 16
+ ins v0, v1, 16, 16
+ raddu.w.qb s3, t0
+ lbu v1, 6(s2)
+ lbu t0, 3(t9)
+ lbu t1, 6(t9)
+ sb t2, 1(t8)
+ raddu.w.qb t3, v0
+ lbu v0, 3(s2)
+ addu t0, t0, t1
+ mult $ac1, t3, t6
+ addu v0, v0, v1
+ lbu t2, 6(s0)
+ addu t0, t0, v0
+ lbu v0, 3(s0)
+ addu s3, t0, s3
+ lbu t0, 3(s1)
+ lbu t3, 6(s1)
+ addu v0, v0, t2
+ sll s3, s3, 1
+ addu t0, t0, t3
+ lh v1, 6(t9)
+ addu t0, t0, v0
+ lh v0, 6(s2)
+ addu s3, t0, s3
+ lh t0, 6(s0)
+ lh t1, 6(s1)
+ madd $ac1, s3, t7
+ extr_r.w t3, $ac1, 16
+ ins t0, t1, 16, 16
+ ins v0, v1, 16, 16
+ raddu.w.qb s3, t0
+ lbu v1, 8(s2)
+ lbu t0, 5(t9)
+ lbu t1, 8(t9)
+ sb t3, 2(t8)
+ raddu.w.qb t2, v0
+ lbu v0, 5(s2)
+ addu t0, t0, t1
+ mult $ac1, t2, t6
+ addu v0, v0, v1
+ lbu t2, 8(s0)
+ addu t0, t0, v0
+ lbu v0, 5(s0)
+ addu s3, t0, s3
+ lbu t0, 5(s1)
+ lbu t3, 8(s1)
+ addu v0, v0, t2
+ sll s3, s3, 1
+ addu t0, t0, t3
+ addiu t8, t8, 4
+ addu t0, t0, v0
+ addiu s2, s2, 8
+ addu s3, t0, s3
+ addiu t9, t9, 8
+ madd $ac1, s3, t7
+ extr_r.w t1, $ac1, 16
+ addiu s0, s0, 8
+ addiu s1, s1, 8
+ bne s5, t8, 5b
+ sb t1, -1(t8)
+/* Special case for last column */
+ lh v0, 0(s2)
+ lh v1, 0(t9)
+ lh t0, 0(s0)
+ lh t1, 0(s1)
+ ins v0, v1, 16, 16
+ ins t0, t1, 16, 16
+ raddu.w.qb t2, v0
+ raddu.w.qb s3, t0
+ lbu v0, -1(s2)
+ lbu v1, 1(s2)
+ lbu t0, -1(t9)
+ lbu t1, 1(t9)
+ addu v0, v0, v1
+ mult $ac1, t2, t6
+ addu t0, t0, t1
+ lbu t2, 1(s0)
+ addu t0, t0, v0
+ lbu t3, 1(s1)
+ addu s3, t0, s3
+ lbu v0, -1(s0)
+ lbu t0, -1(s1)
+ sll s3, s3, 1
+ addu v0, v0, t2
+ addu t0, t0, t3
+ addu t0, t0, v0
+ addu s3, t0, s3
+ madd $ac1, s3, t7
+ extr_r.w t0, $ac1, 16
+ addiu t5, t5, 2
+ sb t0, 0(t8)
+ addiu t4, t4, 1
+ bne t4, a2, 3b
+ addiu t5, t5, 2
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+
+END(jsimd_h2v2_smooth_downsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_int_upsample_dspr2)
+/*
+ * a0 = upsample->h_expand[compptr->component_index]
+ * a1 = upsample->v_expand[compptr->component_index]
+ * a2 = input_data
+ * a3 = output_data_ptr
+ * 16(sp) = cinfo->output_width
+ * 20(sp) = cinfo->max_v_samp_factor
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+ lw s0, 0(a3) // s0 = output_data
+ lw s1, 32(sp) // s1 = cinfo->output_width
+ lw s2, 36(sp) // s2 = cinfo->max_v_samp_factor
+ li t6, 0 // t6 = inrow
+ beqz s2, 10f
+ li s3, 0 // s3 = outrow
+0:
+ addu t0, a2, t6
+ addu t7, s0, s3
+ lw t3, 0(t0) // t3 = inptr
+ lw t8, 0(t7) // t8 = outptr
+ beqz s1, 4f
+ addu t5, t8, s1 // t5 = outend
+1:
+ lb t2, 0(t3) // t2 = invalue = *inptr++
+ addiu t3, 1
+ beqz a0, 3f
+ move t0, a0 // t0 = h_expand
+2:
+ sb t2, 0(t8)
+ addiu t0, -1
+ bgtz t0, 2b
+ addiu t8, 1
+3:
+ bgt t5, t8, 1b
+ nop
+4:
+ addiu t9, a1, -1 // t9 = v_expand - 1
+ blez t9, 9f
+ nop
+5:
+ lw t3, 0(s0)
+ lw t4, 4(s0)
+ subu t0, s1, 0xF
+ blez t0, 7f
+ addu t5, t3, s1 // t5 = end address
+ andi t7, s1, 0xF // t7 = residual
+ subu t8, t5, t7
+6:
+ ulw t0, 0(t3)
+ ulw t1, 4(t3)
+ ulw t2, 8(t3)
+ usw t0, 0(t4)
+ ulw t0, 12(t3)
+ usw t1, 4(t4)
+ usw t2, 8(t4)
+ usw t0, 12(t4)
+ addiu t3, 16
+ bne t3, t8, 6b
+ addiu t4, 16
+ beqz t7, 8f
+ nop
+7:
+ lbu t0, 0(t3)
+ sb t0, 0(t4)
+ addiu t3, 1
+ bne t3, t5, 7b
+ addiu t4, 1
+8:
+ addiu t9, -1
+ bgtz t9, 5b
+ addiu s0, 8
+9:
+ addu s3, s3, a1
+ bne s3, s2, 0b
+ addiu t6, 1
+10:
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+ j ra
+ nop
+END(jsimd_int_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v1_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = cinfo->output_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+ lw t7, 0(a3) // t7 = output_data
+ andi t8, a1, 0xf // t8 = residual
+ sll t0, a0, 2
+ blez a0, 4f
+ addu t9, t7, t0 // t9 = output_data end address
+0:
+ lw t5, 0(t7) // t5 = outptr
+ lw t6, 0(a2) // t6 = inptr
+ addu t3, t5, a1 // t3 = outptr + output_width (end address)
+ subu t3, t8 // t3 = end address - residual
+ beq t5, t3, 2f
+ move t4, t8
+1:
+ ulw t0, 0(t6) // t0 = |P3|P2|P1|P0|
+ ulw t2, 4(t6) // t2 = |P7|P6|P5|P4|
+ srl t1, t0, 16 // t1 = |X|X|P3|P2|
+ ins t0, t0, 16, 16 // t0 = |P1|P0|P1|P0|
+ ins t1, t1, 16, 16 // t1 = |P3|P2|P3|P2|
+ ins t0, t0, 8, 16 // t0 = |P1|P1|P0|P0|
+ ins t1, t1, 8, 16 // t1 = |P3|P3|P2|P2|
+ usw t0, 0(t5)
+ usw t1, 4(t5)
+ srl t0, t2, 16 // t0 = |X|X|P7|P6|
+ ins t2, t2, 16, 16 // t2 = |P5|P4|P5|P4|
+ ins t0, t0, 16, 16 // t0 = |P7|P6|P7|P6|
+ ins t2, t2, 8, 16 // t2 = |P5|P5|P4|P4|
+ ins t0, t0, 8, 16 // t0 = |P7|P7|P6|P6|
+ usw t2, 8(t5)
+ usw t0, 12(t5)
+ addiu t5, 16
+ bne t5, t3, 1b
+ addiu t6, 8
+ beqz t8, 3f
+ move t4, t8
+2:
+ lbu t1, 0(t6)
+ sb t1, 0(t5)
+ sb t1, 1(t5)
+ addiu t4, -2
+ addiu t6, 1
+ bgtz t4, 2b
+ addiu t5, 2
+3:
+ addiu t7, 4
+ bne t9, t7, 0b
+ addiu a2, 4
+4:
+ j ra
+ nop
+END(jsimd_h2v1_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v2_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = cinfo->output_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+ lw t7, 0(a3)
+ blez a0, 7f
+ andi t9, a1, 0xf // t9 = residual
+0:
+ lw t6, 0(a2) // t6 = inptr
+ lw t5, 0(t7) // t5 = outptr
+ addu t8, t5, a1 // t8 = outptr end address
+ subu t8, t9 // t8 = end address - residual
+ beq t5, t8, 2f
+ move t4, t9
+1:
+ ulw t0, 0(t6)
+ srl t1, t0, 16
+ ins t0, t0, 16, 16
+ ins t0, t0, 8, 16
+ ins t1, t1, 16, 16
+ ins t1, t1, 8, 16
+ ulw t2, 4(t6)
+ usw t0, 0(t5)
+ usw t1, 4(t5)
+ srl t3, t2, 16
+ ins t2, t2, 16, 16
+ ins t2, t2, 8, 16
+ ins t3, t3, 16, 16
+ ins t3, t3, 8, 16
+ usw t2, 8(t5)
+ usw t3, 12(t5)
+ addiu t5, 16
+ bne t5, t8, 1b
+ addiu t6, 8
+ beqz t9, 3f
+ move t4, t9
+2:
+ lbu t0, 0(t6)
+ sb t0, 0(t5)
+ sb t0, 1(t5)
+ addiu t4, -2
+ addiu t6, 1
+ bgtz t4, 2b
+ addiu t5, 2
+3:
+ lw t6, 0(t7) // t6 = outptr[0]
+ lw t5, 4(t7) // t5 = outptr[1]
+ addu t4, t6, a1 // t4 = new end address
+ beq a1, t9, 5f
+ subu t8, t4, t9
+4:
+ ulw t0, 0(t6)
+ ulw t1, 4(t6)
+ ulw t2, 8(t6)
+ usw t0, 0(t5)
+ ulw t0, 12(t6)
+ usw t1, 4(t5)
+ usw t2, 8(t5)
+ usw t0, 12(t5)
+ addiu t6, 16
+ bne t6, t8, 4b
+ addiu t5, 16
+ beqz t9, 6f
+ nop
+5:
+ lbu t0, 0(t6)
+ sb t0, 0(t5)
+ addiu t6, 1
+ bne t6, t4, 5b
+ addiu t5, 1
+6:
+ addiu t7, 8
+ addiu a0, -2
+ bgtz a0, 0b
+ addiu a2, 4
+7:
+ j ra
+ nop
+END(jsimd_h2v2_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_islow_dspr2)
+/*
+ * a0 = coef_block
+ * a1 = compptr->dcttable
+ * a2 = output
+ * a3 = range_limit
+ */
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ addiu sp, sp, -256
+ move v0, sp
+ addiu v1, zero, 8 // v1 = DCTSIZE = 8
+1:
+ lh s4, 32(a0) // s4 = inptr[16]
+ lh s5, 64(a0) // s5 = inptr[32]
+ lh s6, 96(a0) // s6 = inptr[48]
+ lh t1, 112(a0) // t1 = inptr[56]
+ lh t7, 16(a0) // t7 = inptr[8]
+ lh t5, 80(a0) // t5 = inptr[40]
+ lh t3, 48(a0) // t3 = inptr[24]
+ or s4, s4, t1
+ or s4, s4, t3
+ or s4, s4, t5
+ or s4, s4, t7
+ or s4, s4, s5
+ or s4, s4, s6
+ bnez s4, 2f
+ addiu v1, v1, -1
+ lh s5, 0(a1) // quantptr[DCTSIZE*0]
+ lh s6, 0(a0) // inptr[DCTSIZE*0]
+ mul s5, s5, s6 // DEQUANTIZE(inptr[0], quantptr[0])
+ sll s5, s5, 2
+ sw s5, 0(v0)
+ sw s5, 32(v0)
+ sw s5, 64(v0)
+ sw s5, 96(v0)
+ sw s5, 128(v0)
+ sw s5, 160(v0)
+ sw s5, 192(v0)
+ b 3f
+ sw s5, 224(v0)
+2:
+ lh t0, 112(a1)
+ lh t2, 48(a1)
+ lh t4, 80(a1)
+ lh t6, 16(a1)
+ mul t0, t0, t1 // DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7])
+ mul t1, t2, t3 // DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3])
+ mul t2, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5])
+ mul t3, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1])
+ lh t4, 32(a1)
+ lh t5, 32(a0)
+ lh t6, 96(a1)
+ lh t7, 96(a0)
+ addu s0, t0, t1 // z3 = tmp0 + tmp2
+ addu s1, t1, t2 // z2 = tmp1 + tmp2
+ addu s2, t2, t3 // z4 = tmp1 + tmp3
+ addu s3, s0, s2 // z3 + z4
+ addiu t9, zero, 9633 // FIX_1_175875602
+ mul s3, s3, t9 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
+ addu t8, t0, t3 // z1 = tmp0 + tmp3
+ addiu t9, zero, 2446 // FIX_0_298631336
+ mul t0, t0, t9 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
+ addiu t9, zero, 16819 // FIX_2_053119869
+ mul t2, t2, t9 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
+ addiu t9, zero, 25172 // FIX_3_072711026
+ mul t1, t1, t9 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
+ addiu t9, zero, 12299 // FIX_1_501321110
+ mul t3, t3, t9 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
+ addiu t9, zero, 16069 // FIX_1_961570560
+ mul s0, s0, t9 // -z3 = MULTIPLY(z3, FIX_1_961570560)
+ addiu t9, zero, 3196 // FIX_0_390180644
+ mul s2, s2, t9 // -z4 = MULTIPLY(z4, FIX_0_390180644)
+ addiu t9, zero, 7373 // FIX_0_899976223
+ mul t8, t8, t9 // -z1 = MULTIPLY(z1, FIX_0_899976223)
+ addiu t9, zero, 20995 // FIX_2_562915447
+ mul s1, s1, t9 // -z2 = MULTIPLY(z2, FIX_2_562915447)
+ subu s0, s3, s0 // z3 += z5
+ addu t0, t0, s0 // tmp0 += z3
+ addu t1, t1, s0 // tmp2 += z3
+ subu s2, s3, s2 // z4 += z5
+ addu t2, t2, s2 // tmp1 += z4
+ addu t3, t3, s2 // tmp3 += z4
+ subu t0, t0, t8 // tmp0 += z1
+ subu t1, t1, s1 // tmp2 += z2
+ subu t2, t2, s1 // tmp1 += z2
+ subu t3, t3, t8 // tmp3 += z1
+ mul s0, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2])
+ addiu t9, zero, 6270 // FIX_0_765366865
+ mul s1, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6])
+ lh t4, 0(a1)
+ lh t5, 0(a0)
+ lh t6, 64(a1)
+ lh t7, 64(a0)
+ mul s2, t9, s0 // MULTIPLY(z2, FIX_0_765366865)
+ mul t5, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0])
+ mul t6, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4])
+ addiu t9, zero, 4433 // FIX_0_541196100
+ addu s3, s0, s1 // z2 + z3
+ mul s3, s3, t9 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
+ addiu t9, zero, 15137 // FIX_1_847759065
+ mul t8, s1, t9 // MULTIPLY(z3, FIX_1_847759065)
+ addu t4, t5, t6
+ subu t5, t5, t6
+ sll t4, t4, 13 // tmp0 = (z2 + z3) << CONST_BITS
+ sll t5, t5, 13 // tmp1 = (z2 - z3) << CONST_BITS
+ addu t7, s3, s2 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
+ subu t6, s3, t8 // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065)
+ addu s0, t4, t7
+ subu s1, t4, t7
+ addu s2, t5, t6
+ subu s3, t5, t6
+ addu t4, s0, t3
+ subu s0, s0, t3
+ addu t3, s2, t1
+ subu s2, s2, t1
+ addu t1, s3, t2
+ subu s3, s3, t2
+ addu t2, s1, t0
+ subu s1, s1, t0
+ shra_r.w t4, t4, 11
+ shra_r.w t3, t3, 11
+ shra_r.w t1, t1, 11
+ shra_r.w t2, t2, 11
+ shra_r.w s1, s1, 11
+ shra_r.w s3, s3, 11
+ shra_r.w s2, s2, 11
+ shra_r.w s0, s0, 11
+ sw t4, 0(v0)
+ sw t3, 32(v0)
+ sw t1, 64(v0)
+ sw t2, 96(v0)
+ sw s1, 128(v0)
+ sw s3, 160(v0)
+ sw s2, 192(v0)
+ sw s0, 224(v0)
+3:
+ addiu a1, a1, 2
+ addiu a0, a0, 2
+ bgtz v1, 1b
+ addiu v0, v0, 4
+ move v0, sp
+ addiu v1, zero, 8
+4:
+ lw t0, 8(v0) // z2 = (JLONG)wsptr[2]
+ lw t1, 24(v0) // z3 = (JLONG)wsptr[6]
+ lw t2, 0(v0) // (JLONG)wsptr[0]
+ lw t3, 16(v0) // (JLONG)wsptr[4]
+ lw s4, 4(v0) // (JLONG)wsptr[1]
+ lw s5, 12(v0) // (JLONG)wsptr[3]
+ lw s6, 20(v0) // (JLONG)wsptr[5]
+ lw s7, 28(v0) // (JLONG)wsptr[7]
+ or s4, s4, t0
+ or s4, s4, t1
+ or s4, s4, t3
+ or s4, s4, s7
+ or s4, s4, s5
+ or s4, s4, s6
+ bnez s4, 5f
+ addiu v1, v1, -1
+ shra_r.w s5, t2, 5
+ andi s5, s5, 0x3ff
+ lbux s5, s5(a3)
+ lw s1, 0(a2)
+ replv.qb s5, s5
+ usw s5, 0(s1)
+ usw s5, 4(s1)
+ b 6f
+ nop
+5:
+ addu t4, t0, t1 // z2 + z3
+ addiu t8, zero, 4433 // FIX_0_541196100
+ mul t5, t4, t8 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
+ addiu t8, zero, 15137 // FIX_1_847759065
+ mul t1, t1, t8 // MULTIPLY(z3, FIX_1_847759065)
+ addiu t8, zero, 6270 // FIX_0_765366865
+ mul t0, t0, t8 // MULTIPLY(z2, FIX_0_765366865)
+ addu t4, t2, t3 // (JLONG)wsptr[0] + (JLONG)wsptr[4]
+ subu t2, t2, t3 // (JLONG)wsptr[0] - (JLONG)wsptr[4]
+ sll t4, t4, 13 // tmp0 = (wsptr[0] + wsptr[4]) << CONST_BITS
+ sll t2, t2, 13 // tmp1 = (wsptr[0] - wsptr[4]) << CONST_BITS
+ subu t1, t5, t1 // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065)
+ subu t3, t2, t1 // tmp12 = tmp1 - tmp2
+ addu t2, t2, t1 // tmp11 = tmp1 + tmp2
+ addu t5, t5, t0 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
+ subu t1, t4, t5 // tmp13 = tmp0 - tmp3
+ addu t0, t4, t5 // tmp10 = tmp0 + tmp3
+ lw t4, 28(v0) // tmp0 = (JLONG)wsptr[7]
+ lw t6, 12(v0) // tmp2 = (JLONG)wsptr[3]
+ lw t5, 20(v0) // tmp1 = (JLONG)wsptr[5]
+ lw t7, 4(v0) // tmp3 = (JLONG)wsptr[1]
+ addu s0, t4, t6 // z3 = tmp0 + tmp2
+ addiu t8, zero, 9633 // FIX_1_175875602
+ addu s1, t5, t7 // z4 = tmp1 + tmp3
+ addu s2, s0, s1 // z3 + z4
+ mul s2, s2, t8 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
+ addu s3, t4, t7 // z1 = tmp0 + tmp3
+ addu t9, t5, t6 // z2 = tmp1 + tmp2
+ addiu t8, zero, 16069 // FIX_1_961570560
+ mul s0, s0, t8 // -z3 = MULTIPLY(z3, FIX_1_961570560)
+ addiu t8, zero, 3196 // FIX_0_390180644
+ mul s1, s1, t8 // -z4 = MULTIPLY(z4, FIX_0_390180644)
+ addiu t8, zero, 2446 // FIX_0_298631336
+ mul t4, t4, t8 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
+ addiu t8, zero, 7373 // FIX_0_899976223
+ mul s3, s3, t8 // -z1 = MULTIPLY(z1, FIX_0_899976223)
+ addiu t8, zero, 16819 // FIX_2_053119869
+ mul t5, t5, t8 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
+ addiu t8, zero, 20995 // FIX_2_562915447
+ mul t9, t9, t8 // -z2 = MULTIPLY(z2, FIX_2_562915447)
+ addiu t8, zero, 25172 // FIX_3_072711026
+ mul t6, t6, t8 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
+ addiu t8, zero, 12299 // FIX_1_501321110
+ mul t7, t7, t8 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
+ subu s0, s2, s0 // z3 += z5
+ subu s1, s2, s1 // z4 += z5
+ addu t4, t4, s0
+ subu t4, t4, s3 // tmp0
+ addu t5, t5, s1
+ subu t5, t5, t9 // tmp1
+ addu t6, t6, s0
+ subu t6, t6, t9 // tmp2
+ addu t7, t7, s1
+ subu t7, t7, s3 // tmp3
+ addu s0, t0, t7
+ subu t0, t0, t7
+ addu t7, t2, t6
+ subu t2, t2, t6
+ addu t6, t3, t5
+ subu t3, t3, t5
+ addu t5, t1, t4
+ subu t1, t1, t4
+ shra_r.w s0, s0, 18
+ shra_r.w t7, t7, 18
+ shra_r.w t6, t6, 18
+ shra_r.w t5, t5, 18
+ shra_r.w t1, t1, 18
+ shra_r.w t3, t3, 18
+ shra_r.w t2, t2, 18
+ shra_r.w t0, t0, 18
+ andi s0, s0, 0x3ff
+ andi t7, t7, 0x3ff
+ andi t6, t6, 0x3ff
+ andi t5, t5, 0x3ff
+ andi t1, t1, 0x3ff
+ andi t3, t3, 0x3ff
+ andi t2, t2, 0x3ff
+ andi t0, t0, 0x3ff
+ lw s1, 0(a2)
+ lbux s0, s0(a3)
+ lbux t7, t7(a3)
+ lbux t6, t6(a3)
+ lbux t5, t5(a3)
+ lbux t1, t1(a3)
+ lbux t3, t3(a3)
+ lbux t2, t2(a3)
+ lbux t0, t0(a3)
+ sb s0, 0(s1)
+ sb t7, 1(s1)
+ sb t6, 2(s1)
+ sb t5, 3(s1)
+ sb t1, 4(s1)
+ sb t3, 5(s1)
+ sb t2, 6(s1)
+ sb t0, 7(s1)
+6:
+ addiu v0, v0, 32
+ bgtz v1, 4b
+ addiu a2, a2, 4
+ addiu sp, sp, 256
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+
+END(jsimd_idct_islow_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2)
+/*
+ * a0 = inptr
+ * a1 = quantptr
+ * a2 = wsptr
+ * a3 = mips_idct_ifast_coefs
+ */
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ addiu t9, a0, 16 // end address
+ or AT, a3, zero
+
+0:
+ lw s0, 0(a1) // quantptr[DCTSIZE*0]
+ lw t0, 0(a0) // inptr[DCTSIZE*0]
+ lw t1, 16(a0) // inptr[DCTSIZE*1]
+ muleq_s.w.phl v0, t0, s0 // tmp0 ...
+ lw t2, 32(a0) // inptr[DCTSIZE*2]
+ lw t3, 48(a0) // inptr[DCTSIZE*3]
+ lw t4, 64(a0) // inptr[DCTSIZE*4]
+ lw t5, 80(a0) // inptr[DCTSIZE*5]
+ muleq_s.w.phr t0, t0, s0 // ... tmp0 ...
+ lw t6, 96(a0) // inptr[DCTSIZE*6]
+ lw t7, 112(a0) // inptr[DCTSIZE*7]
+ or s4, t1, t2
+ or s5, t3, t4
+ bnez s4, 1f
+ ins t0, v0, 16, 16 // ... tmp0
+ bnez s5, 1f
+ or s6, t5, t6
+ or s6, s6, t7
+ bnez s6, 1f
+ sw t0, 0(a2) // wsptr[DCTSIZE*0]
+ sw t0, 16(a2) // wsptr[DCTSIZE*1]
+ sw t0, 32(a2) // wsptr[DCTSIZE*2]
+ sw t0, 48(a2) // wsptr[DCTSIZE*3]
+ sw t0, 64(a2) // wsptr[DCTSIZE*4]
+ sw t0, 80(a2) // wsptr[DCTSIZE*5]
+ sw t0, 96(a2) // wsptr[DCTSIZE*6]
+ sw t0, 112(a2) // wsptr[DCTSIZE*7]
+ addiu a0, a0, 4
+ b 2f
+ addiu a1, a1, 4
+
+1:
+ lw s1, 32(a1) // quantptr[DCTSIZE*2]
+ lw s2, 64(a1) // quantptr[DCTSIZE*4]
+ muleq_s.w.phl v0, t2, s1 // tmp1 ...
+ muleq_s.w.phr t2, t2, s1 // ... tmp1 ...
+ lw s0, 16(a1) // quantptr[DCTSIZE*1]
+ lw s1, 48(a1) // quantptr[DCTSIZE*3]
+ lw s3, 96(a1) // quantptr[DCTSIZE*6]
+ muleq_s.w.phl v1, t4, s2 // tmp2 ...
+ muleq_s.w.phr t4, t4, s2 // ... tmp2 ...
+ lw s2, 80(a1) // quantptr[DCTSIZE*5]
+ lw t8, 4(AT) // FIX(1.414213562)
+ ins t2, v0, 16, 16 // ... tmp1
+ muleq_s.w.phl v0, t6, s3 // tmp3 ...
+ muleq_s.w.phr t6, t6, s3 // ... tmp3 ...
+ ins t4, v1, 16, 16 // ... tmp2
+ addq.ph s4, t0, t4 // tmp10
+ subq.ph s5, t0, t4 // tmp11
+ ins t6, v0, 16, 16 // ... tmp3
+ subq.ph s6, t2, t6 // tmp12 ...
+ addq.ph s7, t2, t6 // tmp13
+ mulq_s.ph s6, s6, t8 // ... tmp12 ...
+ addq.ph t0, s4, s7 // tmp0
+ subq.ph t6, s4, s7 // tmp3
+ muleq_s.w.phl v0, t1, s0 // tmp4 ...
+ muleq_s.w.phr t1, t1, s0 // ... tmp4 ...
+ shll_s.ph s6, s6, 1 // x2
+ lw s3, 112(a1) // quantptr[DCTSIZE*7]
+ subq.ph s6, s6, s7 // ... tmp12
+ muleq_s.w.phl v1, t7, s3 // tmp7 ...
+ muleq_s.w.phr t7, t7, s3 // ... tmp7 ...
+ ins t1, v0, 16, 16 // ... tmp4
+ addq.ph t2, s5, s6 // tmp1
+ subq.ph t4, s5, s6 // tmp2
+ muleq_s.w.phl v0, t5, s2 // tmp6 ...
+ muleq_s.w.phr t5, t5, s2 // ... tmp6 ...
+ ins t7, v1, 16, 16 // ... tmp7
+ addq.ph s5, t1, t7 // z11
+ subq.ph s6, t1, t7 // z12
+ muleq_s.w.phl v1, t3, s1 // tmp5 ...
+ muleq_s.w.phr t3, t3, s1 // ... tmp5 ...
+ ins t5, v0, 16, 16 // ... tmp6
+ ins t3, v1, 16, 16 // ... tmp5
+ addq.ph s7, t5, t3 // z13
+ subq.ph v0, t5, t3 // z10
+ addq.ph t7, s5, s7 // tmp7
+ subq.ph s5, s5, s7 // tmp11 ...
+ addq.ph v1, v0, s6 // z5 ...
+ mulq_s.ph s5, s5, t8 // ... tmp11
+ lw t8, 8(AT) // FIX(1.847759065)
+ lw s4, 0(AT) // FIX(1.082392200)
+ addq.ph s0, t0, t7
+ subq.ph s1, t0, t7
+ mulq_s.ph v1, v1, t8 // ... z5
+ shll_s.ph s5, s5, 1 // x2
+ lw t8, 12(AT) // FIX(-2.613125930)
+ sw s0, 0(a2) // wsptr[DCTSIZE*0]
+ shll_s.ph v0, v0, 1 // x4
+ mulq_s.ph v0, v0, t8 // tmp12 ...
+ mulq_s.ph s4, s6, s4 // tmp10 ...
+ shll_s.ph v1, v1, 1 // x2
+ addiu a0, a0, 4
+ addiu a1, a1, 4
+ sw s1, 112(a2) // wsptr[DCTSIZE*7]
+ shll_s.ph s6, v0, 1 // x4
+ shll_s.ph s4, s4, 1 // x2
+ addq.ph s6, s6, v1 // ... tmp12
+ subq.ph t5, s6, t7 // tmp6
+ subq.ph s4, s4, v1 // ... tmp10
+ subq.ph t3, s5, t5 // tmp5
+ addq.ph s2, t2, t5
+ addq.ph t1, s4, t3 // tmp4
+ subq.ph s3, t2, t5
+ sw s2, 16(a2) // wsptr[DCTSIZE*1]
+ sw s3, 96(a2) // wsptr[DCTSIZE*6]
+ addq.ph v0, t4, t3
+ subq.ph v1, t4, t3
+ sw v0, 32(a2) // wsptr[DCTSIZE*2]
+ sw v1, 80(a2) // wsptr[DCTSIZE*5]
+ addq.ph v0, t6, t1
+ subq.ph v1, t6, t1
+ sw v0, 64(a2) // wsptr[DCTSIZE*4]
+ sw v1, 48(a2) // wsptr[DCTSIZE*3]
+
+2:
+ bne a0, t9, 0b
+ addiu a2, a2, 4
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+
+END(jsimd_idct_ifast_cols_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2)
+/*
+ * a0 = wsptr
+ * a1 = output_buf
+ * a2 = output_col
+ * a3 = mips_idct_ifast_coefs
+ */
+ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
+
+ addiu t9, a0, 128 // end address
+ lui s8, 0x8080
+ ori s8, s8, 0x8080
+
+0:
+ lw AT, 36(sp) // restore $a3 (mips_idct_ifast_coefs)
+ lw t0, 0(a0) // wsptr[DCTSIZE*0+0/1] b a
+ lw s0, 16(a0) // wsptr[DCTSIZE*1+0/1] B A
+ lw t2, 4(a0) // wsptr[DCTSIZE*0+2/3] d c
+ lw s2, 20(a0) // wsptr[DCTSIZE*1+2/3] D C
+ lw t4, 8(a0) // wsptr[DCTSIZE*0+4/5] f e
+ lw s4, 24(a0) // wsptr[DCTSIZE*1+4/5] F E
+ lw t6, 12(a0) // wsptr[DCTSIZE*0+6/7] h g
+ lw s6, 28(a0) // wsptr[DCTSIZE*1+6/7] H G
+ precrq.ph.w t1, s0, t0 // B b
+ ins t0, s0, 16, 16 // A a
+ bnez t1, 1f
+ or s0, t2, s2
+ bnez s0, 1f
+ or s0, t4, s4
+ bnez s0, 1f
+ or s0, t6, s6
+ bnez s0, 1f
+ shll_s.ph s0, t0, 2 // A a
+ lw a3, 0(a1)
+ lw AT, 4(a1)
+ precrq.ph.w t0, s0, s0 // A A
+ ins s0, s0, 16, 16 // a a
+ addu a3, a3, a2
+ addu AT, AT, a2
+ precrq.qb.ph t0, t0, t0 // A A A A
+ precrq.qb.ph s0, s0, s0 // a a a a
+ addu.qb s0, s0, s8
+ addu.qb t0, t0, s8
+ sw s0, 0(a3)
+ sw s0, 4(a3)
+ sw t0, 0(AT)
+ sw t0, 4(AT)
+ addiu a0, a0, 32
+ bne a0, t9, 0b
+ addiu a1, a1, 8
+ b 2f
+ nop
+
+1:
+ precrq.ph.w t3, s2, t2
+ ins t2, s2, 16, 16
+ precrq.ph.w t5, s4, t4
+ ins t4, s4, 16, 16
+ precrq.ph.w t7, s6, t6
+ ins t6, s6, 16, 16
+ lw t8, 4(AT) // FIX(1.414213562)
+ addq.ph s4, t0, t4 // tmp10
+ subq.ph s5, t0, t4 // tmp11
+ subq.ph s6, t2, t6 // tmp12 ...
+ addq.ph s7, t2, t6 // tmp13
+ mulq_s.ph s6, s6, t8 // ... tmp12 ...
+ addq.ph t0, s4, s7 // tmp0
+ subq.ph t6, s4, s7 // tmp3
+ shll_s.ph s6, s6, 1 // x2
+ subq.ph s6, s6, s7 // ... tmp12
+ addq.ph t2, s5, s6 // tmp1
+ subq.ph t4, s5, s6 // tmp2
+ addq.ph s5, t1, t7 // z11
+ subq.ph s6, t1, t7 // z12
+ addq.ph s7, t5, t3 // z13
+ subq.ph v0, t5, t3 // z10
+ addq.ph t7, s5, s7 // tmp7
+ subq.ph s5, s5, s7 // tmp11 ...
+ addq.ph v1, v0, s6 // z5 ...
+ mulq_s.ph s5, s5, t8 // ... tmp11
+ lw t8, 8(AT) // FIX(1.847759065)
+ lw s4, 0(AT) // FIX(1.082392200)
+ addq.ph s0, t0, t7 // tmp0 + tmp7
+ subq.ph s7, t0, t7 // tmp0 - tmp7
+ mulq_s.ph v1, v1, t8 // ... z5
+ lw a3, 0(a1)
+ lw t8, 12(AT) // FIX(-2.613125930)
+ shll_s.ph s5, s5, 1 // x2
+ addu a3, a3, a2
+ shll_s.ph v0, v0, 1 // x4
+ mulq_s.ph v0, v0, t8 // tmp12 ...
+ mulq_s.ph s4, s6, s4 // tmp10 ...
+ shll_s.ph v1, v1, 1 // x2
+ addiu a0, a0, 32
+ addiu a1, a1, 8
+ shll_s.ph s6, v0, 1 // x4
+ shll_s.ph s4, s4, 1 // x2
+ addq.ph s6, s6, v1 // ... tmp12
+ shll_s.ph s0, s0, 2
+ subq.ph t5, s6, t7 // tmp6
+ subq.ph s4, s4, v1 // ... tmp10
+ subq.ph t3, s5, t5 // tmp5
+ shll_s.ph s7, s7, 2
+ addq.ph t1, s4, t3 // tmp4
+ addq.ph s1, t2, t5 // tmp1 + tmp6
+ subq.ph s6, t2, t5 // tmp1 - tmp6
+ addq.ph s2, t4, t3 // tmp2 + tmp5
+ subq.ph s5, t4, t3 // tmp2 - tmp5
+ addq.ph s4, t6, t1 // tmp3 + tmp4
+ subq.ph s3, t6, t1 // tmp3 - tmp4
+ shll_s.ph s1, s1, 2
+ shll_s.ph s2, s2, 2
+ shll_s.ph s3, s3, 2
+ shll_s.ph s4, s4, 2
+ shll_s.ph s5, s5, 2
+ shll_s.ph s6, s6, 2
+ precrq.ph.w t0, s1, s0 // B A
+ ins s0, s1, 16, 16 // b a
+ precrq.ph.w t2, s3, s2 // D C
+ ins s2, s3, 16, 16 // d c
+ precrq.ph.w t4, s5, s4 // F E
+ ins s4, s5, 16, 16 // f e
+ precrq.ph.w t6, s7, s6 // H G
+ ins s6, s7, 16, 16 // h g
+ precrq.qb.ph t0, t2, t0 // D C B A
+ precrq.qb.ph s0, s2, s0 // d c b a
+ precrq.qb.ph t4, t6, t4 // H G F E
+ precrq.qb.ph s4, s6, s4 // h g f e
+ addu.qb s0, s0, s8
+ addu.qb s4, s4, s8
+ sw s0, 0(a3) // outptr[0/1/2/3] d c b a
+ sw s4, 4(a3) // outptr[4/5/6/7] h g f e
+ lw a3, -4(a1)
+ addu.qb t0, t0, s8
+ addu a3, a3, a2
+ addu.qb t4, t4, s8
+ sw t0, 0(a3) // outptr[0/1/2/3] D C B A
+ bne a0, t9, 0b
+ sw t4, 4(a3) // outptr[4/5/6/7] H G F E
+
+2:
+
+ RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
+
+ j ra
+ nop
+
+END(jsimd_idct_ifast_rows_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_fdct_islow_dspr2)
+/*
+ * a0 = data
+ */
+ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+ lui t0, 6437
+ ori t0, 2260
+ lui t1, 9633
+ ori t1, 11363
+ lui t2, 0xd39e
+ ori t2, 0xe6dc
+ lui t3, 0xf72d
+ ori t3, 9633
+ lui t4, 2261
+ ori t4, 9633
+ lui t5, 0xd39e
+ ori t5, 6437
+ lui t6, 9633
+ ori t6, 0xd39d
+ lui t7, 0xe6dc
+ ori t7, 2260
+ lui t8, 4433
+ ori t8, 10703
+ lui t9, 0xd630
+ ori t9, 4433
+ li s8, 8
+ move a1, a0
+1:
+ lw s0, 0(a1) // tmp0 = 1|0
+ lw s1, 4(a1) // tmp1 = 3|2
+ lw s2, 8(a1) // tmp2 = 5|4
+ lw s3, 12(a1) // tmp3 = 7|6
+ packrl.ph s1, s1, s1 // tmp1 = 2|3
+ packrl.ph s3, s3, s3 // tmp3 = 6|7
+ subq.ph s7, s1, s2 // tmp7 = 2-5|3-4 = t5|t4
+ subq.ph s5, s0, s3 // tmp5 = 1-6|0-7 = t6|t7
+ mult $0, $0 // ac0 = 0
+ dpa.w.ph $ac0, s7, t0 // ac0 += t5* 6437 + t4* 2260
+ dpa.w.ph $ac0, s5, t1 // ac0 += t6* 9633 + t7* 11363
+ mult $ac1, $0, $0 // ac1 = 0
+ dpa.w.ph $ac1, s7, t2 // ac1 += t5*-11362 + t4* -6436
+ dpa.w.ph $ac1, s5, t3 // ac1 += t6* -2259 + t7* 9633
+ mult $ac2, $0, $0 // ac2 = 0
+ dpa.w.ph $ac2, s7, t4 // ac2 += t5* 2261 + t4* 9633
+ dpa.w.ph $ac2, s5, t5 // ac2 += t6*-11362 + t7* 6437
+ mult $ac3, $0, $0 // ac3 = 0
+ dpa.w.ph $ac3, s7, t6 // ac3 += t5* 9633 + t4*-11363
+ dpa.w.ph $ac3, s5, t7 // ac3 += t6* -6436 + t7* 2260
+ addq.ph s6, s1, s2 // tmp6 = 2+5|3+4 = t2|t3
+ addq.ph s4, s0, s3 // tmp4 = 1+6|0+7 = t1|t0
+ extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
+ extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
+ extr_r.w s2, $ac2, 11 // tmp2 = (ac2 + 1024) >> 11
+ extr_r.w s3, $ac3, 11 // tmp3 = (ac3 + 1024) >> 11
+ addq.ph s5, s4, s6 // tmp5 = t1+t2|t0+t3 = t11|t10
+ subq.ph s7, s4, s6 // tmp7 = t1-t2|t0-t3 = t12|t13
+ sh s0, 2(a1)
+ sh s1, 6(a1)
+ sh s2, 10(a1)
+ sh s3, 14(a1)
+ mult $0, $0 // ac0 = 0
+ dpa.w.ph $ac0, s7, t8 // ac0 += t12* 4433 + t13* 10703
+ mult $ac1, $0, $0 // ac1 = 0
+ dpa.w.ph $ac1, s7, t9 // ac1 += t12*-10704 + t13* 4433
+ sra s4, s5, 16 // tmp4 = t11
+ addiu a1, a1, 16
+ addiu s8, s8, -1
+ extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
+ extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
+ addu s2, s5, s4 // tmp2 = t10 + t11
+ subu s3, s5, s4 // tmp3 = t10 - t11
+ sll s2, s2, 2 // tmp2 = (t10 + t11) << 2
+ sll s3, s3, 2 // tmp3 = (t10 - t11) << 2
+ sh s2, -16(a1)
+ sh s3, -8(a1)
+ sh s0, -12(a1)
+ bgtz s8, 1b
+ sh s1, -4(a1)
+ li t0, 2260
+ li t1, 11363
+ li t2, 9633
+ li t3, 6436
+ li t4, 6437
+ li t5, 2261
+ li t6, 11362
+ li t7, 2259
+ li t8, 4433
+ li t9, 10703
+ li a1, 10704
+ li s8, 8
+
+2:
+ lh a2, 0(a0) // 0
+ lh a3, 16(a0) // 8
+ lh v0, 32(a0) // 16
+ lh v1, 48(a0) // 24
+ lh s4, 64(a0) // 32
+ lh s5, 80(a0) // 40
+ lh s6, 96(a0) // 48
+ lh s7, 112(a0) // 56
+ addu s2, v0, s5 // tmp2 = 16 + 40
+ subu s5, v0, s5 // tmp5 = 16 - 40
+ addu s3, v1, s4 // tmp3 = 24 + 32
+ subu s4, v1, s4 // tmp4 = 24 - 32
+ addu s0, a2, s7 // tmp0 = 0 + 56
+ subu s7, a2, s7 // tmp7 = 0 - 56
+ addu s1, a3, s6 // tmp1 = 8 + 48
+ subu s6, a3, s6 // tmp6 = 8 - 48
+ addu a2, s0, s3 // tmp10 = tmp0 + tmp3
+ subu v1, s0, s3 // tmp13 = tmp0 - tmp3
+ addu a3, s1, s2 // tmp11 = tmp1 + tmp2
+ subu v0, s1, s2 // tmp12 = tmp1 - tmp2
+ mult s7, t1 // ac0 = tmp7 * c1
+ madd s4, t0 // ac0 += tmp4 * c0
+ madd s5, t4 // ac0 += tmp5 * c4
+ madd s6, t2 // ac0 += tmp6 * c2
+ mult $ac1, s7, t2 // ac1 = tmp7 * c2
+ msub $ac1, s4, t3 // ac1 -= tmp4 * c3
+ msub $ac1, s5, t6 // ac1 -= tmp5 * c6
+ msub $ac1, s6, t7 // ac1 -= tmp6 * c7
+ mult $ac2, s7, t4 // ac2 = tmp7 * c4
+ madd $ac2, s4, t2 // ac2 += tmp4 * c2
+ madd $ac2, s5, t5 // ac2 += tmp5 * c5
+ msub $ac2, s6, t6 // ac2 -= tmp6 * c6
+ mult $ac3, s7, t0 // ac3 = tmp7 * c0
+ msub $ac3, s4, t1 // ac3 -= tmp4 * c1
+ madd $ac3, s5, t2 // ac3 += tmp5 * c2
+ msub $ac3, s6, t3 // ac3 -= tmp6 * c3
+ extr_r.w s0, $ac0, 15 // tmp0 = (ac0 + 16384) >> 15
+ extr_r.w s1, $ac1, 15 // tmp1 = (ac1 + 16384) >> 15
+ extr_r.w s2, $ac2, 15 // tmp2 = (ac2 + 16384) >> 15
+ extr_r.w s3, $ac3, 15 // tmp3 = (ac3 + 16384) >> 15
+ addiu s8, s8, -1
+ addu s4, a2, a3 // tmp4 = tmp10 + tmp11
+ subu s5, a2, a3 // tmp5 = tmp10 - tmp11
+ sh s0, 16(a0)
+ sh s1, 48(a0)
+ sh s2, 80(a0)
+ sh s3, 112(a0)
+ mult v0, t8 // ac0 = tmp12 * c8
+ madd v1, t9 // ac0 += tmp13 * c9
+ mult $ac1, v1, t8 // ac1 = tmp13 * c8
+ msub $ac1, v0, a1 // ac1 -= tmp12 * c10
+ addiu a0, a0, 2
+ extr_r.w s6, $ac0, 15 // tmp6 = (ac0 + 16384) >> 15
+ extr_r.w s7, $ac1, 15 // tmp7 = (ac1 + 16384) >> 15
+ shra_r.w s4, s4, 2 // tmp4 = (tmp4 + 2) >> 2
+ shra_r.w s5, s5, 2 // tmp5 = (tmp5 + 2) >> 2
+ sh s4, -2(a0)
+ sh s5, 62(a0)
+ sh s6, 30(a0)
+ bgtz s8, 2b
+ sh s7, 94(a0)
+
+ RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+ jr ra
+ nop
+
+END(jsimd_fdct_islow_dspr2)
+
+
+/**************************************************************************/
+LEAF_DSPR2(jsimd_fdct_ifast_dspr2)
+/*
+ * a0 = data
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 8, s0, s1
+
+ li a1, 0x014e014e // FIX_1_306562965 (334 << 16)|(334 & 0xffff)
+ li a2, 0x008b008b // FIX_0_541196100 (139 << 16)|(139 & 0xffff)
+ li a3, 0x00620062 // FIX_0_382683433 (98 << 16) |(98 & 0xffff)
+ li s1, 0x00b500b5 // FIX_0_707106781 (181 << 16)|(181 & 0xffff)
+
+ move v0, a0
+ addiu v1, v0, 128 // end address
+
+0:
+ lw t0, 0(v0) // tmp0 = 1|0
+ lw t1, 4(v0) // tmp1 = 3|2
+ lw t2, 8(v0) // tmp2 = 5|4
+ lw t3, 12(v0) // tmp3 = 7|6
+ packrl.ph t1, t1, t1 // tmp1 = 2|3
+ packrl.ph t3, t3, t3 // tmp3 = 6|7
+ subq.ph t7, t1, t2 // tmp7 = 2-5|3-4 = t5|t4
+ subq.ph t5, t0, t3 // tmp5 = 1-6|0-7 = t6|t7
+ addq.ph t6, t1, t2 // tmp6 = 2+5|3+4 = t2|t3
+ addq.ph t4, t0, t3 // tmp4 = 1+6|0+7 = t1|t0
+ addq.ph t8, t4, t6 // tmp5 = t1+t2|t0+t3 = t11|t10
+ subq.ph t9, t4, t6 // tmp7 = t1-t2|t0-t3 = t12|t13
+ sra t4, t8, 16 // tmp4 = t11
+ mult $0, $0 // ac0 = 0
+ dpa.w.ph $ac0, t9, s1
+ mult $ac1, $0, $0 // ac1 = 0
+ dpa.w.ph $ac1, t7, a3 // ac1 += t4*98 + t5*98
+ dpsx.w.ph $ac1, t5, a3 // ac1 += t6*98 + t7*98
+ mult $ac2, $0, $0 // ac2 = 0
+ dpa.w.ph $ac2, t7, a2 // ac2 += t4*139 + t5*139
+ mult $ac3, $0, $0 // ac3 = 0
+ dpa.w.ph $ac3, t5, a1 // ac3 += t6*334 + t7*334
+ precrq.ph.w t0, t5, t7 // t0 = t5|t6
+ addq.ph t2, t8, t4 // tmp2 = t10 + t11
+ subq.ph t3, t8, t4 // tmp3 = t10 - t11
+ extr.w t4, $ac0, 8
+ mult $0, $0 // ac0 = 0
+ dpa.w.ph $ac0, t0, s1 // ac0 += t5*181 + t6*181
+ extr.w t0, $ac1, 8 // t0 = z5
+ extr.w t1, $ac2, 8 // t1 = MULTIPLY(tmp10, 139)
+ extr.w t7, $ac3, 8 // t2 = MULTIPLY(tmp12, 334)
+ extr.w t8, $ac0, 8 // t8 = z3 = MULTIPLY(tmp11, 181)
+ add t6, t1, t0 // t6 = z2
+ add t7, t7, t0 // t7 = z4
+ subq.ph t0, t5, t8 // t0 = z13 = tmp7 - z3
+ addq.ph t8, t5, t8 // t9 = z11 = tmp7 + z3
+ addq.ph t1, t0, t6 // t1 = z13 + z2
+ subq.ph t6, t0, t6 // t6 = z13 - z2
+ addq.ph t0, t8, t7 // t0 = z11 + z4
+ subq.ph t7, t8, t7 // t7 = z11 - z4
+ addq.ph t5, t4, t9
+ subq.ph t4, t9, t4
+ sh t2, 0(v0)
+ sh t5, 4(v0)
+ sh t3, 8(v0)
+ sh t4, 12(v0)
+ sh t1, 10(v0)
+ sh t6, 6(v0)
+ sh t0, 2(v0)
+ sh t7, 14(v0)
+ addiu v0, 16
+ bne v1, v0, 0b
+ nop
+ move v0, a0
+ addiu v1, v0, 16
+
+1:
+ lh t0, 0(v0) // 0
+ lh t1, 16(v0) // 8
+ lh t2, 32(v0) // 16
+ lh t3, 48(v0) // 24
+ lh t4, 64(v0) // 32
+ lh t5, 80(v0) // 40
+ lh t6, 96(v0) // 48
+ lh t7, 112(v0) // 56
+ add t8, t0, t7 // t8 = tmp0
+ sub t7, t0, t7 // t7 = tmp7
+ add t0, t1, t6 // t0 = tmp1
+ sub t1, t1, t6 // t1 = tmp6
+ add t6, t2, t5 // t6 = tmp2
+ sub t5, t2, t5 // t5 = tmp5
+ add t2, t3, t4 // t2 = tmp3
+ sub t3, t3, t4 // t3 = tmp4
+ add t4, t8, t2 // t4 = tmp10 = tmp0 + tmp3
+ sub t8, t8, t2 // t8 = tmp13 = tmp0 - tmp3
+ sub s0, t0, t6 // s0 = tmp12 = tmp1 - tmp2
+ ins t8, s0, 16, 16 // t8 = tmp12|tmp13
+ add t2, t0, t6 // t2 = tmp11 = tmp1 + tmp2
+ mult $0, $0 // ac0 = 0
+ dpa.w.ph $ac0, t8, s1 // ac0 += t12*181 + t13*181
+ add s0, t4, t2 // t8 = tmp10+tmp11
+ sub t4, t4, t2 // t4 = tmp10-tmp11
+ sh s0, 0(v0)
+ sh t4, 64(v0)
+ extr.w t2, $ac0, 8 // z1 = MULTIPLY(tmp12+tmp13, FIX_0_707106781)
+ addq.ph t4, t8, t2 // t9 = tmp13 + z1
+ subq.ph t8, t8, t2 // t2 = tmp13 - z1
+ sh t4, 32(v0)
+ sh t8, 96(v0)
+ add t3, t3, t5 // t3 = tmp10 = tmp4 + tmp5
+ add t0, t5, t1 // t0 = tmp11 = tmp5 + tmp6
+ add t1, t1, t7 // t1 = tmp12 = tmp6 + tmp7
+ andi t4, a1, 0xffff
+ mul s0, t1, t4
+ sra s0, s0, 8 // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965)
+ ins t1, t3, 16, 16 // t1 = tmp10|tmp12
+ mult $0, $0 // ac0 = 0
+ mulsa.w.ph $ac0, t1, a3 // ac0 += t10*98 - t12*98
+ extr.w t8, $ac0, 8 // z5 = MULTIPLY(tmp10-tmp12, FIX_0_382683433)
+ add t2, t7, t8 // t2 = tmp7 + z5
+ sub t7, t7, t8 // t7 = tmp7 - z5
+ andi t4, a2, 0xffff
+ mul t8, t3, t4
+ sra t8, t8, 8 // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100)
+ andi t4, s1, 0xffff
+ mul t6, t0, t4
+ sra t6, t6, 8 // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781)
+ add t0, t6, t8 // t0 = z3 + z2
+ sub t1, t6, t8 // t1 = z3 - z2
+ add t3, t6, s0 // t3 = z3 + z4
+ sub t4, t6, s0 // t4 = z3 - z4
+ sub t5, t2, t1 // t5 = dataptr[5]
+ sub t6, t7, t0 // t6 = dataptr[3]
+ add t3, t2, t3 // t3 = dataptr[1]
+ add t4, t7, t4 // t4 = dataptr[7]
+ sh t5, 80(v0)
+ sh t6, 48(v0)
+ sh t3, 16(v0)
+ sh t4, 112(v0)
+ addiu v0, 2
+ bne v0, v1, 1b
+ nop
+
+ RESTORE_REGS_FROM_STACK 8, s0, s1
+
+ j ra
+ nop
+END(jsimd_fdct_ifast_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_quantize_dspr2)
+/*
+ * a0 = coef_block
+ * a1 = divisors
+ * a2 = workspace
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 16, s0, s1, s2
+
+ addiu v0, a2, 124 // v0 = workspace_end
+ lh t0, 0(a2)
+ lh t1, 0(a1)
+ lh t2, 128(a1)
+ sra t3, t0, 15
+ sll t3, t3, 1
+ addiu t3, t3, 1
+ mul t0, t0, t3
+ lh t4, 384(a1)
+ lh t5, 130(a1)
+ lh t6, 2(a2)
+ lh t7, 2(a1)
+ lh t8, 386(a1)
+
+1:
+ andi t1, 0xffff
+ add t9, t0, t2
+ andi t9, 0xffff
+ mul v1, t9, t1
+ sra s0, t6, 15
+ sll s0, s0, 1
+ addiu s0, s0, 1
+ addiu t9, t4, 16
+ srav v1, v1, t9
+ mul v1, v1, t3
+ mul t6, t6, s0
+ andi t7, 0xffff
+ addiu a2, a2, 4
+ addiu a1, a1, 4
+ add s1, t6, t5
+ andi s1, 0xffff
+ sh v1, 0(a0)
+
+ mul s2, s1, t7
+ addiu s1, t8, 16
+ srav s2, s2, s1
+ mul s2, s2, s0
+ lh t0, 0(a2)
+ lh t1, 0(a1)
+ sra t3, t0, 15
+ sll t3, t3, 1
+ addiu t3, t3, 1
+ mul t0, t0, t3
+ lh t2, 128(a1)
+ lh t4, 384(a1)
+ lh t5, 130(a1)
+ lh t8, 386(a1)
+ lh t6, 2(a2)
+ lh t7, 2(a1)
+ sh s2, 2(a0)
+ lh t0, 0(a2)
+ sra t3, t0, 15
+ sll t3, t3, 1
+ addiu t3, t3, 1
+ mul t0, t0, t3
+ bne a2, v0, 1b
+ addiu a0, a0, 4
+
+ andi t1, 0xffff
+ add t9, t0, t2
+ andi t9, 0xffff
+ mul v1, t9, t1
+ sra s0, t6, 15
+ sll s0, s0, 1
+ addiu s0, s0, 1
+ addiu t9, t4, 16
+ srav v1, v1, t9
+ mul v1, v1, t3
+ mul t6, t6, s0
+ andi t7, 0xffff
+ sh v1, 0(a0)
+ add s1, t6, t5
+ andi s1, 0xffff
+ mul s2, s1, t7
+ addiu s1, t8, 16
+ addiu a2, a2, 4
+ addiu a1, a1, 4
+ srav s2, s2, s1
+ mul s2, s2, s0
+ sh s2, 2(a0)
+
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2
+
+ j ra
+ nop
+
+END(jsimd_quantize_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_quantize_float_dspr2)
+/*
+ * a0 = coef_block
+ * a1 = divisors
+ * a2 = workspace
+ */
+ .set at
+
+ li t1, 0x46800100 // integer representation 16384.5
+ mtc1 t1, f0
+ li t0, 63
+0:
+ lwc1 f2, 0(a2)
+ lwc1 f10, 0(a1)
+ lwc1 f4, 4(a2)
+ lwc1 f12, 4(a1)
+ lwc1 f6, 8(a2)
+ lwc1 f14, 8(a1)
+ lwc1 f8, 12(a2)
+ lwc1 f16, 12(a1)
+ madd.s f2, f0, f2, f10
+ madd.s f4, f0, f4, f12
+ madd.s f6, f0, f6, f14
+ madd.s f8, f0, f8, f16
+ lwc1 f10, 16(a1)
+ lwc1 f12, 20(a1)
+ trunc.w.s f2, f2
+ trunc.w.s f4, f4
+ trunc.w.s f6, f6
+ trunc.w.s f8, f8
+ lwc1 f14, 24(a1)
+ lwc1 f16, 28(a1)
+ mfc1 t1, f2
+ mfc1 t2, f4
+ mfc1 t3, f6
+ mfc1 t4, f8
+ lwc1 f2, 16(a2)
+ lwc1 f4, 20(a2)
+ lwc1 f6, 24(a2)
+ lwc1 f8, 28(a2)
+ madd.s f2, f0, f2, f10
+ madd.s f4, f0, f4, f12
+ madd.s f6, f0, f6, f14
+ madd.s f8, f0, f8, f16
+ addiu t1, t1, -16384
+ addiu t2, t2, -16384
+ addiu t3, t3, -16384
+ addiu t4, t4, -16384
+ trunc.w.s f2, f2
+ trunc.w.s f4, f4
+ trunc.w.s f6, f6
+ trunc.w.s f8, f8
+ sh t1, 0(a0)
+ sh t2, 2(a0)
+ sh t3, 4(a0)
+ sh t4, 6(a0)
+ mfc1 t1, f2
+ mfc1 t2, f4
+ mfc1 t3, f6
+ mfc1 t4, f8
+ addiu t0, t0, -8
+ addiu a2, a2, 32
+ addiu a1, a1, 32
+ addiu t1, t1, -16384
+ addiu t2, t2, -16384
+ addiu t3, t3, -16384
+ addiu t4, t4, -16384
+ sh t1, 8(a0)
+ sh t2, 10(a0)
+ sh t3, 12(a0)
+ sh t4, 14(a0)
+ bgez t0, 0b
+ addiu a0, a0, 16
+
+ j ra
+ nop
+
+END(jsimd_quantize_float_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_2x2_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = output_buf
+ * a3 = output_col
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
+
+ addiu sp, sp, -40
+ move v0, sp
+ addiu s2, zero, 29692
+ addiu s3, zero, -10426
+ addiu s4, zero, 6967
+ addiu s5, zero, -5906
+ lh t0, 0(a1) // t0 = inptr[DCTSIZE*0]
+ lh t5, 0(a0) // t5 = quantptr[DCTSIZE*0]
+ lh t1, 48(a1) // t1 = inptr[DCTSIZE*3]
+ lh t6, 48(a0) // t6 = quantptr[DCTSIZE*3]
+ mul t4, t5, t0
+ lh t0, 16(a1) // t0 = inptr[DCTSIZE*1]
+ lh t5, 16(a0) // t5 = quantptr[DCTSIZE*1]
+ mul t6, t6, t1
+ mul t5, t5, t0
+ lh t2, 80(a1) // t2 = inptr[DCTSIZE*5]
+ lh t7, 80(a0) // t7 = quantptr[DCTSIZE*5]
+ lh t3, 112(a1) // t3 = inptr[DCTSIZE*7]
+ lh t8, 112(a0) // t8 = quantptr[DCTSIZE*7]
+ mul t7, t7, t2
+ mult zero, zero
+ mul t8, t8, t3
+ li s0, 0x73FCD746 // s0 = (29692 << 16) | (-10426 & 0xffff)
+ li s1, 0x1B37E8EE // s1 = (6967 << 16) | (-5906 & 0xffff)
+ ins t6, t5, 16, 16 // t6 = t5|t6
+ sll t4, t4, 15
+ dpa.w.ph $ac0, t6, s0
+ lh t1, 2(a1)
+ lh t6, 2(a0)
+ ins t8, t7, 16, 16 // t8 = t7|t8
+ dpa.w.ph $ac0, t8, s1
+ mflo t0, $ac0
+ mul t5, t6, t1
+ lh t1, 18(a1)
+ lh t6, 18(a0)
+ lh t2, 50(a1)
+ lh t7, 50(a0)
+ mul t6, t6, t1
+ subu t8, t4, t0
+ mul t7, t7, t2
+ addu t0, t4, t0
+ shra_r.w t0, t0, 13
+ lh t1, 82(a1)
+ lh t2, 82(a0)
+ lh t3, 114(a1)
+ lh t4, 114(a0)
+ shra_r.w t8, t8, 13
+ mul t1, t1, t2
+ mul t3, t3, t4
+ sw t0, 0(v0)
+ sw t8, 20(v0)
+ sll t4, t5, 15
+ ins t7, t6, 16, 16
+ mult zero, zero
+ dpa.w.ph $ac0, t7, s0
+ ins t3, t1, 16, 16
+ lh t1, 6(a1)
+ lh t6, 6(a0)
+ dpa.w.ph $ac0, t3, s1
+ mflo t0, $ac0
+ mul t5, t6, t1
+ lh t1, 22(a1)
+ lh t6, 22(a0)
+ lh t2, 54(a1)
+ lh t7, 54(a0)
+ mul t6, t6, t1
+ subu t8, t4, t0
+ mul t7, t7, t2
+ addu t0, t4, t0
+ shra_r.w t0, t0, 13
+ lh t1, 86(a1)
+ lh t2, 86(a0)
+ lh t3, 118(a1)
+ lh t4, 118(a0)
+ shra_r.w t8, t8, 13
+ mul t1, t1, t2
+ mul t3, t3, t4
+ sw t0, 4(v0)
+ sw t8, 24(v0)
+ sll t4, t5, 15
+ ins t7, t6, 16, 16
+ mult zero, zero
+ dpa.w.ph $ac0, t7, s0
+ ins t3, t1, 16, 16
+ lh t1, 10(a1)
+ lh t6, 10(a0)
+ dpa.w.ph $ac0, t3, s1
+ mflo t0, $ac0
+ mul t5, t6, t1
+ lh t1, 26(a1)
+ lh t6, 26(a0)
+ lh t2, 58(a1)
+ lh t7, 58(a0)
+ mul t6, t6, t1
+ subu t8, t4, t0
+ mul t7, t7, t2
+ addu t0, t4, t0
+ shra_r.w t0, t0, 13
+ lh t1, 90(a1)
+ lh t2, 90(a0)
+ lh t3, 122(a1)
+ lh t4, 122(a0)
+ shra_r.w t8, t8, 13
+ mul t1, t1, t2
+ mul t3, t3, t4
+ sw t0, 8(v0)
+ sw t8, 28(v0)
+ sll t4, t5, 15
+ ins t7, t6, 16, 16
+ mult zero, zero
+ dpa.w.ph $ac0, t7, s0
+ ins t3, t1, 16, 16
+ lh t1, 14(a1)
+ lh t6, 14(a0)
+ dpa.w.ph $ac0, t3, s1
+ mflo t0, $ac0
+ mul t5, t6, t1
+ lh t1, 30(a1)
+ lh t6, 30(a0)
+ lh t2, 62(a1)
+ lh t7, 62(a0)
+ mul t6, t6, t1
+ subu t8, t4, t0
+ mul t7, t7, t2
+ addu t0, t4, t0
+ shra_r.w t0, t0, 13
+ lh t1, 94(a1)
+ lh t2, 94(a0)
+ lh t3, 126(a1)
+ lh t4, 126(a0)
+ shra_r.w t8, t8, 13
+ mul t1, t1, t2
+ mul t3, t3, t4
+ sw t0, 12(v0)
+ sw t8, 32(v0)
+ sll t4, t5, 15
+ ins t7, t6, 16, 16
+ mult zero, zero
+ dpa.w.ph $ac0, t7, s0
+ ins t3, t1, 16, 16
+ dpa.w.ph $ac0, t3, s1
+ mflo t0, $ac0
+ lw t9, 0(a2)
+ lw t3, 0(v0)
+ lw t7, 4(v0)
+ lw t1, 8(v0)
+ addu t9, t9, a3
+ sll t3, t3, 15
+ subu t8, t4, t0
+ addu t0, t4, t0
+ shra_r.w t0, t0, 13
+ shra_r.w t8, t8, 13
+ sw t0, 16(v0)
+ sw t8, 36(v0)
+ lw t5, 12(v0)
+ lw t6, 16(v0)
+ mult t7, s2
+ madd t1, s3
+ madd t5, s4
+ madd t6, s5
+ lw t5, 24(v0)
+ lw t7, 28(v0)
+ mflo t0, $ac0
+ lw t8, 32(v0)
+ lw t2, 36(v0)
+ mult $ac1, t5, s2
+ madd $ac1, t7, s3
+ madd $ac1, t8, s4
+ madd $ac1, t2, s5
+ addu t1, t3, t0
+ subu t6, t3, t0
+ shra_r.w t1, t1, 20
+ shra_r.w t6, t6, 20
+ mflo t4, $ac1
+ shll_s.w t1, t1, 24
+ shll_s.w t6, t6, 24
+ sra t1, t1, 24
+ sra t6, t6, 24
+ addiu t1, t1, 128
+ addiu t6, t6, 128
+ lw t0, 20(v0)
+ sb t1, 0(t9)
+ sb t6, 1(t9)
+ sll t0, t0, 15
+ lw t9, 4(a2)
+ addu t1, t0, t4
+ subu t6, t0, t4
+ addu t9, t9, a3
+ shra_r.w t1, t1, 20
+ shra_r.w t6, t6, 20
+ shll_s.w t1, t1, 24
+ shll_s.w t6, t6, 24
+ sra t1, t1, 24
+ sra t6, t6, 24
+ addiu t1, t1, 128
+ addiu t6, t6, 128
+ sb t1, 0(t9)
+ sb t6, 1(t9)
+ addiu sp, sp, 40
+
+ RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
+
+ j ra
+ nop
+
+END(jsimd_idct_2x2_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_4x4_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = output_buf
+ * a3 = output_col
+ * 16(sp) = workspace[DCTSIZE*4]; // buffers data between passes
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ lw v1, 48(sp)
+ move t0, a1
+ move t1, v1
+ li t9, 4
+ li s0, 0x2e75f93e
+ li s1, 0x21f9ba79
+ li s2, 0xecc2efb0
+ li s3, 0x52031ccd
+
+0:
+ lh s6, 32(t0) // inptr[DCTSIZE*2]
+ lh t6, 32(a0) // quantptr[DCTSIZE*2]
+ lh s7, 96(t0) // inptr[DCTSIZE*6]
+ lh t7, 96(a0) // quantptr[DCTSIZE*6]
+ mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
+ lh s4, 0(t0) // inptr[DCTSIZE*0]
+ mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
+ lh s5, 0(a0) // quantptr[0]
+ li s6, 15137
+ li s7, 6270
+ mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0])
+ mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
+ lh t5, 112(t0) // inptr[DCTSIZE*7]
+ mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
+ lh s4, 112(a0) // quantptr[DCTSIZE*7]
+ lh v0, 80(t0) // inptr[DCTSIZE*5]
+ lh s5, 80(a0) // quantptr[DCTSIZE*5]
+ lh s6, 48(a0) // quantptr[DCTSIZE*3]
+ sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1)
+ lh s7, 16(a0) // quantptr[DCTSIZE*1]
+ lh t8, 16(t0) // inptr[DCTSIZE*1]
+ subu t6, t6, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
+ lh t7, 48(t0) // inptr[DCTSIZE*3]
+ mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
+ mul v0, s5, v0 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
+ mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
+ mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
+ addu t3, t2, t6 // tmp10 = tmp0 + z2
+ subu t4, t2, t6 // tmp10 = tmp0 - z2
+ mult $ac0, zero, zero
+ mult $ac1, zero, zero
+ ins t5, v0, 16, 16
+ ins t7, t8, 16, 16
+ addiu t9, t9, -1
+ dpa.w.ph $ac0, t5, s0
+ dpa.w.ph $ac0, t7, s1
+ dpa.w.ph $ac1, t5, s2
+ dpa.w.ph $ac1, t7, s3
+ mflo s4, $ac0
+ mflo s5, $ac1
+ addiu a0, a0, 2
+ addiu t1, t1, 4
+ addiu t0, t0, 2
+ addu t6, t4, s4
+ subu t5, t4, s4
+ addu s6, t3, s5
+ subu s7, t3, s5
+ shra_r.w t6, t6, 12 // DESCALE(tmp12 + temp1, 12)
+ shra_r.w t5, t5, 12 // DESCALE(tmp12 - temp1, 12)
+ shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12)
+ shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12)
+ sw t6, 28(t1)
+ sw t5, 60(t1)
+ sw s6, -4(t1)
+ bgtz t9, 0b
+ sw s7, 92(t1)
+ // second loop three pass
+ li t9, 3
+1:
+ lh s6, 34(t0) // inptr[DCTSIZE*2]
+ lh t6, 34(a0) // quantptr[DCTSIZE*2]
+ lh s7, 98(t0) // inptr[DCTSIZE*6]
+ lh t7, 98(a0) // quantptr[DCTSIZE*6]
+ mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
+ lh s4, 2(t0) // inptr[DCTSIZE*0]
+ mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
+ lh s5, 2(a0) // quantptr[DCTSIZE*0]
+ li s6, 15137
+ li s7, 6270
+ mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0])
+ mul v0, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
+ lh t5, 114(t0) // inptr[DCTSIZE*7]
+ mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
+ lh s4, 114(a0) // quantptr[DCTSIZE*7]
+ lh s5, 82(a0) // quantptr[DCTSIZE*5]
+ lh t6, 82(t0) // inptr[DCTSIZE*5]
+ sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1)
+ lh s6, 50(a0) // quantptr[DCTSIZE*3]
+ lh t8, 18(t0) // inptr[DCTSIZE*1]
+ subu v0, v0, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
+ lh t7, 50(t0) // inptr[DCTSIZE*3]
+ lh s7, 18(a0) // quantptr[DCTSIZE*1]
+ mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
+ mul t6, s5, t6 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
+ mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
+ mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
+ addu t3, t2, v0 // tmp10 = tmp0 + z2
+ subu t4, t2, v0 // tmp10 = tmp0 - z2
+ mult $ac0, zero, zero
+ mult $ac1, zero, zero
+ ins t5, t6, 16, 16
+ ins t7, t8, 16, 16
+ dpa.w.ph $ac0, t5, s0
+ dpa.w.ph $ac0, t7, s1
+ dpa.w.ph $ac1, t5, s2
+ dpa.w.ph $ac1, t7, s3
+ mflo t5, $ac0
+ mflo t6, $ac1
+ addiu t9, t9, -1
+ addiu t0, t0, 2
+ addiu a0, a0, 2
+ addiu t1, t1, 4
+ addu s5, t4, t5
+ subu s4, t4, t5
+ addu s6, t3, t6
+ subu s7, t3, t6
+ shra_r.w s5, s5, 12 // DESCALE(tmp12 + temp1, 12)
+ shra_r.w s4, s4, 12 // DESCALE(tmp12 - temp1, 12)
+ shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12)
+ shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12)
+ sw s5, 32(t1)
+ sw s4, 64(t1)
+ sw s6, 0(t1)
+ bgtz t9, 1b
+ sw s7, 96(t1)
+ move t1, v1
+ li s4, 15137
+ lw s6, 8(t1) // wsptr[2]
+ li s5, 6270
+ lw s7, 24(t1) // wsptr[6]
+ mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
+ lw t2, 0(t1) // wsptr[0]
+ mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
+ lh t5, 28(t1) // wsptr[7]
+ lh t6, 20(t1) // wsptr[5]
+ lh t7, 12(t1) // wsptr[3]
+ lh t8, 4(t1) // wsptr[1]
+ ins t5, t6, 16, 16
+ ins t7, t8, 16, 16
+ mult $ac0, zero, zero
+ dpa.w.ph $ac0, t5, s0
+ dpa.w.ph $ac0, t7, s1
+ mult $ac1, zero, zero
+ dpa.w.ph $ac1, t5, s2
+ dpa.w.ph $ac1, t7, s3
+ sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
+ mflo s6, $ac0
+ // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
+ subu s4, s4, s5
+ addu t3, t2, s4 // tmp10 = tmp0 + z2
+ mflo s7, $ac1
+ subu t4, t2, s4 // tmp10 = tmp0 - z2
+ addu t7, t4, s6
+ subu t8, t4, s6
+ addu t5, t3, s7
+ subu t6, t3, s7
+ shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
+ shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
+ shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
+ shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
+ sll s4, t9, 2
+ lw v0, 0(a2) // output_buf[ctr]
+ shll_s.w t5, t5, 24
+ shll_s.w t6, t6, 24
+ shll_s.w t7, t7, 24
+ shll_s.w t8, t8, 24
+ sra t5, t5, 24
+ sra t6, t6, 24
+ sra t7, t7, 24
+ sra t8, t8, 24
+ addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
+ addiu t5, t5, 128
+ addiu t6, t6, 128
+ addiu t7, t7, 128
+ addiu t8, t8, 128
+ sb t5, 0(v0)
+ sb t7, 1(v0)
+ sb t8, 2(v0)
+ sb t6, 3(v0)
+ // 2
+ li s4, 15137
+ lw s6, 40(t1) // wsptr[2]
+ li s5, 6270
+ lw s7, 56(t1) // wsptr[6]
+ mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
+ lw t2, 32(t1) // wsptr[0]
+ mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
+ lh t5, 60(t1) // wsptr[7]
+ lh t6, 52(t1) // wsptr[5]
+ lh t7, 44(t1) // wsptr[3]
+ lh t8, 36(t1) // wsptr[1]
+ ins t5, t6, 16, 16
+ ins t7, t8, 16, 16
+ mult $ac0, zero, zero
+ dpa.w.ph $ac0, t5, s0
+ dpa.w.ph $ac0, t7, s1
+ mult $ac1, zero, zero
+ dpa.w.ph $ac1, t5, s2
+ dpa.w.ph $ac1, t7, s3
+ sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
+ mflo s6, $ac0
+ // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
+ subu s4, s4, s5
+ addu t3, t2, s4 // tmp10 = tmp0 + z2
+ mflo s7, $ac1
+ subu t4, t2, s4 // tmp10 = tmp0 - z2
+ addu t7, t4, s6
+ subu t8, t4, s6
+ addu t5, t3, s7
+ subu t6, t3, s7
+ shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1)
+ shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1)
+ shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1)
+ shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1)
+ sll s4, t9, 2
+ lw v0, 4(a2) // output_buf[ctr]
+ shll_s.w t5, t5, 24
+ shll_s.w t6, t6, 24
+ shll_s.w t7, t7, 24
+ shll_s.w t8, t8, 24
+ sra t5, t5, 24
+ sra t6, t6, 24
+ sra t7, t7, 24
+ sra t8, t8, 24
+ addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
+ addiu t5, t5, 128
+ addiu t6, t6, 128
+ addiu t7, t7, 128
+ addiu t8, t8, 128
+ sb t5, 0(v0)
+ sb t7, 1(v0)
+ sb t8, 2(v0)
+ sb t6, 3(v0)
+ // 3
+ li s4, 15137
+ lw s6, 72(t1) // wsptr[2]
+ li s5, 6270
+ lw s7, 88(t1) // wsptr[6]
+ mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
+ lw t2, 64(t1) // wsptr[0]
+ mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
+ lh t5, 92(t1) // wsptr[7]
+ lh t6, 84(t1) // wsptr[5]
+ lh t7, 76(t1) // wsptr[3]
+ lh t8, 68(t1) // wsptr[1]
+ ins t5, t6, 16, 16
+ ins t7, t8, 16, 16
+ mult $ac0, zero, zero
+ dpa.w.ph $ac0, t5, s0
+ dpa.w.ph $ac0, t7, s1
+ mult $ac1, zero, zero
+ dpa.w.ph $ac1, t5, s2
+ dpa.w.ph $ac1, t7, s3
+ sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
+ mflo s6, $ac0
+ // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
+ subu s4, s4, s5
+ addu t3, t2, s4 // tmp10 = tmp0 + z2
+ mflo s7, $ac1
+ subu t4, t2, s4 // tmp10 = tmp0 - z2
+ addu t7, t4, s6
+ subu t8, t4, s6
+ addu t5, t3, s7
+ subu t6, t3, s7
+ shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
+ shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
+ shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
+ shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
+ sll s4, t9, 2
+ lw v0, 8(a2) // output_buf[ctr]
+ shll_s.w t5, t5, 24
+ shll_s.w t6, t6, 24
+ shll_s.w t7, t7, 24
+ shll_s.w t8, t8, 24
+ sra t5, t5, 24
+ sra t6, t6, 24
+ sra t7, t7, 24
+ sra t8, t8, 24
+ addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
+ addiu t5, t5, 128
+ addiu t6, t6, 128
+ addiu t7, t7, 128
+ addiu t8, t8, 128
+ sb t5, 0(v0)
+ sb t7, 1(v0)
+ sb t8, 2(v0)
+ sb t6, 3(v0)
+ li s4, 15137
+ lw s6, 104(t1) // wsptr[2]
+ li s5, 6270
+ lw s7, 120(t1) // wsptr[6]
+ mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
+ lw t2, 96(t1) // wsptr[0]
+ mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
+ lh t5, 124(t1) // wsptr[7]
+ lh t6, 116(t1) // wsptr[5]
+ lh t7, 108(t1) // wsptr[3]
+ lh t8, 100(t1) // wsptr[1]
+ ins t5, t6, 16, 16
+ ins t7, t8, 16, 16
+ mult $ac0, zero, zero
+ dpa.w.ph $ac0, t5, s0
+ dpa.w.ph $ac0, t7, s1
+ mult $ac1, zero, zero
+ dpa.w.ph $ac1, t5, s2
+ dpa.w.ph $ac1, t7, s3
+ sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
+ mflo s6, $ac0
+ // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
+ subu s4, s4, s5
+ addu t3, t2, s4 // tmp10 = tmp0 + z2;
+ mflo s7, $ac1
+ subu t4, t2, s4 // tmp10 = tmp0 - z2;
+ addu t7, t4, s6
+ subu t8, t4, s6
+ addu t5, t3, s7
+ subu t6, t3, s7
+ shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
+ shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
+ shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
+ shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
+ sll s4, t9, 2
+ lw v0, 12(a2) // output_buf[ctr]
+ shll_s.w t5, t5, 24
+ shll_s.w t6, t6, 24
+ shll_s.w t7, t7, 24
+ shll_s.w t8, t8, 24
+ sra t5, t5, 24
+ sra t6, t6, 24
+ sra t7, t7, 24
+ sra t8, t8, 24
+ addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
+ addiu t5, t5, 128
+ addiu t6, t6, 128
+ addiu t7, t7, 128
+ addiu t8, t8, 128
+ sb t5, 0(v0)
+ sb t7, 1(v0)
+ sb t8, 2(v0)
+ sb t6, 3(v0)
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+END(jsimd_idct_4x4_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_6x6_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = output_buf
+ * a3 = output_col
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ addiu sp, sp, -144
+ move v0, sp
+ addiu v1, v0, 24
+ addiu t9, zero, 5793
+ addiu s0, zero, 10033
+ addiu s1, zero, 2998
+
+1:
+ lh s2, 0(a0) // q0 = quantptr[ 0]
+ lh s3, 32(a0) // q1 = quantptr[16]
+ lh s4, 64(a0) // q2 = quantptr[32]
+ lh t2, 64(a1) // tmp2 = inptr[32]
+ lh t1, 32(a1) // tmp1 = inptr[16]
+ lh t0, 0(a1) // tmp0 = inptr[ 0]
+ mul t2, t2, s4 // tmp2 = tmp2 * q2
+ mul t1, t1, s3 // tmp1 = tmp1 * q1
+ mul t0, t0, s2 // tmp0 = tmp0 * q0
+ lh t6, 16(a1) // z1 = inptr[ 8]
+ lh t8, 80(a1) // z3 = inptr[40]
+ lh t7, 48(a1) // z2 = inptr[24]
+ lh s2, 16(a0) // q0 = quantptr[ 8]
+ lh s4, 80(a0) // q2 = quantptr[40]
+ lh s3, 48(a0) // q1 = quantptr[24]
+ mul t2, t2, t9 // tmp2 = tmp2 * 5793
+ mul t1, t1, s0 // tmp1 = tmp1 * 10033
+ sll t0, t0, 13 // tmp0 = tmp0 << 13
+ mul t6, t6, s2 // z1 = z1 * q0
+ mul t8, t8, s4 // z3 = z3 * q2
+ mul t7, t7, s3 // z2 = z2 * q1
+ addu t3, t0, t2 // tmp10 = tmp0 + tmp2
+ sll t2, t2, 1 // tmp2 = tmp2 << 2
+ subu t4, t0, t2 // tmp11 = tmp0 - tmp2;
+ subu t5, t3, t1 // tmp12 = tmp10 - tmp1
+ addu t3, t3, t1 // tmp10 = tmp10 + tmp1
+ addu t1, t6, t8 // tmp1 = z1 + z3
+ mul t1, t1, s1 // tmp1 = tmp1 * 2998
+ shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11
+ subu t2, t6, t8 // tmp2 = z1 - z3
+ subu t2, t2, t7 // tmp2 = tmp2 - z2
+ sll t2, t2, 2 // tmp2 = tmp2 << 2
+ addu t0, t6, t7 // tmp0 = z1 + z2
+ sll t0, t0, 13 // tmp0 = tmp0 << 13
+ subu s2, t8, t7 // q0 = z3 - z2
+ sll s2, s2, 13 // q0 = q0 << 13
+ addu t0, t0, t1 // tmp0 = tmp0 + tmp1
+ addu t1, s2, t1 // tmp1 = q0 + tmp1
+ addu s2, t4, t2 // q0 = tmp11 + tmp2
+ subu s3, t4, t2 // q1 = tmp11 - tmp2
+ addu t6, t3, t0 // z1 = tmp10 + tmp0
+ subu t7, t3, t0 // z2 = tmp10 - tmp0
+ addu t4, t5, t1 // tmp11 = tmp12 + tmp1
+ subu t5, t5, t1 // tmp12 = tmp12 - tmp1
+ shra_r.w t6, t6, 11 // z1 = (z1 + 1024) >> 11
+ shra_r.w t7, t7, 11 // z2 = (z2 + 1024) >> 11
+ shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11
+ shra_r.w t5, t5, 11 // tmp12 = (tmp12 + 1024) >> 11
+ sw s2, 24(v0)
+ sw s3, 96(v0)
+ sw t6, 0(v0)
+ sw t7, 120(v0)
+ sw t4, 48(v0)
+ sw t5, 72(v0)
+ addiu v0, v0, 4
+ addiu a1, a1, 2
+ bne v0, v1, 1b
+ addiu a0, a0, 2
+
+ /* Pass 2: process 6 rows from work array, store into output array. */
+ move v0, sp
+ addiu v1, v0, 144
+
+2:
+ lw t0, 0(v0)
+ lw t2, 16(v0)
+ lw s5, 0(a2)
+ addiu t0, t0, 16
+ sll t0, t0, 13
+ mul t3, t2, t9
+ lw t6, 4(v0)
+ lw t8, 20(v0)
+ lw t7, 12(v0)
+ addu s5, s5, a3
+ addu s6, t6, t8
+ mul s6, s6, s1
+ addu t1, t0, t3
+ subu t4, t0, t3
+ subu t4, t4, t3
+ lw t3, 8(v0)
+ mul t0, t3, s0
+ addu s7, t6, t7
+ sll s7, s7, 13
+ addu s7, s6, s7
+ subu t2, t8, t7
+ sll t2, t2, 13
+ addu t2, s6, t2
+ subu s6, t6, t7
+ subu s6, s6, t8
+ sll s6, s6, 13
+ addu t3, t1, t0
+ subu t5, t1, t0
+ addu t6, t3, s7
+ subu t3, t3, s7
+ addu t7, t4, s6
+ subu t4, t4, s6
+ addu t8, t5, t2
+ subu t5, t5, t2
+ shll_s.w t6, t6, 6
+ shll_s.w t3, t3, 6
+ shll_s.w t7, t7, 6
+ shll_s.w t4, t4, 6
+ shll_s.w t8, t8, 6
+ shll_s.w t5, t5, 6
+ sra t6, t6, 24
+ addiu t6, t6, 128
+ sra t3, t3, 24
+ addiu t3, t3, 128
+ sb t6, 0(s5)
+ sra t7, t7, 24
+ addiu t7, t7, 128
+ sb t3, 5(s5)
+ sra t4, t4, 24
+ addiu t4, t4, 128
+ sb t7, 1(s5)
+ sra t8, t8, 24
+ addiu t8, t8, 128
+ sb t4, 4(s5)
+ addiu v0, v0, 24
+ sra t5, t5, 24
+ addiu t5, t5, 128
+ sb t8, 2(s5)
+ addiu a2, a2, 4
+ bne v0, v1, 2b
+ sb t5, 3(s5)
+
+ addiu sp, sp, 144
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+
+END(jsimd_idct_6x6_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = workspace
+ */
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+ li a3, 8
+
+1:
+ // odd part
+ lh t0, 48(a1)
+ lh t1, 48(a0)
+ lh t2, 16(a1)
+ lh t3, 16(a0)
+ lh t4, 80(a1)
+ lh t5, 80(a0)
+ lh t6, 112(a1)
+ lh t7, 112(a0)
+ mul t0, t0, t1 // z2
+ mul t1, t2, t3 // z1
+ mul t2, t4, t5 // z3
+ mul t3, t6, t7 // z4
+ li t4, 10703 // FIX(1.306562965)
+ li t5, 4433 // FIX_0_541196100
+ li t6, 7053 // FIX(0.860918669)
+ mul t4, t0, t4 // tmp11
+ mul t5, t0, t5 // -tmp14
+ addu t7, t1, t2 // tmp10
+ addu t8, t7, t3 // tmp10 + z4
+ mul t6, t6, t8 // tmp15
+ li t8, 2139 // FIX(0.261052384)
+ mul t8, t7, t8 // MULTIPLY(tmp10, FIX(0.261052384))
+ li t7, 2295 // FIX(0.280143716)
+ mul t7, t1, t7 // MULTIPLY(z1, FIX(0.280143716))
+ addu t9, t2, t3 // z3 + z4
+ li s0, 8565 // FIX(1.045510580)
+ mul t9, t9, s0 // -tmp13
+ li s0, 12112 // FIX(1.478575242)
+ mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242)
+ li s1, 12998 // FIX(1.586706681)
+ mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681))
+ li s2, 5540 // FIX(0.676326758)
+ mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758))
+ li s3, 16244 // FIX(1.982889723)
+ mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723))
+ subu t1, t1, t3 // z1-=z4
+ subu t0, t0, t2 // z2-=z3
+ addu t2, t0, t1 // z1+z2
+ li t3, 4433 // FIX_0_541196100
+ mul t2, t2, t3 // z3
+ li t3, 6270 // FIX_0_765366865
+ mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865)
+ li t3, 15137 // FIX_0_765366865
+ mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065)
+ addu t8, t6, t8 // tmp12
+ addu t3, t8, t4 // tmp12 + tmp11
+ addu t3, t3, t7 // tmp10
+ subu t8, t8, t9 // tmp12 + tmp13
+ addu s0, t5, s0
+ subu t8, t8, s0 // tmp12
+ subu t9, t6, t9
+ subu s1, s1, t4
+ addu t9, t9, s1 // tmp13
+ subu t6, t6, t5
+ subu t6, t6, s2
+ subu t6, t6, s3 // tmp15
+ // even part start
+ lh t4, 64(a1)
+ lh t5, 64(a0)
+ lh t7, 32(a1)
+ lh s0, 32(a0)
+ lh s1, 0(a1)
+ lh s2, 0(a0)
+ lh s3, 96(a1)
+ lh v0, 96(a0)
+ mul t4, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4])
+ mul t5, t7, s0 // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2])
+ mul t7, s1, s2 // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0])
+ mul s0, s3, v0 // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6])
+ // odd part end
+ addu t1, t2, t1 // tmp11
+ subu t0, t2, t0 // tmp14
+ // update counter and pointers
+ addiu a3, a3, -1
+ addiu a0, a0, 2
+ addiu a1, a1, 2
+ // even part rest
+ li s1, 10033
+ li s2, 11190
+ mul t4, t4, s1 // z4
+ mul s1, t5, s2 // z4
+ sll t5, t5, 13 // z1
+ sll t7, t7, 13
+ addiu t7, t7, 1024 // z3
+ sll s0, s0, 13 // z2
+ addu s2, t7, t4 // tmp10
+ subu t4, t7, t4 // tmp11
+ subu s3, t5, s0 // tmp12
+ addu t2, t7, s3 // tmp21
+ subu s3, t7, s3 // tmp24
+ addu t7, s1, s0 // tmp12
+ addu v0, s2, t7 // tmp20
+ subu s2, s2, t7 // tmp25
+ subu s1, s1, t5 // z4 - z1
+ subu s1, s1, s0 // tmp12
+ addu s0, t4, s1 // tmp22
+ subu t4, t4, s1 // tmp23
+ // final output stage
+ addu t5, v0, t3
+ subu v0, v0, t3
+ addu t3, t2, t1
+ subu t2, t2, t1
+ addu t1, s0, t8
+ subu s0, s0, t8
+ addu t8, t4, t9
+ subu t4, t4, t9
+ addu t9, s3, t0
+ subu s3, s3, t0
+ addu t0, s2, t6
+ subu s2, s2, t6
+ sra t5, t5, 11
+ sra t3, t3, 11
+ sra t1, t1, 11
+ sra t8, t8, 11
+ sra t9, t9, 11
+ sra t0, t0, 11
+ sra s2, s2, 11
+ sra s3, s3, 11
+ sra t4, t4, 11
+ sra s0, s0, 11
+ sra t2, t2, 11
+ sra v0, v0, 11
+ sw t5, 0(a2)
+ sw t3, 32(a2)
+ sw t1, 64(a2)
+ sw t8, 96(a2)
+ sw t9, 128(a2)
+ sw t0, 160(a2)
+ sw s2, 192(a2)
+ sw s3, 224(a2)
+ sw t4, 256(a2)
+ sw s0, 288(a2)
+ sw t2, 320(a2)
+ sw v0, 352(a2)
+ bgtz a3, 1b
+ addiu a2, a2, 4
+
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+ j ra
+ nop
+
+END(jsimd_idct_12x12_pass1_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2)
+/*
+ * a0 = workspace
+ * a1 = output
+ */
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+ li a3, 12
+
+1:
+ // Odd part
+ lw t0, 12(a0)
+ lw t1, 4(a0)
+ lw t2, 20(a0)
+ lw t3, 28(a0)
+ li t4, 10703 // FIX(1.306562965)
+ li t5, 4433 // FIX_0_541196100
+ mul t4, t0, t4 // tmp11
+ mul t5, t0, t5 // -tmp14
+ addu t6, t1, t2 // tmp10
+ li t7, 2139 // FIX(0.261052384)
+ mul t7, t6, t7 // MULTIPLY(tmp10, FIX(0.261052384))
+ addu t6, t6, t3 // tmp10 + z4
+ li t8, 7053 // FIX(0.860918669)
+ mul t6, t6, t8 // tmp15
+ li t8, 2295 // FIX(0.280143716)
+ mul t8, t1, t8 // MULTIPLY(z1, FIX(0.280143716))
+ addu t9, t2, t3 // z3 + z4
+ li s0, 8565 // FIX(1.045510580)
+ mul t9, t9, s0 // -tmp13
+ li s0, 12112 // FIX(1.478575242)
+ mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242))
+ li s1, 12998 // FIX(1.586706681)
+ mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681))
+ li s2, 5540 // FIX(0.676326758)
+ mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758))
+ li s3, 16244 // FIX(1.982889723)
+ mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723))
+ subu t1, t1, t3 // z1 -= z4
+ subu t0, t0, t2 // z2 -= z3
+ addu t2, t1, t0 // z1 + z2
+ li t3, 4433 // FIX_0_541196100
+ mul t2, t2, t3 // z3
+ li t3, 6270 // FIX_0_765366865
+ mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865)
+ li t3, 15137 // FIX_1_847759065
+ mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065)
+ addu t3, t6, t7 // tmp12
+ addu t7, t3, t4
+ addu t7, t7, t8 // tmp10
+ subu t3, t3, t9
+ subu t3, t3, t5
+ subu t3, t3, s0 // tmp12
+ subu t9, t6, t9
+ subu t9, t9, t4
+ addu t9, t9, s1 // tmp13
+ subu t6, t6, t5
+ subu t6, t6, s2
+ subu t6, t6, s3 // tmp15
+ addu t1, t2, t1 // tmp11
+ subu t0, t2, t0 // tmp14
+ // even part
+ lw t2, 16(a0) // z4
+ lw t4, 8(a0) // z1
+ lw t5, 0(a0) // z3
+ lw t8, 24(a0) // z2
+ li s0, 10033 // FIX(1.224744871)
+ li s1, 11190 // FIX(1.366025404)
+ mul t2, t2, s0 // z4
+ mul s0, t4, s1 // z4
+ addiu t5, t5, 0x10
+ sll t5, t5, 13 // z3
+ sll t4, t4, 13 // z1
+ sll t8, t8, 13 // z2
+ subu s1, t4, t8 // tmp12
+ addu s2, t5, t2 // tmp10
+ subu t2, t5, t2 // tmp11
+ addu s3, t5, s1 // tmp21
+ subu s1, t5, s1 // tmp24
+ addu t5, s0, t8 // tmp12
+ addu v0, s2, t5 // tmp20
+ subu t5, s2, t5 // tmp25
+ subu t4, s0, t4
+ subu t4, t4, t8 // tmp12
+ addu t8, t2, t4 // tmp22
+ subu t2, t2, t4 // tmp23
+ // increment counter and pointers
+ addiu a3, a3, -1
+ addiu a0, a0, 32
+ // Final stage
+ addu t4, v0, t7
+ subu v0, v0, t7
+ addu t7, s3, t1
+ subu s3, s3, t1
+ addu t1, t8, t3
+ subu t8, t8, t3
+ addu t3, t2, t9
+ subu t2, t2, t9
+ addu t9, s1, t0
+ subu s1, s1, t0
+ addu t0, t5, t6
+ subu t5, t5, t6
+ sll t4, t4, 4
+ sll t7, t7, 4
+ sll t1, t1, 4
+ sll t3, t3, 4
+ sll t9, t9, 4
+ sll t0, t0, 4
+ sll t5, t5, 4
+ sll s1, s1, 4
+ sll t2, t2, 4
+ sll t8, t8, 4
+ sll s3, s3, 4
+ sll v0, v0, 4
+ shll_s.w t4, t4, 2
+ shll_s.w t7, t7, 2
+ shll_s.w t1, t1, 2
+ shll_s.w t3, t3, 2
+ shll_s.w t9, t9, 2
+ shll_s.w t0, t0, 2
+ shll_s.w t5, t5, 2
+ shll_s.w s1, s1, 2
+ shll_s.w t2, t2, 2
+ shll_s.w t8, t8, 2
+ shll_s.w s3, s3, 2
+ shll_s.w v0, v0, 2
+ srl t4, t4, 24
+ srl t7, t7, 24
+ srl t1, t1, 24
+ srl t3, t3, 24
+ srl t9, t9, 24
+ srl t0, t0, 24
+ srl t5, t5, 24
+ srl s1, s1, 24
+ srl t2, t2, 24
+ srl t8, t8, 24
+ srl s3, s3, 24
+ srl v0, v0, 24
+ lw t6, 0(a1)
+ addiu t4, t4, 0x80
+ addiu t7, t7, 0x80
+ addiu t1, t1, 0x80
+ addiu t3, t3, 0x80
+ addiu t9, t9, 0x80
+ addiu t0, t0, 0x80
+ addiu t5, t5, 0x80
+ addiu s1, s1, 0x80
+ addiu t2, t2, 0x80
+ addiu t8, t8, 0x80
+ addiu s3, s3, 0x80
+ addiu v0, v0, 0x80
+ sb t4, 0(t6)
+ sb t7, 1(t6)
+ sb t1, 2(t6)
+ sb t3, 3(t6)
+ sb t9, 4(t6)
+ sb t0, 5(t6)
+ sb t5, 6(t6)
+ sb s1, 7(t6)
+ sb t2, 8(t6)
+ sb t8, 9(t6)
+ sb s3, 10(t6)
+ sb v0, 11(t6)
+ bgtz a3, 1b
+ addiu a1, a1, 4
+
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+ jr ra
+ nop
+
+END(jsimd_idct_12x12_pass2_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_convsamp_dspr2)
+/*
+ * a0 = sample_data
+ * a1 = start_col
+ * a2 = workspace
+ */
+ lw t0, 0(a0)
+ li t7, 0xff80ff80
+ addu t0, t0, a1
+ ulw t1, 0(t0)
+ ulw t2, 4(t0)
+ preceu.ph.qbr t3, t1
+ preceu.ph.qbl t4, t1
+ lw t0, 4(a0)
+ preceu.ph.qbr t5, t2
+ preceu.ph.qbl t6, t2
+ addu t0, t0, a1
+ addu.ph t3, t3, t7
+ addu.ph t4, t4, t7
+ ulw t1, 0(t0)
+ ulw t2, 4(t0)
+ addu.ph t5, t5, t7
+ addu.ph t6, t6, t7
+ usw t3, 0(a2)
+ usw t4, 4(a2)
+ preceu.ph.qbr t3, t1
+ preceu.ph.qbl t4, t1
+ usw t5, 8(a2)
+ usw t6, 12(a2)
+
+ lw t0, 8(a0)
+ preceu.ph.qbr t5, t2
+ preceu.ph.qbl t6, t2
+ addu t0, t0, a1
+ addu.ph t3, t3, t7
+ addu.ph t4, t4, t7
+ ulw t1, 0(t0)
+ ulw t2, 4(t0)
+ addu.ph t5, t5, t7
+ addu.ph t6, t6, t7
+ usw t3, 16(a2)
+ usw t4, 20(a2)
+ preceu.ph.qbr t3, t1
+ preceu.ph.qbl t4, t1
+ usw t5, 24(a2)
+ usw t6, 28(a2)
+
+ lw t0, 12(a0)
+ preceu.ph.qbr t5, t2
+ preceu.ph.qbl t6, t2
+ addu t0, t0, a1
+ addu.ph t3, t3, t7
+ addu.ph t4, t4, t7
+ ulw t1, 0(t0)
+ ulw t2, 4(t0)
+ addu.ph t5, t5, t7
+ addu.ph t6, t6, t7
+ usw t3, 32(a2)
+ usw t4, 36(a2)
+ preceu.ph.qbr t3, t1
+ preceu.ph.qbl t4, t1
+ usw t5, 40(a2)
+ usw t6, 44(a2)
+
+ lw t0, 16(a0)
+ preceu.ph.qbr t5, t2
+ preceu.ph.qbl t6, t2
+ addu t0, t0, a1
+ addu.ph t3, t3, t7
+ addu.ph t4, t4, t7
+ ulw t1, 0(t0)
+ ulw t2, 4(t0)
+ addu.ph t5, t5, t7
+ addu.ph t6, t6, t7
+ usw t3, 48(a2)
+ usw t4, 52(a2)
+ preceu.ph.qbr t3, t1
+ preceu.ph.qbl t4, t1
+ usw t5, 56(a2)
+ usw t6, 60(a2)
+
+ lw t0, 20(a0)
+ preceu.ph.qbr t5, t2
+ preceu.ph.qbl t6, t2
+ addu t0, t0, a1
+ addu.ph t3, t3, t7
+ addu.ph t4, t4, t7
+ ulw t1, 0(t0)
+ ulw t2, 4(t0)
+ addu.ph t5, t5, t7
+ addu.ph t6, t6, t7
+ usw t3, 64(a2)
+ usw t4, 68(a2)
+ preceu.ph.qbr t3, t1
+ preceu.ph.qbl t4, t1
+ usw t5, 72(a2)
+ usw t6, 76(a2)
+
+ lw t0, 24(a0)
+ preceu.ph.qbr t5, t2
+ preceu.ph.qbl t6, t2
+ addu t0, t0, a1
+ addu.ph t3, t3, t7
+ addu.ph t4, t4, t7
+ ulw t1, 0(t0)
+ ulw t2, 4(t0)
+ addu.ph t5, t5, t7
+ addu.ph t6, t6, t7
+ usw t3, 80(a2)
+ usw t4, 84(a2)
+ preceu.ph.qbr t3, t1
+ preceu.ph.qbl t4, t1
+ usw t5, 88(a2)
+ usw t6, 92(a2)
+
+ lw t0, 28(a0)
+ preceu.ph.qbr t5, t2
+ preceu.ph.qbl t6, t2
+ addu t0, t0, a1
+ addu.ph t3, t3, t7
+ addu.ph t4, t4, t7
+ ulw t1, 0(t0)
+ ulw t2, 4(t0)
+ addu.ph t5, t5, t7
+ addu.ph t6, t6, t7
+ usw t3, 96(a2)
+ usw t4, 100(a2)
+ preceu.ph.qbr t3, t1
+ preceu.ph.qbl t4, t1
+ usw t5, 104(a2)
+ usw t6, 108(a2)
+ preceu.ph.qbr t5, t2
+ preceu.ph.qbl t6, t2
+ addu.ph t3, t3, t7
+ addu.ph t4, t4, t7
+ addu.ph t5, t5, t7
+ addu.ph t6, t6, t7
+ usw t3, 112(a2)
+ usw t4, 116(a2)
+ usw t5, 120(a2)
+ usw t6, 124(a2)
+
+ j ra
+ nop
+
+END(jsimd_convsamp_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_convsamp_float_dspr2)
+/*
+ * a0 = sample_data
+ * a1 = start_col
+ * a2 = workspace
+ */
+ .set at
+
+ lw t0, 0(a0)
+ addu t0, t0, a1
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f2
+ mtc1 t2, f4
+ mtc1 t3, f6
+ mtc1 t4, f8
+ mtc1 t5, f10
+ mtc1 t6, f12
+ mtc1 t7, f14
+ mtc1 t8, f16
+ cvt.s.w f2, f2
+ cvt.s.w f4, f4
+ cvt.s.w f6, f6
+ cvt.s.w f8, f8
+ cvt.s.w f10, f10
+ cvt.s.w f12, f12
+ cvt.s.w f14, f14
+ cvt.s.w f16, f16
+ lw t0, 4(a0)
+ swc1 f2, 0(a2)
+ swc1 f4, 4(a2)
+ swc1 f6, 8(a2)
+ addu t0, t0, a1
+ swc1 f8, 12(a2)
+ swc1 f10, 16(a2)
+ swc1 f12, 20(a2)
+ swc1 f14, 24(a2)
+ swc1 f16, 28(a2)
+ // elemr 1
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f2
+ mtc1 t2, f4
+ mtc1 t3, f6
+ mtc1 t4, f8
+ mtc1 t5, f10
+ mtc1 t6, f12
+ mtc1 t7, f14
+ mtc1 t8, f16
+ cvt.s.w f2, f2
+ cvt.s.w f4, f4
+ cvt.s.w f6, f6
+ cvt.s.w f8, f8
+ cvt.s.w f10, f10
+ cvt.s.w f12, f12
+ cvt.s.w f14, f14
+ cvt.s.w f16, f16
+ lw t0, 8(a0)
+ swc1 f2, 32(a2)
+ swc1 f4, 36(a2)
+ swc1 f6, 40(a2)
+ addu t0, t0, a1
+ swc1 f8, 44(a2)
+ swc1 f10, 48(a2)
+ swc1 f12, 52(a2)
+ swc1 f14, 56(a2)
+ swc1 f16, 60(a2)
+ // elemr 2
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f2
+ mtc1 t2, f4
+ mtc1 t3, f6
+ mtc1 t4, f8
+ mtc1 t5, f10
+ mtc1 t6, f12
+ mtc1 t7, f14
+ mtc1 t8, f16
+ cvt.s.w f2, f2
+ cvt.s.w f4, f4
+ cvt.s.w f6, f6
+ cvt.s.w f8, f8
+ cvt.s.w f10, f10
+ cvt.s.w f12, f12
+ cvt.s.w f14, f14
+ cvt.s.w f16, f16
+ lw t0, 12(a0)
+ swc1 f2, 64(a2)
+ swc1 f4, 68(a2)
+ swc1 f6, 72(a2)
+ addu t0, t0, a1
+ swc1 f8, 76(a2)
+ swc1 f10, 80(a2)
+ swc1 f12, 84(a2)
+ swc1 f14, 88(a2)
+ swc1 f16, 92(a2)
+ // elemr 3
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f2
+ mtc1 t2, f4
+ mtc1 t3, f6
+ mtc1 t4, f8
+ mtc1 t5, f10
+ mtc1 t6, f12
+ mtc1 t7, f14
+ mtc1 t8, f16
+ cvt.s.w f2, f2
+ cvt.s.w f4, f4
+ cvt.s.w f6, f6
+ cvt.s.w f8, f8
+ cvt.s.w f10, f10
+ cvt.s.w f12, f12
+ cvt.s.w f14, f14
+ cvt.s.w f16, f16
+ lw t0, 16(a0)
+ swc1 f2, 96(a2)
+ swc1 f4, 100(a2)
+ swc1 f6, 104(a2)
+ addu t0, t0, a1
+ swc1 f8, 108(a2)
+ swc1 f10, 112(a2)
+ swc1 f12, 116(a2)
+ swc1 f14, 120(a2)
+ swc1 f16, 124(a2)
+ // elemr 4
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f2
+ mtc1 t2, f4
+ mtc1 t3, f6
+ mtc1 t4, f8
+ mtc1 t5, f10
+ mtc1 t6, f12
+ mtc1 t7, f14
+ mtc1 t8, f16
+ cvt.s.w f2, f2
+ cvt.s.w f4, f4
+ cvt.s.w f6, f6
+ cvt.s.w f8, f8
+ cvt.s.w f10, f10
+ cvt.s.w f12, f12
+ cvt.s.w f14, f14
+ cvt.s.w f16, f16
+ lw t0, 20(a0)
+ swc1 f2, 128(a2)
+ swc1 f4, 132(a2)
+ swc1 f6, 136(a2)
+ addu t0, t0, a1
+ swc1 f8, 140(a2)
+ swc1 f10, 144(a2)
+ swc1 f12, 148(a2)
+ swc1 f14, 152(a2)
+ swc1 f16, 156(a2)
+ // elemr 5
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f2
+ mtc1 t2, f4
+ mtc1 t3, f6
+ mtc1 t4, f8
+ mtc1 t5, f10
+ mtc1 t6, f12
+ mtc1 t7, f14
+ mtc1 t8, f16
+ cvt.s.w f2, f2
+ cvt.s.w f4, f4
+ cvt.s.w f6, f6
+ cvt.s.w f8, f8
+ cvt.s.w f10, f10
+ cvt.s.w f12, f12
+ cvt.s.w f14, f14
+ cvt.s.w f16, f16
+ lw t0, 24(a0)
+ swc1 f2, 160(a2)
+ swc1 f4, 164(a2)
+ swc1 f6, 168(a2)
+ addu t0, t0, a1
+ swc1 f8, 172(a2)
+ swc1 f10, 176(a2)
+ swc1 f12, 180(a2)
+ swc1 f14, 184(a2)
+ swc1 f16, 188(a2)
+ // elemr 6
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f2
+ mtc1 t2, f4
+ mtc1 t3, f6
+ mtc1 t4, f8
+ mtc1 t5, f10
+ mtc1 t6, f12
+ mtc1 t7, f14
+ mtc1 t8, f16
+ cvt.s.w f2, f2
+ cvt.s.w f4, f4
+ cvt.s.w f6, f6
+ cvt.s.w f8, f8
+ cvt.s.w f10, f10
+ cvt.s.w f12, f12
+ cvt.s.w f14, f14
+ cvt.s.w f16, f16
+ lw t0, 28(a0)
+ swc1 f2, 192(a2)
+ swc1 f4, 196(a2)
+ swc1 f6, 200(a2)
+ addu t0, t0, a1
+ swc1 f8, 204(a2)
+ swc1 f10, 208(a2)
+ swc1 f12, 212(a2)
+ swc1 f14, 216(a2)
+ swc1 f16, 220(a2)
+ // elemr 7
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f2
+ mtc1 t2, f4
+ mtc1 t3, f6
+ mtc1 t4, f8
+ mtc1 t5, f10
+ mtc1 t6, f12
+ mtc1 t7, f14
+ mtc1 t8, f16
+ cvt.s.w f2, f2
+ cvt.s.w f4, f4
+ cvt.s.w f6, f6
+ cvt.s.w f8, f8
+ cvt.s.w f10, f10
+ cvt.s.w f12, f12
+ cvt.s.w f14, f14
+ cvt.s.w f16, f16
+ swc1 f2, 224(a2)
+ swc1 f4, 228(a2)
+ swc1 f6, 232(a2)
+ swc1 f8, 236(a2)
+ swc1 f10, 240(a2)
+ swc1 f12, 244(a2)
+ swc1 f14, 248(a2)
+ swc1 f16, 252(a2)
+
+ j ra
+ nop
+
+END(jsimd_convsamp_float_dspr2)
+
+/*****************************************************************************/
diff --git a/simd/mips/jsimd_dspr2_asm.h b/simd/mips/jsimd_dspr2_asm.h
new file mode 100644
index 0000000..12cfda4
--- /dev/null
+++ b/simd/mips/jsimd_dspr2_asm.h
@@ -0,0 +1,292 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013, MIPS Technologies, Inc., California.
+ * Copyright (C) 2018, Matthieu Darbois.
+ * All Rights Reserved.
+ * Authors: Teodora Novkovic (teodora.novkovic@imgtec.com)
+ * Darko Laus (darko.laus@imgtec.com)
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define zero $0
+#define AT $1
+#define v0 $2
+#define v1 $3
+#define a0 $4
+#define a1 $5
+#define a2 $6
+#define a3 $7
+#define t0 $8
+#define t1 $9
+#define t2 $10
+#define t3 $11
+#define t4 $12
+#define t5 $13
+#define t6 $14
+#define t7 $15
+#define s0 $16
+#define s1 $17
+#define s2 $18
+#define s3 $19
+#define s4 $20
+#define s5 $21
+#define s6 $22
+#define s7 $23
+#define t8 $24
+#define t9 $25
+#define k0 $26
+#define k1 $27
+#define gp $28
+#define sp $29
+#define fp $30
+#define s8 $30
+#define ra $31
+
+#define f0 $f0
+#define f1 $f1
+#define f2 $f2
+#define f3 $f3
+#define f4 $f4
+#define f5 $f5
+#define f6 $f6
+#define f7 $f7
+#define f8 $f8
+#define f9 $f9
+#define f10 $f10
+#define f11 $f11
+#define f12 $f12
+#define f13 $f13
+#define f14 $f14
+#define f15 $f15
+#define f16 $f16
+#define f17 $f17
+#define f18 $f18
+#define f19 $f19
+#define f20 $f20
+#define f21 $f21
+#define f22 $f22
+#define f23 $f23
+#define f24 $f24
+#define f25 $f25
+#define f26 $f26
+#define f27 $f27
+#define f28 $f28
+#define f29 $f29
+#define f30 $f30
+#define f31 $f31
+
+#ifdef __ELF__
+#define HIDDEN_SYMBOL(symbol) .hidden symbol;
+#else
+#define HIDDEN_SYMBOL(symbol)
+#endif
+
+/*
+ * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2
+ */
+#define LEAF_MIPS32R2(symbol) \
+ .globl symbol; \
+ HIDDEN_SYMBOL(symbol) \
+ .align 2; \
+ .type symbol, @function; \
+ .ent symbol, 0; \
+symbol: \
+ .frame sp, 0, ra; \
+ .set push; \
+ .set arch = mips32r2; \
+ .set noreorder; \
+ .set noat;
+
+/*
+ * LEAF_DSPR2 - declare leaf routine for MIPS DSPr2
+ */
+#define LEAF_DSPR2(symbol) \
+LEAF_MIPS32R2(symbol) \
+ .set dspr2;
+
+/*
+ * END - mark end of function
+ */
+#define END(function) \
+ .set pop; \
+ .end function; \
+ .size function, .-function
+
+/*
+ * Checks if stack offset is big enough for storing/restoring regs_num
+ * number of register to/from stack. Stack offset must be greater than
+ * or equal to the number of bytes needed for storing registers (regs_num*4).
+ * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is
+ * preserved for input arguments of the functions, already stored in a0-a3),
+ * stack size can be further optimized by utilizing this space.
+ */
+.macro CHECK_STACK_OFFSET regs_num, stack_offset
+.if \stack_offset < \regs_num * 4 - 16
+.error "Stack offset too small."
+.endif
+.endm
+
+/*
+ * Saves set of registers on stack. Maximum number of registers that
+ * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * before registers are pushed in order to provide enough space on stack
+ * (offset must be multiple of 4, and must be big enough, as described by
+ * CHECK_STACK_OFFSET macro). This macro is intended to be used in
+ * combination with RESTORE_REGS_FROM_STACK macro. Example:
+ * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1
+ * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \
+ r2 = 0, r3 = 0, r4 = 0, \
+ r5 = 0, r6 = 0, r7 = 0, \
+ r8 = 0, r9 = 0, r10 = 0, \
+ r11 = 0, r12 = 0, r13 = 0, \
+ r14 = 0
+.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
+ .error "Stack offset must be pozitive and multiple of 4."
+.endif
+.if \stack_offset != 0
+ addiu sp, sp, -\stack_offset
+.endif
+ sw \r1, 0(sp)
+.if \r2 != 0
+ sw \r2, 4(sp)
+.endif
+.if \r3 != 0
+ sw \r3, 8(sp)
+.endif
+.if \r4 != 0
+ sw \r4, 12(sp)
+.endif
+.if \r5 != 0
+ CHECK_STACK_OFFSET 5, \stack_offset
+ sw \r5, 16(sp)
+.endif
+.if \r6 != 0
+ CHECK_STACK_OFFSET 6, \stack_offset
+ sw \r6, 20(sp)
+.endif
+.if \r7 != 0
+ CHECK_STACK_OFFSET 7, \stack_offset
+ sw \r7, 24(sp)
+.endif
+.if \r8 != 0
+ CHECK_STACK_OFFSET 8, \stack_offset
+ sw \r8, 28(sp)
+.endif
+.if \r9 != 0
+ CHECK_STACK_OFFSET 9, \stack_offset
+ sw \r9, 32(sp)
+.endif
+.if \r10 != 0
+ CHECK_STACK_OFFSET 10, \stack_offset
+ sw \r10, 36(sp)
+.endif
+.if \r11 != 0
+ CHECK_STACK_OFFSET 11, \stack_offset
+ sw \r11, 40(sp)
+.endif
+.if \r12 != 0
+ CHECK_STACK_OFFSET 12, \stack_offset
+ sw \r12, 44(sp)
+.endif
+.if \r13 != 0
+ CHECK_STACK_OFFSET 13, \stack_offset
+ sw \r13, 48(sp)
+.endif
+.if \r14 != 0
+ CHECK_STACK_OFFSET 14, \stack_offset
+ sw \r14, 52(sp)
+.endif
+.endm
+
+/*
+ * Restores set of registers from stack. Maximum number of registers that
+ * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * after registers are restored (offset must be multiple of 4, and must
+ * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is
+ * intended to be used in combination with RESTORE_REGS_FROM_STACK macro.
+ * Example:
+ * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1
+ * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \
+ r2 = 0, r3 = 0, r4 = 0, \
+ r5 = 0, r6 = 0, r7 = 0, \
+ r8 = 0, r9 = 0, r10 = 0, \
+ r11 = 0, r12 = 0, r13 = 0, \
+ r14 = 0
+.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
+ .error "Stack offset must be pozitive and multiple of 4."
+.endif
+ lw \r1, 0(sp)
+.if \r2 != 0
+ lw \r2, 4(sp)
+.endif
+.if \r3 != 0
+ lw \r3, 8(sp)
+.endif
+.if \r4 != 0
+ lw \r4, 12(sp)
+.endif
+.if \r5 != 0
+ CHECK_STACK_OFFSET 5, \stack_offset
+ lw \r5, 16(sp)
+.endif
+.if \r6 != 0
+ CHECK_STACK_OFFSET 6, \stack_offset
+ lw \r6, 20(sp)
+.endif
+.if \r7 != 0
+ CHECK_STACK_OFFSET 7, \stack_offset
+ lw \r7, 24(sp)
+.endif
+.if \r8 != 0
+ CHECK_STACK_OFFSET 8, \stack_offset
+ lw \r8, 28(sp)
+.endif
+.if \r9 != 0
+ CHECK_STACK_OFFSET 9, \stack_offset
+ lw \r9, 32(sp)
+.endif
+.if \r10 != 0
+ CHECK_STACK_OFFSET 10, \stack_offset
+ lw \r10, 36(sp)
+.endif
+.if \r11 != 0
+ CHECK_STACK_OFFSET 11, \stack_offset
+ lw \r11, 40(sp)
+.endif
+.if \r12 != 0
+ CHECK_STACK_OFFSET 12, \stack_offset
+ lw \r12, 44(sp)
+.endif
+.if \r13 != 0
+ CHECK_STACK_OFFSET 13, \stack_offset
+ lw \r13, 48(sp)
+.endif
+.if \r14 != 0
+ CHECK_STACK_OFFSET 14, \stack_offset
+ lw \r14, 52(sp)
+.endif
+.if \stack_offset != 0
+ addiu sp, sp, \stack_offset
+.endif
+.endm