ARM32 NEON SIMD implementation of Huffman encoding
Full-color compression speedups relative to libjpeg-turbo 1.4.2:
800 MHz ARM Cortex-A9, iOS, 32-bit: 26-44% (avg. 32%)
Refer to #42 and #47 for discussion.
This commit also removes the unnecessary
if (simd_support & JSIMD_ARM_NEON)
statements from the jsimd* algorithm functions. Since the jsimd_can*()
functions check for the existence of NEON, the corresponding algorithm
functions will never be called if NEON isn't available. Removing those
if statements improved performance across the board by a couple of
percent.
Based on:
https://github.com/mayeut/libjpeg-turbo/commit/fc023c880ce1d6c908fb78ccc25f5d5fd910ccc5
diff --git a/simd/jsimd_arm.c b/simd/jsimd_arm.c
index e715291..635cbd7 100644
--- a/simd/jsimd_arm.c
+++ b/simd/jsimd_arm.c
@@ -2,8 +2,8 @@
* jsimd_arm.c
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2009-2011, 2013-2014 D. R. Commander
- * Copyright 2015 Matthieu Darbois
+ * Copyright 2009-2011, 2013-2014, 2016 D. R. Commander
+ * Copyright 2015-2016 Matthieu Darbois
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -228,8 +228,7 @@
break;
}
- if (simd_support & JSIMD_ARM_NEON)
- neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+ neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
}
GLOBAL(void)
@@ -274,8 +273,7 @@
break;
}
- if (simd_support & JSIMD_ARM_NEON)
- neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+ neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
}
GLOBAL(void)
@@ -283,9 +281,8 @@
JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows)
{
- if (simd_support & JSIMD_ARM_NEON)
- jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
- output_buf, num_rows);
+ jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+ output_buf, num_rows);
}
GLOBAL(int)
@@ -387,10 +384,9 @@
JSAMPARRAY input_data,
JSAMPARRAY * output_data_ptr)
{
- if (simd_support & JSIMD_ARM_NEON)
- jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
- compptr->downsampled_width, input_data,
- output_data_ptr);
+ jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
}
GLOBAL(int)
@@ -458,8 +454,7 @@
jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
DCTELEM * workspace)
{
- if (simd_support & JSIMD_ARM_NEON)
- jsimd_convsamp_neon(sample_data, start_col, workspace);
+ jsimd_convsamp_neon(sample_data, start_col, workspace);
}
GLOBAL(void)
@@ -509,8 +504,7 @@
GLOBAL(void)
jsimd_fdct_ifast (DCTELEM * data)
{
- if (simd_support & JSIMD_ARM_NEON)
- jsimd_fdct_ifast_neon(data);
+ jsimd_fdct_ifast_neon(data);
}
GLOBAL(void)
@@ -549,8 +543,7 @@
jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
DCTELEM * workspace)
{
- if (simd_support & JSIMD_ARM_NEON)
- jsimd_quantize_neon(coef_block, divisors, workspace);
+ jsimd_quantize_neon(coef_block, divisors, workspace);
}
GLOBAL(void)
@@ -610,9 +603,8 @@
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
- if (simd_support & JSIMD_ARM_NEON)
- jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
- output_col);
+ jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
+ output_col);
}
GLOBAL(void)
@@ -620,9 +612,8 @@
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
- if (simd_support & JSIMD_ARM_NEON)
- jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
- output_col);
+ jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
+ output_col);
}
GLOBAL(int)
@@ -686,9 +677,8 @@
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
- if (simd_support & JSIMD_ARM_NEON)
- jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
- output_col);
+ jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
+ output_col);
}
GLOBAL(void)
@@ -696,9 +686,8 @@
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
- if (simd_support & JSIMD_ARM_NEON)
- jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
- output_col);
+ jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
+ output_col);
}
GLOBAL(void)
@@ -711,6 +700,16 @@
GLOBAL(int)
jsimd_can_huff_encode_one_block (void)
{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_ARM_NEON)
+ return 1;
+
return 0;
}
@@ -719,5 +718,6 @@
int last_dc_val, c_derived_tbl *dctbl,
c_derived_tbl *actbl)
{
- return NULL;
+ return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+ dctbl, actbl);
}