blob: de6fadddf5b071fc02a9200729f365d7fce316c1 [file] [log] [blame]
hbono@chromium.org98626972011-08-03 03:13:08 +00001Index: jdmarker.c
hbono@chromium.org920a8a92010-11-22 09:17:38 +00002===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00003--- jdmarker.c (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +00004+++ jdmarker.c (working copy)
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00005@@ -910,7 +910,7 @@
hbono@chromium.org538d9fd2011-08-15 06:52:21 +00006 }
7
8 if (cinfo->marker->discarded_bytes != 0) {
9- WARNMS2(cinfo, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c);
10+ TRACEMS2(cinfo, 1, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c);
11 cinfo->marker->discarded_bytes = 0;
12 }
13
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +000014@@ -944,7 +944,144 @@
hbono@chromium.org98626972011-08-03 03:13:08 +000015 return TRUE;
16 }
hbono@chromium.org920a8a92010-11-22 09:17:38 +000017
hbono@chromium.org98626972011-08-03 03:13:08 +000018+#ifdef MOTION_JPEG_SUPPORTED
hbono@chromium.org920a8a92010-11-22 09:17:38 +000019
hbono@chromium.org98626972011-08-03 03:13:08 +000020+/* The default Huffman tables used by motion JPEG frames. When a motion JPEG
21+ * frame does not have DHT tables, we should use the huffman tables suggested by
22+ * the JPEG standard. Each of these tables represents a member of the JHUFF_TBLS
23+ * struct so we can just copy it to the according JHUFF_TBLS member.
24+ */
25+/* DC table 0 */
26+LOCAL(const unsigned char) mjpg_dc0_bits[] = {
27+ 0x00, 0x01, 0x05, 0x01, 0x01, 0x01, 0x01, 0x01,
28+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
29+};
30+
31+LOCAL(const unsigned char) mjpg_dc0_huffval[] = {
32+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
33+ 0x08, 0x09, 0x0A, 0x0B
34+};
35+
36+/* DC table 1 */
37+LOCAL(const unsigned char) mjpg_dc1_bits[] = {
38+ 0x00, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
39+ 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00
40+};
41+
42+LOCAL(const unsigned char) mjpg_dc1_huffval[] = {
43+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
44+ 0x08, 0x09, 0x0A, 0x0B
45+};
46+
47+/* AC table 0 */
48+LOCAL(const unsigned char) mjpg_ac0_bits[] = {
49+ 0x00, 0x02, 0x01, 0x03, 0x03, 0x02, 0x04, 0x03,
50+ 0x05, 0x05, 0x04, 0x04, 0x00, 0x00, 0x01, 0x7D
51+};
52+
53+LOCAL(const unsigned char) mjpg_ac0_huffval[] = {
54+ 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
55+ 0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
56+ 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xA1, 0x08,
57+ 0x23, 0x42, 0xB1, 0xC1, 0x15, 0x52, 0xD1, 0xF0,
58+ 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0A, 0x16,
59+ 0x17, 0x18, 0x19, 0x1A, 0x25, 0x26, 0x27, 0x28,
60+ 0x29, 0x2A, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
61+ 0x3A, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
62+ 0x4A, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
63+ 0x5A, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
64+ 0x6A, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
65+ 0x7A, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
66+ 0x8A, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
67+ 0x99, 0x9A, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
68+ 0xA8, 0xA9, 0xAA, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6,
69+ 0xB7, 0xB8, 0xB9, 0xBA, 0xC2, 0xC3, 0xC4, 0xC5,
70+ 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xD2, 0xD3, 0xD4,
71+ 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xE1, 0xE2,
72+ 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA,
73+ 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8,
74+ 0xF9, 0xFA
75+};
76+
77+/* AC table 1 */
78+LOCAL(const unsigned char) mjpg_ac1_bits[] = {
79+ 0x00, 0x02, 0x01, 0x02, 0x04, 0x04, 0x03, 0x04,
80+ 0x07, 0x05, 0x04, 0x04, 0x00, 0x01, 0x02, 0x77
81+};
82+
83+LOCAL(const unsigned char) mjpg_ac1_huffval[] = {
84+ 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
85+ 0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
86+ 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
87+ 0xA1, 0xB1, 0xC1, 0x09, 0x23, 0x33, 0x52, 0xF0,
88+ 0x15, 0x62, 0x72, 0xD1, 0x0A, 0x16, 0x24, 0x34,
89+ 0xE1, 0x25, 0xF1, 0x17, 0x18, 0x19, 0x1A, 0x26,
90+ 0x27, 0x28, 0x29, 0x2A, 0x35, 0x36, 0x37, 0x38,
91+ 0x39, 0x3A, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
92+ 0x49, 0x4A, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
93+ 0x59, 0x5A, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
94+ 0x69, 0x6A, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
95+ 0x79, 0x7A, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
96+ 0x88, 0x89, 0x8A, 0x92, 0x93, 0x94, 0x95, 0x96,
97+ 0x97, 0x98, 0x99, 0x9A, 0xA2, 0xA3, 0xA4, 0xA5,
98+ 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xB2, 0xB3, 0xB4,
99+ 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xC2, 0xC3,
100+ 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xD2,
101+ 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA,
102+ 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9,
103+ 0xEA, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8,
104+ 0xF9, 0xFA
105+};
106+
107+/* Loads the default Huffman tables used by motion JPEG frames. This function
108+ * just copies the huffman tables suggested in the JPEG standard when we have
109+ * not load them.
110+ */
111+LOCAL(void)
112+mjpg_load_huff_tables (j_decompress_ptr cinfo)
113+{
114+ JHUFF_TBL *htblptr;
115+
116+ if (! cinfo->dc_huff_tbl_ptrs[0]) {
117+ htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
118+ MEMZERO(htblptr, SIZEOF(JHUFF_TBL));
119+ MEMCOPY(&htblptr->bits[1], mjpg_dc0_bits, SIZEOF(mjpg_dc0_bits));
120+ MEMCOPY(&htblptr->huffval[0], mjpg_dc0_huffval, SIZEOF(mjpg_dc0_huffval));
121+ cinfo->dc_huff_tbl_ptrs[0] = htblptr;
122+ }
123+
124+ if (! cinfo->dc_huff_tbl_ptrs[1]) {
125+ htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
126+ MEMZERO(htblptr, SIZEOF(JHUFF_TBL));
127+ MEMCOPY(&htblptr->bits[1], mjpg_dc1_bits, SIZEOF(mjpg_dc1_bits));
128+ MEMCOPY(&htblptr->huffval[0], mjpg_dc1_huffval, SIZEOF(mjpg_dc1_huffval));
129+ cinfo->dc_huff_tbl_ptrs[1] = htblptr;
130+ }
131+
132+ if (! cinfo->ac_huff_tbl_ptrs[0]) {
133+ htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
134+ MEMZERO(htblptr, SIZEOF(JHUFF_TBL));
135+ MEMCOPY(&htblptr->bits[1], mjpg_ac0_bits, SIZEOF(mjpg_ac0_bits));
136+ MEMCOPY(&htblptr->huffval[0], mjpg_ac0_huffval, SIZEOF(mjpg_ac0_huffval));
137+ cinfo->ac_huff_tbl_ptrs[0] = htblptr;
138+ }
139+
140+ if (! cinfo->ac_huff_tbl_ptrs[1]) {
141+ htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
142+ MEMZERO(htblptr, SIZEOF(JHUFF_TBL));
143+ MEMCOPY(&htblptr->bits[1], mjpg_ac1_bits, SIZEOF(mjpg_ac1_bits));
144+ MEMCOPY(&htblptr->huffval[0], mjpg_ac1_huffval, SIZEOF(mjpg_ac1_huffval));
145+ cinfo->ac_huff_tbl_ptrs[1] = htblptr;
146+ }
147+}
148+
149+#else
150+
151+#define mjpg_load_huff_tables(cinfo)
152+
153+#endif /* MOTION_JPEG_SUPPORTED */
154+
155+
hbono@chromium.org920a8a92010-11-22 09:17:38 +0000156 /*
hbono@chromium.org98626972011-08-03 03:13:08 +0000157 * Read markers until SOS or EOI.
158 *
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000159@@ -1013,6 +1150,7 @@
hbono@chromium.org98626972011-08-03 03:13:08 +0000160 break;
161
162 case M_SOS:
163+ mjpg_load_huff_tables(cinfo);
164 if (! get_sos(cinfo))
165 return JPEG_SUSPENDED;
166 cinfo->unread_marker = 0; /* processed the marker */
hbono@chromium.org5c4dda92011-08-03 05:14:47 +0000167Index: jmorecfg.h
168===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000169--- jmorecfg.h (revision 829)
hbono@chromium.org5c4dda92011-08-03 05:14:47 +0000170+++ jmorecfg.h (working copy)
171@@ -153,14 +153,18 @@
172 /* INT16 must hold at least the values -32768..32767. */
173
174 #ifndef XMD_H /* X11/xmd.h correctly defines INT16 */
175+#ifndef _BASETSD_H_ /* basetsd.h correctly defines INT32 */
176 typedef short INT16;
177 #endif
178+#endif
179
180 /* INT32 must hold at least signed 32-bit values. */
181
182 #ifndef XMD_H /* X11/xmd.h correctly defines INT32 */
183+#ifndef _BASETSD_H_ /* basetsd.h correctly defines INT32 */
184 typedef long INT32;
185 #endif
186+#endif
187
188 /* Datatype used for image dimensions. The JPEG standard only supports
189 * images up to 64K*64K due to 16-bit fields in SOF markers. Therefore
190@@ -210,11 +214,13 @@
191 * explicit coding is needed; see uses of the NEED_FAR_POINTERS symbol.
192 */
193
194+#ifndef FAR
195 #ifdef NEED_FAR_POINTERS
196 #define FAR far
197 #else
198 #define FAR
199 #endif
200+#endif
201
202
203 /*
hbono@chromium.org920a8a92010-11-22 09:17:38 +0000204Index: jpeglib.h
205===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000206--- jpeglib.h (revision 829)
hbono@chromium.org920a8a92010-11-22 09:17:38 +0000207+++ jpeglib.h (working copy)
hbono@chromium.org98626972011-08-03 03:13:08 +0000208@@ -15,6 +15,10 @@
209 #ifndef JPEGLIB_H
210 #define JPEGLIB_H
hbono@chromium.org920a8a92010-11-22 09:17:38 +0000211
212+/* Begin chromium edits */
213+#include "jpeglibmangler.h"
214+/* End chromium edits */
215+
216 /*
217 * First we include the configuration files that record how this
218 * installation of the JPEG library is set up. jconfig.h can be
219Index: jpeglibmangler.h
220===================================================================
221--- jpeglibmangler.h (revision 0)
hbono@chromium.org98626972011-08-03 03:13:08 +0000222+++ jpeglibmangler.h (revision 0)
hbono@chromium.org313e0252011-04-28 09:03:50 +0000223@@ -0,0 +1,113 @@
hbono@chromium.org920a8a92010-11-22 09:17:38 +0000224+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
225+// Use of this source code is governed by a BSD-style license that can be
226+// found in the LICENSE file.
227+
228+#ifndef THIRD_PARTY_LIBJPEG_TURBO_JPEGLIBMANGLER_H_
229+#define THIRD_PARTY_LIBJPEG_TURBO_JPEGLIBMANGLER_H_
230+
231+// Mangle all externally visible function names so we can build our own libjpeg
232+// without system libraries trying to use it.
233+
234+#define jpeg_make_c_derived_tbl chromium_jpeg_make_c_derived_tbl
235+#define jpeg_gen_optimal_table chromium_jpeg_gen_optimal_table
236+#define jpeg_make_d_derived_tbl chromium_jpeg_make_d_derived_tbl
237+#define jpeg_fill_bit_buffer chromium_jpeg_fill_bit_buffer
238+#define jpeg_huff_decode chromium_jpeg_huff_decode
239+#define jpeg_fdct_islow chromium_jpeg_fdct_islow
240+#define jpeg_fdct_ifast chromium_jpeg_fdct_ifast
241+#define jpeg_fdct_float chromium_jpeg_fdct_float
242+#define jpeg_idct_islow chromium_jpeg_idct_islow
243+#define jpeg_idct_ifast chromium_jpeg_idct_ifast
244+#define jpeg_idct_float chromium_jpeg_idct_float
245+#define jpeg_idct_4x4 chromium_jpeg_idct_4x4
246+#define jpeg_idct_2x2 chromium_jpeg_idct_2x2
247+#define jpeg_idct_1x1 chromium_jpeg_idct_1x1
248+#define jinit_compress_master chromium_jinit_compress_master
249+#define jinit_c_master_control chromium_jinit_c_master_control
250+#define jinit_c_main_controller chromium_jinit_c_main_controller
251+#define jinit_c_prep_controller chromium_jinit_c_prep_controller
252+#define jinit_c_coef_controller chromium_jinit_c_coef_controller
253+#define jinit_color_converter chromium_jinit_color_converter
254+#define jinit_downsampler chromium_jinit_downsampler
255+#define jinit_forward_dct chromium_jinit_forward_dct
256+#define jinit_huff_encoder chromium_jinit_huff_encoder
257+#define jinit_phuff_encoder chromium_jinit_phuff_encoder
258+#define jinit_marker_writer chromium_jinit_marker_writer
259+#define jinit_master_decompress chromium_jinit_master_decompress
260+#define jinit_d_main_controller chromium_jinit_d_main_controller
261+#define jinit_d_coef_controller chromium_jinit_d_coef_controller
262+#define jinit_d_post_controller chromium_jinit_d_post_controller
263+#define jinit_input_controller chromium_jinit_input_controller
264+#define jinit_marker_reader chromium_jinit_marker_reader
265+#define jinit_huff_decoder chromium_jinit_huff_decoder
266+#define jinit_phuff_decoder chromium_jinit_phuff_decoder
267+#define jinit_inverse_dct chromium_jinit_inverse_dct
268+#define jinit_upsampler chromium_jinit_upsampler
269+#define jinit_color_deconverter chromium_jinit_color_deconverter
270+#define jinit_1pass_quantizer chromium_jinit_1pass_quantizer
271+#define jinit_2pass_quantizer chromium_jinit_2pass_quantizer
272+#define jinit_merged_upsampler chromium_jinit_merged_upsampler
273+#define jinit_memory_mgr chromium_jinit_memory_mgr
274+#define jdiv_round_up chromium_jdiv_round_up
275+#define jround_up chromium_jround_up
276+#define jcopy_sample_rows chromium_jcopy_sample_rows
277+#define jcopy_block_row chromium_jcopy_block_row
278+#define jzero_far chromium_jzero_far
279+#define jpeg_std_error chromium_jpeg_std_error
280+#define jpeg_CreateCompress chromium_jpeg_CreateCompress
281+#define jpeg_CreateDecompress chromium_jpeg_CreateDecompress
282+#define jpeg_destroy_compress chromium_jpeg_destroy_compress
283+#define jpeg_destroy_decompress chromium_jpeg_destroy_decompress
284+#define jpeg_stdio_dest chromium_jpeg_stdio_dest
285+#define jpeg_stdio_src chromium_jpeg_stdio_src
286+#define jpeg_set_defaults chromium_jpeg_set_defaults
287+#define jpeg_set_colorspace chromium_jpeg_set_colorspace
288+#define jpeg_default_colorspace chromium_jpeg_default_colorspace
289+#define jpeg_set_quality chromium_jpeg_set_quality
290+#define jpeg_set_linear_quality chromium_jpeg_set_linear_quality
291+#define jpeg_add_quant_table chromium_jpeg_add_quant_table
292+#define jpeg_quality_scaling chromium_jpeg_quality_scaling
293+#define jpeg_simple_progression chromium_jpeg_simple_progression
294+#define jpeg_suppress_tables chromium_jpeg_suppress_tables
295+#define jpeg_alloc_quant_table chromium_jpeg_alloc_quant_table
296+#define jpeg_alloc_huff_table chromium_jpeg_alloc_huff_table
297+#define jpeg_start_compress chromium_jpeg_start_compress
298+#define jpeg_write_scanlines chromium_jpeg_write_scanlines
299+#define jpeg_finish_compress chromium_jpeg_finish_compress
300+#define jpeg_write_raw_data chromium_jpeg_write_raw_data
301+#define jpeg_write_marker chromium_jpeg_write_marker
302+#define jpeg_write_m_header chromium_jpeg_write_m_header
303+#define jpeg_write_m_byte chromium_jpeg_write_m_byte
304+#define jpeg_write_tables chromium_jpeg_write_tables
305+#define jpeg_read_header chromium_jpeg_read_header
306+#define jpeg_start_decompress chromium_jpeg_start_decompress
307+#define jpeg_read_scanlines chromium_jpeg_read_scanlines
308+#define jpeg_finish_decompress chromium_jpeg_finish_decompress
309+#define jpeg_read_raw_data chromium_jpeg_read_raw_data
310+#define jpeg_has_multiple_scans chromium_jpeg_has_multiple_scans
311+#define jpeg_start_output chromium_jpeg_start_output
312+#define jpeg_finish_output chromium_jpeg_finish_output
313+#define jpeg_input_complete chromium_jpeg_input_complete
314+#define jpeg_new_colormap chromium_jpeg_new_colormap
315+#define jpeg_consume_input chromium_jpeg_consume_input
316+#define jpeg_calc_output_dimensions chromium_jpeg_calc_output_dimensions
317+#define jpeg_save_markers chromium_jpeg_save_markers
318+#define jpeg_set_marker_processor chromium_jpeg_set_marker_processor
319+#define jpeg_read_coefficients chromium_jpeg_read_coefficients
320+#define jpeg_write_coefficients chromium_jpeg_write_coefficients
321+#define jpeg_copy_critical_parameters chromium_jpeg_copy_critical_parameters
322+#define jpeg_abort_compress chromium_jpeg_abort_compress
323+#define jpeg_abort_decompress chromium_jpeg_abort_decompress
324+#define jpeg_abort chromium_jpeg_abort
325+#define jpeg_destroy chromium_jpeg_destroy
326+#define jpeg_resync_to_restart chromium_jpeg_resync_to_restart
327+#define jpeg_get_small chromium_jpeg_get_small
328+#define jpeg_free_small chromium_jpeg_free_small
329+#define jpeg_get_large chromium_jpeg_get_large
330+#define jpeg_free_large chromium_jpeg_free_large
331+#define jpeg_mem_available chromium_jpeg_mem_available
332+#define jpeg_open_backing_store chromium_jpeg_open_backing_store
333+#define jpeg_mem_init chromium_jpeg_mem_init
334+#define jpeg_mem_term chromium_jpeg_mem_term
335+
336+#endif // THIRD_PARTY_LIBJPEG_TURBO_JPEGLIBMANGLER_H_
hbono@chromium.org98626972011-08-03 03:13:08 +0000337Index: simd/jcgrass2-64.asm
hbono@chromium.org68635482011-02-07 06:02:41 +0000338===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000339--- simd/jcgrass2-64.asm (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +0000340+++ simd/jcgrass2-64.asm (working copy)
341@@ -30,7 +30,7 @@
342 SECTION SEG_CONST
hbono@chromium.org321292e2011-02-17 04:45:42 +0000343
344 alignz 16
hbono@chromium.org98626972011-08-03 03:13:08 +0000345- global EXTN(jconst_rgb_gray_convert_sse2)
346+ global EXTN(jconst_rgb_gray_convert_sse2) PRIVATE
hbono@chromium.org321292e2011-02-17 04:45:42 +0000347
hbono@chromium.org98626972011-08-03 03:13:08 +0000348 EXTN(jconst_rgb_gray_convert_sse2):
hbono@chromium.org321292e2011-02-17 04:45:42 +0000349
hbono@chromium.org68635482011-02-07 06:02:41 +0000350Index: simd/jiss2fst.asm
351===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000352--- simd/jiss2fst.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +0000353+++ simd/jiss2fst.asm (working copy)
354@@ -59,7 +59,7 @@
355 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
356
357 alignz 16
358- global EXTN(jconst_idct_ifast_sse2)
359+ global EXTN(jconst_idct_ifast_sse2) PRIVATE
360
361 EXTN(jconst_idct_ifast_sse2):
362
363@@ -92,7 +92,7 @@
364 %define WK_NUM 2
365
366 align 16
367- global EXTN(jsimd_idct_ifast_sse2)
368+ global EXTN(jsimd_idct_ifast_sse2) PRIVATE
369
370 EXTN(jsimd_idct_ifast_sse2):
371 push ebp
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000372Index: simd/jcclrss2-64.asm
373===================================================================
374--- simd/jcclrss2-64.asm (revision 829)
375+++ simd/jcclrss2-64.asm (working copy)
376@@ -37,7 +37,7 @@
377
378 align 16
379
380- global EXTN(jsimd_rgb_ycc_convert_sse2)
381+ global EXTN(jsimd_rgb_ycc_convert_sse2) PRIVATE
382
383 EXTN(jsimd_rgb_ycc_convert_sse2):
384 push rbp
hbono@chromium.org68635482011-02-07 06:02:41 +0000385Index: simd/jiss2red-64.asm
386===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000387--- simd/jiss2red-64.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +0000388+++ simd/jiss2red-64.asm (working copy)
389@@ -73,7 +73,7 @@
390 SECTION SEG_CONST
391
392 alignz 16
393- global EXTN(jconst_idct_red_sse2)
394+ global EXTN(jconst_idct_red_sse2) PRIVATE
395
396 EXTN(jconst_idct_red_sse2):
397
398@@ -114,7 +114,7 @@
399 %define WK_NUM 2
400
401 align 16
402- global EXTN(jsimd_idct_4x4_sse2)
403+ global EXTN(jsimd_idct_4x4_sse2) PRIVATE
404
405 EXTN(jsimd_idct_4x4_sse2):
406 push rbp
407@@ -413,7 +413,7 @@
408 ; r13 = JDIMENSION output_col
409
410 align 16
411- global EXTN(jsimd_idct_2x2_sse2)
412+ global EXTN(jsimd_idct_2x2_sse2) PRIVATE
413
414 EXTN(jsimd_idct_2x2_sse2):
415 push rbp
hbono@chromium.org321292e2011-02-17 04:45:42 +0000416Index: simd/ji3dnflt.asm
417===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000418--- simd/ji3dnflt.asm (revision 829)
hbono@chromium.org321292e2011-02-17 04:45:42 +0000419+++ simd/ji3dnflt.asm (working copy)
420@@ -27,7 +27,7 @@
421 SECTION SEG_CONST
422
423 alignz 16
424- global EXTN(jconst_idct_float_3dnow)
425+ global EXTN(jconst_idct_float_3dnow) PRIVATE
426
427 EXTN(jconst_idct_float_3dnow):
428
429@@ -63,7 +63,7 @@
430 ; FAST_FLOAT workspace[DCTSIZE2]
431
432 align 16
433- global EXTN(jsimd_idct_float_3dnow)
434+ global EXTN(jsimd_idct_float_3dnow) PRIVATE
435
436 EXTN(jsimd_idct_float_3dnow):
437 push ebp
hbono@chromium.org68635482011-02-07 06:02:41 +0000438Index: simd/jsimdcpu.asm
439===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000440--- simd/jsimdcpu.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +0000441+++ simd/jsimdcpu.asm (working copy)
442@@ -29,7 +29,7 @@
443 ;
444
445 align 16
446- global EXTN(jpeg_simd_cpu_support)
447+ global EXTN(jpeg_simd_cpu_support) PRIVATE
448
449 EXTN(jpeg_simd_cpu_support):
450 push ebx
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000451Index: simd/jdmerss2-64.asm
452===================================================================
453--- simd/jdmerss2-64.asm (revision 829)
454+++ simd/jdmerss2-64.asm (working copy)
455@@ -35,7 +35,7 @@
456 SECTION SEG_CONST
457
458 alignz 16
459- global EXTN(jconst_merged_upsample_sse2)
460+ global EXTN(jconst_merged_upsample_sse2) PRIVATE
461
462 EXTN(jconst_merged_upsample_sse2):
463
hbono@chromium.org68635482011-02-07 06:02:41 +0000464Index: simd/jdsammmx.asm
465===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000466--- simd/jdsammmx.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +0000467+++ simd/jdsammmx.asm (working copy)
468@@ -22,7 +22,7 @@
469 SECTION SEG_CONST
470
471 alignz 16
472- global EXTN(jconst_fancy_upsample_mmx)
473+ global EXTN(jconst_fancy_upsample_mmx) PRIVATE
474
475 EXTN(jconst_fancy_upsample_mmx):
476
477@@ -58,7 +58,7 @@
478 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
479
480 align 16
481- global EXTN(jsimd_h2v1_fancy_upsample_mmx)
482+ global EXTN(jsimd_h2v1_fancy_upsample_mmx) PRIVATE
483
484 EXTN(jsimd_h2v1_fancy_upsample_mmx):
485 push ebp
486@@ -216,7 +216,7 @@
487 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
488
489 align 16
490- global EXTN(jsimd_h2v2_fancy_upsample_mmx)
491+ global EXTN(jsimd_h2v2_fancy_upsample_mmx) PRIVATE
492
493 EXTN(jsimd_h2v2_fancy_upsample_mmx):
494 push ebp
495@@ -542,7 +542,7 @@
496 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
497
498 align 16
499- global EXTN(jsimd_h2v1_upsample_mmx)
500+ global EXTN(jsimd_h2v1_upsample_mmx) PRIVATE
501
502 EXTN(jsimd_h2v1_upsample_mmx):
503 push ebp
504@@ -643,7 +643,7 @@
505 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
506
507 align 16
508- global EXTN(jsimd_h2v2_upsample_mmx)
509+ global EXTN(jsimd_h2v2_upsample_mmx) PRIVATE
510
511 EXTN(jsimd_h2v2_upsample_mmx):
512 push ebp
hbono@chromium.org68635482011-02-07 06:02:41 +0000513Index: simd/jdmrgmmx.asm
514===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000515--- simd/jdmrgmmx.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +0000516+++ simd/jdmrgmmx.asm (working copy)
hbono@chromium.org98626972011-08-03 03:13:08 +0000517@@ -40,7 +40,7 @@
hbono@chromium.org68635482011-02-07 06:02:41 +0000518 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
519
520 align 16
521- global EXTN(jsimd_h2v1_merged_upsample_mmx)
522+ global EXTN(jsimd_h2v1_merged_upsample_mmx) PRIVATE
523
524 EXTN(jsimd_h2v1_merged_upsample_mmx):
525 push ebp
hbono@chromium.org98626972011-08-03 03:13:08 +0000526@@ -409,7 +409,7 @@
hbono@chromium.org68635482011-02-07 06:02:41 +0000527 %define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
528
529 align 16
530- global EXTN(jsimd_h2v2_merged_upsample_mmx)
531+ global EXTN(jsimd_h2v2_merged_upsample_mmx) PRIVATE
532
533 EXTN(jsimd_h2v2_merged_upsample_mmx):
534 push ebp
535Index: simd/jdsamss2.asm
536===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000537--- simd/jdsamss2.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +0000538+++ simd/jdsamss2.asm (working copy)
539@@ -22,7 +22,7 @@
540 SECTION SEG_CONST
541
542 alignz 16
543- global EXTN(jconst_fancy_upsample_sse2)
544+ global EXTN(jconst_fancy_upsample_sse2) PRIVATE
545
546 EXTN(jconst_fancy_upsample_sse2):
547
548@@ -58,7 +58,7 @@
549 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
550
551 align 16
552- global EXTN(jsimd_h2v1_fancy_upsample_sse2)
553+ global EXTN(jsimd_h2v1_fancy_upsample_sse2) PRIVATE
554
555 EXTN(jsimd_h2v1_fancy_upsample_sse2):
556 push ebp
557@@ -214,7 +214,7 @@
558 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
559
560 align 16
561- global EXTN(jsimd_h2v2_fancy_upsample_sse2)
562+ global EXTN(jsimd_h2v2_fancy_upsample_sse2) PRIVATE
563
564 EXTN(jsimd_h2v2_fancy_upsample_sse2):
565 push ebp
566@@ -538,7 +538,7 @@
567 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
568
569 align 16
570- global EXTN(jsimd_h2v1_upsample_sse2)
571+ global EXTN(jsimd_h2v1_upsample_sse2) PRIVATE
572
573 EXTN(jsimd_h2v1_upsample_sse2):
574 push ebp
575@@ -637,7 +637,7 @@
576 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
577
578 align 16
579- global EXTN(jsimd_h2v2_upsample_sse2)
580+ global EXTN(jsimd_h2v2_upsample_sse2) PRIVATE
581
582 EXTN(jsimd_h2v2_upsample_sse2):
583 push ebp
584Index: simd/jiss2flt-64.asm
585===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000586--- simd/jiss2flt-64.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +0000587+++ simd/jiss2flt-64.asm (working copy)
588@@ -38,7 +38,7 @@
589 SECTION SEG_CONST
590
591 alignz 16
592- global EXTN(jconst_idct_float_sse2)
593+ global EXTN(jconst_idct_float_sse2) PRIVATE
594
595 EXTN(jconst_idct_float_sse2):
596
597@@ -74,7 +74,7 @@
598 ; FAST_FLOAT workspace[DCTSIZE2]
599
600 align 16
601- global EXTN(jsimd_idct_float_sse2)
602+ global EXTN(jsimd_idct_float_sse2) PRIVATE
603
604 EXTN(jsimd_idct_float_sse2):
605 push rbp
606Index: simd/jfss2int-64.asm
607===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000608--- simd/jfss2int-64.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +0000609+++ simd/jfss2int-64.asm (working copy)
610@@ -67,7 +67,7 @@
611 SECTION SEG_CONST
612
613 alignz 16
614- global EXTN(jconst_fdct_islow_sse2)
615+ global EXTN(jconst_fdct_islow_sse2) PRIVATE
616
617 EXTN(jconst_fdct_islow_sse2):
618
619@@ -101,7 +101,7 @@
620 %define WK_NUM 6
621
622 align 16
623- global EXTN(jsimd_fdct_islow_sse2)
624+ global EXTN(jsimd_fdct_islow_sse2) PRIVATE
625
626 EXTN(jsimd_fdct_islow_sse2):
627 push rbp
628Index: simd/jcqnts2f.asm
629===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000630--- simd/jcqnts2f.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +0000631+++ simd/jcqnts2f.asm (working copy)
632@@ -35,7 +35,7 @@
633 %define workspace ebp+16 ; FAST_FLOAT * workspace
634
635 align 16
636- global EXTN(jsimd_convsamp_float_sse2)
637+ global EXTN(jsimd_convsamp_float_sse2) PRIVATE
638
639 EXTN(jsimd_convsamp_float_sse2):
640 push ebp
641@@ -115,7 +115,7 @@
642 %define workspace ebp+16 ; FAST_FLOAT * workspace
643
644 align 16
645- global EXTN(jsimd_quantize_float_sse2)
646+ global EXTN(jsimd_quantize_float_sse2) PRIVATE
647
648 EXTN(jsimd_quantize_float_sse2):
649 push ebp
650Index: simd/jdmrgss2.asm
651===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000652--- simd/jdmrgss2.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +0000653+++ simd/jdmrgss2.asm (working copy)
hbono@chromium.org98626972011-08-03 03:13:08 +0000654@@ -40,7 +40,7 @@
hbono@chromium.org68635482011-02-07 06:02:41 +0000655 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
656
657 align 16
658- global EXTN(jsimd_h2v1_merged_upsample_sse2)
659+ global EXTN(jsimd_h2v1_merged_upsample_sse2) PRIVATE
660
661 EXTN(jsimd_h2v1_merged_upsample_sse2):
662 push ebp
hbono@chromium.org0ec930e2012-01-18 07:01:04 +0000663@@ -560,7 +560,7 @@
hbono@chromium.org68635482011-02-07 06:02:41 +0000664 %define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
665
666 align 16
667- global EXTN(jsimd_h2v2_merged_upsample_sse2)
668+ global EXTN(jsimd_h2v2_merged_upsample_sse2) PRIVATE
669
670 EXTN(jsimd_h2v2_merged_upsample_sse2):
671 push ebp
672Index: simd/jfmmxint.asm
673===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000674--- simd/jfmmxint.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +0000675+++ simd/jfmmxint.asm (working copy)
676@@ -66,7 +66,7 @@
677 SECTION SEG_CONST
678
679 alignz 16
680- global EXTN(jconst_fdct_islow_mmx)
681+ global EXTN(jconst_fdct_islow_mmx) PRIVATE
682
683 EXTN(jconst_fdct_islow_mmx):
684
685@@ -101,7 +101,7 @@
686 %define WK_NUM 2
687
688 align 16
689- global EXTN(jsimd_fdct_islow_mmx)
690+ global EXTN(jsimd_fdct_islow_mmx) PRIVATE
691
692 EXTN(jsimd_fdct_islow_mmx):
693 push ebp
hbono@chromium.org98626972011-08-03 03:13:08 +0000694Index: simd/jcgryss2-64.asm
695===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000696--- simd/jcgryss2-64.asm (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +0000697+++ simd/jcgryss2-64.asm (working copy)
698@@ -37,7 +37,7 @@
699
700 align 16
701
702- global EXTN(jsimd_rgb_gray_convert_sse2)
703+ global EXTN(jsimd_rgb_gray_convert_sse2) PRIVATE
704
705 EXTN(jsimd_rgb_gray_convert_sse2):
706 push rbp
hbono@chromium.org68635482011-02-07 06:02:41 +0000707Index: simd/jcqnts2i.asm
708===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000709--- simd/jcqnts2i.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +0000710+++ simd/jcqnts2i.asm (working copy)
711@@ -35,7 +35,7 @@
712 %define workspace ebp+16 ; DCTELEM * workspace
713
714 align 16
715- global EXTN(jsimd_convsamp_sse2)
716+ global EXTN(jsimd_convsamp_sse2) PRIVATE
717
718 EXTN(jsimd_convsamp_sse2):
719 push ebp
720@@ -117,7 +117,7 @@
721 %define workspace ebp+16 ; DCTELEM * workspace
722
723 align 16
724- global EXTN(jsimd_quantize_sse2)
725+ global EXTN(jsimd_quantize_sse2) PRIVATE
726
727 EXTN(jsimd_quantize_sse2):
728 push ebp
729Index: simd/jiss2fst-64.asm
730===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000731--- simd/jiss2fst-64.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +0000732+++ simd/jiss2fst-64.asm (working copy)
733@@ -60,7 +60,7 @@
734 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
735
736 alignz 16
737- global EXTN(jconst_idct_ifast_sse2)
738+ global EXTN(jconst_idct_ifast_sse2) PRIVATE
739
740 EXTN(jconst_idct_ifast_sse2):
741
742@@ -93,7 +93,7 @@
743 %define WK_NUM 2
744
745 align 16
746- global EXTN(jsimd_idct_ifast_sse2)
747+ global EXTN(jsimd_idct_ifast_sse2) PRIVATE
748
749 EXTN(jsimd_idct_ifast_sse2):
750 push rbp
hbono@chromium.org68635482011-02-07 06:02:41 +0000751Index: simd/jiss2flt.asm
752===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000753--- simd/jiss2flt.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +0000754+++ simd/jiss2flt.asm (working copy)
755@@ -37,7 +37,7 @@
756 SECTION SEG_CONST
757
758 alignz 16
759- global EXTN(jconst_idct_float_sse2)
760+ global EXTN(jconst_idct_float_sse2) PRIVATE
761
762 EXTN(jconst_idct_float_sse2):
763
764@@ -73,7 +73,7 @@
765 ; FAST_FLOAT workspace[DCTSIZE2]
766
767 align 16
768- global EXTN(jsimd_idct_float_sse2)
769+ global EXTN(jsimd_idct_float_sse2) PRIVATE
770
771 EXTN(jsimd_idct_float_sse2):
772 push ebp
hbono@chromium.org68635482011-02-07 06:02:41 +0000773Index: simd/jiss2int.asm
774===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000775--- simd/jiss2int.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +0000776+++ simd/jiss2int.asm (working copy)
777@@ -66,7 +66,7 @@
778 SECTION SEG_CONST
779
780 alignz 16
781- global EXTN(jconst_idct_islow_sse2)
782+ global EXTN(jconst_idct_islow_sse2) PRIVATE
783
784 EXTN(jconst_idct_islow_sse2):
785
786@@ -105,7 +105,7 @@
787 %define WK_NUM 12
788
789 align 16
790- global EXTN(jsimd_idct_islow_sse2)
791+ global EXTN(jsimd_idct_islow_sse2) PRIVATE
792
793 EXTN(jsimd_idct_islow_sse2):
794 push ebp
795Index: simd/jfsseflt-64.asm
796===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000797--- simd/jfsseflt-64.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +0000798+++ simd/jfsseflt-64.asm (working copy)
799@@ -38,7 +38,7 @@
800 SECTION SEG_CONST
801
802 alignz 16
803- global EXTN(jconst_fdct_float_sse)
804+ global EXTN(jconst_fdct_float_sse) PRIVATE
805
806 EXTN(jconst_fdct_float_sse):
807
808@@ -65,7 +65,7 @@
809 %define WK_NUM 2
810
811 align 16
812- global EXTN(jsimd_fdct_float_sse)
813+ global EXTN(jsimd_fdct_float_sse) PRIVATE
814
815 EXTN(jsimd_fdct_float_sse):
816 push rbp
817Index: simd/jccolss2-64.asm
818===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000819--- simd/jccolss2-64.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +0000820+++ simd/jccolss2-64.asm (working copy)
821@@ -34,7 +34,7 @@
822 SECTION SEG_CONST
823
824 alignz 16
825- global EXTN(jconst_rgb_ycc_convert_sse2)
826+ global EXTN(jconst_rgb_ycc_convert_sse2) PRIVATE
827
828 EXTN(jconst_rgb_ycc_convert_sse2):
829
hbono@chromium.org68635482011-02-07 06:02:41 +0000830Index: simd/jcsamss2-64.asm
831===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000832--- simd/jcsamss2-64.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +0000833+++ simd/jcsamss2-64.asm (working copy)
834@@ -41,7 +41,7 @@
835 ; r15 = JSAMPARRAY output_data
836
837 align 16
838- global EXTN(jsimd_h2v1_downsample_sse2)
839+ global EXTN(jsimd_h2v1_downsample_sse2) PRIVATE
840
841 EXTN(jsimd_h2v1_downsample_sse2):
842 push rbp
843@@ -185,7 +185,7 @@
844 ; r15 = JSAMPARRAY output_data
845
846 align 16
847- global EXTN(jsimd_h2v2_downsample_sse2)
848+ global EXTN(jsimd_h2v2_downsample_sse2) PRIVATE
849
850 EXTN(jsimd_h2v2_downsample_sse2):
851 push rbp
hbono@chromium.org68635482011-02-07 06:02:41 +0000852Index: simd/jdclrss2-64.asm
853===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000854--- simd/jdclrss2-64.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +0000855+++ simd/jdclrss2-64.asm (working copy)
hbono@chromium.org98626972011-08-03 03:13:08 +0000856@@ -39,7 +39,7 @@
hbono@chromium.org68635482011-02-07 06:02:41 +0000857 %define WK_NUM 2
858
859 align 16
860- global EXTN(jsimd_ycc_rgb_convert_sse2)
861+ global EXTN(jsimd_ycc_rgb_convert_sse2) PRIVATE
862
863 EXTN(jsimd_ycc_rgb_convert_sse2):
864 push rbp
hbono@chromium.org98626972011-08-03 03:13:08 +0000865Index: simd/jdcolmmx.asm
866===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000867--- simd/jdcolmmx.asm (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +0000868+++ simd/jdcolmmx.asm (working copy)
869@@ -35,7 +35,7 @@
870 SECTION SEG_CONST
871
872 alignz 16
873- global EXTN(jconst_ycc_rgb_convert_mmx)
874+ global EXTN(jconst_ycc_rgb_convert_mmx) PRIVATE
875
876 EXTN(jconst_ycc_rgb_convert_mmx):
877
hbono@chromium.org321292e2011-02-17 04:45:42 +0000878Index: simd/jcclrmmx.asm
879===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000880--- simd/jcclrmmx.asm (revision 829)
hbono@chromium.org321292e2011-02-17 04:45:42 +0000881+++ simd/jcclrmmx.asm (working copy)
hbono@chromium.org98626972011-08-03 03:13:08 +0000882@@ -40,7 +40,7 @@
hbono@chromium.org321292e2011-02-17 04:45:42 +0000883 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
884
885 align 16
886- global EXTN(jsimd_rgb_ycc_convert_mmx)
887+ global EXTN(jsimd_rgb_ycc_convert_mmx) PRIVATE
888
889 EXTN(jsimd_rgb_ycc_convert_mmx):
890 push ebp
hbono@chromium.org68635482011-02-07 06:02:41 +0000891Index: simd/jfsseflt.asm
892===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000893--- simd/jfsseflt.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +0000894+++ simd/jfsseflt.asm (working copy)
895@@ -37,7 +37,7 @@
896 SECTION SEG_CONST
897
898 alignz 16
899- global EXTN(jconst_fdct_float_sse)
900+ global EXTN(jconst_fdct_float_sse) PRIVATE
901
902 EXTN(jconst_fdct_float_sse):
903
904@@ -65,7 +65,7 @@
905 %define WK_NUM 2
906
907 align 16
908- global EXTN(jsimd_fdct_float_sse)
909+ global EXTN(jsimd_fdct_float_sse) PRIVATE
910
911 EXTN(jsimd_fdct_float_sse):
912 push ebp
hbono@chromium.org68635482011-02-07 06:02:41 +0000913Index: simd/jdmrgss2-64.asm
914===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000915--- simd/jdmrgss2-64.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +0000916+++ simd/jdmrgss2-64.asm (working copy)
hbono@chromium.org98626972011-08-03 03:13:08 +0000917@@ -39,7 +39,7 @@
hbono@chromium.org68635482011-02-07 06:02:41 +0000918 %define WK_NUM 3
919
920 align 16
921- global EXTN(jsimd_h2v1_merged_upsample_sse2)
922+ global EXTN(jsimd_h2v1_merged_upsample_sse2) PRIVATE
923
924 EXTN(jsimd_h2v1_merged_upsample_sse2):
925 push rbp
hbono@chromium.org0ec930e2012-01-18 07:01:04 +0000926@@ -543,7 +543,7 @@
hbono@chromium.org68635482011-02-07 06:02:41 +0000927 ; r13 = JSAMPARRAY output_buf
928
929 align 16
930- global EXTN(jsimd_h2v2_merged_upsample_sse2)
931+ global EXTN(jsimd_h2v2_merged_upsample_sse2) PRIVATE
932
933 EXTN(jsimd_h2v2_merged_upsample_sse2):
934 push rbp
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000935Index: simd/jdcolss2.asm
hbono@chromium.org321292e2011-02-17 04:45:42 +0000936===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000937--- simd/jdcolss2.asm (revision 829)
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000938+++ simd/jdcolss2.asm (working copy)
939@@ -35,7 +35,7 @@
940 SECTION SEG_CONST
hbono@chromium.org321292e2011-02-17 04:45:42 +0000941
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000942 alignz 16
943- global EXTN(jconst_ycc_rgb_convert_sse2)
944+ global EXTN(jconst_ycc_rgb_convert_sse2) PRIVATE
hbono@chromium.org321292e2011-02-17 04:45:42 +0000945
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000946 EXTN(jconst_ycc_rgb_convert_sse2):
hbono@chromium.org321292e2011-02-17 04:45:42 +0000947
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000948Index: simd/jdmermmx.asm
949===================================================================
950--- simd/jdmermmx.asm (revision 829)
951+++ simd/jdmermmx.asm (working copy)
952@@ -35,7 +35,7 @@
953 SECTION SEG_CONST
954
955 alignz 16
956- global EXTN(jconst_merged_upsample_mmx)
957+ global EXTN(jconst_merged_upsample_mmx) PRIVATE
958
959 EXTN(jconst_merged_upsample_mmx):
960
961Index: simd/jcclrss2.asm
962===================================================================
963--- simd/jcclrss2.asm (revision 829)
964+++ simd/jcclrss2.asm (working copy)
965@@ -38,7 +38,7 @@
966
967 align 16
968
969- global EXTN(jsimd_rgb_ycc_convert_sse2)
970+ global EXTN(jsimd_rgb_ycc_convert_sse2) PRIVATE
971
972 EXTN(jsimd_rgb_ycc_convert_sse2):
973 push ebp
hbono@chromium.org68635482011-02-07 06:02:41 +0000974Index: simd/jiss2red.asm
975===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +0000976--- simd/jiss2red.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +0000977+++ simd/jiss2red.asm (working copy)
978@@ -72,7 +72,7 @@
979 SECTION SEG_CONST
980
981 alignz 16
982- global EXTN(jconst_idct_red_sse2)
983+ global EXTN(jconst_idct_red_sse2) PRIVATE
984
985 EXTN(jconst_idct_red_sse2):
986
987@@ -113,7 +113,7 @@
988 %define WK_NUM 2
989
990 align 16
991- global EXTN(jsimd_idct_4x4_sse2)
992+ global EXTN(jsimd_idct_4x4_sse2) PRIVATE
993
994 EXTN(jsimd_idct_4x4_sse2):
995 push ebp
996@@ -424,7 +424,7 @@
997 %define output_col(b) (b)+20 ; JDIMENSION output_col
998
999 align 16
1000- global EXTN(jsimd_idct_2x2_sse2)
1001+ global EXTN(jsimd_idct_2x2_sse2) PRIVATE
1002
1003 EXTN(jsimd_idct_2x2_sse2):
1004 push ebp
hbono@chromium.org98626972011-08-03 03:13:08 +00001005Index: simd/jdmerss2.asm
1006===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001007--- simd/jdmerss2.asm (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +00001008+++ simd/jdmerss2.asm (working copy)
1009@@ -35,7 +35,7 @@
1010 SECTION SEG_CONST
1011
1012 alignz 16
1013- global EXTN(jconst_merged_upsample_sse2)
1014+ global EXTN(jconst_merged_upsample_sse2) PRIVATE
1015
1016 EXTN(jconst_merged_upsample_sse2):
1017
1018Index: simd/jfss2fst-64.asm
1019===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001020--- simd/jfss2fst-64.asm (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +00001021+++ simd/jfss2fst-64.asm (working copy)
1022@@ -53,7 +53,7 @@
1023 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
1024
1025 alignz 16
1026- global EXTN(jconst_fdct_ifast_sse2)
1027+ global EXTN(jconst_fdct_ifast_sse2) PRIVATE
1028
1029 EXTN(jconst_fdct_ifast_sse2):
1030
1031@@ -80,7 +80,7 @@
1032 %define WK_NUM 2
1033
1034 align 16
1035- global EXTN(jsimd_fdct_ifast_sse2)
1036+ global EXTN(jsimd_fdct_ifast_sse2) PRIVATE
1037
1038 EXTN(jsimd_fdct_ifast_sse2):
1039 push rbp
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001040Index: simd/jcqntmmx.asm
1041===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001042--- simd/jcqntmmx.asm (revision 829)
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001043+++ simd/jcqntmmx.asm (working copy)
1044@@ -35,7 +35,7 @@
1045 %define workspace ebp+16 ; DCTELEM * workspace
1046
1047 align 16
1048- global EXTN(jsimd_convsamp_mmx)
1049+ global EXTN(jsimd_convsamp_mmx) PRIVATE
1050
1051 EXTN(jsimd_convsamp_mmx):
1052 push ebp
1053@@ -140,7 +140,7 @@
1054 %define workspace ebp+16 ; DCTELEM * workspace
1055
1056 align 16
1057- global EXTN(jsimd_quantize_mmx)
1058+ global EXTN(jsimd_quantize_mmx) PRIVATE
1059
1060 EXTN(jsimd_quantize_mmx):
1061 push ebp
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001062Index: simd/jimmxfst.asm
1063===================================================================
1064--- simd/jimmxfst.asm (revision 829)
1065+++ simd/jimmxfst.asm (working copy)
1066@@ -59,7 +59,7 @@
1067 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
1068
1069 alignz 16
1070- global EXTN(jconst_idct_ifast_mmx)
1071+ global EXTN(jconst_idct_ifast_mmx) PRIVATE
1072
1073 EXTN(jconst_idct_ifast_mmx):
1074
1075@@ -94,7 +94,7 @@
1076 ; JCOEF workspace[DCTSIZE2]
1077
1078 align 16
1079- global EXTN(jsimd_idct_ifast_mmx)
1080+ global EXTN(jsimd_idct_ifast_mmx) PRIVATE
1081
1082 EXTN(jsimd_idct_ifast_mmx):
1083 push ebp
hbono@chromium.org98626972011-08-03 03:13:08 +00001084Index: simd/jfss2fst.asm
1085===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001086--- simd/jfss2fst.asm (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +00001087+++ simd/jfss2fst.asm (working copy)
1088@@ -52,7 +52,7 @@
1089 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
1090
1091 alignz 16
1092- global EXTN(jconst_fdct_ifast_sse2)
1093+ global EXTN(jconst_fdct_ifast_sse2) PRIVATE
1094
1095 EXTN(jconst_fdct_ifast_sse2):
1096
1097@@ -80,7 +80,7 @@
1098 %define WK_NUM 2
1099
1100 align 16
1101- global EXTN(jsimd_fdct_ifast_sse2)
1102+ global EXTN(jsimd_fdct_ifast_sse2) PRIVATE
1103
1104 EXTN(jsimd_fdct_ifast_sse2):
1105 push ebp
1106Index: simd/jcgrammx.asm
1107===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001108--- simd/jcgrammx.asm (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +00001109+++ simd/jcgrammx.asm (working copy)
1110@@ -33,7 +33,7 @@
1111 SECTION SEG_CONST
1112
1113 alignz 16
1114- global EXTN(jconst_rgb_gray_convert_mmx)
1115+ global EXTN(jconst_rgb_gray_convert_mmx) PRIVATE
1116
1117 EXTN(jconst_rgb_gray_convert_mmx):
1118
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001119Index: simd/jdcolss2-64.asm
1120===================================================================
1121--- simd/jdcolss2-64.asm (revision 829)
1122+++ simd/jdcolss2-64.asm (working copy)
1123@@ -35,7 +35,7 @@
1124 SECTION SEG_CONST
1125
1126 alignz 16
1127- global EXTN(jconst_ycc_rgb_convert_sse2)
1128+ global EXTN(jconst_ycc_rgb_convert_sse2) PRIVATE
1129
1130 EXTN(jconst_ycc_rgb_convert_sse2):
1131
hbono@chromium.org98626972011-08-03 03:13:08 +00001132Index: simd/jf3dnflt.asm
1133===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001134--- simd/jf3dnflt.asm (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +00001135+++ simd/jf3dnflt.asm (working copy)
1136@@ -27,7 +27,7 @@
1137 SECTION SEG_CONST
1138
1139 alignz 16
1140- global EXTN(jconst_fdct_float_3dnow)
1141+ global EXTN(jconst_fdct_float_3dnow) PRIVATE
1142
1143 EXTN(jconst_fdct_float_3dnow):
1144
1145@@ -55,7 +55,7 @@
1146 %define WK_NUM 2
1147
1148 align 16
1149- global EXTN(jsimd_fdct_float_3dnow)
1150+ global EXTN(jsimd_fdct_float_3dnow) PRIVATE
1151
1152 EXTN(jsimd_fdct_float_3dnow):
1153 push ebp
1154Index: simd/jdsamss2-64.asm
1155===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001156--- simd/jdsamss2-64.asm (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +00001157+++ simd/jdsamss2-64.asm (working copy)
1158@@ -23,7 +23,7 @@
1159 SECTION SEG_CONST
1160
1161 alignz 16
1162- global EXTN(jconst_fancy_upsample_sse2)
1163+ global EXTN(jconst_fancy_upsample_sse2) PRIVATE
1164
1165 EXTN(jconst_fancy_upsample_sse2):
1166
1167@@ -59,7 +59,7 @@
1168 ; r13 = JSAMPARRAY * output_data_ptr
1169
1170 align 16
1171- global EXTN(jsimd_h2v1_fancy_upsample_sse2)
1172+ global EXTN(jsimd_h2v1_fancy_upsample_sse2) PRIVATE
1173
1174 EXTN(jsimd_h2v1_fancy_upsample_sse2):
1175 push rbp
1176@@ -201,7 +201,7 @@
1177 %define WK_NUM 4
1178
1179 align 16
1180- global EXTN(jsimd_h2v2_fancy_upsample_sse2)
1181+ global EXTN(jsimd_h2v2_fancy_upsample_sse2) PRIVATE
1182
1183 EXTN(jsimd_h2v2_fancy_upsample_sse2):
1184 push rbp
1185@@ -498,7 +498,7 @@
1186 ; r13 = JSAMPARRAY * output_data_ptr
1187
1188 align 16
1189- global EXTN(jsimd_h2v1_upsample_sse2)
1190+ global EXTN(jsimd_h2v1_upsample_sse2) PRIVATE
1191
1192 EXTN(jsimd_h2v1_upsample_sse2):
1193 push rbp
1194@@ -587,7 +587,7 @@
1195 ; r13 = JSAMPARRAY * output_data_ptr
1196
1197 align 16
1198- global EXTN(jsimd_h2v2_upsample_sse2)
1199+ global EXTN(jsimd_h2v2_upsample_sse2) PRIVATE
1200
1201 EXTN(jsimd_h2v2_upsample_sse2):
1202 push rbp
1203Index: simd/jcgrass2.asm
1204===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001205--- simd/jcgrass2.asm (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +00001206+++ simd/jcgrass2.asm (working copy)
1207@@ -30,7 +30,7 @@
1208 SECTION SEG_CONST
1209
1210 alignz 16
1211- global EXTN(jconst_rgb_gray_convert_sse2)
1212+ global EXTN(jconst_rgb_gray_convert_sse2) PRIVATE
1213
1214 EXTN(jconst_rgb_gray_convert_sse2):
1215
1216Index: simd/jcsammmx.asm
1217===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001218--- simd/jcsammmx.asm (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +00001219+++ simd/jcsammmx.asm (working copy)
1220@@ -40,7 +40,7 @@
1221 %define output_data(b) (b)+28 ; JSAMPARRAY output_data
1222
1223 align 16
1224- global EXTN(jsimd_h2v1_downsample_mmx)
1225+ global EXTN(jsimd_h2v1_downsample_mmx) PRIVATE
1226
1227 EXTN(jsimd_h2v1_downsample_mmx):
1228 push ebp
1229@@ -182,7 +182,7 @@
1230 %define output_data(b) (b)+28 ; JSAMPARRAY output_data
1231
1232 align 16
1233- global EXTN(jsimd_h2v2_downsample_mmx)
1234+ global EXTN(jsimd_h2v2_downsample_mmx) PRIVATE
1235
1236 EXTN(jsimd_h2v2_downsample_mmx):
1237 push ebp
thakis@chromium.org7a076b52014-08-18 22:56:54 +00001238Index: simd/jsimd_arm_neon.S
1239===================================================================
1240--- simd/jsimd_arm_neon.S (revision 272637)
1241+++ simd/jsimd_arm_neon.S (working copy)
1242@@ -41,11 +41,9 @@
1243 /* Supplementary macro for setting function attributes */
1244 .macro asm_function fname
1245 #ifdef __APPLE__
1246- .func _\fname
1247 .globl _\fname
1248 _\fname:
1249 #else
1250- .func \fname
1251 .global \fname
1252 #ifdef __ELF__
1253 .hidden \fname
1254@@ -670,7 +668,6 @@
1255 .unreq ROW6R
1256 .unreq ROW7L
1257 .unreq ROW7R
1258-.endfunc
1259
1260
1261 /*****************************************************************************/
1262@@ -895,7 +892,6 @@
1263 .unreq TMP2
1264 .unreq TMP3
1265 .unreq TMP4
1266-.endfunc
1267
1268
1269 /*****************************************************************************/
1270@@ -1108,7 +1104,6 @@
1271 .unreq TMP2
1272 .unreq TMP3
1273 .unreq TMP4
1274-.endfunc
1275
1276 .purgem idct_helper
1277
1278@@ -1263,7 +1258,6 @@
1279 .unreq OUTPUT_COL
1280 .unreq TMP1
1281 .unreq TMP2
1282-.endfunc
1283
1284 .purgem idct_helper
1285
1286@@ -1547,7 +1541,6 @@
1287 .unreq U
1288 .unreq V
1289 .unreq N
1290-.endfunc
1291
1292 .purgem do_yuv_to_rgb
1293 .purgem do_yuv_to_rgb_stage1
1294@@ -1858,7 +1851,6 @@
1295 .unreq U
1296 .unreq V
1297 .unreq N
1298-.endfunc
1299
1300 .purgem do_rgb_to_yuv
1301 .purgem do_rgb_to_yuv_stage1
1302@@ -1940,7 +1932,6 @@
1303 .unreq TMP2
1304 .unreq TMP3
1305 .unreq TMP4
1306-.endfunc
1307
1308
1309 /*****************************************************************************/
1310@@ -2064,7 +2055,6 @@
1311
1312 .unreq DATA
1313 .unreq TMP
1314-.endfunc
1315
1316
1317 /*****************************************************************************/
1318@@ -2166,7 +2156,6 @@
1319 .unreq CORRECTION
1320 .unreq SHIFT
1321 .unreq LOOP_COUNT
1322-.endfunc
1323
1324
1325 /*****************************************************************************/
1326@@ -2401,7 +2390,6 @@
1327 .unreq WIDTH
1328 .unreq TMP
1329
1330-.endfunc
1331
1332 .purgem upsample16
1333 .purgem upsample32
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001334Index: simd/jsimd_i386.c
1335===================================================================
1336--- simd/jsimd_i386.c (revision 829)
1337+++ simd/jsimd_i386.c (working copy)
1338@@ -61,6 +61,7 @@
1339 simd_support &= JSIMD_SSE2;
1340 }
1341
1342+#ifndef JPEG_DECODE_ONLY
1343 GLOBAL(int)
1344 jsimd_can_rgb_ycc (void)
1345 {
1346@@ -82,6 +83,7 @@
1347
1348 return 0;
1349 }
1350+#endif
1351
1352 GLOBAL(int)
1353 jsimd_can_rgb_gray (void)
1354@@ -127,6 +129,7 @@
1355 return 0;
1356 }
1357
1358+#ifndef JPEG_DECODE_ONLY
1359 GLOBAL(void)
1360 jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
1361 JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
1362@@ -179,6 +182,7 @@
1363 mmxfct(cinfo->image_width, input_buf,
1364 output_buf, output_row, num_rows);
1365 }
1366+#endif
1367
1368 GLOBAL(void)
1369 jsimd_rgb_gray_convert (j_compress_ptr cinfo,
1370@@ -286,6 +290,7 @@
1371 input_row, output_buf, num_rows);
1372 }
1373
1374+#ifndef JPEG_DECODE_ONLY
1375 GLOBAL(int)
1376 jsimd_can_h2v2_downsample (void)
1377 {
1378@@ -351,6 +356,7 @@
1379 compptr->v_samp_factor, compptr->width_in_blocks,
1380 input_data, output_data);
1381 }
1382+#endif
1383
1384 GLOBAL(int)
1385 jsimd_can_h2v2_upsample (void)
1386@@ -636,6 +642,7 @@
1387 in_row_group_ctr, output_buf);
1388 }
1389
1390+#ifndef JPEG_DECODE_ONLY
1391 GLOBAL(int)
1392 jsimd_can_convsamp (void)
1393 {
1394@@ -855,6 +862,7 @@
1395 else if (simd_support & JSIMD_3DNOW)
1396 jsimd_quantize_float_3dnow(coef_block, divisors, workspace);
1397 }
1398+#endif
1399
1400 GLOBAL(int)
1401 jsimd_can_idct_2x2 (void)
1402@@ -1045,4 +1053,3 @@
1403 jsimd_idct_float_3dnow(compptr->dct_table, coef_block,
1404 output_buf, output_col);
1405 }
1406-
hbono@chromium.org98626972011-08-03 03:13:08 +00001407Index: simd/jcqnts2f-64.asm
1408===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001409--- simd/jcqnts2f-64.asm (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +00001410+++ simd/jcqnts2f-64.asm (working copy)
1411@@ -36,7 +36,7 @@
1412 ; r12 = FAST_FLOAT * workspace
1413
1414 align 16
1415- global EXTN(jsimd_convsamp_float_sse2)
1416+ global EXTN(jsimd_convsamp_float_sse2) PRIVATE
1417
1418 EXTN(jsimd_convsamp_float_sse2):
1419 push rbp
1420@@ -110,7 +110,7 @@
1421 ; r12 = FAST_FLOAT * workspace
1422
1423 align 16
1424- global EXTN(jsimd_quantize_float_sse2)
1425+ global EXTN(jsimd_quantize_float_sse2) PRIVATE
1426
1427 EXTN(jsimd_quantize_float_sse2):
1428 push rbp
1429Index: simd/jcqnt3dn.asm
1430===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001431--- simd/jcqnt3dn.asm (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +00001432+++ simd/jcqnt3dn.asm (working copy)
1433@@ -35,7 +35,7 @@
1434 %define workspace ebp+16 ; FAST_FLOAT * workspace
1435
1436 align 16
1437- global EXTN(jsimd_convsamp_float_3dnow)
1438+ global EXTN(jsimd_convsamp_float_3dnow) PRIVATE
1439
1440 EXTN(jsimd_convsamp_float_3dnow):
1441 push ebp
1442@@ -138,7 +138,7 @@
1443 %define workspace ebp+16 ; FAST_FLOAT * workspace
1444
1445 align 16
1446- global EXTN(jsimd_quantize_float_3dnow)
1447+ global EXTN(jsimd_quantize_float_3dnow) PRIVATE
1448
1449 EXTN(jsimd_quantize_float_3dnow):
1450 push ebp
1451Index: simd/jcsamss2.asm
1452===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001453--- simd/jcsamss2.asm (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +00001454+++ simd/jcsamss2.asm (working copy)
1455@@ -40,7 +40,7 @@
1456 %define output_data(b) (b)+28 ; JSAMPARRAY output_data
1457
1458 align 16
1459- global EXTN(jsimd_h2v1_downsample_sse2)
1460+ global EXTN(jsimd_h2v1_downsample_sse2) PRIVATE
1461
1462 EXTN(jsimd_h2v1_downsample_sse2):
1463 push ebp
1464@@ -195,7 +195,7 @@
1465 %define output_data(b) (b)+28 ; JSAMPARRAY output_data
1466
1467 align 16
1468- global EXTN(jsimd_h2v2_downsample_sse2)
1469+ global EXTN(jsimd_h2v2_downsample_sse2) PRIVATE
1470
1471 EXTN(jsimd_h2v2_downsample_sse2):
1472 push ebp
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001473Index: simd/jsimd_x86_64.c
1474===================================================================
1475--- simd/jsimd_x86_64.c (revision 829)
1476+++ simd/jsimd_x86_64.c (working copy)
1477@@ -29,6 +29,7 @@
1478
1479 #define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
1480
1481+#ifndef JPEG_DECODE_ONLY
1482 GLOBAL(int)
1483 jsimd_can_rgb_ycc (void)
1484 {
1485@@ -45,6 +46,7 @@
1486
1487 return 1;
1488 }
1489+#endif
1490
1491 GLOBAL(int)
1492 jsimd_can_rgb_gray (void)
1493@@ -80,6 +82,7 @@
1494 return 1;
1495 }
1496
1497+#ifndef JPEG_DECODE_ONLY
1498 GLOBAL(void)
1499 jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
1500 JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
1501@@ -118,6 +121,7 @@
1502
1503 sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
1504 }
1505+#endif
1506
1507 GLOBAL(void)
1508 jsimd_rgb_gray_convert (j_compress_ptr cinfo,
1509@@ -197,6 +201,7 @@
1510 sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
1511 }
1512
1513+#ifndef JPEG_DECODE_ONLY
1514 GLOBAL(int)
1515 jsimd_can_h2v2_downsample (void)
1516 {
1517@@ -242,6 +247,7 @@
1518 compptr->width_in_blocks,
1519 input_data, output_data);
1520 }
1521+#endif
1522
1523 GLOBAL(int)
1524 jsimd_can_h2v2_upsample (void)
1525@@ -451,6 +457,7 @@
1526 sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
1527 }
1528
1529+#ifndef JPEG_DECODE_ONLY
1530 GLOBAL(int)
1531 jsimd_can_convsamp (void)
1532 {
1533@@ -601,6 +608,7 @@
1534 {
1535 jsimd_quantize_float_sse2(coef_block, divisors, workspace);
1536 }
1537+#endif
1538
1539 GLOBAL(int)
1540 jsimd_can_idct_2x2 (void)
1541@@ -750,4 +758,3 @@
1542 jsimd_idct_float_sse2(compptr->dct_table, coef_block,
1543 output_buf, output_col);
1544 }
1545-
hbono@chromium.org98626972011-08-03 03:13:08 +00001546Index: simd/jimmxint.asm
1547===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001548--- simd/jimmxint.asm (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +00001549+++ simd/jimmxint.asm (working copy)
1550@@ -66,7 +66,7 @@
1551 SECTION SEG_CONST
1552
1553 alignz 16
1554- global EXTN(jconst_idct_islow_mmx)
1555+ global EXTN(jconst_idct_islow_mmx) PRIVATE
1556
1557 EXTN(jconst_idct_islow_mmx):
1558
1559@@ -107,7 +107,7 @@
1560 ; JCOEF workspace[DCTSIZE2]
1561
1562 align 16
1563- global EXTN(jsimd_idct_islow_mmx)
1564+ global EXTN(jsimd_idct_islow_mmx) PRIVATE
1565
1566 EXTN(jsimd_idct_islow_mmx):
1567 push ebp
1568Index: simd/jcgrymmx.asm
1569===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001570--- simd/jcgrymmx.asm (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +00001571+++ simd/jcgrymmx.asm (working copy)
1572@@ -41,7 +41,7 @@
1573 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
1574
1575 align 16
1576- global EXTN(jsimd_rgb_gray_convert_mmx)
1577+ global EXTN(jsimd_rgb_gray_convert_mmx) PRIVATE
1578
1579 EXTN(jsimd_rgb_gray_convert_mmx):
1580 push ebp
1581Index: simd/jfss2int.asm
1582===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001583--- simd/jfss2int.asm (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +00001584+++ simd/jfss2int.asm (working copy)
1585@@ -66,7 +66,7 @@
1586 SECTION SEG_CONST
1587
1588 alignz 16
1589- global EXTN(jconst_fdct_islow_sse2)
1590+ global EXTN(jconst_fdct_islow_sse2) PRIVATE
1591
1592 EXTN(jconst_fdct_islow_sse2):
1593
1594@@ -101,7 +101,7 @@
1595 %define WK_NUM 6
1596
1597 align 16
1598- global EXTN(jsimd_fdct_islow_sse2)
1599+ global EXTN(jsimd_fdct_islow_sse2) PRIVATE
1600
1601 EXTN(jsimd_fdct_islow_sse2):
1602 push ebp
1603Index: simd/jcgryss2.asm
1604===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001605--- simd/jcgryss2.asm (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +00001606+++ simd/jcgryss2.asm (working copy)
1607@@ -39,7 +39,7 @@
1608
1609 align 16
1610
1611- global EXTN(jsimd_rgb_gray_convert_sse2)
1612+ global EXTN(jsimd_rgb_gray_convert_sse2) PRIVATE
1613
1614 EXTN(jsimd_rgb_gray_convert_sse2):
1615 push ebp
1616Index: simd/jccolmmx.asm
1617===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001618--- simd/jccolmmx.asm (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +00001619+++ simd/jccolmmx.asm (working copy)
1620@@ -37,7 +37,7 @@
1621 SECTION SEG_CONST
1622
1623 alignz 16
1624- global EXTN(jconst_rgb_ycc_convert_mmx)
1625+ global EXTN(jconst_rgb_ycc_convert_mmx) PRIVATE
1626
1627 EXTN(jconst_rgb_ycc_convert_mmx):
1628
1629Index: simd/jimmxred.asm
1630===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001631--- simd/jimmxred.asm (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +00001632+++ simd/jimmxred.asm (working copy)
1633@@ -72,7 +72,7 @@
1634 SECTION SEG_CONST
1635
1636 alignz 16
1637- global EXTN(jconst_idct_red_mmx)
1638+ global EXTN(jconst_idct_red_mmx) PRIVATE
1639
1640 EXTN(jconst_idct_red_mmx):
1641
1642@@ -115,7 +115,7 @@
1643 ; JCOEF workspace[DCTSIZE2]
1644
1645 align 16
1646- global EXTN(jsimd_idct_4x4_mmx)
1647+ global EXTN(jsimd_idct_4x4_mmx) PRIVATE
1648
1649 EXTN(jsimd_idct_4x4_mmx):
1650 push ebp
1651@@ -503,7 +503,7 @@
1652 %define output_col(b) (b)+20 ; JDIMENSION output_col
1653
1654 align 16
1655- global EXTN(jsimd_idct_2x2_mmx)
1656+ global EXTN(jsimd_idct_2x2_mmx) PRIVATE
1657
1658 EXTN(jsimd_idct_2x2_mmx):
1659 push ebp
1660Index: simd/jsimdext.inc
1661===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001662--- simd/jsimdext.inc (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +00001663+++ simd/jsimdext.inc (working copy)
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001664@@ -73,6 +73,9 @@
1665 ; * *BSD family Unix using elf format
1666 ; * Unix System V, including Solaris x86, UnixWare and SCO Unix
1667
1668+; PIC is the default on Linux
1669+%define PIC
1670+
1671 ; mark stack as non-executable
1672 section .note.GNU-stack noalloc noexec nowrite progbits
1673
hbono@chromium.org0ec930e2012-01-18 07:01:04 +00001674@@ -375,4 +378,14 @@
hbono@chromium.org98626972011-08-03 03:13:08 +00001675 ;
1676 %include "jsimdcfg.inc"
1677
1678+; Begin chromium edits
1679+%ifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
1680+%define PRIVATE :private_extern
1681+%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------
1682+%define PRIVATE :hidden
1683+%else
1684+%define PRIVATE
1685+%endif
1686+; End chromium edits
1687+
1688 ; --------------------------------------------------------------------------
1689Index: simd/jdclrmmx.asm
1690===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001691--- simd/jdclrmmx.asm (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +00001692+++ simd/jdclrmmx.asm (working copy)
1693@@ -40,7 +40,7 @@
1694 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
1695
1696 align 16
1697- global EXTN(jsimd_ycc_rgb_convert_mmx)
1698+ global EXTN(jsimd_ycc_rgb_convert_mmx) PRIVATE
1699
1700 EXTN(jsimd_ycc_rgb_convert_mmx):
1701 push ebp
1702Index: simd/jccolss2.asm
1703===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001704--- simd/jccolss2.asm (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +00001705+++ simd/jccolss2.asm (working copy)
1706@@ -34,7 +34,7 @@
1707 SECTION SEG_CONST
1708
1709 alignz 16
1710- global EXTN(jconst_rgb_ycc_convert_sse2)
1711+ global EXTN(jconst_rgb_ycc_convert_sse2) PRIVATE
1712
1713 EXTN(jconst_rgb_ycc_convert_sse2):
1714
1715Index: simd/jisseflt.asm
1716===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001717--- simd/jisseflt.asm (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +00001718+++ simd/jisseflt.asm (working copy)
1719@@ -37,7 +37,7 @@
1720 SECTION SEG_CONST
1721
1722 alignz 16
1723- global EXTN(jconst_idct_float_sse)
1724+ global EXTN(jconst_idct_float_sse) PRIVATE
1725
1726 EXTN(jconst_idct_float_sse):
1727
1728@@ -73,7 +73,7 @@
1729 ; FAST_FLOAT workspace[DCTSIZE2]
1730
1731 align 16
1732- global EXTN(jsimd_idct_float_sse)
1733+ global EXTN(jsimd_idct_float_sse) PRIVATE
1734
1735 EXTN(jsimd_idct_float_sse):
1736 push ebp
1737Index: simd/jcqnts2i-64.asm
1738===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001739--- simd/jcqnts2i-64.asm (revision 829)
hbono@chromium.org98626972011-08-03 03:13:08 +00001740+++ simd/jcqnts2i-64.asm (working copy)
1741@@ -36,7 +36,7 @@
1742 ; r12 = DCTELEM * workspace
1743
1744 align 16
1745- global EXTN(jsimd_convsamp_sse2)
1746+ global EXTN(jsimd_convsamp_sse2) PRIVATE
1747
1748 EXTN(jsimd_convsamp_sse2):
1749 push rbp
1750@@ -112,7 +112,7 @@
1751 ; r12 = DCTELEM * workspace
1752
1753 align 16
1754- global EXTN(jsimd_quantize_sse2)
1755+ global EXTN(jsimd_quantize_sse2) PRIVATE
1756
1757 EXTN(jsimd_quantize_sse2):
1758 push rbp
hbono@chromium.org68635482011-02-07 06:02:41 +00001759Index: simd/jdclrss2.asm
1760===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001761--- simd/jdclrss2.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +00001762+++ simd/jdclrss2.asm (working copy)
hbono@chromium.org98626972011-08-03 03:13:08 +00001763@@ -40,7 +40,7 @@
hbono@chromium.org68635482011-02-07 06:02:41 +00001764 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
1765
1766 align 16
1767- global EXTN(jsimd_ycc_rgb_convert_sse2)
1768+ global EXTN(jsimd_ycc_rgb_convert_sse2) PRIVATE
1769
1770 EXTN(jsimd_ycc_rgb_convert_sse2):
1771 push ebp
hbono@chromium.org68635482011-02-07 06:02:41 +00001772Index: simd/jcqntsse.asm
1773===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001774--- simd/jcqntsse.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +00001775+++ simd/jcqntsse.asm (working copy)
1776@@ -35,7 +35,7 @@
1777 %define workspace ebp+16 ; FAST_FLOAT * workspace
1778
1779 align 16
1780- global EXTN(jsimd_convsamp_float_sse)
1781+ global EXTN(jsimd_convsamp_float_sse) PRIVATE
1782
1783 EXTN(jsimd_convsamp_float_sse):
1784 push ebp
1785@@ -138,7 +138,7 @@
1786 %define workspace ebp+16 ; FAST_FLOAT * workspace
1787
1788 align 16
1789- global EXTN(jsimd_quantize_float_sse)
1790+ global EXTN(jsimd_quantize_float_sse) PRIVATE
1791
1792 EXTN(jsimd_quantize_float_sse):
1793 push ebp
hbono@chromium.org68635482011-02-07 06:02:41 +00001794Index: simd/jiss2int-64.asm
1795===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001796--- simd/jiss2int-64.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +00001797+++ simd/jiss2int-64.asm (working copy)
1798@@ -67,7 +67,7 @@
1799 SECTION SEG_CONST
1800
1801 alignz 16
1802- global EXTN(jconst_idct_islow_sse2)
1803+ global EXTN(jconst_idct_islow_sse2) PRIVATE
1804
1805 EXTN(jconst_idct_islow_sse2):
1806
1807@@ -106,7 +106,7 @@
1808 %define WK_NUM 12
1809
1810 align 16
1811- global EXTN(jsimd_idct_islow_sse2)
1812+ global EXTN(jsimd_idct_islow_sse2) PRIVATE
1813
1814 EXTN(jsimd_idct_islow_sse2):
1815 push rbp
1816Index: simd/jfmmxfst.asm
1817===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001818--- simd/jfmmxfst.asm (revision 829)
hbono@chromium.org68635482011-02-07 06:02:41 +00001819+++ simd/jfmmxfst.asm (working copy)
1820@@ -52,7 +52,7 @@
1821 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
1822
1823 alignz 16
1824- global EXTN(jconst_fdct_ifast_mmx)
1825+ global EXTN(jconst_fdct_ifast_mmx) PRIVATE
1826
1827 EXTN(jconst_fdct_ifast_mmx):
1828
1829@@ -80,7 +80,7 @@
1830 %define WK_NUM 2
1831
1832 align 16
1833- global EXTN(jsimd_fdct_ifast_mmx)
1834+ global EXTN(jsimd_fdct_ifast_mmx) PRIVATE
1835
1836 EXTN(jsimd_fdct_ifast_mmx):
1837 push ebp
hbono@chromium.org538d9fd2011-08-15 06:52:21 +00001838Index: jdarith.c
1839===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001840--- jdarith.c (revision 829)
hbono@chromium.org538d9fd2011-08-15 06:52:21 +00001841+++ jdarith.c (working copy)
1842@@ -150,8 +150,8 @@
1843 */
1844 sv = *st;
1845 qe = jpeg_aritab[sv & 0x7F]; /* => Qe_Value */
1846- nl = qe & 0xFF; qe >>= 8; /* Next_Index_LPS + Switch_MPS */
1847- nm = qe & 0xFF; qe >>= 8; /* Next_Index_MPS */
1848+ nl = (unsigned char) qe & 0xFF; qe >>= 8; /* Next_Index_LPS + Switch_MPS */
1849+ nm = (unsigned char) qe & 0xFF; qe >>= 8; /* Next_Index_MPS */
1850
1851 /* Decode & estimation procedures per sections D.2.4 & D.2.5 */
1852 temp = e->a - qe;
1853Index: jdhuff.c
1854===================================================================
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00001855--- jdhuff.c (revision 829)
hbono@chromium.org538d9fd2011-08-15 06:52:21 +00001856+++ jdhuff.c (working copy)
1857@@ -742,7 +742,7 @@
1858 * this module, since we'll just re-assign them on the next call.)
1859 */
1860
1861-#define BUFSIZE (DCTSIZE2 * 2)
1862+#define BUFSIZE (DCTSIZE2 * 2u)
1863
1864 METHODDEF(boolean)
1865 decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
noel@chromium.org841fff82014-05-23 23:38:59 +00001866Index: jchuff.c
1867===================================================================
1868--- jchuff.c (revision 1219)
1869+++ jchuff.c (revision 1220)
1870@@ -22,8 +22,36 @@
1871 #include "jchuff.h" /* Declarations shared with jcphuff.c */
1872 #include <limits.h>
1873
1874+/*
1875+ * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be
1876+ * used for bit counting rather than the lookup table. This will reduce the
1877+ * memory footprint by 64k, which is important for some mobile applications
1878+ * that create many isolated instances of libjpeg-turbo (web browsers, for
1879+ * instance.) This may improve performance on some mobile platforms as well.
1880+ * This feature is enabled by default only on ARM processors, because some x86
1881+ * chips have a slow implementation of bsr, and the use of clz/bsr cannot be
1882+ * shown to have a significant performance impact even on the x86 chips that
1883+ * have a fast implementation of it. When building for ARMv6, you can
1884+ * explicitly disable the use of clz/bsr by adding -mthumb to the compiler
1885+ * flags (this defines __thumb__).
1886+ */
1887+
1888+/* NOTE: Both GCC and Clang define __GNUC__ */
1889+#if defined __GNUC__ && defined __arm__
1890+#if !defined __thumb__ || defined __thumb2__
1891+#define USE_CLZ_INTRINSIC
1892+#endif
1893+#endif
1894+
1895+#ifdef USE_CLZ_INTRINSIC
1896+#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x))
1897+#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0)
1898+#else
1899 static unsigned char jpeg_nbits_table[65536];
1900 static int jpeg_nbits_table_init = 0;
1901+#define JPEG_NBITS(x) (jpeg_nbits_table[x])
1902+#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x)
1903+#endif
1904
1905 #ifndef min
1906 #define min(a,b) ((a)<(b)?(a):(b))
1907@@ -272,6 +300,7 @@
1908 dtbl->ehufsi[i] = huffsize[p];
1909 }
1910
1911+#ifndef USE_CLZ_INTRINSIC
1912 if(!jpeg_nbits_table_init) {
1913 for(i = 0; i < 65536; i++) {
1914 int nbits = 0, temp = i;
1915@@ -280,6 +309,7 @@
1916 }
1917 jpeg_nbits_table_init = 1;
1918 }
1919+#endif
1920 }
1921
1922
1923@@ -482,7 +512,7 @@
1924 temp2 += temp3;
1925
1926 /* Find the number of bits needed for the magnitude of the coefficient */
1927- nbits = jpeg_nbits_table[temp];
1928+ nbits = JPEG_NBITS(temp);
1929
1930 /* Emit the Huffman-coded symbol for the number of bits */
1931 code = dctbl->ehufco[nbits];
1932@@ -516,7 +546,7 @@
1933 temp ^= temp3; \
1934 temp -= temp3; \
1935 temp2 += temp3; \
1936- nbits = jpeg_nbits_table[temp]; \
1937+ nbits = JPEG_NBITS_NONZERO(temp); \
1938 /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
1939 while (r > 15) { \
1940 EMIT_BITS(code_0xf0, size_0xf0) \
rmcilroy@chromium.org2ed53192014-08-29 12:32:19 +00001941Index: simd/jsimd_arm64.c
1942===================================================================
1943--- /dev/null
1944+++ simd/jsimd_arm64.c
1945@@ -0,0 +1,544 @@
1946+/*
1947+ * jsimd_arm64.c
1948+ *
1949+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
1950+ * Copyright 2009-2011, 2013-2014 D. R. Commander
1951+ *
1952+ * Based on the x86 SIMD extension for IJG JPEG library,
1953+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
1954+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
1955+ *
1956+ * This file contains the interface between the "normal" portions
1957+ * of the library and the SIMD implementations when running on a
1958+ * 64-bit ARM architecture.
1959+ */
1960+
1961+#define JPEG_INTERNALS
1962+#include "../jinclude.h"
1963+#include "../jpeglib.h"
1964+#include "../jsimd.h"
1965+#include "../jdct.h"
1966+#include "../jsimddct.h"
1967+#include "jsimd.h"
1968+
1969+#include <stdio.h>
1970+#include <string.h>
1971+#include <ctype.h>
1972+
1973+static unsigned int simd_support = ~0;
1974+
1975+/*
1976+ * Check what SIMD accelerations are supported.
1977+ *
1978+ * FIXME: This code is racy under a multi-threaded environment.
1979+ */
1980+
1981+/*
1982+ * ARMv8 architectures support NEON extensions by default.
1983+ * It is no longer optional as it was with ARMv7.
1984+ */
1985+
1986+
1987+LOCAL(void)
1988+init_simd (void)
1989+{
1990+ char *env = NULL;
1991+
1992+ if (simd_support != ~0U)
1993+ return;
1994+
1995+ simd_support = 0;
1996+
1997+ simd_support |= JSIMD_ARM_NEON;
1998+
1999+ /* Force different settings through environment variables */
2000+ env = getenv("JSIMD_FORCENEON");
2001+ if ((env != NULL) && (strcmp(env, "1") == 0))
2002+ simd_support &= JSIMD_ARM_NEON;
2003+ env = getenv("JSIMD_FORCENONE");
2004+ if ((env != NULL) && (strcmp(env, "1") == 0))
2005+ simd_support = 0;
2006+}
2007+
2008+GLOBAL(int)
2009+jsimd_can_rgb_ycc (void)
2010+{
2011+ init_simd();
2012+
2013+ return 0;
2014+}
2015+
2016+GLOBAL(int)
2017+jsimd_can_rgb_gray (void)
2018+{
2019+ init_simd();
2020+
2021+ return 0;
2022+}
2023+
2024+GLOBAL(int)
2025+jsimd_can_ycc_rgb (void)
2026+{
2027+ init_simd();
2028+
2029+ /* The code is optimised for these values only */
2030+ if (BITS_IN_JSAMPLE != 8)
2031+ return 0;
2032+ if (sizeof(JDIMENSION) != 4)
2033+ return 0;
2034+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
2035+ return 0;
2036+
2037+ if (simd_support & JSIMD_ARM_NEON)
2038+ return 1;
2039+
2040+ return 0;
2041+}
2042+
2043+GLOBAL(int)
2044+jsimd_can_ycc_rgb565 (void)
2045+{
2046+ init_simd();
2047+
2048+ /* The code is optimised for these values only */
2049+ if (BITS_IN_JSAMPLE != 8)
2050+ return 0;
2051+ if (sizeof(JDIMENSION) != 4)
2052+ return 0;
2053+
2054+ if (simd_support & JSIMD_ARM_NEON)
2055+ return 1;
2056+
2057+ return 0;
2058+}
2059+
2060+GLOBAL(void)
2061+jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
2062+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
2063+ JDIMENSION output_row, int num_rows)
2064+{
2065+}
2066+
2067+GLOBAL(void)
2068+jsimd_rgb_gray_convert (j_compress_ptr cinfo,
2069+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
2070+ JDIMENSION output_row, int num_rows)
2071+{
2072+}
2073+
2074+GLOBAL(void)
2075+jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
2076+ JSAMPIMAGE input_buf, JDIMENSION input_row,
2077+ JSAMPARRAY output_buf, int num_rows)
2078+{
2079+ void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
2080+
2081+ switch(cinfo->out_color_space) {
2082+ case JCS_EXT_RGB:
2083+ neonfct=jsimd_ycc_extrgb_convert_neon;
2084+ break;
2085+ case JCS_EXT_RGBX:
2086+ case JCS_EXT_RGBA:
2087+ neonfct=jsimd_ycc_extrgbx_convert_neon;
2088+ break;
2089+ case JCS_EXT_BGR:
2090+ neonfct=jsimd_ycc_extbgr_convert_neon;
2091+ break;
2092+ case JCS_EXT_BGRX:
2093+ case JCS_EXT_BGRA:
2094+ neonfct=jsimd_ycc_extbgrx_convert_neon;
2095+ break;
2096+ case JCS_EXT_XBGR:
2097+ case JCS_EXT_ABGR:
2098+ neonfct=jsimd_ycc_extxbgr_convert_neon;
2099+ break;
2100+ case JCS_EXT_XRGB:
2101+ case JCS_EXT_ARGB:
2102+ neonfct=jsimd_ycc_extxrgb_convert_neon;
2103+ break;
2104+ default:
2105+ neonfct=jsimd_ycc_extrgb_convert_neon;
2106+ break;
2107+ }
2108+
2109+ if (simd_support & JSIMD_ARM_NEON)
2110+ neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
2111+}
2112+
2113+GLOBAL(void)
2114+jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
2115+ JSAMPIMAGE input_buf, JDIMENSION input_row,
2116+ JSAMPARRAY output_buf, int num_rows)
2117+{
2118+ if (simd_support & JSIMD_ARM_NEON)
2119+ jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
2120+ output_buf, num_rows);
2121+}
2122+
2123+GLOBAL(int)
2124+jsimd_can_h2v2_downsample (void)
2125+{
2126+ init_simd();
2127+
2128+ return 0;
2129+}
2130+
2131+GLOBAL(int)
2132+jsimd_can_h2v1_downsample (void)
2133+{
2134+ init_simd();
2135+
2136+ return 0;
2137+}
2138+
2139+GLOBAL(void)
2140+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
2141+ JSAMPARRAY input_data, JSAMPARRAY output_data)
2142+{
2143+}
2144+
2145+GLOBAL(void)
2146+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
2147+ JSAMPARRAY input_data, JSAMPARRAY output_data)
2148+{
2149+}
2150+
2151+GLOBAL(int)
2152+jsimd_can_h2v2_upsample (void)
2153+{
2154+ init_simd();
2155+
2156+ return 0;
2157+}
2158+
2159+GLOBAL(int)
2160+jsimd_can_h2v1_upsample (void)
2161+{
2162+ init_simd();
2163+
2164+ return 0;
2165+}
2166+
2167+GLOBAL(void)
2168+jsimd_h2v2_upsample (j_decompress_ptr cinfo,
2169+ jpeg_component_info * compptr,
2170+ JSAMPARRAY input_data,
2171+ JSAMPARRAY * output_data_ptr)
2172+{
2173+}
2174+
2175+GLOBAL(void)
2176+jsimd_h2v1_upsample (j_decompress_ptr cinfo,
2177+ jpeg_component_info * compptr,
2178+ JSAMPARRAY input_data,
2179+ JSAMPARRAY * output_data_ptr)
2180+{
2181+}
2182+
2183+GLOBAL(int)
2184+jsimd_can_h2v2_fancy_upsample (void)
2185+{
2186+ init_simd();
2187+
2188+ return 0;
2189+}
2190+
2191+GLOBAL(int)
2192+jsimd_can_h2v1_fancy_upsample (void)
2193+{
2194+ init_simd();
2195+
2196+ return 0;
2197+}
2198+
2199+GLOBAL(void)
2200+jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
2201+ jpeg_component_info * compptr,
2202+ JSAMPARRAY input_data,
2203+ JSAMPARRAY * output_data_ptr)
2204+{
2205+}
2206+
2207+GLOBAL(void)
2208+jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
2209+ jpeg_component_info * compptr,
2210+ JSAMPARRAY input_data,
2211+ JSAMPARRAY * output_data_ptr)
2212+{
2213+}
2214+
2215+GLOBAL(int)
2216+jsimd_can_h2v2_merged_upsample (void)
2217+{
2218+ init_simd();
2219+
2220+ return 0;
2221+}
2222+
2223+GLOBAL(int)
2224+jsimd_can_h2v1_merged_upsample (void)
2225+{
2226+ init_simd();
2227+
2228+ return 0;
2229+}
2230+
2231+GLOBAL(void)
2232+jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
2233+ JSAMPIMAGE input_buf,
2234+ JDIMENSION in_row_group_ctr,
2235+ JSAMPARRAY output_buf)
2236+{
2237+}
2238+
2239+GLOBAL(void)
2240+jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
2241+ JSAMPIMAGE input_buf,
2242+ JDIMENSION in_row_group_ctr,
2243+ JSAMPARRAY output_buf)
2244+{
2245+}
2246+
2247+GLOBAL(int)
2248+jsimd_can_convsamp (void)
2249+{
2250+ init_simd();
2251+
2252+ return 0;
2253+}
2254+
2255+GLOBAL(int)
2256+jsimd_can_convsamp_float (void)
2257+{
2258+ init_simd();
2259+
2260+ return 0;
2261+}
2262+
2263+GLOBAL(void)
2264+jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
2265+ DCTELEM * workspace)
2266+{
2267+}
2268+
2269+GLOBAL(void)
2270+jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
2271+ FAST_FLOAT * workspace)
2272+{
2273+}
2274+
2275+GLOBAL(int)
2276+jsimd_can_fdct_islow (void)
2277+{
2278+ init_simd();
2279+
2280+ return 0;
2281+}
2282+
2283+GLOBAL(int)
2284+jsimd_can_fdct_ifast (void)
2285+{
2286+ init_simd();
2287+
2288+ return 0;
2289+}
2290+
2291+GLOBAL(int)
2292+jsimd_can_fdct_float (void)
2293+{
2294+ init_simd();
2295+
2296+ return 0;
2297+}
2298+
2299+GLOBAL(void)
2300+jsimd_fdct_islow (DCTELEM * data)
2301+{
2302+}
2303+
2304+GLOBAL(void)
2305+jsimd_fdct_ifast (DCTELEM * data)
2306+{
2307+}
2308+
2309+GLOBAL(void)
2310+jsimd_fdct_float (FAST_FLOAT * data)
2311+{
2312+}
2313+
2314+GLOBAL(int)
2315+jsimd_can_quantize (void)
2316+{
2317+ init_simd();
2318+
2319+ return 0;
2320+}
2321+
2322+GLOBAL(int)
2323+jsimd_can_quantize_float (void)
2324+{
2325+ init_simd();
2326+
2327+ return 0;
2328+}
2329+
2330+GLOBAL(void)
2331+jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
2332+ DCTELEM * workspace)
2333+{
2334+}
2335+
2336+GLOBAL(void)
2337+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
2338+ FAST_FLOAT * workspace)
2339+{
2340+}
2341+
2342+GLOBAL(int)
2343+jsimd_can_idct_2x2 (void)
2344+{
2345+ init_simd();
2346+
2347+ /* The code is optimised for these values only */
2348+ if (DCTSIZE != 8)
2349+ return 0;
2350+ if (sizeof(JCOEF) != 2)
2351+ return 0;
2352+ if (BITS_IN_JSAMPLE != 8)
2353+ return 0;
2354+ if (sizeof(JDIMENSION) != 4)
2355+ return 0;
2356+ if (sizeof(ISLOW_MULT_TYPE) != 2)
2357+ return 0;
2358+
2359+ if (simd_support & JSIMD_ARM_NEON)
2360+ return 1;
2361+
2362+ return 0;
2363+}
2364+
2365+GLOBAL(int)
2366+jsimd_can_idct_4x4 (void)
2367+{
2368+ init_simd();
2369+
2370+ /* The code is optimised for these values only */
2371+ if (DCTSIZE != 8)
2372+ return 0;
2373+ if (sizeof(JCOEF) != 2)
2374+ return 0;
2375+ if (BITS_IN_JSAMPLE != 8)
2376+ return 0;
2377+ if (sizeof(JDIMENSION) != 4)
2378+ return 0;
2379+ if (sizeof(ISLOW_MULT_TYPE) != 2)
2380+ return 0;
2381+
2382+ if (simd_support & JSIMD_ARM_NEON)
2383+ return 1;
2384+
2385+ return 0;
2386+}
2387+
2388+GLOBAL(void)
2389+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2390+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
2391+ JDIMENSION output_col)
2392+{
2393+ if (simd_support & JSIMD_ARM_NEON)
2394+ jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
2395+ output_col);
2396+}
2397+
2398+GLOBAL(void)
2399+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2400+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
2401+ JDIMENSION output_col)
2402+{
2403+ if (simd_support & JSIMD_ARM_NEON)
2404+ jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
2405+ output_col);
2406+}
2407+
2408+GLOBAL(int)
2409+jsimd_can_idct_islow (void)
2410+{
2411+ init_simd();
2412+
2413+ /* The code is optimised for these values only */
2414+ if (DCTSIZE != 8)
2415+ return 0;
2416+ if (sizeof(JCOEF) != 2)
2417+ return 0;
2418+ if (BITS_IN_JSAMPLE != 8)
2419+ return 0;
2420+ if (sizeof(JDIMENSION) != 4)
2421+ return 0;
2422+ if (sizeof(ISLOW_MULT_TYPE) != 2)
2423+ return 0;
2424+
2425+ if (simd_support & JSIMD_ARM_NEON)
2426+ return 1;
2427+
2428+ return 0;
2429+}
2430+
2431+GLOBAL(int)
2432+jsimd_can_idct_ifast (void)
2433+{
2434+ init_simd();
2435+
2436+ /* The code is optimised for these values only */
2437+ if (DCTSIZE != 8)
2438+ return 0;
2439+ if (sizeof(JCOEF) != 2)
2440+ return 0;
2441+ if (BITS_IN_JSAMPLE != 8)
2442+ return 0;
2443+ if (sizeof(JDIMENSION) != 4)
2444+ return 0;
2445+ if (sizeof(IFAST_MULT_TYPE) != 2)
2446+ return 0;
2447+ if (IFAST_SCALE_BITS != 2)
2448+ return 0;
2449+
2450+ if (simd_support & JSIMD_ARM_NEON)
2451+ return 1;
2452+
2453+ return 0;
2454+}
2455+
2456+GLOBAL(int)
2457+jsimd_can_idct_float (void)
2458+{
2459+ init_simd();
2460+
2461+ return 0;
2462+}
2463+
2464+GLOBAL(void)
2465+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2466+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
2467+ JDIMENSION output_col)
2468+{
2469+ if (simd_support & JSIMD_ARM_NEON)
2470+ jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
2471+ output_col);
2472+}
2473+
2474+GLOBAL(void)
2475+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2476+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
2477+ JDIMENSION output_col)
2478+{
2479+ if (simd_support & JSIMD_ARM_NEON)
2480+ jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
2481+ output_col);
2482+}
2483+
2484+GLOBAL(void)
2485+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2486+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
2487+ JDIMENSION output_col)
2488+{
2489+}
2490Index: simd/jsimd_arm64_neon.S
2491new file mode 100644
2492===================================================================
2493--- /dev/null
2494+++ simd/jsimd_arm64_neon.S
2495@@ -0,0 +1,1861 @@
2496+/*
2497+ * ARMv8 NEON optimizations for libjpeg-turbo
2498+ *
2499+ * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
2500+ * All rights reserved.
2501+ * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
2502+ * Copyright (C) 2013-2014, Linaro Limited
2503+ * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
2504+ *
2505+ * This software is provided 'as-is', without any express or implied
2506+ * warranty. In no event will the authors be held liable for any damages
2507+ * arising from the use of this software.
2508+ *
2509+ * Permission is granted to anyone to use this software for any purpose,
2510+ * including commercial applications, and to alter it and redistribute it
2511+ * freely, subject to the following restrictions:
2512+ *
2513+ * 1. The origin of this software must not be misrepresented; you must not
2514+ * claim that you wrote the original software. If you use this software
2515+ * in a product, an acknowledgment in the product documentation would be
2516+ * appreciated but is not required.
2517+ * 2. Altered source versions must be plainly marked as such, and must not be
2518+ * misrepresented as being the original software.
2519+ * 3. This notice may not be removed or altered from any source distribution.
2520+ */
2521+
2522+#if defined(__linux__) && defined(__ELF__)
2523+.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
2524+#endif
2525+
2526+.text
2527+.arch armv8-a+fp+simd
2528+
2529+
2530+#define RESPECT_STRICT_ALIGNMENT 1
2531+
2532+
2533+/*****************************************************************************/
2534+
2535+/* Supplementary macro for setting function attributes */
2536+.macro asm_function fname
2537+#ifdef __APPLE__
2538+ .globl _\fname
2539+_\fname:
2540+#else
2541+ .global \fname
2542+#ifdef __ELF__
2543+ .hidden \fname
2544+ .type \fname, %function
2545+#endif
2546+\fname:
2547+#endif
2548+.endm
2549+
2550+/* Transpose elements of single 128 bit registers */
2551+.macro transpose_single x0,x1,xi,xilen,literal
2552+ ins \xi\xilen[0], \x0\xilen[0]
2553+ ins \x1\xilen[0], \x0\xilen[1]
2554+ trn1 \x0\literal, \x0\literal, \x1\literal
2555+ trn2 \x1\literal, \xi\literal, \x1\literal
2556+.endm
2557+
2558+/* Transpose elements of 2 differnet registers */
2559+.macro transpose x0,x1,xi,xilen,literal
2560+ mov \xi\xilen, \x0\xilen
2561+ trn1 \x0\literal, \x0\literal, \x1\literal
2562+ trn2 \x1\literal, \xi\literal, \x1\literal
2563+.endm
2564+
2565+/* Transpose a block of 4x4 coefficients in four 64-bit registers */
2566+.macro transpose_4x4_32 x0,x0len x1,x1len x2,x2len x3,x3len,xi,xilen
2567+ mov \xi\xilen, \x0\xilen
2568+ trn1 \x0\x0len, \x0\x0len, \x2\x2len
2569+ trn2 \x2\x2len, \xi\x0len, \x2\x2len
2570+ mov \xi\xilen, \x1\xilen
2571+ trn1 \x1\x1len, \x1\x1len, \x3\x3len
2572+ trn2 \x3\x3len, \xi\x1len, \x3\x3len
2573+.endm
2574+
2575+.macro transpose_4x4_16 x0,x0len x1,x1len, x2,x2len, x3,x3len,xi,xilen
2576+ mov \xi\xilen, \x0\xilen
2577+ trn1 \x0\x0len, \x0\x0len, \x1\x1len
2578+ trn2 \x1\x2len, \xi\x0len, \x1\x2len
2579+ mov \xi\xilen, \x2\xilen
2580+ trn1 \x2\x2len, \x2\x2len, \x3\x3len
2581+ trn2 \x3\x2len, \xi\x1len, \x3\x3len
2582+.endm
2583+
2584+.macro transpose_4x4 x0, x1, x2, x3,x5
2585+ transpose_4x4_16 \x0,.4h, \x1,.4h, \x2,.4h,\x3,.4h,\x5,.16b
2586+ transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b
2587+.endm
2588+
2589+
2590+#define CENTERJSAMPLE 128
2591+
2592+/*****************************************************************************/
2593+
2594+/*
2595+ * Perform dequantization and inverse DCT on one block of coefficients.
2596+ *
2597+ * GLOBAL(void)
2598+ * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
2599+ * JSAMPARRAY output_buf, JDIMENSION output_col)
2600+ */
2601+
2602+#define FIX_0_298631336 (2446)
2603+#define FIX_0_390180644 (3196)
2604+#define FIX_0_541196100 (4433)
2605+#define FIX_0_765366865 (6270)
2606+#define FIX_0_899976223 (7373)
2607+#define FIX_1_175875602 (9633)
2608+#define FIX_1_501321110 (12299)
2609+#define FIX_1_847759065 (15137)
2610+#define FIX_1_961570560 (16069)
2611+#define FIX_2_053119869 (16819)
2612+#define FIX_2_562915447 (20995)
2613+#define FIX_3_072711026 (25172)
2614+
2615+#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
2616+#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
2617+#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
2618+#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
2619+#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
2620+#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
2621+#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
2622+#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
2623+
2624+/*
2625+ * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
2626+ * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
2627+ */
2628+#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \
2629+{ \
2630+ DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
2631+ INT32 q1, q2, q3, q4, q5, q6, q7; \
2632+ INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \
2633+ \
2634+ /* 1-D iDCT input data */ \
2635+ row0 = xrow0; \
2636+ row1 = xrow1; \
2637+ row2 = xrow2; \
2638+ row3 = xrow3; \
2639+ row4 = xrow4; \
2640+ row5 = xrow5; \
2641+ row6 = xrow6; \
2642+ row7 = xrow7; \
2643+ \
2644+ q5 = row7 + row3; \
2645+ q4 = row5 + row1; \
2646+ q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
2647+ MULTIPLY(q4, FIX_1_175875602); \
2648+ q7 = MULTIPLY(q5, FIX_1_175875602) + \
2649+ MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
2650+ q2 = MULTIPLY(row2, FIX_0_541196100) + \
2651+ MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
2652+ q4 = q6; \
2653+ q3 = ((INT32) row0 - (INT32) row4) << 13; \
2654+ q6 += MULTIPLY(row5, -FIX_2_562915447) + \
2655+ MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
2656+ /* now we can use q1 (reloadable constants have been used up) */ \
2657+ q1 = q3 + q2; \
2658+ q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
2659+ MULTIPLY(row1, -FIX_0_899976223); \
2660+ q5 = q7; \
2661+ q1 = q1 + q6; \
2662+ q7 += MULTIPLY(row7, -FIX_0_899976223) + \
2663+ MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
2664+ \
2665+ /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
2666+ tmp11_plus_tmp2 = q1; \
2667+ row1 = 0; \
2668+ \
2669+ q1 = q1 - q6; \
2670+ q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
2671+ MULTIPLY(row3, -FIX_2_562915447); \
2672+ q1 = q1 - q6; \
2673+ q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
2674+ MULTIPLY(row6, FIX_0_541196100); \
2675+ q3 = q3 - q2; \
2676+ \
2677+ /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
2678+ tmp11_minus_tmp2 = q1; \
2679+ \
2680+ q1 = ((INT32) row0 + (INT32) row4) << 13; \
2681+ q2 = q1 + q6; \
2682+ q1 = q1 - q6; \
2683+ \
2684+ /* pick up the results */ \
2685+ tmp0 = q4; \
2686+ tmp1 = q5; \
2687+ tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
2688+ tmp3 = q7; \
2689+ tmp10 = q2; \
2690+ tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
2691+ tmp12 = q3; \
2692+ tmp13 = q1; \
2693+}
2694+
2695+#define XFIX_0_899976223 v0.4h[0]
2696+#define XFIX_0_541196100 v0.4h[1]
2697+#define XFIX_2_562915447 v0.4h[2]
2698+#define XFIX_0_298631336_MINUS_0_899976223 v0.4h[3]
2699+#define XFIX_1_501321110_MINUS_0_899976223 v1.4h[0]
2700+#define XFIX_2_053119869_MINUS_2_562915447 v1.4h[1]
2701+#define XFIX_0_541196100_PLUS_0_765366865 v1.4h[2]
2702+#define XFIX_1_175875602 v1.4h[3]
2703+#define XFIX_1_175875602_MINUS_0_390180644 v2.4h[0]
2704+#define XFIX_0_541196100_MINUS_1_847759065 v2.4h[1]
2705+#define XFIX_3_072711026_MINUS_2_562915447 v2.4h[2]
2706+#define XFIX_1_175875602_MINUS_1_961570560 v2.4h[3]
2707+
2708+.balign 16
2709+jsimd_idct_islow_neon_consts:
2710+ .short FIX_0_899976223 /* d0[0] */
2711+ .short FIX_0_541196100 /* d0[1] */
2712+ .short FIX_2_562915447 /* d0[2] */
2713+ .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
2714+ .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
2715+ .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
2716+ .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
2717+ .short FIX_1_175875602 /* d1[3] */
2718+ /* reloadable constants */
2719+ .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
2720+ .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
2721+ .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
2722+ .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
2723+
2724+asm_function jsimd_idct_islow_neon
2725+
2726+ DCT_TABLE .req x0
2727+ COEF_BLOCK .req x1
2728+ OUTPUT_BUF .req x2
2729+ OUTPUT_COL .req x3
2730+ TMP1 .req x0
2731+ TMP2 .req x1
2732+ TMP3 .req x2
2733+ TMP4 .req x15
2734+
2735+ ROW0L .req v16
2736+ ROW0R .req v17
2737+ ROW1L .req v18
2738+ ROW1R .req v19
2739+ ROW2L .req v20
2740+ ROW2R .req v21
2741+ ROW3L .req v22
2742+ ROW3R .req v23
2743+ ROW4L .req v24
2744+ ROW4R .req v25
2745+ ROW5L .req v26
2746+ ROW5R .req v27
2747+ ROW6L .req v28
2748+ ROW6R .req v29
2749+ ROW7L .req v30
2750+ ROW7R .req v31
2751+ /* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */
2752+ sub sp, sp, 272
2753+ str x15, [sp], 16
2754+ adr x15, jsimd_idct_islow_neon_consts
2755+ st1 {v0.8b - v3.8b}, [sp], 32
2756+ st1 {v4.8b - v7.8b}, [sp], 32
2757+ st1 {v8.8b - v11.8b}, [sp], 32
2758+ st1 {v12.8b - v15.8b}, [sp], 32
2759+ st1 {v16.8b - v19.8b}, [sp], 32
2760+ st1 {v20.8b - v23.8b}, [sp], 32
2761+ st1 {v24.8b - v27.8b}, [sp], 32
2762+ st1 {v28.8b - v31.8b}, [sp], 32
2763+ ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
2764+ ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
2765+ ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
2766+ mul v16.4h, v16.4h, v0.4h
2767+ mul v17.4h, v17.4h, v1.4h
2768+ ins v16.2d[1], v17.2d[0] /* 128 bit q8 */
2769+ ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
2770+ mul v18.4h, v18.4h, v2.4h
2771+ mul v19.4h, v19.4h, v3.4h
2772+ ins v18.2d[1], v19.2d[0] /* 128 bit q9 */
2773+ ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32
2774+ mul v20.4h, v20.4h, v4.4h
2775+ mul v21.4h, v21.4h, v5.4h
2776+ ins v20.2d[1], v21.2d[0] /* 128 bit q10 */
2777+ ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
2778+ mul v22.4h, v22.4h, v6.4h
2779+ mul v23.4h, v23.4h, v7.4h
2780+ ins v22.2d[1], v23.2d[0] /* 128 bit q11 */
2781+ ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]
2782+ mul v24.4h, v24.4h, v0.4h
2783+ mul v25.4h, v25.4h, v1.4h
2784+ ins v24.2d[1], v25.2d[0] /* 128 bit q12 */
2785+ ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
2786+ mul v28.4h, v28.4h, v4.4h
2787+ mul v29.4h, v29.4h, v5.4h
2788+ ins v28.2d[1], v29.2d[0] /* 128 bit q14 */
2789+ mul v26.4h, v26.4h, v2.4h
2790+ mul v27.4h, v27.4h, v3.4h
2791+ ins v26.2d[1], v27.2d[0] /* 128 bit q13 */
2792+ ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */
2793+ add x15, x15, #16
2794+ mul v30.4h, v30.4h, v6.4h
2795+ mul v31.4h, v31.4h, v7.4h
2796+ ins v30.2d[1], v31.2d[0] /* 128 bit q15 */
2797+ /* Go to the bottom of the stack */
2798+ sub sp, sp, 352
2799+ stp x4, x5, [sp], 16
2800+ st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */
2801+ st1 {v12.4h - v15.4h}, [sp], 32
2802+ /* 1-D IDCT, pass 1, left 4x8 half */
2803+ add v4.4h, ROW7L.4h, ROW3L.4h
2804+ add v5.4h, ROW5L.4h, ROW1L.4h
2805+ smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560
2806+ smlal v12.4s, v5.4h, XFIX_1_175875602
2807+ smull v14.4s, v4.4h, XFIX_1_175875602
2808+ /* Check for the zero coefficients in the right 4x8 half */
2809+ smlal v14.4s, v5.4h, XFIX_1_175875602_MINUS_0_390180644
2810+ ssubl v6.4s, ROW0L.4h, ROW4L.4h
2811+ ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
2812+ smull v4.4s, ROW2L.4h, XFIX_0_541196100
2813+ smlal v4.4s, ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065
2814+ orr x0, x4, x5
2815+ mov v8.16b, v12.16b
2816+ smlsl v12.4s, ROW5L.4h, XFIX_2_562915447
2817+ ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
2818+ smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
2819+ shl v6.4s, v6.4s, #13
2820+ orr x0, x0, x4
2821+ smlsl v8.4s, ROW1L.4h, XFIX_0_899976223
2822+ orr x0, x0 , x5
2823+ add v2.4s, v6.4s, v4.4s
2824+ ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
2825+ mov v10.16b, v14.16b
2826+ add v2.4s, v2.4s, v12.4s
2827+ orr x0, x0, x4
2828+ smlsl v14.4s, ROW7L.4h, XFIX_0_899976223
2829+ orr x0, x0, x5
2830+ smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
2831+ rshrn ROW1L.4h, v2.4s, #11
2832+ ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
2833+ sub v2.4s, v2.4s, v12.4s
2834+ smlal v10.4s, ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447
2835+ orr x0, x0, x4
2836+ smlsl v10.4s, ROW3L.4h, XFIX_2_562915447
2837+ orr x0, x0, x5
2838+ sub v2.4s, v2.4s, v12.4s
2839+ smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
2840+ ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
2841+ smlal v12.4s, ROW6L.4h, XFIX_0_541196100
2842+ sub v6.4s, v6.4s, v4.4s
2843+ orr x0, x0, x4
2844+ rshrn ROW6L.4h, v2.4s, #11
2845+ orr x0, x0, x5
2846+ add v2.4s, v6.4s, v10.4s
2847+ ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
2848+ sub v6.4s, v6.4s, v10.4s
2849+ saddl v10.4s, ROW0L.4h, ROW4L.4h
2850+ orr x0, x0, x4
2851+ rshrn ROW2L.4h, v2.4s, #11
2852+ orr x0, x0, x5
2853+ rshrn ROW5L.4h, v6.4s, #11
2854+ ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
2855+ shl v10.4s, v10.4s, #13
2856+ smlal v8.4s, ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223
2857+ orr x0, x0, x4
2858+ add v4.4s, v10.4s, v12.4s
2859+ orr x0, x0, x5
2860+ cmp x0, #0 /* orrs instruction removed */
2861+ sub v2.4s, v10.4s, v12.4s
2862+ add v12.4s, v4.4s, v14.4s
2863+ ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
2864+ sub v4.4s, v4.4s, v14.4s
2865+ add v10.4s, v2.4s, v8.4s
2866+ orr x0, x4, x5
2867+ sub v6.4s, v2.4s, v8.4s
2868+ /* pop {x4, x5} */
2869+ sub sp, sp, 80
2870+ ldp x4, x5, [sp], 16
2871+ rshrn ROW7L.4h, v4.4s, #11
2872+ rshrn ROW3L.4h, v10.4s, #11
2873+ rshrn ROW0L.4h, v12.4s, #11
2874+ rshrn ROW4L.4h, v6.4s, #11
2875+
2876+ beq 3f /* Go to do some special handling for the sparse right 4x8 half */
2877+
2878+ /* 1-D IDCT, pass 1, right 4x8 half */
2879+ ld1 {v2.4h}, [x15] /* reload constants */
2880+ add v10.4h, ROW7R.4h, ROW3R.4h
2881+ add v8.4h, ROW5R.4h, ROW1R.4h
2882+ /* Transpose ROW6L <-> ROW7L (v3 available free register) */
2883+ transpose ROW6L, ROW7L, v3, .16b, .4h
2884+ smull v12.4s, v10.4h, XFIX_1_175875602_MINUS_1_961570560
2885+ smlal v12.4s, v8.4h, XFIX_1_175875602
2886+ /* Transpose ROW2L <-> ROW3L (v3 available free register) */
2887+ transpose ROW2L, ROW3L, v3, .16b, .4h
2888+ smull v14.4s, v10.4h, XFIX_1_175875602
2889+ smlal v14.4s, v8.4h, XFIX_1_175875602_MINUS_0_390180644
2890+ /* Transpose ROW0L <-> ROW1L (v3 available free register) */
2891+ transpose ROW0L, ROW1L, v3, .16b, .4h
2892+ ssubl v6.4s, ROW0R.4h, ROW4R.4h
2893+ smull v4.4s, ROW2R.4h, XFIX_0_541196100
2894+ smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
2895+ /* Transpose ROW4L <-> ROW5L (v3 available free register) */
2896+ transpose ROW4L, ROW5L, v3, .16b, .4h
2897+ mov v8.16b, v12.16b
2898+ smlsl v12.4s, ROW5R.4h, XFIX_2_562915447
2899+ smlal v12.4s, ROW3R.4h, XFIX_3_072711026_MINUS_2_562915447
2900+ /* Transpose ROW1L <-> ROW3L (v3 available free register) */
2901+ transpose ROW1L, ROW3L, v3, .16b, .2s
2902+ shl v6.4s, v6.4s, #13
2903+ smlsl v8.4s, ROW1R.4h, XFIX_0_899976223
2904+ /* Transpose ROW4L <-> ROW6L (v3 available free register) */
2905+ transpose ROW4L, ROW6L, v3, .16b, .2s
2906+ add v2.4s, v6.4s, v4.4s
2907+ mov v10.16b, v14.16b
2908+ add v2.4s, v2.4s, v12.4s
2909+ /* Transpose ROW0L <-> ROW2L (v3 available free register) */
2910+ transpose ROW0L, ROW2L, v3, .16b, .2s
2911+ smlsl v14.4s, ROW7R.4h, XFIX_0_899976223
2912+ smlal v14.4s, ROW1R.4h, XFIX_1_501321110_MINUS_0_899976223
2913+ rshrn ROW1R.4h, v2.4s, #11
2914+ /* Transpose ROW5L <-> ROW7L (v3 available free register) */
2915+ transpose ROW5L, ROW7L, v3, .16b, .2s
2916+ sub v2.4s, v2.4s, v12.4s
2917+ smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
2918+ smlsl v10.4s, ROW3R.4h, XFIX_2_562915447
2919+ sub v2.4s, v2.4s, v12.4s
2920+ smull v12.4s, ROW2R.4h, XFIX_0_541196100_PLUS_0_765366865
2921+ smlal v12.4s, ROW6R.4h, XFIX_0_541196100
2922+ sub v6.4s, v6.4s, v4.4s
2923+ rshrn ROW6R.4h, v2.4s, #11
2924+ add v2.4s, v6.4s, v10.4s
2925+ sub v6.4s, v6.4s, v10.4s
2926+ saddl v10.4s, ROW0R.4h, ROW4R.4h
2927+ rshrn ROW2R.4h, v2.4s, #11
2928+ rshrn ROW5R.4h, v6.4s, #11
2929+ shl v10.4s, v10.4s, #13
2930+ smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
2931+ add v4.4s, v10.4s, v12.4s
2932+ sub v2.4s, v10.4s, v12.4s
2933+ add v12.4s, v4.4s, v14.4s
2934+ sub v4.4s, v4.4s, v14.4s
2935+ add v10.4s, v2.4s, v8.4s
2936+ sub v6.4s, v2.4s, v8.4s
2937+ rshrn ROW7R.4h, v4.4s, #11
2938+ rshrn ROW3R.4h, v10.4s, #11
2939+ rshrn ROW0R.4h, v12.4s, #11
2940+ rshrn ROW4R.4h, v6.4s, #11
2941+ /* Transpose right 4x8 half */
2942+ transpose ROW6R, ROW7R, v3, .16b, .4h
2943+ transpose ROW2R, ROW3R, v3, .16b, .4h
2944+ transpose ROW0R, ROW1R, v3, .16b, .4h
2945+ transpose ROW4R, ROW5R, v3, .16b, .4h
2946+ transpose ROW1R, ROW3R, v3, .16b, .2s
2947+ transpose ROW4R, ROW6R, v3, .16b, .2s
2948+ transpose ROW0R, ROW2R, v3, .16b, .2s
2949+ transpose ROW5R, ROW7R, v3, .16b, .2s
2950+
2951+1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
2952+ ld1 {v2.4h}, [x15] /* reload constants */
2953+ smull v12.4S, ROW1R.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4h */
2954+ smlal v12.4s, ROW1L.4h, XFIX_1_175875602
2955+ smlal v12.4s, ROW3R.4h, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L.4h <-> ROW3R.4h */
2956+ smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
2957+ smull v14.4s, ROW3R.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4h */
2958+ smlal v14.4s, ROW3L.4h, XFIX_1_175875602
2959+ smlal v14.4s, ROW1R.4h, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L.4h <-> ROW1R.4h */
2960+ smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
2961+ ssubl v6.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
2962+ smull v4.4s, ROW2L.4h, XFIX_0_541196100
2963+ smlal v4.4s, ROW2R.4h, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L.4h <-> ROW2R.4h */
2964+ mov v8.16b, v12.16b
2965+ smlsl v12.4s, ROW1R.4h, XFIX_2_562915447 /* ROW5L.4h <-> ROW1R.4h */
2966+ smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
2967+ shl v6.4s, v6.4s, #13
2968+ smlsl v8.4s, ROW1L.4h, XFIX_0_899976223
2969+ add v2.4s, v6.4s, v4.4s
2970+ mov v10.16b, v14.16b
2971+ add v2.4s, v2.4s, v12.4s
2972+ smlsl v14.4s, ROW3R.4h, XFIX_0_899976223 /* ROW7L.4h <-> ROW3R.4h */
2973+ smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
2974+ shrn ROW1L.4h, v2.4s, #16
2975+ sub v2.4s, v2.4s, v12.4s
2976+ smlal v10.4s, ROW1R.4h, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L.4h <-> ROW1R.4h */
2977+ smlsl v10.4s, ROW3L.4h, XFIX_2_562915447
2978+ sub v2.4s, v2.4s, v12.4s
2979+ smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
2980+ smlal v12.4s, ROW2R.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4h */
2981+ sub v6.4s, v6.4s, v4.4s
2982+ shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
2983+ add v2.4s, v6.4s, v10.4s
2984+ sub v6.4s, v6.4s, v10.4s
2985+ saddl v10.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
2986+ shrn ROW2L.4h, v2.4s, #16
2987+ shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
2988+ shl v10.4s, v10.4s, #13
2989+ smlal v8.4s, ROW3R.4h, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L.4h <-> ROW3R.4h */
2990+ add v4.4s, v10.4s, v12.4s
2991+ sub v2.4s, v10.4s, v12.4s
2992+ add v12.4s, v4.4s, v14.4s
2993+ sub v4.4s, v4.4s, v14.4s
2994+ add v10.4s, v2.4s, v8.4s
2995+ sub v6.4s, v2.4s, v8.4s
2996+ shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
2997+ shrn ROW3L.4h, v10.4s, #16
2998+ shrn ROW0L.4h, v12.4s, #16
2999+ shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
3000+ /* 1-D IDCT, pass 2, right 4x8 half */
3001+ ld1 {v2.4h}, [x15] /* reload constants */
3002+ smull v12.4s, ROW5R.4h, XFIX_1_175875602
3003+ smlal v12.4s, ROW5L.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4h */
3004+ smlal v12.4s, ROW7R.4h, XFIX_1_175875602_MINUS_1_961570560
3005+ smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L.4h <-> ROW3R.4h */
3006+ smull v14.4s, ROW7R.4h, XFIX_1_175875602
3007+ smlal v14.4s, ROW7L.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4h */
3008+ smlal v14.4s, ROW5R.4h, XFIX_1_175875602_MINUS_0_390180644
3009+ smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L.4h <-> ROW1R.4h */
3010+ ssubl v6.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
3011+ smull v4.4s, ROW6L.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4h */
3012+ smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
3013+ mov v8.16b, v12.16b
3014+ smlsl v12.4s, ROW5R.4h, XFIX_2_562915447
3015+ smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L.4h <-> ROW3R.4h */
3016+ shl v6.4s, v6.4s, #13
3017+ smlsl v8.4s, ROW5L.4h, XFIX_0_899976223 /* ROW5L.4h <-> ROW1R.4h */
3018+ add v2.4s, v6.4s, v4.4s
3019+ mov v10.16b, v14.16b
3020+ add v2.4s, v2.4s, v12.4s
3021+ smlsl v14.4s, ROW7R.4h, XFIX_0_899976223
3022+ smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L.4h <-> ROW1R.4h */
3023+ shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
3024+ sub v2.4s, v2.4s, v12.4s
3025+ smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
3026+ smlsl v10.4s, ROW7L.4h, XFIX_2_562915447 /* ROW7L.4h <-> ROW3R.4h */
3027+ sub v2.4s, v2.4s, v12.4s
3028+ smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L.4h <-> ROW2R.4h */
3029+ smlal v12.4s, ROW6R.4h, XFIX_0_541196100
3030+ sub v6.4s, v6.4s, v4.4s
3031+ shrn ROW6R.4h, v2.4s, #16
3032+ add v2.4s, v6.4s, v10.4s
3033+ sub v6.4s, v6.4s, v10.4s
3034+ saddl v10.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
3035+ shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
3036+ shrn ROW5R.4h, v6.4s, #16
3037+ shl v10.4s, v10.4s, #13
3038+ smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
3039+ add v4.4s, v10.4s, v12.4s
3040+ sub v2.4s, v10.4s, v12.4s
3041+ add v12.4s, v4.4s, v14.4s
3042+ sub v4.4s, v4.4s, v14.4s
3043+ add v10.4s, v2.4s, v8.4s
3044+ sub v6.4s, v2.4s, v8.4s
3045+ shrn ROW7R.4h, v4.4s, #16
3046+ shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
3047+ shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
3048+ shrn ROW4R.4h, v6.4s, #16
3049+
3050+2: /* Descale to 8-bit and range limit */
3051+ ins v16.2d[1], v17.2d[0]
3052+ ins v18.2d[1], v19.2d[0]
3053+ ins v20.2d[1], v21.2d[0]
3054+ ins v22.2d[1], v23.2d[0]
3055+ sqrshrn v16.8b, v16.8h, #2
3056+ sqrshrn2 v16.16b, v18.8h, #2
3057+ sqrshrn v18.8b, v20.8h, #2
3058+ sqrshrn2 v18.16b, v22.8h, #2
3059+
3060+ /* vpop {v8.4h - d15.4h} */ /* restore NEON registers */
3061+ ld1 {v8.4h - v11.4h}, [sp], 32
3062+ ld1 {v12.4h - v15.4h}, [sp], 32
3063+ ins v24.2d[1], v25.2d[0]
3064+
3065+ sqrshrn v20.8b, v24.8h, #2
3066+ /* Transpose the final 8-bit samples and do signed->unsigned conversion */
3067+ /* trn1 v16.8h, v16.8h, v18.8h */
3068+ transpose v16, v18, v3, .16b, .8h
3069+ ins v26.2d[1], v27.2d[0]
3070+ ins v28.2d[1], v29.2d[0]
3071+ ins v30.2d[1], v31.2d[0]
3072+ sqrshrn2 v20.16b, v26.8h, #2
3073+ sqrshrn v22.8b, v28.8h, #2
3074+ movi v0.16b, #(CENTERJSAMPLE)
3075+ sqrshrn2 v22.16b, v30.8h, #2
3076+ transpose_single v16, v17, v3, .2d, .8b
3077+ transpose_single v18, v19, v3, .2d, .8b
3078+ add v16.8b, v16.8b, v0.8b
3079+ add v17.8b, v17.8b, v0.8b
3080+ add v18.8b, v18.8b, v0.8b
3081+ add v19.8b, v19.8b, v0.8b
3082+ transpose v20, v22, v3, .16b, .8h
3083+ /* Store results to the output buffer */
3084+ ldp TMP1, TMP2, [OUTPUT_BUF], 16
3085+ add TMP1, TMP1, OUTPUT_COL
3086+ add TMP2, TMP2, OUTPUT_COL
3087+ st1 {v16.8b}, [TMP1]
3088+ transpose_single v20, v21, v3, .2d, .8b
3089+ st1 {v17.8b}, [TMP2]
3090+ ldp TMP1, TMP2, [OUTPUT_BUF], 16
3091+ add TMP1, TMP1, OUTPUT_COL
3092+ add TMP2, TMP2, OUTPUT_COL
3093+ st1 {v18.8b}, [TMP1]
3094+ add v20.8b, v20.8b, v0.8b
3095+ add v21.8b, v21.8b, v0.8b
3096+ st1 {v19.8b}, [TMP2]
3097+ ldp TMP1, TMP2, [OUTPUT_BUF], 16
3098+ ldp TMP3, TMP4, [OUTPUT_BUF]
3099+ add TMP1, TMP1, OUTPUT_COL
3100+ add TMP2, TMP2, OUTPUT_COL
3101+ add TMP3, TMP3, OUTPUT_COL
3102+ add TMP4, TMP4, OUTPUT_COL
3103+ transpose_single v22, v23, v3, .2d, .8b
3104+ st1 {v20.8b}, [TMP1]
3105+ add v22.8b, v22.8b, v0.8b
3106+ add v23.8b, v23.8b, v0.8b
3107+ st1 {v21.8b}, [TMP2]
3108+ st1 {v22.8b}, [TMP3]
3109+ st1 {v23.8b}, [TMP4]
3110+ ldr x15, [sp], 16
3111+ ld1 {v0.8b - v3.8b}, [sp], 32
3112+ ld1 {v4.8b - v7.8b}, [sp], 32
3113+ ld1 {v8.8b - v11.8b}, [sp], 32
3114+ ld1 {v12.8b - v15.8b}, [sp], 32
3115+ ld1 {v16.8b - v19.8b}, [sp], 32
3116+ ld1 {v20.8b - v23.8b}, [sp], 32
3117+ ld1 {v24.8b - v27.8b}, [sp], 32
3118+ ld1 {v28.8b - v31.8b}, [sp], 32
3119+ blr x30
3120+
3121+3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
3122+
3123+ /* Transpose left 4x8 half */
3124+ transpose ROW6L, ROW7L, v3, .16b, .4h
3125+ transpose ROW2L, ROW3L, v3, .16b, .4h
3126+ transpose ROW0L, ROW1L, v3, .16b, .4h
3127+ transpose ROW4L, ROW5L, v3, .16b, .4h
3128+ shl ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */
3129+ transpose ROW1L, ROW3L, v3, .16b, .2s
3130+ transpose ROW4L, ROW6L, v3, .16b, .2s
3131+ transpose ROW0L, ROW2L, v3, .16b, .2s
3132+ transpose ROW5L, ROW7L, v3, .16b, .2s
3133+ cmp x0, #0
3134+ beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
3135+
3136+ /* Only row 0 is non-zero for the right 4x8 half */
3137+ dup ROW1R.4h, ROW0R.4h[1]
3138+ dup ROW2R.4h, ROW0R.4h[2]
3139+ dup ROW3R.4h, ROW0R.4h[3]
3140+ dup ROW4R.4h, ROW0R.4h[0]
3141+ dup ROW5R.4h, ROW0R.4h[1]
3142+ dup ROW6R.4h, ROW0R.4h[2]
3143+ dup ROW7R.4h, ROW0R.4h[3]
3144+ dup ROW0R.4h, ROW0R.4h[0]
3145+ b 1b /* Go to 'normal' second pass */
3146+
3147+4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
3148+ ld1 {v2.4h}, [x15] /* reload constants */
3149+ smull v12.4s, ROW1L.4h, XFIX_1_175875602
3150+ smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
3151+ smull v14.4s, ROW3L.4h, XFIX_1_175875602
3152+ smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
3153+ smull v4.4s, ROW2L.4h, XFIX_0_541196100
3154+ sshll v6.4s, ROW0L.4h, #13
3155+ mov v8.16b, v12.16b
3156+ smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
3157+ smlsl v8.4s, ROW1L.4h, XFIX_0_899976223
3158+ add v2.4s, v6.4s, v4.4s
3159+ mov v10.16b, v14.16b
3160+ smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
3161+ add v2.4s, v2.4s, v12.4s
3162+ add v12.4s, v12.4s, v12.4s
3163+ smlsl v10.4s, ROW3L.4h, XFIX_2_562915447
3164+ shrn ROW1L.4h, v2.4s, #16
3165+ sub v2.4s, v2.4s, v12.4s
3166+ smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
3167+ sub v6.4s, v6.4s, v4.4s
3168+ shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
3169+ add v2.4s, v6.4s, v10.4s
3170+ sub v6.4s, v6.4s, v10.4s
3171+ sshll v10.4s, ROW0L.4h, #13
3172+ shrn ROW2L.4h, v2.4s, #16
3173+ shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
3174+ add v4.4s, v10.4s, v12.4s
3175+ sub v2.4s, v10.4s, v12.4s
3176+ add v12.4s, v4.4s, v14.4s
3177+ sub v4.4s, v4.4s, v14.4s
3178+ add v10.4s, v2.4s, v8.4s
3179+ sub v6.4s, v2.4s, v8.4s
3180+ shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
3181+ shrn ROW3L.4h, v10.4s, #16
3182+ shrn ROW0L.4h, v12.4s, #16
3183+ shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
3184+ /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
3185+ ld1 {v2.4h}, [x15] /* reload constants */
3186+ smull v12.4s, ROW5L.4h, XFIX_1_175875602
3187+ smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560
3188+ smull v14.4s, ROW7L.4h, XFIX_1_175875602
3189+ smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644
3190+ smull v4.4s, ROW6L.4h, XFIX_0_541196100
3191+ sshll v6.4s, ROW4L.4h, #13
3192+ mov v8.16b, v12.16b
3193+ smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447
3194+ smlsl v8.4s, ROW5L.4h, XFIX_0_899976223
3195+ add v2.4s, v6.4s, v4.4s
3196+ mov v10.16b, v14.16b
3197+ smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223
3198+ add v2.4s, v2.4s, v12.4s
3199+ add v12.4s, v12.4s, v12.4s
3200+ smlsl v10.4s, ROW7L.4h, XFIX_2_562915447
3201+ shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
3202+ sub v2.4s, v2.4s, v12.4s
3203+ smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865
3204+ sub v6.4s, v6.4s, v4.4s
3205+ shrn ROW6R.4h, v2.4s, #16
3206+ add v2.4s, v6.4s, v10.4s
3207+ sub v6.4s, v6.4s, v10.4s
3208+ sshll v10.4s, ROW4L.4h, #13
3209+ shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
3210+ shrn ROW5R.4h, v6.4s, #16
3211+ add v4.4s, v10.4s, v12.4s
3212+ sub v2.4s, v10.4s, v12.4s
3213+ add v12.4s, v4.4s, v14.4s
3214+ sub v4.4s, v4.4s, v14.4s
3215+ add v10.4s, v2.4s, v8.4s
3216+ sub v6.4s, v2.4s, v8.4s
3217+ shrn ROW7R.4h, v4.4s, #16
3218+ shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
3219+ shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
3220+ shrn ROW4R.4h, v6.4s, #16
3221+ b 2b /* Go to epilogue */
3222+
3223+ .unreq DCT_TABLE
3224+ .unreq COEF_BLOCK
3225+ .unreq OUTPUT_BUF
3226+ .unreq OUTPUT_COL
3227+ .unreq TMP1
3228+ .unreq TMP2
3229+ .unreq TMP3
3230+ .unreq TMP4
3231+
3232+ .unreq ROW0L
3233+ .unreq ROW0R
3234+ .unreq ROW1L
3235+ .unreq ROW1R
3236+ .unreq ROW2L
3237+ .unreq ROW2R
3238+ .unreq ROW3L
3239+ .unreq ROW3R
3240+ .unreq ROW4L
3241+ .unreq ROW4R
3242+ .unreq ROW5L
3243+ .unreq ROW5R
3244+ .unreq ROW6L
3245+ .unreq ROW6R
3246+ .unreq ROW7L
3247+ .unreq ROW7R
3248+
3249+
3250+/*****************************************************************************/
3251+
3252+/*
3253+ * jsimd_idct_ifast_neon
3254+ *
3255+ * This function contains a fast, not so accurate integer implementation of
3256+ * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
3257+ * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
3258+ * function from jidctfst.c
3259+ *
3260+ * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
3261+ * But in ARM NEON case some extra additions are required because VQDMULH
3262+ * instruction can't handle the constants larger than 1. So the expressions
3263+ * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
3264+ * which introduces an extra addition. Overall, there are 6 extra additions
3265+ * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
3266+ */
3267+
3268+#define XFIX_1_082392200 v0.4h[0]
3269+#define XFIX_1_414213562 v0.4h[1]
3270+#define XFIX_1_847759065 v0.4h[2]
3271+#define XFIX_2_613125930 v0.4h[3]
3272+
3273+.balign 16
3274+jsimd_idct_ifast_neon_consts:
3275+ .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
3276+ .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
3277+ .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
3278+ .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
3279+
3280+asm_function jsimd_idct_ifast_neon
3281+
3282+ DCT_TABLE .req x0
3283+ COEF_BLOCK .req x1
3284+ OUTPUT_BUF .req x2
3285+ OUTPUT_COL .req x3
3286+ TMP1 .req x0
3287+ TMP2 .req x1
3288+ TMP3 .req x2
3289+ TMP4 .req x22
3290+ TMP5 .req x23
3291+
3292+ /* Load and dequantize coefficients into NEON registers
3293+ * with the following allocation:
3294+ * 0 1 2 3 | 4 5 6 7
3295+ * ---------+--------
3296+ * 0 | d16 | d17 ( v8.8h )
3297+ * 1 | d18 | d19 ( v9.8h )
3298+ * 2 | d20 | d21 ( v10.8h )
3299+ * 3 | d22 | d23 ( v11.8h )
3300+ * 4 | d24 | d25 ( v12.8h )
3301+ * 5 | d26 | d27 ( v13.8h )
3302+ * 6 | d28 | d29 ( v14.8h )
3303+ * 7 | d30 | d31 ( v15.8h )
3304+ */
3305+ /* Save NEON registers used in fast IDCT */
3306+ sub sp, sp, #176
3307+ stp x22, x23, [sp], 16
3308+ adr x23, jsimd_idct_ifast_neon_consts
3309+ st1 {v0.8b - v3.8b}, [sp], 32
3310+ st1 {v4.8b - v7.8b}, [sp], 32
3311+ st1 {v8.8b - v11.8b}, [sp], 32
3312+ st1 {v12.8b - v15.8b}, [sp], 32
3313+ st1 {v16.8b - v19.8b}, [sp], 32
3314+ ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32
3315+ ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
3316+ ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32
3317+ mul v8.8h, v8.8h, v0.8h
3318+ ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
3319+ mul v9.8h, v9.8h, v1.8h
3320+ ld1 {v12.8h, v13.8h}, [COEF_BLOCK], 32
3321+ mul v10.8h, v10.8h, v2.8h
3322+ ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
3323+ mul v11.8h, v11.8h, v3.8h
3324+ ld1 {v14.8h, v15.8h}, [COEF_BLOCK], 32
3325+ mul v12.8h, v12.8h, v0.8h
3326+ ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
3327+ mul v14.8h, v14.8h, v2.8h
3328+ mul v13.8h, v13.8h, v1.8h
3329+ ld1 {v0.4h}, [x23] /* load constants */
3330+ mul v15.8h, v15.8h, v3.8h
3331+
3332+ /* 1-D IDCT, pass 1 */
3333+ sub v2.8h, v10.8h, v14.8h
3334+ add v14.8h, v10.8h, v14.8h
3335+ sub v1.8h, v11.8h, v13.8h
3336+ add v13.8h, v11.8h, v13.8h
3337+ sub v5.8h, v9.8h, v15.8h
3338+ add v15.8h, v9.8h, v15.8h
3339+ sqdmulh v4.8h, v2.8h, XFIX_1_414213562
3340+ sqdmulh v6.8h, v1.8h, XFIX_2_613125930
3341+ add v3.8h, v1.8h, v1.8h
3342+ sub v1.8h, v5.8h, v1.8h
3343+ add v10.8h, v2.8h, v4.8h
3344+ sqdmulh v4.8h, v1.8h, XFIX_1_847759065
3345+ sub v2.8h, v15.8h, v13.8h
3346+ add v3.8h, v3.8h, v6.8h
3347+ sqdmulh v6.8h, v2.8h, XFIX_1_414213562
3348+ add v1.8h, v1.8h, v4.8h
3349+ sqdmulh v4.8h, v5.8h, XFIX_1_082392200
3350+ sub v10.8h, v10.8h, v14.8h
3351+ add v2.8h, v2.8h, v6.8h
3352+ sub v6.8h, v8.8h, v12.8h
3353+ add v12.8h, v8.8h, v12.8h
3354+ add v9.8h, v5.8h, v4.8h
3355+ add v5.8h, v6.8h, v10.8h
3356+ sub v10.8h, v6.8h, v10.8h
3357+ add v6.8h, v15.8h, v13.8h
3358+ add v8.8h, v12.8h, v14.8h
3359+ sub v3.8h, v6.8h, v3.8h
3360+ sub v12.8h, v12.8h, v14.8h
3361+ sub v3.8h, v3.8h, v1.8h
3362+ sub v1.8h, v9.8h, v1.8h
3363+ add v2.8h, v3.8h, v2.8h
3364+ sub v15.8h, v8.8h, v6.8h
3365+ add v1.8h, v1.8h, v2.8h
3366+ add v8.8h, v8.8h, v6.8h
3367+ add v14.8h, v5.8h, v3.8h
3368+ sub v9.8h, v5.8h, v3.8h
3369+ sub v13.8h, v10.8h, v2.8h
3370+ add v10.8h, v10.8h, v2.8h
3371+ /* Transpose q8-q9 */
3372+ mov v18.16b, v8.16b
3373+ trn1 v8.8h, v8.8h, v9.8h
3374+ trn2 v9.8h, v18.8h, v9.8h
3375+ sub v11.8h, v12.8h, v1.8h
3376+ /* Transpose q14-q15 */
3377+ mov v18.16b, v14.16b
3378+ trn1 v14.8h, v14.8h, v15.8h
3379+ trn2 v15.8h, v18.8h, v15.8h
3380+ add v12.8h, v12.8h, v1.8h
3381+ /* Transpose q10-q11 */
3382+ mov v18.16b, v10.16b
3383+ trn1 v10.8h, v10.8h, v11.8h
3384+ trn2 v11.8h, v18.8h, v11.8h
3385+ /* Transpose q12-q13 */
3386+ mov v18.16b, v12.16b
3387+ trn1 v12.8h, v12.8h, v13.8h
3388+ trn2 v13.8h, v18.8h, v13.8h
3389+ /* Transpose q9-q11 */
3390+ mov v18.16b, v9.16b
3391+ trn1 v9.4s, v9.4s, v11.4s
3392+ trn2 v11.4s, v18.4s, v11.4s
3393+ /* Transpose q12-q14 */
3394+ mov v18.16b, v12.16b
3395+ trn1 v12.4s, v12.4s, v14.4s
3396+ trn2 v14.4s, v18.4s, v14.4s
3397+ /* Transpose q8-q10 */
3398+ mov v18.16b, v8.16b
3399+ trn1 v8.4s, v8.4s, v10.4s
3400+ trn2 v10.4s, v18.4s, v10.4s
3401+ /* Transpose q13-q15 */
3402+ mov v18.16b, v13.16b
3403+ trn1 v13.4s, v13.4s, v15.4s
3404+ trn2 v15.4s, v18.4s, v15.4s
3405+ /* vswp v14.4h, v10-MSB.4h */
3406+ umov x22, v14.d[0]
3407+ ins v14.2d[0], v10.2d[1]
3408+ ins v10.2d[1], x22
3409+ /* vswp v13.4h, v9MSB.4h */
3410+
3411+ umov x22, v13.d[0]
3412+ ins v13.2d[0], v9.2d[1]
3413+ ins v9.2d[1], x22
3414+ /* 1-D IDCT, pass 2 */
3415+ sub v2.8h, v10.8h, v14.8h
3416+ /* vswp v15.4h, v11MSB.4h */
3417+ umov x22, v15.d[0]
3418+ ins v15.2d[0], v11.2d[1]
3419+ ins v11.2d[1], x22
3420+ add v14.8h, v10.8h, v14.8h
3421+ /* vswp v12.4h, v8-MSB.4h */
3422+ umov x22, v12.d[0]
3423+ ins v12.2d[0], v8.2d[1]
3424+ ins v8.2d[1], x22
3425+ sub v1.8h, v11.8h, v13.8h
3426+ add v13.8h, v11.8h, v13.8h
3427+ sub v5.8h, v9.8h, v15.8h
3428+ add v15.8h, v9.8h, v15.8h
3429+ sqdmulh v4.8h, v2.8h, XFIX_1_414213562
3430+ sqdmulh v6.8h, v1.8h, XFIX_2_613125930
3431+ add v3.8h, v1.8h, v1.8h
3432+ sub v1.8h, v5.8h, v1.8h
3433+ add v10.8h, v2.8h, v4.8h
3434+ sqdmulh v4.8h, v1.8h, XFIX_1_847759065
3435+ sub v2.8h, v15.8h, v13.8h
3436+ add v3.8h, v3.8h, v6.8h
3437+ sqdmulh v6.8h, v2.8h, XFIX_1_414213562
3438+ add v1.8h, v1.8h, v4.8h
3439+ sqdmulh v4.8h, v5.8h, XFIX_1_082392200
3440+ sub v10.8h, v10.8h, v14.8h
3441+ add v2.8h, v2.8h, v6.8h
3442+ sub v6.8h, v8.8h, v12.8h
3443+ add v12.8h, v8.8h, v12.8h
3444+ add v9.8h, v5.8h, v4.8h
3445+ add v5.8h, v6.8h, v10.8h
3446+ sub v10.8h, v6.8h, v10.8h
3447+ add v6.8h, v15.8h, v13.8h
3448+ add v8.8h, v12.8h, v14.8h
3449+ sub v3.8h, v6.8h, v3.8h
3450+ sub v12.8h, v12.8h, v14.8h
3451+ sub v3.8h, v3.8h, v1.8h
3452+ sub v1.8h, v9.8h, v1.8h
3453+ add v2.8h, v3.8h, v2.8h
3454+ sub v15.8h, v8.8h, v6.8h
3455+ add v1.8h, v1.8h, v2.8h
3456+ add v8.8h, v8.8h, v6.8h
3457+ add v14.8h, v5.8h, v3.8h
3458+ sub v9.8h, v5.8h, v3.8h
3459+ sub v13.8h, v10.8h, v2.8h
3460+ add v10.8h, v10.8h, v2.8h
3461+ sub v11.8h, v12.8h, v1.8h
3462+ add v12.8h, v12.8h, v1.8h
3463+ /* Descale to 8-bit and range limit */
3464+ movi v0.16b, #0x80
3465+ sqshrn v8.8b, v8.8h, #5
3466+ sqshrn2 v8.16b, v9.8h, #5
3467+ sqshrn v9.8b, v10.8h, #5
3468+ sqshrn2 v9.16b, v11.8h, #5
3469+ sqshrn v10.8b, v12.8h, #5
3470+ sqshrn2 v10.16b, v13.8h, #5
3471+ sqshrn v11.8b, v14.8h, #5
3472+ sqshrn2 v11.16b, v15.8h, #5
3473+ add v8.16b, v8.16b, v0.16b
3474+ add v9.16b, v9.16b, v0.16b
3475+ add v10.16b, v10.16b, v0.16b
3476+ add v11.16b, v11.16b, v0.16b
3477+ /* Transpose the final 8-bit samples */
3478+ /* Transpose q8-q9 */
3479+ mov v18.16b, v8.16b
3480+ trn1 v8.8h, v8.8h, v9.8h
3481+ trn2 v9.8h, v18.8h, v9.8h
3482+ /* Transpose q10-q11 */
3483+ mov v18.16b, v10.16b
3484+ trn1 v10.8h, v10.8h, v11.8h
3485+ trn2 v11.8h, v18.8h, v11.8h
3486+ /* Transpose q8-q10 */
3487+ mov v18.16b, v8.16b
3488+ trn1 v8.4s, v8.4s, v10.4s
3489+ trn2 v10.4s, v18.4s, v10.4s
3490+ /* Transpose q9-q11 */
3491+ mov v18.16b, v9.16b
3492+ trn1 v9.4s, v9.4s, v11.4s
3493+ trn2 v11.4s, v18.4s, v11.4s
3494+ /* make copy */
3495+ ins v17.2d[0], v8.2d[1]
3496+ /* Transpose d16-d17-msb */
3497+ mov v18.16b, v8.16b
3498+ trn1 v8.8b, v8.8b, v17.8b
3499+ trn2 v17.8b, v18.8b, v17.8b
3500+ /* make copy */
3501+ ins v19.2d[0], v9.2d[1]
3502+ mov v18.16b, v9.16b
3503+ trn1 v9.8b, v9.8b, v19.8b
3504+ trn2 v19.8b, v18.8b, v19.8b
3505+ /* Store results to the output buffer */
3506+ ldp TMP1, TMP2, [OUTPUT_BUF], 16
3507+ add TMP1, TMP1, OUTPUT_COL
3508+ add TMP2, TMP2, OUTPUT_COL
3509+ st1 {v8.8b}, [TMP1]
3510+ st1 {v17.8b}, [TMP2]
3511+ ldp TMP1, TMP2, [OUTPUT_BUF], 16
3512+ add TMP1, TMP1, OUTPUT_COL
3513+ add TMP2, TMP2, OUTPUT_COL
3514+ st1 {v9.8b}, [TMP1]
3515+ /* make copy */
3516+ ins v7.2d[0], v10.2d[1]
3517+ mov v18.16b, v10.16b
3518+ trn1 v10.8b, v10.8b, v7.8b
3519+ trn2 v7.8b, v18.8b, v7.8b
3520+ st1 {v19.8b}, [TMP2]
3521+ ldp TMP1, TMP2, [OUTPUT_BUF], 16
3522+ ldp TMP4, TMP5, [OUTPUT_BUF], 16
3523+ add TMP1, TMP1, OUTPUT_COL
3524+ add TMP2, TMP2, OUTPUT_COL
3525+ add TMP4, TMP4, OUTPUT_COL
3526+ add TMP5, TMP5, OUTPUT_COL
3527+ st1 {v10.8b}, [TMP1]
3528+ /* make copy */
3529+ ins v16.2d[0], v11.2d[1]
3530+ mov v18.16b, v11.16b
3531+ trn1 v11.8b, v11.8b, v16.8b
3532+ trn2 v16.8b, v18.8b, v16.8b
3533+ st1 {v7.8b}, [TMP2]
3534+ st1 {v11.8b}, [TMP4]
3535+ st1 {v16.8b}, [TMP5]
3536+ sub sp, sp, #176
3537+ ldp x22, x23, [sp], 16
3538+ ld1 {v0.8b - v3.8b}, [sp], 32
3539+ ld1 {v4.8b - v7.8b}, [sp], 32
3540+ ld1 {v8.8b - v11.8b}, [sp], 32
3541+ ld1 {v12.8b - v15.8b}, [sp], 32
3542+ ld1 {v16.8b - v19.8b}, [sp], 32
3543+ blr x30
3544+
3545+ .unreq DCT_TABLE
3546+ .unreq COEF_BLOCK
3547+ .unreq OUTPUT_BUF
3548+ .unreq OUTPUT_COL
3549+ .unreq TMP1
3550+ .unreq TMP2
3551+ .unreq TMP3
3552+ .unreq TMP4
3553+
3554+
3555+/*****************************************************************************/
3556+
3557+/*
3558+ * jsimd_idct_4x4_neon
3559+ *
3560+ * This function contains inverse-DCT code for getting reduced-size
3561+ * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
3562+ * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
3563+ * function from jpeg-6b (jidctred.c).
3564+ *
3565+ * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
3566+ * requires much less arithmetic operations and hence should be faster.
3567+ * The primary purpose of this particular NEON optimized function is
3568+ * bit exact compatibility with jpeg-6b.
3569+ *
3570+ * TODO: a bit better instructions scheduling can be achieved by expanding
3571+ * idct_helper/transpose_4x4 macros and reordering instructions,
3572+ * but readability will suffer somewhat.
3573+ */
3574+
3575+#define CONST_BITS 13
3576+
3577+#define FIX_0_211164243 (1730) /* FIX(0.211164243) */
3578+#define FIX_0_509795579 (4176) /* FIX(0.509795579) */
3579+#define FIX_0_601344887 (4926) /* FIX(0.601344887) */
3580+#define FIX_0_720959822 (5906) /* FIX(0.720959822) */
3581+#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
3582+#define FIX_0_850430095 (6967) /* FIX(0.850430095) */
3583+#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
3584+#define FIX_1_061594337 (8697) /* FIX(1.061594337) */
3585+#define FIX_1_272758580 (10426) /* FIX(1.272758580) */
3586+#define FIX_1_451774981 (11893) /* FIX(1.451774981) */
3587+#define FIX_1_847759065 (15137) /* FIX(1.847759065) */
3588+#define FIX_2_172734803 (17799) /* FIX(2.172734803) */
3589+#define FIX_2_562915447 (20995) /* FIX(2.562915447) */
3590+#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
3591+
3592+.balign 16
3593+jsimd_idct_4x4_neon_consts:
3594+ .short FIX_1_847759065 /* v0.4h[0] */
3595+ .short -FIX_0_765366865 /* v0.4h[1] */
3596+ .short -FIX_0_211164243 /* v0.4h[2] */
3597+ .short FIX_1_451774981 /* v0.4h[3] */
3598+ .short -FIX_2_172734803 /* d1[0] */
3599+ .short FIX_1_061594337 /* d1[1] */
3600+ .short -FIX_0_509795579 /* d1[2] */
3601+ .short -FIX_0_601344887 /* d1[3] */
3602+ .short FIX_0_899976223 /* v2.4h[0] */
3603+ .short FIX_2_562915447 /* v2.4h[1] */
3604+ .short 1 << (CONST_BITS+1) /* v2.4h[2] */
3605+ .short 0 /* v2.4h[3] */
3606+
3607+.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
3608+ smull v28.4s, \x4, v2.4h[2]
3609+ smlal v28.4s, \x8, v0.4h[0]
3610+ smlal v28.4s, \x14, v0.4h[1]
3611+
3612+ smull v26.4s, \x16, v1.4h[2]
3613+ smlal v26.4s, \x12, v1.4h[3]
3614+ smlal v26.4s, \x10, v2.4h[0]
3615+ smlal v26.4s, \x6, v2.4h[1]
3616+
3617+ smull v30.4s, \x4, v2.4h[2]
3618+ smlsl v30.4s, \x8, v0.4h[0]
3619+ smlsl v30.4s, \x14, v0.4h[1]
3620+
3621+ smull v24.4s, \x16, v0.4h[2]
3622+ smlal v24.4s, \x12, v0.4h[3]
3623+ smlal v24.4s, \x10, v1.4h[0]
3624+ smlal v24.4s, \x6, v1.4h[1]
3625+
3626+ add v20.4s, v28.4s, v26.4s
3627+ sub v28.4s, v28.4s, v26.4s
3628+
3629+.if \shift > 16
3630+ srshr v20.4s, v20.4s, #\shift
3631+ srshr v28.4s, v28.4s, #\shift
3632+ xtn \y26, v20.4s
3633+ xtn \y29, v28.4s
3634+.else
3635+ rshrn \y26, v20.4s, #\shift
3636+ rshrn \y29, v28.4s, #\shift
3637+.endif
3638+
3639+ add v20.4s, v30.4s, v24.4s
3640+ sub v30.4s, v30.4s, v24.4s
3641+
3642+.if \shift > 16
3643+ srshr v20.4s, v20.4s, #\shift
3644+ srshr v30.4s, v30.4s, #\shift
3645+ xtn \y27, v20.4s
3646+ xtn \y28, v30.4s
3647+.else
3648+ rshrn \y27, v20.4s, #\shift
3649+ rshrn \y28, v30.4s, #\shift
3650+.endif
3651+
3652+.endm
3653+
3654+asm_function jsimd_idct_4x4_neon
3655+
3656+ DCT_TABLE .req x0
3657+ COEF_BLOCK .req x1
3658+ OUTPUT_BUF .req x2
3659+ OUTPUT_COL .req x3
3660+ TMP1 .req x0
3661+ TMP2 .req x1
3662+ TMP3 .req x2
3663+ TMP4 .req x15
3664+
3665+ /* Save all used NEON registers */
3666+ sub sp, sp, 272
3667+ str x15, [sp], 16
3668+ /* Load constants (v3.4h is just used for padding) */
3669+ adr TMP4, jsimd_idct_4x4_neon_consts
3670+ st1 {v0.8b - v3.8b}, [sp], 32
3671+ st1 {v4.8b - v7.8b}, [sp], 32
3672+ st1 {v8.8b - v11.8b}, [sp], 32
3673+ st1 {v12.8b - v15.8b}, [sp], 32
3674+ st1 {v16.8b - v19.8b}, [sp], 32
3675+ st1 {v20.8b - v23.8b}, [sp], 32
3676+ st1 {v24.8b - v27.8b}, [sp], 32
3677+ st1 {v28.8b - v31.8b}, [sp], 32
3678+ ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
3679+
3680+ /* Load all COEF_BLOCK into NEON registers with the following allocation:
3681+ * 0 1 2 3 | 4 5 6 7
3682+ * ---------+--------
3683+ * 0 | v4.4h | v5.4h
3684+ * 1 | v6.4h | v7.4h
3685+ * 2 | v8.4h | v9.4h
3686+ * 3 | v10.4h | v11.4h
3687+ * 4 | - | -
3688+ * 5 | v12.4h | v13.4h
3689+ * 6 | v14.4h | v15.4h
3690+ * 7 | v16.4h | v17.4h
3691+ */
3692+ ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
3693+ ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
3694+ add COEF_BLOCK, COEF_BLOCK, #16
3695+ ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
3696+ ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
3697+ /* dequantize */
3698+ ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
3699+ mul v4.4h, v4.4h, v18.4h
3700+ mul v5.4h, v5.4h, v19.4h
3701+ ins v4.2d[1], v5.2d[0] /* 128 bit q4 */
3702+ ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
3703+ mul v6.4h, v6.4h, v20.4h
3704+ mul v7.4h, v7.4h, v21.4h
3705+ ins v6.2d[1], v7.2d[0] /* 128 bit q6 */
3706+ mul v8.4h, v8.4h, v22.4h
3707+ mul v9.4h, v9.4h, v23.4h
3708+ ins v8.2d[1], v9.2d[0] /* 128 bit q8 */
3709+ add DCT_TABLE, DCT_TABLE, #16
3710+ ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
3711+ mul v10.4h, v10.4h, v24.4h
3712+ mul v11.4h, v11.4h, v25.4h
3713+ ins v10.2d[1], v11.2d[0] /* 128 bit q10 */
3714+ mul v12.4h, v12.4h, v26.4h
3715+ mul v13.4h, v13.4h, v27.4h
3716+ ins v12.2d[1], v13.2d[0] /* 128 bit q12 */
3717+ ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
3718+ mul v14.4h, v14.4h, v28.4h
3719+ mul v15.4h, v15.4h, v29.4h
3720+ ins v14.2d[1], v15.2d[0] /* 128 bit q14 */
3721+ mul v16.4h, v16.4h, v30.4h
3722+ mul v17.4h, v17.4h, v31.4h
3723+ ins v16.2d[1], v17.2d[0] /* 128 bit q16 */
3724+
3725+ /* Pass 1 */
3726+ idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4.4h, v6.4h, v8.4h, v10.4h
3727+ transpose_4x4 v4, v6, v8, v10, v3
3728+ ins v10.2d[1], v11.2d[0]
3729+ idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5.4h, v7.4h, v9.4h, v11.4h
3730+ transpose_4x4 v5, v7, v9, v11, v3
3731+ ins v10.2d[1], v11.2d[0]
3732+ /* Pass 2 */
3733+ idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4h, v27.4h, v28.4h, v29.4h
3734+ transpose_4x4 v26, v27, v28, v29, v3
3735+
3736+ /* Range limit */
3737+ movi v30.8h, #0x80
3738+ ins v26.2d[1], v27.2d[0]
3739+ ins v28.2d[1], v29.2d[0]
3740+ add v26.8h, v26.8h, v30.8h
3741+ add v28.8h, v28.8h, v30.8h
3742+ sqxtun v26.8b, v26.8h
3743+ sqxtun v27.8b, v28.8h
3744+
3745+ /* Store results to the output buffer */
3746+ ldp TMP1, TMP2, [OUTPUT_BUF], 16
3747+ ldp TMP3, TMP4, [OUTPUT_BUF]
3748+ add TMP1, TMP1, OUTPUT_COL
3749+ add TMP2, TMP2, OUTPUT_COL
3750+ add TMP3, TMP3, OUTPUT_COL
3751+ add TMP4, TMP4, OUTPUT_COL
3752+
3753+#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
3754+ /* We can use much less instructions on little endian systems if the
3755+ * OS kernel is not configured to trap unaligned memory accesses
3756+ */
3757+ st1 {v26.s}[0], [TMP1], 4
3758+ st1 {v27.s}[0], [TMP3], 4
3759+ st1 {v26.s}[1], [TMP2], 4
3760+ st1 {v27.s}[1], [TMP4], 4
3761+#else
3762+ st1 {v26.b}[0], [TMP1], 1
3763+ st1 {v27.b}[0], [TMP3], 1
3764+ st1 {v26.b}[1], [TMP1], 1
3765+ st1 {v27.b}[1], [TMP3], 1
3766+ st1 {v26.b}[2], [TMP1], 1
3767+ st1 {v27.b}[2], [TMP3], 1
3768+ st1 {v26.b}[3], [TMP1], 1
3769+ st1 {v27.b}[3], [TMP3], 1
3770+
3771+ st1 {v26.b}[4], [TMP2], 1
3772+ st1 {v27.b}[4], [TMP4], 1
3773+ st1 {v26.b}[5], [TMP2], 1
3774+ st1 {v27.b}[5], [TMP4], 1
3775+ st1 {v26.b}[6], [TMP2], 1
3776+ st1 {v27.b}[6], [TMP4], 1
3777+ st1 {v26.b}[7], [TMP2], 1
3778+ st1 {v27.b}[7], [TMP4], 1
3779+#endif
3780+
3781+ /* vpop {v8.4h - v15.4h} ;not available */
3782+ sub sp, sp, #272
3783+ ldr x15, [sp], 16
3784+ ld1 {v0.8b - v3.8b}, [sp], 32
3785+ ld1 {v4.8b - v7.8b}, [sp], 32
3786+ ld1 {v8.8b - v11.8b}, [sp], 32
3787+ ld1 {v12.8b - v15.8b}, [sp], 32
3788+ ld1 {v16.8b - v19.8b}, [sp], 32
3789+ ld1 {v20.8b - v23.8b}, [sp], 32
3790+ ld1 {v24.8b - v27.8b}, [sp], 32
3791+ ld1 {v28.8b - v31.8b}, [sp], 32
3792+ blr x30
3793+
3794+ .unreq DCT_TABLE
3795+ .unreq COEF_BLOCK
3796+ .unreq OUTPUT_BUF
3797+ .unreq OUTPUT_COL
3798+ .unreq TMP1
3799+ .unreq TMP2
3800+ .unreq TMP3
3801+ .unreq TMP4
3802+
3803+.purgem idct_helper
3804+
3805+
3806+/*****************************************************************************/
3807+
3808+/*
3809+ * jsimd_idct_2x2_neon
3810+ *
3811+ * This function contains inverse-DCT code for getting reduced-size
3812+ * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
3813+ * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
3814+ * function from jpeg-6b (jidctred.c).
3815+ *
3816+ * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
3817+ * requires much less arithmetic operations and hence should be faster.
3818+ * The primary purpose of this particular NEON optimized function is
3819+ * bit exact compatibility with jpeg-6b.
3820+ */
3821+
3822+.balign 8
3823+jsimd_idct_2x2_neon_consts:
3824+ .short -FIX_0_720959822 /* v14[0] */
3825+ .short FIX_0_850430095 /* v14[1] */
3826+ .short -FIX_1_272758580 /* v14[2] */
3827+ .short FIX_3_624509785 /* v14[3] */
3828+
3829+.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
3830+ sshll v15.4s, \x4, #15
3831+ smull v26.4s, \x6, v14.4h[3]
3832+ smlal v26.4s, \x10, v14.4h[2]
3833+ smlal v26.4s, \x12, v14.4h[1]
3834+ smlal v26.4s, \x16, v14.4h[0]
3835+
3836+ add v20.4s, v15.4s, v26.4s
3837+ sub v15.4s, v15.4s, v26.4s
3838+
3839+.if \shift > 16
3840+ srshr v20.4s, v20.4s, #\shift
3841+ srshr v15.4s, v15.4s, #\shift
3842+ xtn \y26, v20.4s
3843+ xtn \y27, v15.4s
3844+.else
3845+ rshrn \y26, v20.4s, #\shift
3846+ rshrn \y27, v15.4s, #\shift
3847+.endif
3848+
3849+.endm
3850+
3851+asm_function jsimd_idct_2x2_neon
3852+
3853+ DCT_TABLE .req x0
3854+ COEF_BLOCK .req x1
3855+ OUTPUT_BUF .req x2
3856+ OUTPUT_COL .req x3
3857+ TMP1 .req x0
3858+ TMP2 .req x15
3859+
3860+ /* vpush {v8.4h - v15.4h} ; not available */
3861+ sub sp, sp, 208
3862+ str x15, [sp], 16
3863+
3864+ /* Load constants */
3865+ adr TMP2, jsimd_idct_2x2_neon_consts
3866+ st1 {v4.8b - v7.8b}, [sp], 32
3867+ st1 {v8.8b - v11.8b}, [sp], 32
3868+ st1 {v12.8b - v15.8b}, [sp], 32
3869+ st1 {v16.8b - v19.8b}, [sp], 32
3870+ st1 {v21.8b - v22.8b}, [sp], 16
3871+ st1 {v24.8b - v27.8b}, [sp], 32
3872+ st1 {v30.8b - v31.8b}, [sp], 16
3873+ ld1 {v14.4h}, [TMP2]
3874+
3875+ /* Load all COEF_BLOCK into NEON registers with the following allocation:
3876+ * 0 1 2 3 | 4 5 6 7
3877+ * ---------+--------
3878+ * 0 | v4.4h | v5.4h
3879+ * 1 | v6.4h | v7.4h
3880+ * 2 | - | -
3881+ * 3 | v10.4h | v11.4h
3882+ * 4 | - | -
3883+ * 5 | v12.4h | v13.4h
3884+ * 6 | - | -
3885+ * 7 | v16.4h | v17.4h
3886+ */
3887+ ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
3888+ add COEF_BLOCK, COEF_BLOCK, #16
3889+ ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16
3890+ add COEF_BLOCK, COEF_BLOCK, #16
3891+ ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16
3892+ add COEF_BLOCK, COEF_BLOCK, #16
3893+ ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
3894+ /* Dequantize */
3895+ ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
3896+ mul v4.4h, v4.4h, v18.4h
3897+ mul v5.4h, v5.4h, v19.4h
3898+ ins v4.2d[1], v5.2d[0]
3899+ mul v6.4h, v6.4h, v20.4h
3900+ mul v7.4h, v7.4h, v21.4h
3901+ ins v6.2d[1], v7.2d[0]
3902+ add DCT_TABLE, DCT_TABLE, #16
3903+ ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16
3904+ mul v10.4h, v10.4h, v24.4h
3905+ mul v11.4h, v11.4h, v25.4h
3906+ ins v10.2d[1], v11.2d[0]
3907+ add DCT_TABLE, DCT_TABLE, #16
3908+ ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16
3909+ mul v12.4h, v12.4h, v26.4h
3910+ mul v13.4h, v13.4h, v27.4h
3911+ ins v12.2d[1], v13.2d[0]
3912+ add DCT_TABLE, DCT_TABLE, #16
3913+ ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
3914+ mul v16.4h, v16.4h, v30.4h
3915+ mul v17.4h, v17.4h, v31.4h
3916+ ins v16.2d[1], v17.2d[0]
3917+
3918+ /* Pass 1 */
3919+#if 0
3920+ idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
3921+ transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h
3922+ idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
3923+ transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
3924+#else
3925+ smull v26.4s, v6.4h, v14.4h[3]
3926+ smlal v26.4s, v10.4h, v14.4h[2]
3927+ smlal v26.4s, v12.4h, v14.4h[1]
3928+ smlal v26.4s, v16.4h, v14.4h[0]
3929+ smull v24.4s, v7.4h, v14.4h[3]
3930+ smlal v24.4s, v11.4h, v14.4h[2]
3931+ smlal v24.4s, v13.4h, v14.4h[1]
3932+ smlal v24.4s, v17.4h, v14.4h[0]
3933+ sshll v15.4s, v4.4h, #15
3934+ sshll v30.4s, v5.4h, #15
3935+ add v20.4s, v15.4s, v26.4s
3936+ sub v15.4s, v15.4s, v26.4s
3937+ rshrn v4.4h, v20.4s, #13
3938+ rshrn v6.4h, v15.4s, #13
3939+ add v20.4s, v30.4s, v24.4s
3940+ sub v15.4s, v30.4s, v24.4s
3941+ rshrn v5.4h, v20.4s, #13
3942+ rshrn v7.4h, v15.4s, #13
3943+ ins v4.2d[1], v5.2d[0]
3944+ ins v6.2d[1], v7.2d[0]
3945+ transpose v4, v6, v3, .16b, .8h
3946+ transpose v6, v10, v3, .16b, .4s
3947+ ins v11.2d[0], v10.2d[1]
3948+ ins v7.2d[0], v6.2d[1]
3949+#endif
3950+
3951+ /* Pass 2 */
3952+ idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
3953+
3954+ /* Range limit */
3955+ movi v30.8h, #0x80
3956+ ins v26.2d[1], v27.2d[0]
3957+ add v26.8h, v26.8h, v30.8h
3958+ sqxtun v30.8b, v26.8h
3959+ ins v26.2d[0], v30.2d[0]
3960+ sqxtun v27.8b, v26.8h
3961+
3962+ /* Store results to the output buffer */
3963+ ldp TMP1, TMP2, [OUTPUT_BUF]
3964+ add TMP1, TMP1, OUTPUT_COL
3965+ add TMP2, TMP2, OUTPUT_COL
3966+
3967+ st1 {v26.b}[0], [TMP1], 1
3968+ st1 {v27.b}[4], [TMP1], 1
3969+ st1 {v26.b}[1], [TMP2], 1
3970+ st1 {v27.b}[5], [TMP2], 1
3971+
3972+ sub sp, sp, #208
3973+ ldr x15, [sp], 16
3974+ ld1 {v4.8b - v7.8b}, [sp], 32
3975+ ld1 {v8.8b - v11.8b}, [sp], 32
3976+ ld1 {v12.8b - v15.8b}, [sp], 32
3977+ ld1 {v16.8b - v19.8b}, [sp], 32
3978+ ld1 {v21.8b - v22.8b}, [sp], 16
3979+ ld1 {v24.8b - v27.8b}, [sp], 32
3980+ ld1 {v30.8b - v31.8b}, [sp], 16
3981+ blr x30
3982+
3983+ .unreq DCT_TABLE
3984+ .unreq COEF_BLOCK
3985+ .unreq OUTPUT_BUF
3986+ .unreq OUTPUT_COL
3987+ .unreq TMP1
3988+ .unreq TMP2
3989+
3990+.purgem idct_helper
3991+
3992+
3993+/*****************************************************************************/
3994+
3995+/*
3996+ * jsimd_ycc_extrgb_convert_neon
3997+ * jsimd_ycc_extbgr_convert_neon
3998+ * jsimd_ycc_extrgbx_convert_neon
3999+ * jsimd_ycc_extbgrx_convert_neon
4000+ * jsimd_ycc_extxbgr_convert_neon
4001+ * jsimd_ycc_extxrgb_convert_neon
4002+ *
4003+ * Colorspace conversion YCbCr -> RGB
4004+ */
4005+
4006+
4007+.macro do_load size
4008+ .if \size == 8
4009+ ld1 {v4.8b}, [U], 8
4010+ ld1 {v5.8b}, [V], 8
4011+ ld1 {v0.8b}, [Y], 8
4012+ prfm PLDL1KEEP, [U, #64]
4013+ prfm PLDL1KEEP, [V, #64]
4014+ prfm PLDL1KEEP, [Y, #64]
4015+ .elseif \size == 4
4016+ ld1 {v4.b}[0], [U], 1
4017+ ld1 {v4.b}[1], [U], 1
4018+ ld1 {v4.b}[2], [U], 1
4019+ ld1 {v4.b}[3], [U], 1
4020+ ld1 {v5.b}[0], [V], 1
4021+ ld1 {v5.b}[1], [V], 1
4022+ ld1 {v5.b}[2], [V], 1
4023+ ld1 {v5.b}[3], [V], 1
4024+ ld1 {v0.b}[0], [Y], 1
4025+ ld1 {v0.b}[1], [Y], 1
4026+ ld1 {v0.b}[2], [Y], 1
4027+ ld1 {v0.b}[3], [Y], 1
4028+ .elseif \size == 2
4029+ ld1 {v4.b}[4], [U], 1
4030+ ld1 {v4.b}[5], [U], 1
4031+ ld1 {v5.b}[4], [V], 1
4032+ ld1 {v5.b}[5], [V], 1
4033+ ld1 {v0.b}[4], [Y], 1
4034+ ld1 {v0.b}[5], [Y], 1
4035+ .elseif \size == 1
4036+ ld1 {v4.b}[6], [U], 1
4037+ ld1 {v5.b}[6], [V], 1
4038+ ld1 {v0.b}[6], [Y], 1
4039+ .else
4040+ .error unsupported macroblock size
4041+ .endif
4042+.endm
4043+
4044+.macro do_store bpp, size
4045+ .if \bpp == 24
4046+ .if \size == 8
4047+ st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24
4048+ .elseif \size == 4
4049+ st3 {v10.b, v11.b, v12.b}[0], [RGB], 3
4050+ st3 {v10.b, v11.b, v12.b}[1], [RGB], 3
4051+ st3 {v10.b, v11.b, v12.b}[2], [RGB], 3
4052+ st3 {v10.b, v11.b, v12.b}[3], [RGB], 3
4053+ .elseif \size == 2
4054+ st3 {v10.b, v11.b, v12.b}[4], [RGB], 3
4055+ st3 {v10.b, v11.b, v12.b}[5], [RGB], 3
4056+ .elseif \size == 1
4057+ st3 {v10.b, v11.b, v12.b}[6], [RGB], 3
4058+ .else
4059+ .error unsupported macroblock size
4060+ .endif
4061+ .elseif \bpp == 32
4062+ .if \size == 8
4063+ st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
4064+ .elseif \size == 4
4065+ st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
4066+ st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
4067+ st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
4068+ st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
4069+ .elseif \size == 2
4070+ st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
4071+ st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
4072+ .elseif \size == 1
4073+ st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
4074+ .else
4075+ .error unsupported macroblock size
4076+ .endif
4077+ .elseif \bpp==16
4078+ .if \size == 8
4079+ st1 {v25.8h}, [RGB],16
4080+ .elseif \size == 4
4081+ st1 {v25.4h}, [RGB],8
4082+ .elseif \size == 2
4083+ st1 {v25.h}[4], [RGB],2
4084+ st1 {v25.h}[5], [RGB],2
4085+ .elseif \size == 1
4086+ st1 {v25.h}[6], [RGB],2
4087+ .else
4088+ .error unsupported macroblock size
4089+ .endif
4090+ .else
4091+ .error unsupported bpp
4092+ .endif
4093+.endm
4094+
4095+.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize
4096+
4097+/*
4098+ * 2-stage pipelined YCbCr->RGB conversion
4099+ */
4100+
4101+.macro do_yuv_to_rgb_stage1
4102+ uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */
4103+ uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
4104+ smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
4105+ smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
4106+ smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
4107+ smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
4108+ smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
4109+ smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
4110+ smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
4111+ smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
4112+.endm
4113+
4114+.macro do_yuv_to_rgb_stage2
4115+ rshrn v20.4h, v20.4s, #15
4116+ rshrn2 v20.8h, v22.4s, #15
4117+ rshrn v24.4h, v24.4s, #14
4118+ rshrn2 v24.8h, v26.4s, #14
4119+ rshrn v28.4h, v28.4s, #14
4120+ rshrn2 v28.8h, v30.4s, #14
4121+ uaddw v20.8h, v20.8h, v0.8b
4122+ uaddw v24.8h, v24.8h, v0.8b
4123+ uaddw v28.8h, v28.8h, v0.8b
4124+.if \bpp != 16
4125+ sqxtun v1\g_offs\defsize, v20.8h
4126+ sqxtun v1\r_offs\defsize, v24.8h
4127+ sqxtun v1\b_offs\defsize, v28.8h
4128+.else
4129+ sqshlu v21.8h, v20.8h, #8
4130+ sqshlu v25.8h, v24.8h, #8
4131+ sqshlu v29.8h, v28.8h, #8
4132+ sri v25.8h, v21.8h, #5
4133+ sri v25.8h, v29.8h, #11
4134+.endif
4135+
4136+.endm
4137+
4138+.macro do_yuv_to_rgb_stage2_store_load_stage1
4139+ rshrn v20.4h, v20.4s, #15
4140+ rshrn v24.4h, v24.4s, #14
4141+ rshrn v28.4h, v28.4s, #14
4142+ ld1 {v4.8b}, [U], 8
4143+ rshrn2 v20.8h, v22.4s, #15
4144+ rshrn2 v24.8h, v26.4s, #14
4145+ rshrn2 v28.8h, v30.4s, #14
4146+ ld1 {v5.8b}, [V], 8
4147+ uaddw v20.8h, v20.8h, v0.8b
4148+ uaddw v24.8h, v24.8h, v0.8b
4149+ uaddw v28.8h, v28.8h, v0.8b
4150+.if \bpp != 16 /**************** rgb24/rgb32 *********************************/
4151+ sqxtun v1\g_offs\defsize, v20.8h
4152+ ld1 {v0.8b}, [Y], 8
4153+ sqxtun v1\r_offs\defsize, v24.8h
4154+ prfm PLDL1KEEP, [U, #64]
4155+ prfm PLDL1KEEP, [V, #64]
4156+ prfm PLDL1KEEP, [Y, #64]
4157+ sqxtun v1\b_offs\defsize, v28.8h
4158+ uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
4159+ uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
4160+ smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
4161+ smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
4162+ smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
4163+ smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
4164+ smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
4165+ smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
4166+.else /**************************** rgb565 ***********************************/
4167+ sqshlu v21.8h, v20.8h, #8
4168+ sqshlu v25.8h, v24.8h, #8
4169+ sqshlu v29.8h, v28.8h, #8
4170+ uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
4171+ uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
4172+ ld1 {v0.8b}, [Y], 8
4173+ smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
4174+ smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
4175+ smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
4176+ smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
4177+ sri v25.8h, v21.8h, #5
4178+ smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
4179+ smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
4180+ prfm PLDL1KEEP, [U, #64]
4181+ prfm PLDL1KEEP, [V, #64]
4182+ prfm PLDL1KEEP, [Y, #64]
4183+ sri v25.8h, v29.8h, #11
4184+.endif
4185+ do_store \bpp, 8
4186+ smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
4187+ smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
4188+.endm
4189+
4190+.macro do_yuv_to_rgb
4191+ do_yuv_to_rgb_stage1
4192+ do_yuv_to_rgb_stage2
4193+.endm
4194+
4195+/* Apple gas crashes on adrl, work around that by using adr.
4196+ * But this requires a copy of these constants for each function.
4197+ */
4198+
4199+.balign 16
4200+jsimd_ycc_\colorid\()_neon_consts:
4201+ .short 0, 0, 0, 0
4202+ .short 22971, -11277, -23401, 29033
4203+ .short -128, -128, -128, -128
4204+ .short -128, -128, -128, -128
4205+
4206+asm_function jsimd_ycc_\colorid\()_convert_neon
4207+ OUTPUT_WIDTH .req x0
4208+ INPUT_BUF .req x1
4209+ INPUT_ROW .req x2
4210+ OUTPUT_BUF .req x3
4211+ NUM_ROWS .req x4
4212+
4213+ INPUT_BUF0 .req x5
4214+ INPUT_BUF1 .req x6
4215+ INPUT_BUF2 .req INPUT_BUF
4216+
4217+ RGB .req x7
4218+ Y .req x8
4219+ U .req x9
4220+ V .req x10
4221+ N .req x15
4222+
4223+ sub sp, sp, 336
4224+ str x15, [sp], 16
4225+ /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
4226+ adr x15, jsimd_ycc_\colorid\()_neon_consts
4227+ /* Save NEON registers */
4228+ st1 {v0.8b - v3.8b}, [sp], 32
4229+ st1 {v4.8b - v7.8b}, [sp], 32
4230+ st1 {v8.8b - v11.8b}, [sp], 32
4231+ st1 {v12.8b - v15.8b}, [sp], 32
4232+ st1 {v16.8b - v19.8b}, [sp], 32
4233+ st1 {v20.8b - v23.8b}, [sp], 32
4234+ st1 {v24.8b - v27.8b}, [sp], 32
4235+ st1 {v28.8b - v31.8b}, [sp], 32
4236+ ld1 {v0.4h, v1.4h}, [x15], 16
4237+ ld1 {v2.8h}, [x15]
4238+
4239+ /* Save ARM registers and handle input arguments */
4240+ /* push {x4, x5, x6, x7, x8, x9, x10, x30} */
4241+ stp x4, x5, [sp], 16
4242+ stp x6, x7, [sp], 16
4243+ stp x8, x9, [sp], 16
4244+ stp x10, x30, [sp], 16
4245+ ldr INPUT_BUF0, [INPUT_BUF]
4246+ ldr INPUT_BUF1, [INPUT_BUF, 8]
4247+ ldr INPUT_BUF2, [INPUT_BUF, 16]
4248+ .unreq INPUT_BUF
4249+
4250+ /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
4251+ movi v10.16b, #255
4252+ movi v13.16b, #255
4253+
4254+ /* Outer loop over scanlines */
4255+ cmp NUM_ROWS, #1
4256+ blt 9f
4257+0:
4258+ lsl x16, INPUT_ROW, #3
4259+ ldr Y, [INPUT_BUF0, x16]
4260+ ldr U, [INPUT_BUF1, x16]
4261+ mov N, OUTPUT_WIDTH
4262+ ldr V, [INPUT_BUF2, x16]
4263+ add INPUT_ROW, INPUT_ROW, #1
4264+ ldr RGB, [OUTPUT_BUF], #8
4265+
4266+ /* Inner loop over pixels */
4267+ subs N, N, #8
4268+ blt 3f
4269+ do_load 8
4270+ do_yuv_to_rgb_stage1
4271+ subs N, N, #8
4272+ blt 2f
4273+1:
4274+ do_yuv_to_rgb_stage2_store_load_stage1
4275+ subs N, N, #8
4276+ bge 1b
4277+2:
4278+ do_yuv_to_rgb_stage2
4279+ do_store \bpp, 8
4280+ tst N, #7
4281+ beq 8f
4282+3:
4283+ tst N, #4
4284+ beq 3f
4285+ do_load 4
4286+3:
4287+ tst N, #2
4288+ beq 4f
4289+ do_load 2
4290+4:
4291+ tst N, #1
4292+ beq 5f
4293+ do_load 1
4294+5:
4295+ do_yuv_to_rgb
4296+ tst N, #4
4297+ beq 6f
4298+ do_store \bpp, 4
4299+6:
4300+ tst N, #2
4301+ beq 7f
4302+ do_store \bpp, 2
4303+7:
4304+ tst N, #1
4305+ beq 8f
4306+ do_store \bpp, 1
4307+8:
4308+ subs NUM_ROWS, NUM_ROWS, #1
4309+ bgt 0b
4310+9:
4311+ /* Restore all registers and return */
4312+ sub sp, sp, #336
4313+ ldr x15, [sp], 16
4314+ ld1 {v0.8b - v3.8b}, [sp], 32
4315+ ld1 {v4.8b - v7.8b}, [sp], 32
4316+ ld1 {v8.8b - v11.8b}, [sp], 32
4317+ ld1 {v12.8b - v15.8b}, [sp], 32
4318+ ld1 {v16.8b - v19.8b}, [sp], 32
4319+ ld1 {v20.8b - v23.8b}, [sp], 32
4320+ ld1 {v24.8b - v27.8b}, [sp], 32
4321+ ld1 {v28.8b - v31.8b}, [sp], 32
4322+ /* pop {r4, r5, r6, r7, r8, r9, r10, pc} */
4323+ ldp x4, x5, [sp], 16
4324+ ldp x6, x7, [sp], 16
4325+ ldp x8, x9, [sp], 16
4326+ ldp x10, x30, [sp], 16
4327+ br x30
4328+ .unreq OUTPUT_WIDTH
4329+ .unreq INPUT_ROW
4330+ .unreq OUTPUT_BUF
4331+ .unreq NUM_ROWS
4332+ .unreq INPUT_BUF0
4333+ .unreq INPUT_BUF1
4334+ .unreq INPUT_BUF2
4335+ .unreq RGB
4336+ .unreq Y
4337+ .unreq U
4338+ .unreq V
4339+ .unreq N
4340+
4341+.purgem do_yuv_to_rgb
4342+.purgem do_yuv_to_rgb_stage1
4343+.purgem do_yuv_to_rgb_stage2
4344+.purgem do_yuv_to_rgb_stage2_store_load_stage1
4345+.endm
4346+
4347+/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize */
4348+generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b
4349+generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b
4350+generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b
4351+generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b
4352+generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b
4353+generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b
4354+generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b
4355+.purgem do_load
4356+.purgem do_store