blob: 4572a51fa5397f0124f15157bb5f4c7dcae7dd01 [file] [log] [blame]
DRC0be9fa52013-07-24 21:50:20 +00001/*
2 * MIPS DSPr2 optimizations for libjpeg-turbo
3 *
DRCb844eaa2014-05-13 18:40:14 +00004 * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
DRC0be9fa52013-07-24 21:50:20 +00005 * All rights reserved.
6 * Authors: Teodora Novkovic (teodora.novkovic@imgtec.com)
7 * Darko Laus (darko.laus@imgtec.com)
8 * This software is provided 'as-is', without any express or implied
9 * warranty. In no event will the authors be held liable for any damages
10 * arising from the use of this software.
11 *
12 * Permission is granted to anyone to use this software for any purpose,
13 * including commercial applications, and to alter it and redistribute it
14 * freely, subject to the following restrictions:
15 *
16 * 1. The origin of this software must not be misrepresented; you must not
17 * claim that you wrote the original software. If you use this software
18 * in a product, an acknowledgment in the product documentation would be
19 * appreciated but is not required.
20 * 2. Altered source versions must be plainly marked as such, and must not be
21 * misrepresented as being the original software.
22 * 3. This notice may not be removed or altered from any source distribution.
23 */
24
25#include "jsimd_mips_dspr2_asm.h"
26
27/*****************************************************************************/
DRC1b3fd7e2014-05-15 18:26:01 +000028LEAF_MIPS_DSPR2(jsimd_c_null_convert_mips_dspr2)
29/*
30 * a0 - cinfo->image_width
31 * a1 - input_buf
32 * a2 - output_buf
33 * a3 - output_row
34 * 16(sp) - num_rows
35 * 20(sp) - cinfo->num_components
36 *
37 * Null conversion for compression
38 */
39
40 SAVE_REGS_ON_STACK 8, s0, s1
41
42 lw t9, 24(sp) // t9 = num_rows
43 lw s0, 28(sp) // s0 = cinfo->num_components
44 andi t0, a0, 3 // t0 = cinfo->image_width & 3
45 beqz t0, 4f // no residual
46 nop
470:
48 addiu t9, t9, -1
49 bltz t9, 7f
50 li t1, 0
511:
52 sll t3, t1, 2
53 lwx t5, t3(a2) // t5 = outptr = output_buf[ci]
54 lw t2, 0(a1) // t2 = inptr = *input_buf
55 sll t4, a3, 2
56 lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row]
57 addu t2, t2, t1
58 addu s1, t5, a0
59 addu t6, t5, t0
602:
61 lbu t3, 0(t2)
62 addiu t5, t5, 1
63 sb t3, -1(t5)
64 bne t6, t5, 2b
65 addu t2, t2, s0
663:
67 lbu t3, 0(t2)
68 addu t4, t2, s0
69 addu t7, t4, s0
70 addu t8, t7, s0
71 addu t2, t8, s0
72 lbu t4, 0(t4)
73 lbu t7, 0(t7)
74 lbu t8, 0(t8)
75 addiu t5, t5, 4
76 sb t3, -4(t5)
77 sb t4, -3(t5)
78 sb t7, -2(t5)
79 bne s1, t5, 3b
80 sb t8, -1(t5)
81 addiu t1, t1, 1
82 bne t1, s0, 1b
83 nop
84 addiu a1, a1, 4
85 bgez t9, 0b
86 addiu a3, a3, 1
87 b 7f
88 nop
894:
90 addiu t9, t9, -1
91 bltz t9, 7f
92 li t1, 0
935:
94 sll t3, t1, 2
95 lwx t5, t3(a2) // t5 = outptr = output_buf[ci]
96 lw t2, 0(a1) // t2 = inptr = *input_buf
97 sll t4, a3, 2
98 lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row]
99 addu t2, t2, t1
100 addu s1, t5, a0
101 addu t6, t5, t0
1026:
103 lbu t3, 0(t2)
104 addu t4, t2, s0
105 addu t7, t4, s0
106 addu t8, t7, s0
107 addu t2, t8, s0
108 lbu t4, 0(t4)
109 lbu t7, 0(t7)
110 lbu t8, 0(t8)
111 addiu t5, t5, 4
112 sb t3, -4(t5)
113 sb t4, -3(t5)
114 sb t7, -2(t5)
115 bne s1, t5, 6b
116 sb t8, -1(t5)
117 addiu t1, t1, 1
118 bne t1, s0, 5b
119 nop
120 addiu a1, a1, 4
121 bgez t9, 4b
122 addiu a3, a3, 1
1237:
124 RESTORE_REGS_FROM_STACK 8, s0, s1
125
126 j ra
127 nop
128
129END(jsimd_c_null_convert_mips_dspr2)
130
131/*****************************************************************************/
DRC0be9fa52013-07-24 21:50:20 +0000132/*
133 * jsimd_extrgb_ycc_convert_mips_dspr2
134 * jsimd_extbgr_ycc_convert_mips_dspr2
135 * jsimd_extrgbx_ycc_convert_mips_dspr2
136 * jsimd_extbgrx_ycc_convert_mips_dspr2
137 * jsimd_extxbgr_ycc_convert_mips_dspr2
138 * jsimd_extxrgb_ycc_convert_mips_dspr2
139 *
140 * Colorspace conversion RGB -> YCbCr
141 */
142
143.macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
144
145.macro DO_RGB_TO_YCC r, \
146 g, \
147 b, \
148 inptr
149 lbu \r, \r_offs(\inptr)
150 lbu \g, \g_offs(\inptr)
151 lbu \b, \b_offs(\inptr)
152 addiu \inptr, \pixel_size
153.endm
154
155LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2)
156/*
157 * a0 - cinfo->image_width
158 * a1 - input_buf
159 * a2 - output_buf
160 * a3 - output_row
161 * 16(sp) - num_rows
162 */
163
164 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
165
166 lw t7, 48(sp) // t7 = num_rows
167 li s0, 0x4c8b // FIX(0.29900)
168 li s1, 0x9646 // FIX(0.58700)
169 li s2, 0x1d2f // FIX(0.11400)
170 li s3, 0xffffd4cd // -FIX(0.16874)
171 li s4, 0xffffab33 // -FIX(0.33126)
172 li s5, 0x8000 // FIX(0.50000)
173 li s6, 0xffff94d1 // -FIX(0.41869)
174 li s7, 0xffffeb2f // -FIX(0.08131)
175 li t8, 0x807fff // CBCR_OFFSET + ONE_HALF-1
176
1770:
178 addiu t7, -1 // --num_rows
179 lw t6, 0(a1) // t6 = input_buf[0]
180 lw t0, 0(a2)
181 lw t1, 4(a2)
182 lw t2, 8(a2)
183 sll t3, a3, 2
184 lwx t0, t3(t0) // t0 = output_buf[0][output_row]
185 lwx t1, t3(t1) // t1 = output_buf[1][output_row]
186 lwx t2, t3(t2) // t2 = output_buf[2][output_row]
187
188 addu t9, t2, a0 // t9 = end address
189 addiu a3, 1
190
1911:
192 DO_RGB_TO_YCC t3, t4, t5, t6
193
194 mtlo s5, $ac0
195 mtlo t8, $ac1
196 mtlo t8, $ac2
197 maddu $ac0, s2, t5
198 maddu $ac1, s5, t5
199 maddu $ac2, s5, t3
200 maddu $ac0, s0, t3
201 maddu $ac1, s3, t3
202 maddu $ac2, s6, t4
203 maddu $ac0, s1, t4
204 maddu $ac1, s4, t4
205 maddu $ac2, s7, t5
206 extr.w t3, $ac0, 16
207 extr.w t4, $ac1, 16
208 extr.w t5, $ac2, 16
209 sb t3, 0(t0)
210 sb t4, 0(t1)
211 sb t5, 0(t2)
212 addiu t0, 1
213 addiu t2, 1
214 bne t2, t9, 1b
215 addiu t1, 1
216 bgtz t7, 0b
217 addiu a1, 4
218
219 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
220
221 j ra
222 nop
223END(jsimd_\colorid\()_ycc_convert_mips_dspr2)
224
225.purgem DO_RGB_TO_YCC
226
227.endm
228
229/*------------------------------------------id -- pix R G B */
230GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2
231GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0
232GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
233GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
234GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
235GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
236
237/*****************************************************************************/
238/*
239 * jsimd_ycc_extrgb_convert_mips_dspr2
240 * jsimd_ycc_extbgr_convert_mips_dspr2
241 * jsimd_ycc_extrgbx_convert_mips_dspr2
242 * jsimd_ycc_extbgrx_convert_mips_dspr2
243 * jsimd_ycc_extxbgr_convert_mips_dspr2
244 * jsimd_ycc_extxrgb_convert_mips_dspr2
245 *
246 * Colorspace conversion YCbCr -> RGB
247 */
248
249.macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs, a_offs
250
251.macro STORE_YCC_TO_RGB scratch0 \
252 scratch1 \
253 scratch2 \
254 outptr
255 sb \scratch0, \r_offs(\outptr)
256 sb \scratch1, \g_offs(\outptr)
257 sb \scratch2, \b_offs(\outptr)
258.if (\pixel_size == 4)
259 li t0, 0xFF
260 sb t0, \a_offs(\outptr)
261.endif
262 addiu \outptr, \pixel_size
263.endm
264
265LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2)
266/*
267 * a0 - cinfo->image_width
268 * a1 - input_buf
269 * a2 - input_row
270 * a3 - output_buf
271 * 16(sp) - num_rows
272 */
273
274 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
275
276 lw s1, 48(sp)
277 li t3, 0x8000
278 li t4, 0x166e9 // FIX(1.40200)
279 li t5, 0x1c5a2 // FIX(1.77200)
280 li t6, 0xffff492e // -FIX(0.71414)
281 li t7, 0xffffa7e6 // -FIX(0.34414)
282 repl.ph t8, 128
283
2840:
285 lw s0, 0(a3)
286 lw t0, 0(a1)
287 lw t1, 4(a1)
288 lw t2, 8(a1)
289 sll s5, a2, 2
290 addiu s1, -1
291 lwx s2, s5(t0)
292 lwx s3, s5(t1)
293 lwx s4, s5(t2)
294 addu t9, s2, a0
295 addiu a2, 1
296
2971:
298 lbu s7, 0(s4) // cr
299 lbu s6, 0(s3) // cb
300 lbu s5, 0(s2) // y
301 addiu s2, 1
302 addiu s4, 1
303 addiu s7, -128
304 addiu s6, -128
305 mul t2, t7, s6
306 mul t0, t6, s7 // Crgtab[cr]
307 sll s7, 15
308 mulq_rs.w t1, t4, s7 // Crrtab[cr]
309 sll s6, 15
310 addu t2, t3 // Cbgtab[cb]
311 addu t2, t0
312
313 mulq_rs.w t0, t5, s6 // Cbbtab[cb]
314 sra t2, 16
315 addu t1, s5
316 addu t2, s5 // add y
317 ins t2, t1, 16, 16
318 subu.ph t2, t2, t8
319 addu t0, s5
320 shll_s.ph t2, t2, 8
321 subu t0, 128
322 shra.ph t2, t2, 8
323 shll_s.w t0, t0, 24
324 addu.ph t2, t2, t8 // clip & store
325 sra t0, t0, 24
326 sra t1, t2, 16
327 addiu t0, 128
328
329 STORE_YCC_TO_RGB t1, t2, t0, s0
330
331 bne s2, t9, 1b
332 addiu s3, 1
333 bgtz s1, 0b
334 addiu a3, 4
335
336 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
337
338 j ra
339 nop
340END(jsimd_ycc_\colorid\()_convert_mips_dspr2)
341
342.purgem STORE_YCC_TO_RGB
343
344.endm
345
346/*------------------------------------------id -- pix R G B A */
347GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2, 3
348GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0, 3
349GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3
350GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3
351GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0
352GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0
353
354/*****************************************************************************/
DRC86fbf352013-07-27 21:44:14 +0000355/*
DRC49eaa752013-09-27 17:39:57 +0000356 * jsimd_extrgb_gray_convert_mips_dspr2
357 * jsimd_extbgr_gray_convert_mips_dspr2
358 * jsimd_extrgbx_gray_convert_mips_dspr2
359 * jsimd_extbgrx_gray_convert_mips_dspr2
360 * jsimd_extxbgr_gray_convert_mips_dspr2
361 * jsimd_extxrgb_gray_convert_mips_dspr2
362 *
363 * Colorspace conversion RGB -> GRAY
364 */
365
366.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
367
368.macro DO_RGB_TO_GRAY r, \
369 g, \
370 b, \
371 inptr
372 lbu \r, \r_offs(\inptr)
373 lbu \g, \g_offs(\inptr)
374 lbu \b, \b_offs(\inptr)
375 addiu \inptr, \pixel_size
376.endm
377
378LEAF_MIPS_DSPR2(jsimd_\colorid\()_gray_convert_mips_dspr2)
379/*
380 * a0 - cinfo->image_width
381 * a1 - input_buf
382 * a2 - output_buf
383 * a3 - output_row
384 * 16(sp) - num_rows
385 */
386
387 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
388
389 li s0, 0x4c8b // s0 = FIX(0.29900)
390 li s1, 0x9646 // s1 = FIX(0.58700)
391 li s2, 0x1d2f // s2 = FIX(0.11400)
392 li s7, 0x8000 // s7 = FIX(0.50000)
393 lw s6, 48(sp)
394 andi t7, a0, 3
395
3960:
397 addiu s6, -1 // s6 = num_rows
398 lw t0, 0(a1)
399 lw t1, 0(a2)
400 sll t3, a3, 2
401 lwx t1, t3(t1)
402 addiu a3, 1
403 addu t9, t1, a0
404 subu t8, t9, t7
405 beq t1, t8, 2f
406 nop
407
4081:
409 DO_RGB_TO_GRAY t3, t4, t5, t0
410 DO_RGB_TO_GRAY s3, s4, s5, t0
411
412 mtlo s7, $ac0
413 maddu $ac0, s2, t5
414 maddu $ac0, s1, t4
415 maddu $ac0, s0, t3
416 mtlo s7, $ac1
417 maddu $ac1, s2, s5
418 maddu $ac1, s1, s4
419 maddu $ac1, s0, s3
420 extr.w t6, $ac0, 16
421
422 DO_RGB_TO_GRAY t3, t4, t5, t0
423 DO_RGB_TO_GRAY s3, s4, s5, t0
424
425 mtlo s7, $ac0
426 maddu $ac0, s2, t5
427 maddu $ac0, s1, t4
428 extr.w t2, $ac1, 16
429 maddu $ac0, s0, t3
430 mtlo s7, $ac1
431 maddu $ac1, s2, s5
432 maddu $ac1, s1, s4
433 maddu $ac1, s0, s3
434 extr.w t5, $ac0, 16
435 sb t6, 0(t1)
436 sb t2, 1(t1)
437 extr.w t3, $ac1, 16
438 addiu t1, 4
439 sb t5, -2(t1)
440 sb t3, -1(t1)
441 bne t1, t8, 1b
442 nop
443
4442:
445 beqz t7, 4f
446 nop
447
4483:
449 DO_RGB_TO_GRAY t3, t4, t5, t0
450
451 mtlo s7, $ac0
452 maddu $ac0, s2, t5
453 maddu $ac0, s1, t4
454 maddu $ac0, s0, t3
455 extr.w t6, $ac0, 16
456 sb t6, 0(t1)
457 addiu t1, 1
458 bne t1, t9, 3b
459 nop
460
4614:
462 bgtz s6, 0b
463 addiu a1, 4
464
465 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
466
467 j ra
468 nop
469END(jsimd_\colorid\()_gray_convert_mips_dspr2)
470
471.purgem DO_RGB_TO_GRAY
472
473.endm
474
475/*------------------------------------------id -- pix R G B */
476GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2
477GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0
478GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
479GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
480GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
481GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
482/*****************************************************************************/
483/*
DRCb844eaa2014-05-13 18:40:14 +0000484 * jsimd_h2v2_merged_upsample_mips_dspr2
485 * jsimd_h2v2_extrgb_merged_upsample_mips_dspr2
486 * jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2
487 * jsimd_h2v2_extbgr_merged_upsample_mips_dspr2
488 * jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2
489 * jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2
490 * jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2
491 *
492 * Merged h2v2 upsample routines
493 */
494.macro GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 colorid, \
495 pixel_size, \
496 r1_offs, \
497 g1_offs, \
498 b1_offs, \
499 a1_offs, \
500 r2_offs, \
501 g2_offs, \
502 b2_offs, \
503 a2_offs
504
505.macro STORE_H2V2_2_PIXELS scratch0 \
506 scratch1 \
507 scratch2 \
508 scratch3 \
509 scratch4 \
510 scratch5 \
511 outptr
512 sb \scratch0, \r1_offs(\outptr)
513 sb \scratch1, \g1_offs(\outptr)
514 sb \scratch2, \b1_offs(\outptr)
515 sb \scratch3, \r2_offs(\outptr)
516 sb \scratch4, \g2_offs(\outptr)
517 sb \scratch5, \b2_offs(\outptr)
518.if (\pixel_size == 8)
519 li \scratch0, 0xFF
520 sb \scratch0, \a1_offs(\outptr)
521 sb \scratch0, \a2_offs(\outptr)
522.endif
523 addiu \outptr, \pixel_size
524.endm
525
526.macro STORE_H2V2_1_PIXEL scratch0 \
527 scratch1 \
528 scratch2 \
529 outptr
530 sb \scratch0, \r1_offs(\outptr)
531 sb \scratch1, \g1_offs(\outptr)
532 sb \scratch2, \b1_offs(\outptr)
533
534.if (\pixel_size == 8)
535 li t0, 0xFF
536 sb t0, \a1_offs(\outptr)
537.endif
538.endm
539
540LEAF_MIPS_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
541/*
542 * a0 - cinfo->output_width
543 * a1 - input_buf
544 * a2 - in_row_group_ctr
545 * a3 - output_buf
546 * 16(sp) - cinfo->sample_range_limit
547 */
548
549 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
550
551 lw t9, 56(sp) // cinfo->sample_range_limit
552 lw v0, 0(a1)
553 lw v1, 4(a1)
554 lw t0, 8(a1)
555 sll t1, a2, 3
556 addiu t2, t1, 4
557 sll t3, a2, 2
558 lw t4, 0(a3) // t4 = output_buf[0]
559 lwx t1, t1(v0) // t1 = input_buf[0][in_row_group_ctr*2]
560 lwx t2, t2(v0) // t2 = input_buf[0][in_row_group_ctr*2 + 1]
561 lwx t5, t3(v1) // t5 = input_buf[1][in_row_group_ctr]
562 lwx t6, t3(t0) // t6 = input_buf[2][in_row_group_ctr]
563 lw t7, 4(a3) // t7 = output_buf[1]
564 li s1, 0xe6ea
565 addiu t8, s1, 0x7fff // t8 = 0x166e9 [FIX(1.40200)]
566 addiu s0, t8, 0x5eb9 // s0 = 0x1c5a2 [FIX(1.77200)]
567 addiu s1, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)]
568 xori s2, s1, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)]
569 srl t3, a0, 1
570 blez t3, 2f
571 addu t0, t5, t3 // t0 = end address
572 1:
573 lbu t3, 0(t5)
574 lbu s3, 0(t6)
575 addiu t5, t5, 1
576 addiu t3, t3, -128 // (cb - 128)
577 addiu s3, s3, -128 // (cr - 128)
578 mult $ac1, s1, t3
579 madd $ac1, s2, s3
580 sll s3, s3, 15
581 sll t3, t3, 15
582 mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
583 extr_r.w s5, $ac1, 16
584 mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
585 lbu v0, 0(t1)
586 addiu t6, t6, 1
587 addiu t1, t1, 2
588 addu t3, v0, s4 // y+cred
589 addu s3, v0, s5 // y+cgreen
590 addu v1, v0, s6 // y+cblue
591 addu t3, t9, t3 // y+cred
592 addu s3, t9, s3 // y+cgreen
593 addu v1, t9, v1 // y+cblue
594 lbu AT, 0(t3)
595 lbu s7, 0(s3)
596 lbu ra, 0(v1)
597 lbu v0, -1(t1)
598 addu t3, v0, s4 // y+cred
599 addu s3, v0, s5 // y+cgreen
600 addu v1, v0, s6 // y+cblue
601 addu t3, t9, t3 // y+cred
602 addu s3, t9, s3 // y+cgreen
603 addu v1, t9, v1 // y+cblue
604 lbu t3, 0(t3)
605 lbu s3, 0(s3)
606 lbu v1, 0(v1)
607 lbu v0, 0(t2)
608
609 STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
610
611 addu t3, v0, s4 // y+cred
612 addu s3, v0, s5 // y+cgreen
613 addu v1, v0, s6 // y+cblue
614 addu t3, t9, t3 // y+cred
615 addu s3, t9, s3 // y+cgreen
616 addu v1, t9, v1 // y+cblue
617 lbu AT, 0(t3)
618 lbu s7, 0(s3)
619 lbu ra, 0(v1)
620 lbu v0, 1(t2)
621 addiu t2, t2, 2
622 addu t3, v0, s4 // y+cred
623 addu s3, v0, s5 // y+cgreen
624 addu v1, v0, s6 // y+cblue
625 addu t3, t9, t3 // y+cred
626 addu s3, t9, s3 // y+cgreen
627 addu v1, t9, v1 // y+cblue
628 lbu t3, 0(t3)
629 lbu s3, 0(s3)
630 lbu v1, 0(v1)
631
632 STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
633
634 bne t0, t5, 1b
635 nop
6362:
637 andi t0, a0, 1
638 beqz t0, 4f
639 lbu t3, 0(t5)
640 lbu s3, 0(t6)
641 addiu t3, t3, -128 // (cb - 128)
642 addiu s3, s3, -128 // (cr - 128)
643 mult $ac1, s1, t3
644 madd $ac1, s2, s3
645 sll s3, s3, 15
646 sll t3, t3, 15
647 lbu v0, 0(t1)
648 extr_r.w s5, $ac1, 16
649 mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
650 mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
651 addu t3, v0, s4 // y+cred
652 addu s3, v0, s5 // y+cgreen
653 addu v1, v0, s6 // y+cblue
654 addu t3, t9, t3 // y+cred
655 addu s3, t9, s3 // y+cgreen
656 addu v1, t9, v1 // y+cblue
657 lbu t3, 0(t3)
658 lbu s3, 0(s3)
659 lbu v1, 0(v1)
660 lbu v0, 0(t2)
661
662 STORE_H2V2_1_PIXEL t3, s3, v1, t4
663
664 addu t3, v0, s4 // y+cred
665 addu s3, v0, s5 // y+cgreen
666 addu v1, v0, s6 // y+cblue
667 addu t3, t9, t3 // y+cred
668 addu s3, t9, s3 // y+cgreen
669 addu v1, t9, v1 // y+cblue
670 lbu t3, 0(t3)
671 lbu s3, 0(s3)
672 lbu v1, 0(v1)
673
674 STORE_H2V2_1_PIXEL t3, s3, v1, t7
6754:
676 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
677
678 j ra
679 nop
680
681END(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
682
683.purgem STORE_H2V2_1_PIXEL
684.purgem STORE_H2V2_2_PIXELS
685.endm
686
687/*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
688GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
689GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
690GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
691GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
692GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
693GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
694/*****************************************************************************/
695/*
696 * jsimd_h2v1_merged_upsample_mips_dspr2
697 * jsimd_h2v1_extrgb_merged_upsample_mips_dspr2
698 * jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2
699 * jsimd_h2v1_extbgr_merged_upsample_mips_dspr2
700 * jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2
701 * jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2
702 * jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2
703 *
704 * Merged h2v1 upsample routines
705 */
706
707.macro GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 colorid, \
708 pixel_size, \
709 r1_offs, \
710 g1_offs, \
711 b1_offs, \
712 a1_offs, \
713 r2_offs, \
714 g2_offs, \
715 b2_offs, \
716 a2_offs
717
718.macro STORE_H2V1_2_PIXELS scratch0 \
719 scratch1 \
720 scratch2 \
721 scratch3 \
722 scratch4 \
723 scratch5 \
724 outptr
725 sb \scratch0, \r1_offs(\outptr)
726 sb \scratch1, \g1_offs(\outptr)
727 sb \scratch2, \b1_offs(\outptr)
728 sb \scratch3, \r2_offs(\outptr)
729 sb \scratch4, \g2_offs(\outptr)
730 sb \scratch5, \b2_offs(\outptr)
731.if (\pixel_size == 8)
732 li t0, 0xFF
733 sb t0, \a1_offs(\outptr)
734 sb t0, \a2_offs(\outptr)
735.endif
736 addiu \outptr, \pixel_size
737.endm
738
739.macro STORE_H2V1_1_PIXEL scratch0 \
740 scratch1 \
741 scratch2 \
742 outptr
743 sb \scratch0, \r1_offs(\outptr)
744 sb \scratch1, \g1_offs(\outptr)
745 sb \scratch2, \b1_offs(\outptr)
746.if (\pixel_size == 8)
747 li t0, 0xFF
748 sb t0, \a1_offs(\outptr)
749.endif
750.endm
751
752LEAF_MIPS_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
753/*
754 * a0 - cinfo->output_width
755 * a1 - input_buf
756 * a2 - in_row_group_ctr
757 * a3 - output_buf
758 * 16(sp) - range_limit
759 */
760
761 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
762
763 li t0, 0xe6ea
764 lw t1, 0(a1) // t1 = input_buf[0]
765 lw t2, 4(a1) // t2 = input_buf[1]
766 lw t3, 8(a1) // t3 = input_buf[2]
767 lw t8, 56(sp) // t8 = range_limit
768 addiu s1, t0, 0x7fff // s1 = 0x166e9 [FIX(1.40200)]
769 addiu s2, s1, 0x5eb9 // s2 = 0x1c5a2 [FIX(1.77200)]
770 addiu s0, t0, 0x9916 // s0 = 0x8000
771 addiu s4, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)]
772 xori s3, s4, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)]
773 srl t0, a0, 1
774 sll t4, a2, 2
775 lwx s5, t4(t1) // s5 = inptr0
776 lwx s6, t4(t2) // s6 = inptr1
777 lwx s7, t4(t3) // s7 = inptr2
778 lw t7, 0(a3) // t7 = outptr
779 blez t0, 2f
780 addu t9, s6, t0 // t9 = end address
7811:
782 lbu t2, 0(s6) // t2 = cb
783 lbu t0, 0(s7) // t0 = cr
784 lbu t1, 0(s5) // t1 = y
785 addiu t2, t2, -128 // t2 = cb - 128
786 addiu t0, t0, -128 // t0 = cr - 128
787 mult $ac1, s4, t2
788 madd $ac1, s3, t0
789 sll t0, t0, 15
790 sll t2, t2, 15
791 mulq_rs.w t0, s1, t0 // t0 = (C1*cr + ONE_HALF)>> SCALEBITS
792 extr_r.w t5, $ac1, 16
793 mulq_rs.w t6, s2, t2 // t6 = (C2*cb + ONE_HALF)>> SCALEBITS
794 addiu s7, s7, 1
795 addiu s6, s6, 1
796 addu t2, t1, t0 // t2 = y + cred
797 addu t3, t1, t5 // t3 = y + cgreen
798 addu t4, t1, t6 // t4 = y + cblue
799 addu t2, t8, t2
800 addu t3, t8, t3
801 addu t4, t8, t4
802 lbu t1, 1(s5)
803 lbu v0, 0(t2)
804 lbu v1, 0(t3)
805 lbu ra, 0(t4)
806 addu t2, t1, t0
807 addu t3, t1, t5
808 addu t4, t1, t6
809 addu t2, t8, t2
810 addu t3, t8, t3
811 addu t4, t8, t4
812 lbu t2, 0(t2)
813 lbu t3, 0(t3)
814 lbu t4, 0(t4)
815
816 STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
817
818 bne t9, s6, 1b
819 addiu s5, s5, 2
8202:
821 andi t0, a0, 1
822 beqz t0, 4f
823 nop
8243:
825 lbu t2, 0(s6)
826 lbu t0, 0(s7)
827 lbu t1, 0(s5)
828 addiu t2, t2, -128 //(cb - 128)
829 addiu t0, t0, -128 //(cr - 128)
830 mul t3, s4, t2
831 mul t4, s3, t0
832 sll t0, t0, 15
833 sll t2, t2, 15
834 mulq_rs.w t0, s1, t0 // (C1*cr + ONE_HALF)>> SCALEBITS
835 mulq_rs.w t6, s2, t2 // (C2*cb + ONE_HALF)>> SCALEBITS
836 addu t3, t3, s0
837 addu t3, t4, t3
838 sra t5, t3, 16 // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS
839 addu t2, t1, t0 // y + cred
840 addu t3, t1, t5 // y + cgreen
841 addu t4, t1, t6 // y + cblue
842 addu t2, t8, t2
843 addu t3, t8, t3
844 addu t4, t8, t4
845 lbu t2, 0(t2)
846 lbu t3, 0(t3)
847 lbu t4, 0(t4)
848
849 STORE_H2V1_1_PIXEL t2, t3, t4, t7
8504:
851 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
852
853 j ra
854 nop
855
856END(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
857
858.purgem STORE_H2V1_1_PIXEL
859.purgem STORE_H2V1_2_PIXELS
860.endm
861
862/*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
863GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
864GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
865GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
866GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
867GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
868GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
869/*****************************************************************************/
870/*
DRC86fbf352013-07-27 21:44:14 +0000871 * jsimd_h2v2_fancy_upsample_mips_dspr2
872 *
873 * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
874 */
875LEAF_MIPS_DSPR2(jsimd_h2v2_fancy_upsample_mips_dspr2)
876/*
877 * a0 - cinfo->max_v_samp_factor
878 * a1 - downsampled_width
879 * a2 - input_data
880 * a3 - output_data_ptr
881 */
882
883 SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
884
885 li s4, 0
886 lw s2, 0(a3) // s2 = *output_data_ptr
8870:
888 li t9, 2
889 lw s1, -4(a2) // s1 = inptr1
890
8911:
892 lw s0, 0(a2) // s0 = inptr0
893 lwx s3, s4(s2)
894 addiu s5, a1, -2 // s5 = downsampled_width - 2
895 srl t4, s5, 1
896 sll t4, t4, 1
897 lbu t0, 0(s0)
898 lbu t1, 1(s0)
899 lbu t2, 0(s1)
900 lbu t3, 1(s1)
901 addiu s0, 2
902 addiu s1, 2
903 addu t8, s0, t4 // t8 = end address
904 andi s5, s5, 1 // s5 = residual
905 sll t4, t0, 1
906 sll t6, t1, 1
907 addu t0, t0, t4 // t0 = (*inptr0++) * 3
908 addu t1, t1, t6 // t1 = (*inptr0++) * 3
909 addu t7, t0, t2 // t7 = thiscolsum
910 addu t6, t1, t3 // t5 = nextcolsum
911 sll t0, t7, 2 // t0 = thiscolsum * 4
912 subu t1, t0, t7 // t1 = thiscolsum * 3
913 shra_r.w t0, t0, 4
914 addiu t1, 7
915 addu t1, t1, t6
916 srl t1, t1, 4
917 sb t0, 0(s3)
918 sb t1, 1(s3)
919 addiu s3, 2
9202:
921 lh t0, 0(s0) // t0 = A3|A2
922 lh t2, 0(s1) // t2 = B3|B2
923 addiu s0, 2
924 addiu s1, 2
925 preceu.ph.qbr t0, t0 // t0 = 0|A3|0|A2
926 preceu.ph.qbr t2, t2 // t2 = 0|B3|0|B2
927 shll.ph t1, t0, 1
928 sll t3, t6, 1
929 addu.ph t0, t1, t0 // t0 = A3*3|A2*3
930 addu t3, t3, t6 // t3 = this * 3
931 addu.ph t0, t0, t2 // t0 = next2|next1
932 addu t1, t3, t7
933 andi t7, t0, 0xFFFF // t7 = next1
934 sll t2, t7, 1
935 addu t2, t7, t2 // t2 = next1*3
936 addu t4, t2, t6
937 srl t6, t0, 16 // t6 = next2
938 shra_r.w t1, t1, 4 // t1 = (this*3 + last + 8) >> 4
939 addu t0, t3, t7
940 addiu t0, 7
941 srl t0, t0, 4 // t0 = (this*3 + next1 + 7) >> 4
942 shra_r.w t4, t4, 4 // t3 = (next1*3 + this + 8) >> 4
943 addu t2, t2, t6
944 addiu t2, 7
945 srl t2, t2, 4 // t2 = (next1*3 + next2 + 7) >> 4
946 sb t1, 0(s3)
947 sb t0, 1(s3)
948 sb t4, 2(s3)
949 sb t2, 3(s3)
950 bne t8, s0, 2b
951 addiu s3, 4
952 beqz s5, 4f
953 addu t8, s0, s5
9543:
955 lbu t0, 0(s0)
956 lbu t2, 0(s1)
957 addiu s0, 1
958 addiu s1, 1
959 sll t3, t6, 1
960 sll t1, t0, 1
961 addu t1, t0, t1 // t1 = inptr0 * 3
962 addu t3, t3, t6 // t3 = thiscolsum * 3
963 addu t5, t1, t2
964 addu t1, t3, t7
965 shra_r.w t1, t1, 4
966 addu t0, t3, t5
967 addiu t0, 7
968 srl t0, t0, 4
969 sb t1, 0(s3)
970 sb t0, 1(s3)
971 addiu s3, 2
972 move t7, t6
973 bne t8, s0, 3b
974 move t6, t5
9754:
976 sll t0, t6, 2 // t0 = thiscolsum * 4
977 subu t1, t0, t6 // t1 = thiscolsum * 3
978 addu t1, t1, t7
979 addiu s4, 4
980 shra_r.w t1, t1, 4
981 addiu t0, 7
982 srl t0, t0, 4
983 sb t1, 0(s3)
984 sb t0, 1(s3)
985 addiu t9, -1
986 addiu s3, 2
987 bnez t9, 1b
988 lw s1, 4(a2)
989 srl t0, s4, 2
990 subu t0, a0, t0
991 bgtz t0, 0b
992 addiu a2, 4
993
994 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
995
996 j ra
997 nop
998END(jsimd_h2v2_fancy_upsample_mips_dspr2)
999
1000/*****************************************************************************/
1001LEAF_MIPS_DSPR2(jsimd_h2v1_fancy_upsample_mips_dspr2)
1002/*
1003 * a0 - cinfo->max_v_samp_factor
1004 * a1 - downsampled_width
1005 * a2 - input_data
1006 * a3 - output_data_ptr
1007 */
1008
1009 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
1010
1011 .set at
1012
1013 beqz a0, 3f
1014 sll t0, a0, 2
1015 lw s1, 0(a3)
DRC86fbf352013-07-27 21:44:14 +00001016 li s3, 0x10001
DRC771886c2014-05-09 14:45:55 +00001017 addu s0, s1, t0
DRC86fbf352013-07-27 21:44:14 +000010180:
1019 addiu t8, a1, -2
1020 srl t9, t8, 2
1021 lw t7, 0(a2)
1022 lw s2, 0(s1)
1023 lbu t0, 0(t7)
1024 lbu t1, 1(t7) // t1 = inptr[1]
1025 sll t2, t0, 1
1026 addu t2, t2, t0 // t2 = invalue*3
1027 addu t2, t2, t1
1028 shra_r.w t2, t2, 2
1029 sb t0, 0(s2)
1030 sb t2, 1(s2)
1031 beqz t9, 11f
1032 addiu s2, 2
10331:
1034 ulw t0, 0(t7) // t0 = |P3|P2|P1|P0|
1035 ulw t1, 1(t7)
1036 ulh t2, 4(t7) // t2 = |0|0|P5|P4|
1037 preceu.ph.qbl t3, t0 // t3 = |0|P3|0|P2|
1038 preceu.ph.qbr t0, t0 // t0 = |0|P1|0|P0|
1039 preceu.ph.qbr t2, t2 // t2 = |0|P5|0|P4|
1040 preceu.ph.qbl t4, t1 // t4 = |0|P4|0|P3|
1041 preceu.ph.qbr t1, t1 // t1 = |0|P2|0|P1|
1042 shll.ph t5, t4, 1
1043 shll.ph t6, t1, 1
1044 addu.ph t5, t5, t4 // t5 = |P4*3|P3*3|
1045 addu.ph t6, t6, t1 // t6 = |P2*3|P1*3|
1046 addu.ph t4, t3, s3
1047 addu.ph t0, t0, s3
1048 addu.ph t4, t4, t5
1049 addu.ph t0, t0, t6
1050 shrl.ph t4, t4, 2 // t4 = |0|P3|0|P2|
1051 shrl.ph t0, t0, 2 // t0 = |0|P1|0|P0|
1052 addu.ph t2, t2, t5
1053 addu.ph t3, t3, t6
1054 shra_r.ph t2, t2, 2 // t2 = |0|P5|0|P4|
1055 shra_r.ph t3, t3, 2 // t3 = |0|P3|0|P2|
1056 shll.ph t2, t2, 8
1057 shll.ph t3, t3, 8
1058 or t2, t4, t2
1059 or t3, t3, t0
1060 addiu t9, -1
1061 usw t3, 0(s2)
1062 usw t2, 4(s2)
1063 addiu s2, 8
1064 bgtz t9, 1b
1065 addiu t7, 4
106611:
1067 andi t8, 3
DRC771886c2014-05-09 14:45:55 +00001068 beqz t8, 22f
DRC86fbf352013-07-27 21:44:14 +00001069 addiu t7, 1
DRC771886c2014-05-09 14:45:55 +00001070
DRC86fbf352013-07-27 21:44:14 +000010712:
1072 lbu t0, 0(t7)
1073 addiu t7, 1
1074 sll t1, t0, 1
1075 addu t2, t0, t1 // t2 = invalue
1076 lbu t3, -2(t7)
1077 lbu t4, 0(t7)
1078 addiu t3, 1
1079 addiu t4, 2
1080 addu t3, t3, t2
1081 addu t4, t4, t2
1082 srl t3, 2
1083 srl t4, 2
1084 sb t3, 0(s2)
1085 sb t4, 1(s2)
1086 addiu t8, -1
1087 bgtz t8, 2b
1088 addiu s2, 2
1089
DRC771886c2014-05-09 14:45:55 +0000109022:
DRC86fbf352013-07-27 21:44:14 +00001091 lbu t0, 0(t7)
1092 lbu t2, -1(t7)
1093 sll t1, t0, 1
1094 addu t1, t1, t0 // t1 = invalue * 3
1095 addu t1, t1, t2
1096 addiu t1, 1
1097 srl t1, t1, 2
1098 sb t1, 0(s2)
1099 sb t0, 1(s2)
1100 addiu s1, 4
1101 bne s1, s0, 0b
1102 addiu a2, 4
11033:
1104 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
1105
1106 j ra
1107 nop
1108END(jsimd_h2v1_fancy_upsample_mips_dspr2)
1109
1110/*****************************************************************************/
DRC6f2d3c22013-07-27 21:48:18 +00001111LEAF_MIPS_DSPR2(jsimd_h2v1_downsample_mips_dspr2)
1112/*
1113 * a0 - cinfo->image_width
1114 * a1 - cinfo->max_v_samp_factor
1115 * a2 - compptr->v_samp_factor
1116 * a3 - compptr->width_in_blocks
1117 * 16(sp) - input_data
1118 * 20(sp) - output_data
1119 */
1120 .set at
1121
1122 SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
1123
1124 beqz a2, 7f
1125 lw s1, 44(sp) // s1 = output_data
1126 lw s0, 40(sp) // s0 = input_data
1127 srl s2, a0, 2
1128 andi t9, a0, 2
1129 srl t7, t9, 1
1130 addu s2, t7, s2
1131 sll t0, a3, 3 // t0 = width_in_blocks*DCT
1132 srl t7, t0, 1
1133 subu s2, t7, s2
11340:
1135 andi t6, a0, 1 // t6 = temp_index
1136 addiu t6, -1
1137 lw t4, 0(s1) // t4 = outptr
1138 lw t5, 0(s0) // t5 = inptr0
1139 li s3, 0 // s3 = bias
1140 srl t7, a0, 1 // t7 = image_width1
1141 srl s4, t7, 2
1142 andi t8, t7, 3
11431:
1144 ulhu t0, 0(t5)
1145 ulhu t1, 2(t5)
1146 ulhu t2, 4(t5)
1147 ulhu t3, 6(t5)
1148 raddu.w.qb t0, t0
1149 raddu.w.qb t1, t1
1150 raddu.w.qb t2, t2
1151 raddu.w.qb t3, t3
1152 shra.ph t0, t0, 1
1153 shra_r.ph t1, t1, 1
1154 shra.ph t2, t2, 1
1155 shra_r.ph t3, t3, 1
1156 sb t0, 0(t4)
1157 sb t1, 1(t4)
1158 sb t2, 2(t4)
1159 sb t3, 3(t4)
1160 addiu s4, -1
1161 addiu t4, 4
1162 bgtz s4, 1b
1163 addiu t5, 8
1164 beqz t8, 3f
1165 addu s4, t4, t8
11662:
1167 ulhu t0, 0(t5)
1168 raddu.w.qb t0, t0
1169 addqh.w t0, t0, s3
1170 xori s3, s3, 1
1171 sb t0, 0(t4)
1172 addiu t4, 1
1173 bne t4, s4, 2b
1174 addiu t5, 2
11753:
1176 lbux t1, t6(t5)
1177 sll t1, 1
1178 addqh.w t2, t1, s3 // t2 = pixval1
1179 xori s3, s3, 1
1180 addqh.w t3, t1, s3 // t3 = pixval2
1181 blez s2, 5f
1182 append t3, t2, 8
1183 addu t5, t4, s2 // t5 = loop_end2
11844:
1185 ush t3, 0(t4)
1186 addiu s2, -1
1187 bgtz s2, 4b
1188 addiu t4, 2
11895:
1190 beqz t9, 6f
1191 nop
1192 sb t2, 0(t4)
11936:
1194 addiu s1, 4
1195 addiu a2, -1
1196 bnez a2, 0b
1197 addiu s0, 4
11987:
1199 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
1200
1201 j ra
1202 nop
1203END(jsimd_h2v1_downsample_mips_dspr2)
1204
1205/*****************************************************************************/
1206LEAF_MIPS_DSPR2(jsimd_h2v2_downsample_mips_dspr2)
1207
1208/*
1209 * a0 - cinfo->image_width
1210 * a1 - cinfo->max_v_samp_factor
1211 * a2 - compptr->v_samp_factor
1212 * a3 - compptr->width_in_blocks
1213 * 16(sp) - input_data
1214 * 20(sp) - output_data
1215 */
1216 .set at
1217 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1218
1219 beqz a2, 8f
1220 lw s1, 52(sp) // s1 = output_data
1221 lw s0, 48(sp) // s0 = input_data
1222
1223 andi t6, a0, 1 // t6 = temp_index
1224 addiu t6, -1
1225 srl t7, a0, 1 // t7 = image_width1
1226 srl s4, t7, 2
1227 andi t8, t7, 3
1228 andi t9, a0, 2
1229 srl s2, a0, 2
1230 srl t7, t9, 1
1231 addu s2, t7, s2
1232 sll t0, a3, 3 // s2 = width_in_blocks*DCT
1233 srl t7, t0, 1
1234 subu s2, t7, s2
12350:
1236 lw t4, 0(s1) // t4 = outptr
1237 lw t5, 0(s0) // t5 = inptr0
1238 lw s7, 4(s0) // s7 = inptr1
1239 li s6, 1 // s6 = bias
12402:
1241 ulw t0, 0(t5) // t0 = |P3|P2|P1|P0|
1242 ulw t1, 0(s7) // t1 = |Q3|Q2|Q1|Q0|
1243 ulw t2, 4(t5)
1244 ulw t3, 4(s7)
1245 precrq.ph.w t7, t0, t1 // t2 = |P3|P2|Q3|Q2|
1246 ins t0, t1, 16, 16 // t0 = |Q1|Q0|P1|P0|
1247 raddu.w.qb t1, t7
1248 raddu.w.qb t0, t0
1249 shra_r.w t1, t1, 2
1250 addiu t0, 1
1251 srl t0, 2
1252 precrq.ph.w t7, t2, t3
1253 ins t2, t3, 16, 16
1254 raddu.w.qb t7, t7
1255 raddu.w.qb t2, t2
1256 shra_r.w t7, t7, 2
1257 addiu t2, 1
1258 srl t2, 2
1259 sb t0, 0(t4)
1260 sb t1, 1(t4)
1261 sb t2, 2(t4)
1262 sb t7, 3(t4)
1263 addiu t4, 4
1264 addiu t5, 8
1265 addiu s4, s4, -1
1266 bgtz s4, 2b
1267 addiu s7, 8
1268 beqz t8, 4f
1269 addu t8, t4, t8
12703:
1271 ulhu t0, 0(t5)
1272 ulhu t1, 0(s7)
1273 ins t0, t1, 16, 16
1274 raddu.w.qb t0, t0
1275 addu t0, t0, s6
1276 srl t0, 2
1277 xori s6, s6, 3
1278 sb t0, 0(t4)
1279 addiu t5, 2
1280 addiu t4, 1
1281 bne t8, t4, 3b
1282 addiu s7, 2
12834:
1284 lbux t1, t6(t5)
1285 sll t1, 1
1286 lbux t0, t6(s7)
1287 sll t0, 1
1288 addu t1, t1, t0
1289 addu t3, t1, s6
1290 srl t0, t3, 2 // t2 = pixval1
1291 xori s6, s6, 3
1292 addu t2, t1, s6
1293 srl t1, t2, 2 // t3 = pixval2
1294 blez s2, 6f
1295 append t1, t0, 8
12965:
1297 ush t1, 0(t4)
1298 addiu s2, -1
1299 bgtz s2, 5b
1300 addiu t4, 2
13016:
1302 beqz t9, 7f
1303 nop
1304 sb t0, 0(t4)
13057:
1306 addiu s1, 4
1307 addiu a2, -1
1308 bnez a2, 0b
1309 addiu s0, 8
13108:
1311 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1312
1313 j ra
1314 nop
1315END(jsimd_h2v2_downsample_mips_dspr2)
1316/*****************************************************************************/
DRC6a61c1e2014-05-14 15:00:10 +00001317LEAF_MIPS_DSPR2(jsimd_h2v2_smooth_downsample_mips_dspr2)
1318/*
1319 * a0 - input_data
1320 * a1 - output_data
1321 * a2 - compptr->v_samp_factor
1322 * a3 - cinfo->max_v_samp_factor
1323 * 16(sp) - cinfo->smoothing_factor
1324 * 20(sp) - compptr->width_in_blocks
1325 * 24(sp) - cinfo->image_width
1326 */
1327
1328 .set at
1329
1330 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1331
1332 lw s7, 52(sp) // compptr->width_in_blocks
1333 lw s0, 56(sp) // cinfo->image_width
1334 lw s6, 48(sp) // cinfo->smoothing_factor
1335 sll s7, 3 // output_cols = width_in_blocks * DCTSIZE
1336 sll v0, s7, 1
1337 subu v0, v0, s0
1338 blez v0, 2f
1339 move v1, zero
1340 addiu t0, a3, 2 // t0 = cinfo->max_v_samp_factor + 2
13410:
1342 addiu t1, a0, -4
1343 sll t2, v1, 2
1344 lwx t1, t2(t1)
1345 move t3, v0
1346 addu t1, t1, s0
1347 lbu t2, -1(t1)
13481:
1349 addiu t3, t3, -1
1350 sb t2, 0(t1)
1351 bgtz t3, 1b
1352 addiu t1, t1, 1
1353 addiu v1, v1, 1
1354 bne v1, t0, 0b
1355 nop
13562:
1357 li v0, 80
1358 mul v0, s6, v0
1359 li v1, 16384
1360 move t4, zero
1361 move t5, zero
1362 subu t6, v1, v0 // t6 = 16384 - tmp_smoot_f * 80
1363 sll t7, s6, 4 // t7 = tmp_smoot_f * 16
13643:
1365/* Special case for first column: pretend column -1 is same as column 0 */
1366 sll v0, t4, 2
1367 lwx t8, v0(a1) // outptr = output_data[outrow]
1368 sll v1, t5, 2
1369 addiu t9, v1, 4
1370 addiu s0, v1, -4
1371 addiu s1, v1, 8
1372 lwx s2, v1(a0) // inptr0 = input_data[inrow]
1373 lwx t9, t9(a0) // inptr1 = input_data[inrow+1]
1374 lwx s0, s0(a0) // above_ptr = input_data[inrow-1]
1375 lwx s1, s1(a0) // below_ptr = input_data[inrow+2]
1376 lh v0, 0(s2)
1377 lh v1, 0(t9)
1378 lh t0, 0(s0)
1379 lh t1, 0(s1)
1380 ins v0, v1, 16, 16
1381 ins t0, t1, 16, 16
1382 raddu.w.qb t2, v0
1383 raddu.w.qb s3, t0
1384 lbu v0, 0(s2)
1385 lbu v1, 2(s2)
1386 lbu t0, 0(t9)
1387 lbu t1, 2(t9)
1388 addu v0, v0, v1
1389 mult $ac1,t2, t6
1390 addu t0, t0, t1
1391 lbu t2, 2(s0)
1392 addu t0, t0, v0
1393 lbu t3, 2(s1)
1394 addu s3, t0, s3
1395 lbu v0, 0(s0)
1396 lbu t0, 0(s1)
1397 sll s3, s3, 1
1398 addu v0, v0, t2
1399 addu t0, t0, t3
1400 addu t0, t0, v0
1401 addu s3, t0, s3
1402 madd $ac1,s3, t7
1403 extr_r.w v0, $ac1, 16
1404 addiu t8, t8, 1
1405 addiu s2, s2, 2
1406 addiu t9, t9, 2
1407 addiu s0, s0, 2
1408 addiu s1, s1, 2
1409 sb v0, -1(t8)
1410 addiu s4, s7, -2
1411 and s4, s4, 3
1412 addu s5, s4, t8 //end adress
14134:
1414 lh v0, 0(s2)
1415 lh v1, 0(t9)
1416 lh t0, 0(s0)
1417 lh t1, 0(s1)
1418 ins v0, v1, 16, 16
1419 ins t0, t1, 16, 16
1420 raddu.w.qb t2, v0
1421 raddu.w.qb s3, t0
1422 lbu v0, -1(s2)
1423 lbu v1, 2(s2)
1424 lbu t0, -1(t9)
1425 lbu t1, 2(t9)
1426 addu v0, v0, v1
1427 mult $ac1, t2, t6
1428 addu t0, t0, t1
1429 lbu t2, 2(s0)
1430 addu t0, t0, v0
1431 lbu t3, 2(s1)
1432 addu s3, t0, s3
1433 lbu v0, -1(s0)
1434 lbu t0, -1(s1)
1435 sll s3, s3, 1
1436 addu v0, v0, t2
1437 addu t0, t0, t3
1438 addu t0, t0, v0
1439 addu s3, t0, s3
1440 madd $ac1, s3, t7
1441 extr_r.w t2, $ac1, 16
1442 addiu t8, t8, 1
1443 addiu s2, s2, 2
1444 addiu t9, t9, 2
1445 addiu s0, s0, 2
1446 sb t2, -1(t8)
1447 bne s5, t8, 4b
1448 addiu s1, s1, 2
1449 addiu s5, s7, -2
1450 subu s5, s5, s4
1451 addu s5, s5, t8 //end adress
14525:
1453 lh v0, 0(s2)
1454 lh v1, 0(t9)
1455 lh t0, 0(s0)
1456 lh t1, 0(s1)
1457 ins v0, v1, 16, 16
1458 ins t0, t1, 16, 16
1459 raddu.w.qb t2, v0
1460 raddu.w.qb s3, t0
1461 lbu v0, -1(s2)
1462 lbu v1, 2(s2)
1463 lbu t0, -1(t9)
1464 lbu t1, 2(t9)
1465 addu v0, v0, v1
1466 mult $ac1, t2, t6
1467 addu t0, t0, t1
1468 lbu t2, 2(s0)
1469 addu t0, t0, v0
1470 lbu t3, 2(s1)
1471 addu s3, t0, s3
1472 lbu v0, -1(s0)
1473 lbu t0, -1(s1)
1474 sll s3, s3, 1
1475 addu v0, v0, t2
1476 addu t0, t0, t3
1477 lh v1, 2(t9)
1478 addu t0, t0, v0
1479 lh v0, 2(s2)
1480 addu s3, t0, s3
1481 lh t0, 2(s0)
1482 lh t1, 2(s1)
1483 madd $ac1, s3, t7
1484 extr_r.w t2, $ac1, 16
1485 ins t0, t1, 16, 16
1486 ins v0, v1, 16, 16
1487 raddu.w.qb s3, t0
1488 lbu v1, 4(s2)
1489 lbu t0, 1(t9)
1490 lbu t1, 4(t9)
1491 sb t2, 0(t8)
1492 raddu.w.qb t3, v0
1493 lbu v0, 1(s2)
1494 addu t0, t0, t1
1495 mult $ac1, t3, t6
1496 addu v0, v0, v1
1497 lbu t2, 4(s0)
1498 addu t0, t0, v0
1499 lbu v0, 1(s0)
1500 addu s3, t0, s3
1501 lbu t0, 1(s1)
1502 lbu t3, 4(s1)
1503 addu v0, v0, t2
1504 sll s3, s3, 1
1505 addu t0, t0, t3
1506 lh v1, 4(t9)
1507 addu t0, t0, v0
1508 lh v0, 4(s2)
1509 addu s3, t0, s3
1510 lh t0, 4(s0)
1511 lh t1, 4(s1)
1512 madd $ac1, s3, t7
1513 extr_r.w t2, $ac1, 16
1514 ins t0, t1, 16, 16
1515 ins v0, v1, 16, 16
1516 raddu.w.qb s3, t0
1517 lbu v1, 6(s2)
1518 lbu t0, 3(t9)
1519 lbu t1, 6(t9)
1520 sb t2, 1(t8)
1521 raddu.w.qb t3, v0
1522 lbu v0, 3(s2)
1523 addu t0, t0,t1
1524 mult $ac1, t3, t6
1525 addu v0, v0, v1
1526 lbu t2, 6(s0)
1527 addu t0, t0, v0
1528 lbu v0, 3(s0)
1529 addu s3, t0, s3
1530 lbu t0, 3(s1)
1531 lbu t3, 6(s1)
1532 addu v0, v0, t2
1533 sll s3, s3, 1
1534 addu t0, t0, t3
1535 lh v1, 6(t9)
1536 addu t0, t0, v0
1537 lh v0, 6(s2)
1538 addu s3, t0, s3
1539 lh t0, 6(s0)
1540 lh t1, 6(s1)
1541 madd $ac1, s3, t7
1542 extr_r.w t3, $ac1, 16
1543 ins t0, t1, 16, 16
1544 ins v0, v1, 16, 16
1545 raddu.w.qb s3, t0
1546 lbu v1, 8(s2)
1547 lbu t0, 5(t9)
1548 lbu t1, 8(t9)
1549 sb t3, 2(t8)
1550 raddu.w.qb t2, v0
1551 lbu v0, 5(s2)
1552 addu t0, t0, t1
1553 mult $ac1, t2, t6
1554 addu v0, v0, v1
1555 lbu t2, 8(s0)
1556 addu t0, t0, v0
1557 lbu v0, 5(s0)
1558 addu s3, t0, s3
1559 lbu t0, 5(s1)
1560 lbu t3, 8(s1)
1561 addu v0, v0, t2
1562 sll s3, s3, 1
1563 addu t0, t0, t3
1564 addiu t8, t8, 4
1565 addu t0, t0, v0
1566 addiu s2, s2, 8
1567 addu s3, t0, s3
1568 addiu t9, t9, 8
1569 madd $ac1, s3, t7
1570 extr_r.w t1, $ac1, 16
1571 addiu s0, s0, 8
1572 addiu s1, s1, 8
1573 bne s5, t8, 5b
1574 sb t1, -1(t8)
1575/* Special case for last column */
1576 lh v0, 0(s2)
1577 lh v1, 0(t9)
1578 lh t0, 0(s0)
1579 lh t1, 0(s1)
1580 ins v0, v1, 16, 16
1581 ins t0, t1, 16, 16
1582 raddu.w.qb t2, v0
1583 raddu.w.qb s3, t0
1584 lbu v0, -1(s2)
1585 lbu v1, 1(s2)
1586 lbu t0, -1(t9)
1587 lbu t1, 1(t9)
1588 addu v0, v0, v1
1589 mult $ac1, t2, t6
1590 addu t0, t0, t1
1591 lbu t2, 1(s0)
1592 addu t0, t0, v0
1593 lbu t3, 1(s1)
1594 addu s3, t0, s3
1595 lbu v0, -1(s0)
1596 lbu t0, -1(s1)
1597 sll s3, s3, 1
1598 addu v0, v0, t2
1599 addu t0, t0, t3
1600 addu t0, t0, v0
1601 addu s3, t0, s3
1602 madd $ac1, s3, t7
1603 extr_r.w t0, $ac1, 16
1604 addiu t5, t5, 2
1605 sb t0, 0(t8)
1606 addiu t4, t4, 1
1607 bne t4, a2, 3b
1608 addiu t5, t5, 2
1609
1610 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1611
1612 j ra
1613 nop
1614
DRC0bf325b2014-05-15 17:10:39 +00001615END(jsimd_h2v2_smooth_downsample_mips_dspr2)
DRC5ef46302014-05-18 20:04:47 +00001616
1617/*****************************************************************************/
1618LEAF_MIPS_DSPR2(jsimd_int_upsample_mips_dspr2)
1619/*
1620 * a0 - upsample->h_expand[compptr->component_index]
1621 * a1 - upsample->v_expand[compptr->component_index]
1622 * a2 - input_data
1623 * a3 - output_data_ptr
1624 * 16(sp) - cinfo->output_width
1625 * 20(sp) - cinfo->max_v_samp_factor
1626 */
1627 .set at
1628
1629 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
1630
1631 lw s0, 0(a3) // s0 = output_data
1632 lw s1, 32(sp) // s1 = cinfo->output_width
1633 lw s2, 36(sp) // s2 = cinfo->max_v_samp_factor
1634 li t6, 0 // t6 = inrow
1635 beqz s2, 10f
1636 li s3, 0 // s3 = outrow
16370:
1638 addu t0, a2, t6
1639 addu t7, s0, s3
1640 lw t3, 0(t0) // t3 = inptr
1641 lw t8, 0(t7) // t8 = outptr
1642 beqz s1, 4f
1643 addu t5, t8, s1 // t5 = outend
16441:
1645 lb t2, 0(t3) // t2 = invalue = *inptr++
1646 addiu t3, 1
1647 beqz a0, 3f
1648 move t0, a0 // t0 = h_expand
16492:
1650 sb t2, 0(t8)
1651 addiu t0, -1
1652 bgtz t0, 2b
1653 addiu t8, 1
16543:
1655 bgt t5, t8, 1b
1656 nop
16574:
1658 addiu t9, a1, -1 // t9 = v_expand - 1
1659 blez t9, 9f
1660 nop
16615:
1662 lw t3, 0(s0)
1663 lw t4, 4(s0)
1664 subu t0, s1, 0xF
1665 blez t0, 7f
1666 addu t5, t3, s1 // t5 = end address
1667 andi t7, s1, 0xF // t7 = residual
1668 subu t8, t5, t7
16696:
1670 ulw t0, 0(t3)
1671 ulw t1, 4(t3)
1672 ulw t2, 8(t3)
1673 usw t0, 0(t4)
1674 ulw t0, 12(t3)
1675 usw t1, 4(t4)
1676 usw t2, 8(t4)
1677 usw t0, 12(t4)
1678 addiu t3, 16
1679 bne t3, t8, 6b
1680 addiu t4, 16
1681 beqz t7, 8f
1682 nop
16837:
1684 lbu t0, 0(t3)
1685 sb t0, 0(t4)
1686 addiu t3, 1
1687 bne t3, t5, 7b
1688 addiu t4, 1
16898:
1690 addiu t9, -1
1691 bgtz t9, 5b
1692 addiu s0, 8
16939:
1694 addu s3, s3, a1
1695 bne s3, s2, 0b
1696 addiu t6, 1
169710:
1698 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
1699
1700 j ra
1701 nop
1702END(jsimd_int_upsample_mips_dspr2)
1703
DRC6a61c1e2014-05-14 15:00:10 +00001704/*****************************************************************************/
DRC16962c12013-07-27 21:50:02 +00001705LEAF_MIPS_DSPR2(jsimd_h2v1_upsample_mips_dspr2)
1706/*
1707 * a0 - cinfo->max_v_samp_factor
1708 * a1 - cinfo->output_width
1709 * a2 - input_data
1710 * a3 - output_data_ptr
1711 */
1712 lw t7, 0(a3) // t7 = output_data
1713 andi t8, a1, 0xf // t8 = residual
1714 sll t0, a0, 2
DRC922b14b2013-09-25 17:33:37 +00001715 blez a0, 4f
DRC16962c12013-07-27 21:50:02 +00001716 addu t9, t7, t0 // t9 = output_data end address
17170:
1718 lw t5, 0(t7) // t5 = outptr
1719 lw t6, 0(a2) // t6 = inptr
1720 addu t3, t5, a1 // t3 = outptr + output_width (end address)
1721 subu t3, t8 // t3 = end address - residual
DRC922b14b2013-09-25 17:33:37 +00001722 beq t5, t3, 2f
1723 move t4, t8
DRC16962c12013-07-27 21:50:02 +000017241:
1725 ulw t0, 0(t6) // t0 = |P3|P2|P1|P0|
1726 ulw t2, 4(t6) // t2 = |P7|P6|P5|P4|
1727 srl t1, t0, 16 // t1 = |X|X|P3|P2|
1728 ins t0, t0, 16, 16 // t0 = |P1|P0|P1|P0|
1729 ins t1, t1, 16, 16 // t1 = |P3|P2|P3|P2|
1730 ins t0, t0, 8, 16 // t0 = |P1|P1|P0|P0|
1731 ins t1, t1, 8, 16 // t1 = |P3|P3|P2|P2|
1732 usw t0, 0(t5)
1733 usw t1, 4(t5)
1734 srl t0, t2, 16 // t0 = |X|X|P7|P6|
1735 ins t2, t2, 16, 16 // t2 = |P5|P4|P5|P4|
1736 ins t0, t0, 16, 16 // t0 = |P7|P6|P7|P6|
1737 ins t2, t2, 8, 16 // t2 = |P5|P5|P4|P4|
1738 ins t0, t0, 8, 16 // t0 = |P7|P7|P6|P6|
1739 usw t2, 8(t5)
1740 usw t0, 12(t5)
1741 addiu t5, 16
1742 bne t5, t3, 1b
1743 addiu t6, 8
1744 beqz t8, 3f
1745 move t4, t8
17462:
1747 lbu t1, 0(t6)
1748 sb t1, 0(t5)
1749 sb t1, 1(t5)
1750 addiu t4, -2
1751 addiu t6, 1
1752 bgtz t4, 2b
1753 addiu t5, 2
17543:
1755 addiu t7, 4
1756 bne t9, t7, 0b
1757 addiu a2, 4
17584:
1759 j ra
1760 nop
1761END(jsimd_h2v1_upsample_mips_dspr2)
1762
1763/*****************************************************************************/
1764LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2)
1765/*
1766 * a0 - cinfo->max_v_samp_factor
1767 * a1 - cinfo->output_width
1768 * a2 - input_data
1769 * a3 - output_data_ptr
1770 */
1771 lw t7, 0(a3)
DRC922b14b2013-09-25 17:33:37 +00001772 blez a0, 7f
DRC16962c12013-07-27 21:50:02 +00001773 andi t9, a1, 0xf // t9 = residual
17740:
1775 lw t6, 0(a2) // t6 = inptr
1776 lw t5, 0(t7) // t5 = outptr
1777 addu t8, t5, a1 // t8 = outptr end address
1778 subu t8, t9 // t8 = end address - residual
DRC922b14b2013-09-25 17:33:37 +00001779 beq t5, t8, 2f
1780 move t4, t9
DRC16962c12013-07-27 21:50:02 +000017811:
1782 ulw t0, 0(t6)
1783 srl t1, t0, 16
1784 ins t0, t0, 16, 16
1785 ins t0, t0, 8, 16
1786 ins t1, t1, 16, 16
1787 ins t1, t1, 8, 16
1788 ulw t2, 4(t6)
1789 usw t0, 0(t5)
1790 usw t1, 4(t5)
1791 srl t3, t2, 16
1792 ins t2, t2, 16, 16
1793 ins t2, t2, 8, 16
1794 ins t3, t3, 16, 16
1795 ins t3, t3, 8, 16
1796 usw t2, 8(t5)
1797 usw t3, 12(t5)
1798 addiu t5, 16
1799 bne t5, t8, 1b
1800 addiu t6, 8
1801 beqz t9, 3f
1802 move t4, t9
18032:
1804 lbu t0, 0(t6)
1805 sb t0, 0(t5)
1806 sb t0, 1(t5)
1807 addiu t4, -2
1808 addiu t6, 1
1809 bgtz t4, 2b
1810 addiu t5, 2
18113:
1812 ulw t6, 0(t7) // t6 = outptr
1813 ulw t5, 4(t7) // t5 = outptr[1]
1814 addu t4, t6, a1 // t4 = new end address
1815 subu t8, t4, t9
1816 beqz t8, 5f
1817 nop
18184:
1819 ulw t0, 0(t6)
1820 ulw t1, 4(t6)
1821 ulw t2, 8(t6)
1822 usw t0, 0(t5)
1823 ulw t0, 12(t6)
1824 usw t1, 4(t5)
1825 usw t2, 8(t5)
1826 usw t0, 12(t5)
1827 addiu t6, 16
1828 bne t6, t8, 4b
1829 addiu t5, 16
1830 beqz t9, 6f
1831 nop
18325:
1833 lbu t0, 0(t6)
1834 sb t0, 0(t5)
1835 addiu t6, 1
1836 bne t6, t4, 5b
1837 addiu t5, 1
18386:
1839 addiu t7, 8
1840 addiu a0, -2
1841 bgtz a0, 0b
1842 addiu a2, 4
18437:
1844 j ra
1845 nop
1846END(jsimd_h2v2_upsample_mips_dspr2)
DRCd3131c12013-10-08 02:18:59 +00001847
1848/*****************************************************************************/
DRC34347862014-05-06 09:53:21 +00001849LEAF_MIPS_DSPR2(jsimd_idct_islow_mips_dspr2)
1850/*
1851 * a0 - coef_block
1852 * a1 - compptr->dcttable
1853 * a2 - output
1854 * a3 - range_limit
1855 */
1856
1857 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1858
1859 addiu sp, sp, -256
1860 move v0, sp
1861 addiu v1, zero, 8 // v1 = DCTSIZE = 8
18621:
1863 lh s4, 32(a0) // s4 = inptr[16]
1864 lh s5, 64(a0) // s5 = inptr[32]
1865 lh s6, 96(a0) // s6 = inptr[48]
1866 lh t1, 112(a0) // t1 = inptr[56]
1867 lh t7, 16(a0) // t7 = inptr[8]
1868 lh t5, 80(a0) // t5 = inptr[40]
1869 lh t3, 48(a0) // t3 = inptr[24]
1870 or s4, s4, t1
1871 or s4, s4, t3
1872 or s4, s4, t5
1873 or s4, s4, t7
1874 or s4, s4, s5
1875 or s4, s4, s6
1876 bnez s4, 2f
1877 addiu v1, v1, -1
1878 lh s5, 0(a1) // quantptr[DCTSIZE*0]
1879 lh s6, 0(a0) // inptr[DCTSIZE*0]
1880 mul s5, s5, s6 // DEQUANTIZE(inptr[0], quantptr[0])
1881 sll s5, s5, 2
1882 sw s5, 0(v0)
1883 sw s5, 32(v0)
1884 sw s5, 64(v0)
1885 sw s5, 96(v0)
1886 sw s5, 128(v0)
1887 sw s5, 160(v0)
1888 sw s5, 192(v0)
1889 b 3f
1890 sw s5, 224(v0)
18912:
1892 lh t0, 112(a1)
1893 lh t2, 48(a1)
1894 lh t4, 80(a1)
1895 lh t6, 16(a1)
1896 mul t0, t0, t1 // DEQUANTIZE(inptr[DCTSIZE*7],quant[DCTSIZE*7])
1897 mul t1, t2, t3 // DEQUANTIZE(inptr[DCTSIZE*3],quant[DCTSIZE*3])
1898 mul t2, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*5],quant[DCTSIZE*5])
1899 mul t3, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*1],quant[DCTSIZE*1])
1900 lh t4, 32(a1)
1901 lh t5, 32(a0)
1902 lh t6, 96(a1)
1903 lh t7, 96(a0)
1904 addu s0, t0, t1 // z3 = tmp0 + tmp2
1905 addu s1, t1, t2 // z2 = tmp1 + tmp2
1906 addu s2, t2, t3 // z4 = tmp1 + tmp3
1907 addu s3, s0, s2 // z3 + z4
1908 addiu t9, zero, 9633 // FIX_1_175875602
1909 mul s3, s3, t9 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
1910 addu t8, t0, t3 // z1 = tmp0 + tmp3
1911 addiu t9, zero, 2446 // FIX_0_298631336
1912 mul t0, t0, t9 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
1913 addiu t9, zero, 16819 // FIX_2_053119869
1914 mul t2, t2, t9 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
1915 addiu t9, zero, 25172 // FIX_3_072711026
1916 mul t1, t1, t9 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
1917 addiu t9, zero, 12299 // FIX_1_501321110
1918 mul t3, t3, t9 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
1919 addiu t9, zero, 16069 // FIX_1_961570560
1920 mul s0, s0, t9 // -z3 = MULTIPLY(z3, FIX_1_961570560)
1921 addiu t9, zero, 3196 // FIX_0_390180644
1922 mul s2, s2, t9 // -z4 = MULTIPLY(z4, FIX_0_390180644)
1923 addiu t9, zero, 7373 // FIX_0_899976223
1924 mul t8, t8, t9 // -z1 = MULTIPLY(z1, FIX_0_899976223)
1925 addiu t9, zero, 20995 // FIX_2_562915447
1926 mul s1, s1, t9 // -z2 = MULTIPLY(z2, FIX_2_562915447)
1927 subu s0, s3, s0 // z3 += z5
1928 addu t0, t0, s0 // tmp0 += z3
1929 addu t1, t1, s0 // tmp2 += z3
1930 subu s2, s3, s2 // z4 += z5
1931 addu t2, t2, s2 // tmp1 += z4
1932 addu t3, t3, s2 // tmp3 += z4
1933 subu t0, t0, t8 // tmp0 += z1
1934 subu t1, t1, s1 // tmp2 += z2
1935 subu t2, t2, s1 // tmp1 += z2
1936 subu t3, t3, t8 // tmp3 += z1
1937 mul s0, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*2],quant[DCTSIZE*2])
1938 addiu t9, zero, 6270 // FIX_0_765366865
1939 mul s1, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*6],quant[DCTSIZE*6])
1940 lh t4, 0(a1)
1941 lh t5, 0(a0)
1942 lh t6, 64(a1)
1943 lh t7, 64(a0)
1944 mul s2, t9, s0 // MULTIPLY(z2, FIX_0_765366865)
1945 mul t5, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*0],quant[DCTSIZE*0])
1946 mul t6, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*4],quant[DCTSIZE*4])
1947 addiu t9, zero, 4433 // FIX_0_541196100
1948 addu s3, s0, s1 // z2 + z3
1949 mul s3, s3, t9 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
1950 addiu t9, zero, 15137 // FIX_1_847759065
1951 mul t8, s1, t9 // MULTIPLY(z3, FIX_1_847759065)
1952 addu t4, t5, t6
1953 subu t5, t5, t6
1954 sll t4, t4, 13 // tmp0 = (z2 + z3) << CONST_BITS
1955 sll t5, t5, 13 // tmp1 = (z2 - z3) << CONST_BITS
1956 addu t7, s3, s2 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
1957 subu t6, s3, t8 // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
1958 addu s0, t4, t7
1959 subu s1, t4, t7
1960 addu s2, t5, t6
1961 subu s3, t5, t6
1962 addu t4, s0, t3
1963 subu s0, s0, t3
1964 addu t3, s2, t1
1965 subu s2, s2, t1
1966 addu t1, s3, t2
1967 subu s3, s3, t2
1968 addu t2, s1, t0
1969 subu s1, s1, t0
1970 shra_r.w t4, t4, 11
1971 shra_r.w t3, t3, 11
1972 shra_r.w t1, t1, 11
1973 shra_r.w t2, t2, 11
1974 shra_r.w s1, s1, 11
1975 shra_r.w s3, s3, 11
1976 shra_r.w s2, s2, 11
1977 shra_r.w s0, s0, 11
1978 sw t4, 0(v0)
1979 sw t3, 32(v0)
1980 sw t1, 64(v0)
1981 sw t2, 96(v0)
1982 sw s1, 128(v0)
1983 sw s3, 160(v0)
1984 sw s2, 192(v0)
1985 sw s0, 224(v0)
19863:
1987 addiu a1, a1, 2
1988 addiu a0, a0, 2
1989 bgtz v1, 1b
1990 addiu v0, v0, 4
1991 move v0, sp
1992 addiu v1, zero, 8
19934:
1994 lw t0, 8(v0) // z2 = (INT32) wsptr[2]
1995 lw t1, 24(v0) // z3 = (INT32) wsptr[6]
1996 lw t2, 0(v0) // (INT32) wsptr[0]
1997 lw t3, 16(v0) // (INT32) wsptr[4]
1998 lw s4, 4(v0) // (INT32) wsptr[1]
1999 lw s5, 12(v0) // (INT32) wsptr[3]
2000 lw s6, 20(v0) // (INT32) wsptr[5]
2001 lw s7, 28(v0) // (INT32) wsptr[7]
2002 or s4, s4, t0
2003 or s4, s4, t1
2004 or s4, s4, t3
2005 or s4, s4, s7
2006 or s4, s4, s5
2007 or s4, s4, s6
2008 bnez s4, 5f
2009 addiu v1, v1, -1
2010 shra_r.w s5, t2, 5
2011 andi s5, s5, 0x3ff
2012 lbux s5, s5(a3)
2013 lw s1, 0(a2)
2014 replv.qb s5, s5
2015 usw s5, 0(s1)
2016 usw s5, 4(s1)
2017 b 6f
2018 nop
20195:
2020 addu t4, t0, t1 // z2 + z3
2021 addiu t8, zero, 4433 // FIX_0_541196100
2022 mul t5, t4, t8 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
2023 addiu t8, zero, 15137 // FIX_1_847759065
2024 mul t1, t1, t8 // MULTIPLY(z3, FIX_1_847759065)
2025 addiu t8, zero, 6270 // FIX_0_765366865
2026 mul t0, t0, t8 // MULTIPLY(z2, FIX_0_765366865)
2027 addu t4, t2, t3 // (INT32) wsptr[0] + (INT32) wsptr[4]
2028 subu t2, t2, t3 // (INT32) wsptr[0] - (INT32) wsptr[4]
2029 sll t4, t4, 13 // tmp0 = ((wsptr[0] + wsptr[4]) << CONST_BITS
2030 sll t2, t2, 13 // tmp1 = ((wsptr[0] - wsptr[4]) << CONST_BITS
2031 subu t1, t5, t1 // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
2032 subu t3, t2, t1 // tmp12 = tmp1 - tmp2
2033 addu t2, t2, t1 // tmp11 = tmp1 + tmp2
2034 addu t5, t5, t0 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
2035 subu t1, t4, t5 // tmp13 = tmp0 - tmp3
2036 addu t0, t4, t5 // tmp10 = tmp0 + tmp3
2037 lw t4, 28(v0) // tmp0 = (INT32) wsptr[7]
2038 lw t6, 12(v0) // tmp2 = (INT32) wsptr[3]
2039 lw t5, 20(v0) // tmp1 = (INT32) wsptr[5]
2040 lw t7, 4(v0) // tmp3 = (INT32) wsptr[1]
2041 addu s0, t4, t6 // z3 = tmp0 + tmp2
2042 addiu t8, zero, 9633 // FIX_1_175875602
2043 addu s1, t5, t7 // z4 = tmp1 + tmp3
2044 addu s2, s0, s1 // z3 + z4
2045 mul s2, s2, t8 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
2046 addu s3, t4, t7 // z1 = tmp0 + tmp3
2047 addu t9, t5, t6 // z2 = tmp1 + tmp2
2048 addiu t8, zero, 16069 // FIX_1_961570560
2049 mul s0, s0, t8 // -z3 = MULTIPLY(z3, FIX_1_961570560)
2050 addiu t8, zero, 3196 // FIX_0_390180644
2051 mul s1, s1, t8 // -z4 = MULTIPLY(z4, FIX_0_390180644)
2052 addiu t8, zero, 2446 // FIX_0_298631336
2053 mul t4, t4, t8 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
2054 addiu t8, zero, 7373 // FIX_0_899976223
2055 mul s3, s3, t8 // -z1 = MULTIPLY(z1, FIX_0_899976223)
2056 addiu t8, zero, 16819 // FIX_2_053119869
2057 mul t5, t5, t8 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
2058 addiu t8, zero, 20995 // FIX_2_562915447
2059 mul t9, t9, t8 // -z2 = MULTIPLY(z2, FIX_2_562915447)
2060 addiu t8, zero, 25172 // FIX_3_072711026
2061 mul t6, t6, t8 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
2062 addiu t8, zero, 12299 // FIX_1_501321110
2063 mul t7, t7, t8 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
2064 subu s0, s2, s0 // z3 += z5
2065 subu s1, s2, s1 // z4 += z5
2066 addu t4, t4, s0
2067 subu t4, t4, s3 // tmp0
2068 addu t5, t5, s1
2069 subu t5, t5, t9 // tmp1
2070 addu t6, t6, s0
2071 subu t6, t6, t9 // tmp2
2072 addu t7, t7, s1
2073 subu t7, t7, s3 // tmp3
2074 addu s0, t0, t7
2075 subu t0, t0, t7
2076 addu t7, t2, t6
2077 subu t2, t2, t6
2078 addu t6, t3, t5
2079 subu t3, t3, t5
2080 addu t5, t1, t4
2081 subu t1, t1, t4
2082 shra_r.w s0, s0, 18
2083 shra_r.w t7, t7, 18
2084 shra_r.w t6, t6, 18
2085 shra_r.w t5, t5, 18
2086 shra_r.w t1, t1, 18
2087 shra_r.w t3, t3, 18
2088 shra_r.w t2, t2, 18
2089 shra_r.w t0, t0, 18
2090 andi s0, s0, 0x3ff
2091 andi t7, t7, 0x3ff
2092 andi t6, t6, 0x3ff
2093 andi t5, t5, 0x3ff
2094 andi t1, t1, 0x3ff
2095 andi t3, t3, 0x3ff
2096 andi t2, t2, 0x3ff
2097 andi t0, t0, 0x3ff
2098 lw s1, 0(a2)
2099 lbux s0, s0(a3)
2100 lbux t7, t7(a3)
2101 lbux t6, t6(a3)
2102 lbux t5, t5(a3)
2103 lbux t1, t1(a3)
2104 lbux t3, t3(a3)
2105 lbux t2, t2(a3)
2106 lbux t0, t0(a3)
2107 sb s0, 0(s1)
2108 sb t7, 1(s1)
2109 sb t6, 2(s1)
2110 sb t5, 3(s1)
2111 sb t1, 4(s1)
2112 sb t3, 5(s1)
2113 sb t2, 6(s1)
2114 sb t0, 7(s1)
21156:
2116 addiu v0, v0, 32
2117 bgtz v1, 4b
2118 addiu a2, a2, 4
2119 addiu sp, sp, 256
2120
2121 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2122
2123 j ra
2124 nop
2125
2126END(jsimd_idct_islow_mips_dspr2)
2127
2128/*****************************************************************************/
DRCd3131c12013-10-08 02:18:59 +00002129LEAF_MIPS_DSPR2(jsimd_idct_ifast_cols_mips_dspr2)
2130/*
2131 * a0 - inptr
2132 * a1 - quantptr
2133 * a2 - wsptr
2134 * a3 - mips_idct_ifast_coefs
2135 */
2136
2137 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2138
2139 addiu t9, a0, 16 // end address
2140 or AT, a3, zero
2141
21420:
2143 lw s0, 0(a1) // quantptr[DCTSIZE*0]
2144 lw t0, 0(a0) // inptr[DCTSIZE*0]
2145 lw t1, 16(a0) // inptr[DCTSIZE*1]
2146 muleq_s.w.phl v0, t0, s0 // tmp0 ...
2147 lw t2, 32(a0) // inptr[DCTSIZE*2]
2148 lw t3, 48(a0) // inptr[DCTSIZE*3]
2149 lw t4, 64(a0) // inptr[DCTSIZE*4]
2150 lw t5, 80(a0) // inptr[DCTSIZE*5]
2151 muleq_s.w.phr t0, t0, s0 // ... tmp0 ...
2152 lw t6, 96(a0) // inptr[DCTSIZE*6]
2153 lw t7, 112(a0) // inptr[DCTSIZE*7]
2154 or s4, t1, t2
2155 or s5, t3, t4
2156 bnez s4, 1f
2157 ins t0, v0, 16, 16 // ... tmp0
2158 bnez s5, 1f
2159 or s6, t5, t6
2160 or s6, s6, t7
2161 bnez s6, 1f
2162 sw t0, 0(a2) // wsptr[DCTSIZE*0]
2163 sw t0, 16(a2) // wsptr[DCTSIZE*1]
2164 sw t0, 32(a2) // wsptr[DCTSIZE*2]
2165 sw t0, 48(a2) // wsptr[DCTSIZE*3]
2166 sw t0, 64(a2) // wsptr[DCTSIZE*4]
2167 sw t0, 80(a2) // wsptr[DCTSIZE*5]
2168 sw t0, 96(a2) // wsptr[DCTSIZE*6]
2169 sw t0, 112(a2) // wsptr[DCTSIZE*7]
2170 addiu a0, a0, 4
2171 b 2f
2172 addiu a1, a1, 4
2173
21741:
2175 lw s1, 32(a1) // quantptr[DCTSIZE*2]
2176 lw s2, 64(a1) // quantptr[DCTSIZE*4]
2177 muleq_s.w.phl v0, t2, s1 // tmp1 ...
2178 muleq_s.w.phr t2, t2, s1 // ... tmp1 ...
2179 lw s0, 16(a1) // quantptr[DCTSIZE*1]
2180 lw s1, 48(a1) // quantptr[DCTSIZE*3]
2181 lw s3, 96(a1) // quantptr[DCTSIZE*6]
2182 muleq_s.w.phl v1, t4, s2 // tmp2 ...
2183 muleq_s.w.phr t4, t4, s2 // ... tmp2 ...
2184 lw s2, 80(a1) // quantptr[DCTSIZE*5]
2185 lw t8, 4(AT) // FIX(1.414213562)
2186 ins t2, v0, 16, 16 // ... tmp1
2187 muleq_s.w.phl v0, t6, s3 // tmp3 ...
2188 muleq_s.w.phr t6, t6, s3 // ... tmp3 ...
2189 ins t4, v1, 16, 16 // ... tmp2
2190 addq.ph s4, t0, t4 // tmp10
2191 subq.ph s5, t0, t4 // tmp11
2192 ins t6, v0, 16, 16 // ... tmp3
2193 subq.ph s6, t2, t6 // tmp12 ...
2194 addq.ph s7, t2, t6 // tmp13
2195 mulq_s.ph s6, s6, t8 // ... tmp12 ...
2196 addq.ph t0, s4, s7 // tmp0
2197 subq.ph t6, s4, s7 // tmp3
2198 muleq_s.w.phl v0, t1, s0 // tmp4 ...
2199 muleq_s.w.phr t1, t1, s0 // ... tmp4 ...
2200 shll_s.ph s6, s6, 1 // x2
2201 lw s3, 112(a1) // quantptr[DCTSIZE*7]
2202 subq.ph s6, s6, s7 // ... tmp12
2203 muleq_s.w.phl v1, t7, s3 // tmp7 ...
2204 muleq_s.w.phr t7, t7, s3 // ... tmp7 ...
2205 ins t1, v0, 16, 16 // ... tmp4
2206 addq.ph t2, s5, s6 // tmp1
2207 subq.ph t4, s5, s6 // tmp2
2208 muleq_s.w.phl v0, t5, s2 // tmp6 ...
2209 muleq_s.w.phr t5, t5, s2 // ... tmp6 ...
2210 ins t7, v1, 16, 16 // ... tmp7
2211 addq.ph s5, t1, t7 // z11
2212 subq.ph s6, t1, t7 // z12
2213 muleq_s.w.phl v1, t3, s1 // tmp5 ...
2214 muleq_s.w.phr t3, t3, s1 // ... tmp5 ...
2215 ins t5, v0, 16, 16 // ... tmp6
2216 ins t3, v1, 16, 16 // ... tmp5
2217 addq.ph s7, t5, t3 // z13
2218 subq.ph v0, t5, t3 // z10
2219 addq.ph t7, s5, s7 // tmp7
2220 subq.ph s5, s5, s7 // tmp11 ...
2221 addq.ph v1, v0, s6 // z5 ...
2222 mulq_s.ph s5, s5, t8 // ... tmp11
2223 lw t8, 8(AT) // FIX(1.847759065)
2224 lw s4, 0(AT) // FIX(1.082392200)
2225 addq.ph s0, t0, t7
2226 subq.ph s1, t0, t7
2227 mulq_s.ph v1, v1, t8 // ... z5
2228 shll_s.ph s5, s5, 1 // x2
2229 lw t8, 12(AT) // FIX(-2.613125930)
2230 sw s0, 0(a2) // wsptr[DCTSIZE*0]
2231 shll_s.ph v0, v0, 1 // x4
2232 mulq_s.ph v0, v0, t8 // tmp12 ...
2233 mulq_s.ph s4, s6, s4 // tmp10 ...
2234 shll_s.ph v1, v1, 1 // x2
2235 addiu a0, a0, 4
2236 addiu a1, a1, 4
2237 sw s1, 112(a2) // wsptr[DCTSIZE*7]
2238 shll_s.ph s6, v0, 1 // x4
2239 shll_s.ph s4, s4, 1 // x2
2240 addq.ph s6, s6, v1 // ... tmp12
2241 subq.ph t5, s6, t7 // tmp6
2242 subq.ph s4, s4, v1 // ... tmp10
2243 subq.ph t3, s5, t5 // tmp5
2244 addq.ph s2, t2, t5
2245 addq.ph t1, s4, t3 // tmp4
2246 subq.ph s3, t2, t5
2247 sw s2, 16(a2) // wsptr[DCTSIZE*1]
2248 sw s3, 96(a2) // wsptr[DCTSIZE*6]
2249 addq.ph v0, t4, t3
2250 subq.ph v1, t4, t3
2251 sw v0, 32(a2) // wsptr[DCTSIZE*2]
2252 sw v1, 80(a2) // wsptr[DCTSIZE*5]
2253 addq.ph v0, t6, t1
2254 subq.ph v1, t6, t1
2255 sw v0, 64(a2) // wsptr[DCTSIZE*4]
2256 sw v1, 48(a2) // wsptr[DCTSIZE*3]
2257
22582:
2259 bne a0, t9, 0b
2260 addiu a2, a2, 4
2261
2262 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2263
2264 j ra
2265 nop
2266
2267END(jsimd_idct_ifast_cols_mips_dspr2)
2268
2269/*****************************************************************************/
2270LEAF_MIPS_DSPR2(jsimd_idct_ifast_rows_mips_dspr2)
2271/*
2272 * a0 - wsptr
2273 * a1 - output_buf
2274 * a2 - output_col
2275 * a3 - mips_idct_ifast_coefs
2276 */
2277
2278 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
2279
2280 addiu t9, a0, 128 // end address
2281 lui s8, 0x8080
2282 ori s8, s8, 0x8080
2283
22840:
2285 lw AT, 36(sp) // restore $a3 (mips_idct_ifast_coefs)
2286 lw t0, 0(a0) // wsptr[DCTSIZE*0+0/1] b a
2287 lw s0, 16(a0) // wsptr[DCTSIZE*1+0/1] B A
2288 lw t2, 4(a0) // wsptr[DCTSIZE*0+2/3] d c
2289 lw s2, 20(a0) // wsptr[DCTSIZE*1+2/3] D C
2290 lw t4, 8(a0) // wsptr[DCTSIZE*0+4/5] f e
2291 lw s4, 24(a0) // wsptr[DCTSIZE*1+4/5] F E
2292 lw t6, 12(a0) // wsptr[DCTSIZE*0+6/7] h g
2293 lw s6, 28(a0) // wsptr[DCTSIZE*1+6/7] H G
2294 precrq.ph.w t1, s0, t0 // B b
2295 ins t0, s0, 16, 16 // A a
2296 bnez t1, 1f
2297 or s0, t2, s2
2298 bnez s0, 1f
2299 or s0, t4, s4
2300 bnez s0, 1f
2301 or s0, t6, s6
2302 bnez s0, 1f
2303 shll_s.ph s0, t0, 2 // A a
2304 lw a3, 0(a1)
2305 lw AT, 4(a1)
2306 precrq.ph.w t0, s0, s0 // A A
2307 ins s0, s0, 16, 16 // a a
2308 addu a3, a3, a2
2309 addu AT, AT, a2
2310 precrq.qb.ph t0, t0, t0 // A A A A
2311 precrq.qb.ph s0, s0, s0 // a a a a
2312 addu.qb s0, s0, s8
2313 addu.qb t0, t0, s8
2314 sw s0, 0(a3)
2315 sw s0, 4(a3)
2316 sw t0, 0(AT)
2317 sw t0, 4(AT)
2318 addiu a0, a0, 32
2319 bne a0, t9, 0b
2320 addiu a1, a1, 8
2321 b 2f
2322 nop
2323
23241:
2325 precrq.ph.w t3, s2, t2
2326 ins t2, s2, 16, 16
2327 precrq.ph.w t5, s4, t4
2328 ins t4, s4, 16, 16
2329 precrq.ph.w t7, s6, t6
2330 ins t6, s6, 16, 16
2331 lw t8, 4(AT) // FIX(1.414213562)
2332 addq.ph s4, t0, t4 // tmp10
2333 subq.ph s5, t0, t4 // tmp11
2334 subq.ph s6, t2, t6 // tmp12 ...
2335 addq.ph s7, t2, t6 // tmp13
2336 mulq_s.ph s6, s6, t8 // ... tmp12 ...
2337 addq.ph t0, s4, s7 // tmp0
2338 subq.ph t6, s4, s7 // tmp3
2339 shll_s.ph s6, s6, 1 // x2
2340 subq.ph s6, s6, s7 // ... tmp12
2341 addq.ph t2, s5, s6 // tmp1
2342 subq.ph t4, s5, s6 // tmp2
2343 addq.ph s5, t1, t7 // z11
2344 subq.ph s6, t1, t7 // z12
2345 addq.ph s7, t5, t3 // z13
2346 subq.ph v0, t5, t3 // z10
2347 addq.ph t7, s5, s7 // tmp7
2348 subq.ph s5, s5, s7 // tmp11 ...
2349 addq.ph v1, v0, s6 // z5 ...
2350 mulq_s.ph s5, s5, t8 // ... tmp11
2351 lw t8, 8(AT) // FIX(1.847759065)
2352 lw s4, 0(AT) // FIX(1.082392200)
2353 addq.ph s0, t0, t7 // tmp0 + tmp7
2354 subq.ph s7, t0, t7 // tmp0 - tmp7
2355 mulq_s.ph v1, v1, t8 // ... z5
2356 lw a3, 0(a1)
2357 lw t8, 12(AT) // FIX(-2.613125930)
2358 shll_s.ph s5, s5, 1 // x2
2359 addu a3, a3, a2
2360 shll_s.ph v0, v0, 1 // x4
2361 mulq_s.ph v0, v0, t8 // tmp12 ...
2362 mulq_s.ph s4, s6, s4 // tmp10 ...
2363 shll_s.ph v1, v1, 1 // x2
2364 addiu a0, a0, 32
2365 addiu a1, a1, 8
2366 shll_s.ph s6, v0, 1 // x4
2367 shll_s.ph s4, s4, 1 // x2
2368 addq.ph s6, s6, v1 // ... tmp12
2369 shll_s.ph s0, s0, 2
2370 subq.ph t5, s6, t7 // tmp6
2371 subq.ph s4, s4, v1 // ... tmp10
2372 subq.ph t3, s5, t5 // tmp5
2373 shll_s.ph s7, s7, 2
2374 addq.ph t1, s4, t3 // tmp4
2375 addq.ph s1, t2, t5 // tmp1 + tmp6
2376 subq.ph s6, t2, t5 // tmp1 - tmp6
2377 addq.ph s2, t4, t3 // tmp2 + tmp5
2378 subq.ph s5, t4, t3 // tmp2 - tmp5
2379 addq.ph s4, t6, t1 // tmp3 + tmp4
2380 subq.ph s3, t6, t1 // tmp3 - tmp4
2381 shll_s.ph s1, s1, 2
2382 shll_s.ph s2, s2, 2
2383 shll_s.ph s3, s3, 2
2384 shll_s.ph s4, s4, 2
2385 shll_s.ph s5, s5, 2
2386 shll_s.ph s6, s6, 2
2387 precrq.ph.w t0, s1, s0 // B A
2388 ins s0, s1, 16, 16 // b a
2389 precrq.ph.w t2, s3, s2 // D C
2390 ins s2, s3, 16, 16 // d c
2391 precrq.ph.w t4, s5, s4 // F E
2392 ins s4, s5, 16, 16 // f e
2393 precrq.ph.w t6, s7, s6 // H G
2394 ins s6, s7, 16, 16 // h g
2395 precrq.qb.ph t0, t2, t0 // D C B A
2396 precrq.qb.ph s0, s2, s0 // d c b a
2397 precrq.qb.ph t4, t6, t4 // H G F E
2398 precrq.qb.ph s4, s6, s4 // h g f e
2399 addu.qb s0, s0, s8
2400 addu.qb s4, s4, s8
2401 sw s0, 0(a3) // outptr[0/1/2/3] d c b a
2402 sw s4, 4(a3) // outptr[4/5/6/7] h g f e
2403 lw a3, -4(a1)
2404 addu.qb t0, t0, s8
2405 addu a3, a3, a2
2406 addu.qb t4, t4, s8
2407 sw t0, 0(a3) // outptr[0/1/2/3] D C B A
2408 bne a0, t9, 0b
2409 sw t4, 4(a3) // outptr[4/5/6/7] H G F E
2410
24112:
2412
2413 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
2414
2415 j ra
2416 nop
2417
2418END(jsimd_idct_ifast_rows_mips_dspr2)
2419
DRCa6b7fbd2013-09-30 18:13:27 +00002420/*****************************************************************************/
2421LEAF_MIPS_DSPR2(jsimd_fdct_islow_mips_dspr2)
2422/*
2423 * a0 - data
2424 */
2425
2426 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
2427
2428 lui t0, 6437
2429 ori t0, 2260
2430 lui t1, 9633
2431 ori t1, 11363
2432 lui t2, 0xd39e
2433 ori t2, 0xe6dc
2434 lui t3, 0xf72d
2435 ori t3, 9633
2436 lui t4, 2261
2437 ori t4, 9633
2438 lui t5, 0xd39e
2439 ori t5, 6437
2440 lui t6, 9633
2441 ori t6, 0xd39d
2442 lui t7, 0xe6dc
2443 ori t7, 2260
2444 lui t8, 4433
2445 ori t8, 10703
2446 lui t9, 0xd630
2447 ori t9, 4433
2448 li s8, 8
2449 move a1, a0
24501:
2451 lw s0, 0(a1) // tmp0 = 1|0
2452 lw s1, 4(a1) // tmp1 = 3|2
2453 lw s2, 8(a1) // tmp2 = 5|4
2454 lw s3, 12(a1) // tmp3 = 7|6
2455 packrl.ph s1, s1, s1 // tmp1 = 2|3
2456 packrl.ph s3, s3, s3 // tmp3 = 6|7
2457 subq.ph s7, s1, s2 // tmp7 = 2-5|3-4 = t5|t4
2458 subq.ph s5, s0, s3 // tmp5 = 1-6|0-7 = t6|t7
2459 mult $0, $0 // ac0 = 0
2460 dpa.w.ph $ac0, s7, t0 // ac0 += t5* 6437 + t4* 2260
2461 dpa.w.ph $ac0, s5, t1 // ac0 += t6* 9633 + t7* 11363
2462 mult $ac1, $0, $0 // ac1 = 0
2463 dpa.w.ph $ac1, s7, t2 // ac1 += t5*-11362 + t4* -6436
2464 dpa.w.ph $ac1, s5, t3 // ac1 += t6* -2259 + t7* 9633
2465 mult $ac2, $0, $0 // ac2 = 0
2466 dpa.w.ph $ac2, s7, t4 // ac2 += t5* 2261 + t4* 9633
2467 dpa.w.ph $ac2, s5, t5 // ac2 += t6*-11362 + t7* 6437
2468 mult $ac3, $0, $0 // ac3 = 0
2469 dpa.w.ph $ac3, s7, t6 // ac3 += t5* 9633 + t4*-11363
2470 dpa.w.ph $ac3, s5, t7 // ac3 += t6* -6436 + t7* 2260
2471 addq.ph s6, s1, s2 // tmp6 = 2+5|3+4 = t2|t3
2472 addq.ph s4, s0, s3 // tmp4 = 1+6|0+7 = t1|t0
2473 extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
2474 extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
2475 extr_r.w s2, $ac2, 11 // tmp2 = (ac2 + 1024) >> 11
2476 extr_r.w s3, $ac3, 11 // tmp3 = (ac3 + 1024) >> 11
2477 addq.ph s5, s4, s6 // tmp5 = t1+t2|t0+t3 = t11|t10
2478 subq.ph s7, s4, s6 // tmp7 = t1-t2|t0-t3 = t12|t13
2479 sh s0, 2(a1)
2480 sh s1, 6(a1)
2481 sh s2, 10(a1)
2482 sh s3, 14(a1)
2483 mult $0, $0 // ac0 = 0
2484 dpa.w.ph $ac0, s7, t8 // ac0 += t12* 4433 + t13* 10703
2485 mult $ac1, $0, $0 // ac1 = 0
2486 dpa.w.ph $ac1, s7, t9 // ac1 += t12*-10704 + t13* 4433
2487 sra s4, s5, 16 // tmp4 = t11
2488 addiu a1, a1, 16
2489 addiu s8, s8, -1
2490 extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
2491 extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
2492 addu s2, s5, s4 // tmp2 = t10 + t11
2493 subu s3, s5, s4 // tmp3 = t10 - t11
2494 sll s2, s2, 2 // tmp2 = (t10 + t11) << 2
2495 sll s3, s3, 2 // tmp3 = (t10 - t11) << 2
2496 sh s2, -16(a1)
2497 sh s3, -8(a1)
2498 sh s0, -12(a1)
2499 bgtz s8, 1b
2500 sh s1, -4(a1)
2501 li t0, 2260
2502 li t1, 11363
2503 li t2, 9633
2504 li t3, 6436
2505 li t4, 6437
2506 li t5, 2261
2507 li t6, 11362
2508 li t7, 2259
2509 li t8, 4433
2510 li t9, 10703
2511 li a1, 10704
2512 li s8, 8
2513
25142:
2515 lh a2, 0(a0) // 0
2516 lh a3, 16(a0) // 8
2517 lh v0, 32(a0) // 16
2518 lh v1, 48(a0) // 24
2519 lh s4, 64(a0) // 32
2520 lh s5, 80(a0) // 40
2521 lh s6, 96(a0) // 48
2522 lh s7, 112(a0) // 56
2523 addu s2, v0, s5 // tmp2 = 16 + 40
2524 subu s5, v0, s5 // tmp5 = 16 - 40
2525 addu s3, v1, s4 // tmp3 = 24 + 32
2526 subu s4, v1, s4 // tmp4 = 24 - 32
2527 addu s0, a2, s7 // tmp0 = 0 + 56
2528 subu s7, a2, s7 // tmp7 = 0 - 56
2529 addu s1, a3, s6 // tmp1 = 8 + 48
2530 subu s6, a3, s6 // tmp6 = 8 - 48
2531 addu a2, s0, s3 // tmp10 = tmp0 + tmp3
2532 subu v1, s0, s3 // tmp13 = tmp0 - tmp3
2533 addu a3, s1, s2 // tmp11 = tmp1 + tmp2
2534 subu v0, s1, s2 // tmp12 = tmp1 - tmp2
2535 mult s7, t1 // ac0 = tmp7 * c1
2536 madd s4, t0 // ac0 += tmp4 * c0
2537 madd s5, t4 // ac0 += tmp5 * c4
2538 madd s6, t2 // ac0 += tmp6 * c2
2539 mult $ac1, s7, t2 // ac1 = tmp7 * c2
2540 msub $ac1, s4, t3 // ac1 -= tmp4 * c3
2541 msub $ac1, s5, t6 // ac1 -= tmp5 * c6
2542 msub $ac1, s6, t7 // ac1 -= tmp6 * c7
2543 mult $ac2, s7, t4 // ac2 = tmp7 * c4
2544 madd $ac2, s4, t2 // ac2 += tmp4 * c2
2545 madd $ac2, s5, t5 // ac2 += tmp5 * c5
2546 msub $ac2, s6, t6 // ac2 -= tmp6 * c6
2547 mult $ac3, s7, t0 // ac3 = tmp7 * c0
2548 msub $ac3, s4, t1 // ac3 -= tmp4 * c1
2549 madd $ac3, s5, t2 // ac3 += tmp5 * c2
2550 msub $ac3, s6, t3 // ac3 -= tmp6 * c3
2551 extr_r.w s0, $ac0, 15 // tmp0 = (ac0 + 16384) >> 15
2552 extr_r.w s1, $ac1, 15 // tmp1 = (ac1 + 16384) >> 15
2553 extr_r.w s2, $ac2, 15 // tmp2 = (ac2 + 16384) >> 15
2554 extr_r.w s3, $ac3, 15 // tmp3 = (ac3 + 16384) >> 15
2555 addiu s8, s8, -1
2556 addu s4, a2, a3 // tmp4 = tmp10 + tmp11
2557 subu s5, a2, a3 // tmp5 = tmp10 - tmp11
2558 sh s0, 16(a0)
2559 sh s1, 48(a0)
2560 sh s2, 80(a0)
2561 sh s3, 112(a0)
2562 mult v0, t8 // ac0 = tmp12 * c8
2563 madd v1, t9 // ac0 += tmp13 * c9
2564 mult $ac1, v1, t8 // ac1 = tmp13 * c8
2565 msub $ac1, v0, a1 // ac1 -= tmp12 * c10
2566 addiu a0, a0, 2
2567 extr_r.w s6, $ac0, 15 // tmp6 = (ac0 + 16384) >> 15
2568 extr_r.w s7, $ac1, 15 // tmp7 = (ac1 + 16384) >> 15
2569 shra_r.w s4, s4, 2 // tmp4 = (tmp4 + 2) >> 2
2570 shra_r.w s5, s5, 2 // tmp5 = (tmp5 + 2) >> 2
2571 sh s4, -2(a0)
2572 sh s5, 62(a0)
2573 sh s6, 30(a0)
2574 bgtz s8, 2b
2575 sh s7, 94(a0)
2576
2577 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
2578
2579 jr ra
2580 nop
2581
2582END(jsimd_fdct_islow_mips_dspr2)
2583
2584/*****************************************************************************/
DRC71e06a72013-10-08 02:11:21 +00002585LEAF_MIPS_DSPR2(jsimd_fdct_ifast_mips_dspr2)
2586/*
2587 * a0 - data
2588 */
2589 .set at
2590 SAVE_REGS_ON_STACK 8, s0, s1
2591 li a1, 0x014e014e // FIX_1_306562965 (334 << 16)|(334 & 0xffff)
2592 li a2, 0x008b008b // FIX_0_541196100 (139 << 16)|(139 & 0xffff)
2593 li a3, 0x00620062 // FIX_0_382683433 (98 << 16) |(98 & 0xffff)
2594 li s1, 0x00b500b5 // FIX_0_707106781 (181 << 16)|(181 & 0xffff)
2595
2596 move v0, a0
2597 addiu v1, v0, 128 // end address
2598
25990:
2600 lw t0, 0(v0) // tmp0 = 1|0
2601 lw t1, 4(v0) // tmp1 = 3|2
2602 lw t2, 8(v0) // tmp2 = 5|4
2603 lw t3, 12(v0) // tmp3 = 7|6
2604 packrl.ph t1, t1, t1 // tmp1 = 2|3
2605 packrl.ph t3, t3, t3 // tmp3 = 6|7
2606 subq.ph t7, t1, t2 // tmp7 = 2-5|3-4 = t5|t4
2607 subq.ph t5, t0, t3 // tmp5 = 1-6|0-7 = t6|t7
2608 addq.ph t6, t1, t2 // tmp6 = 2+5|3+4 = t2|t3
2609 addq.ph t4, t0, t3 // tmp4 = 1+6|0+7 = t1|t0
2610 addq.ph t8, t4, t6 // tmp5 = t1+t2|t0+t3 = t11|t10
2611 subq.ph t9, t4, t6 // tmp7 = t1-t2|t0-t3 = t12|t13
2612 sra t4, t8, 16 // tmp4 = t11
2613 mult $0, $0 // ac0 = 0
2614 dpa.w.ph $ac0, t9, s1
2615 mult $ac1, $0, $0 // ac1 = 0
2616 dpa.w.ph $ac1, t7, a3 // ac1 += t4*98 + t5*98
2617 dpsx.w.ph $ac1, t5, a3 // ac1 += t6*98 + t7*98
2618 mult $ac2, $0, $0 // ac2 = 0
2619 dpa.w.ph $ac2, t7, a2 // ac2 += t4*139 + t5*139
2620 mult $ac3, $0, $0 // ac3 = 0
2621 dpa.w.ph $ac3, t5, a1 // ac3 += t6*334 + t7*334
2622 precrq.ph.w t0, t5, t7 // t0 = t5|t6
2623 addq.ph t2, t8, t4 // tmp2 = t10 + t11
2624 subq.ph t3, t8, t4 // tmp3 = t10 - t11
2625 extr.w t4, $ac0, 8
2626 mult $0, $0 // ac0 = 0
2627 dpa.w.ph $ac0, t0, s1 // ac0 += t5*181 + t6*181
2628 extr.w t0, $ac1, 8 // t0 = z5
2629 extr.w t1, $ac2, 8 // t1 = MULTIPLY(tmp10, 139)
2630 extr.w t7, $ac3, 8 // t2 = MULTIPLY(tmp12, 334)
2631 extr.w t8, $ac0, 8 // t8 = z3 = MULTIPLY(tmp11, 181)
2632 add t6, t1, t0 // t6 = z2
2633 add t7, t7, t0 // t7 = z4
2634 subq.ph t0, t5, t8 // t0 = z13 = tmp7 - z3
2635 addq.ph t8, t5, t8 // t9 = z11 = tmp7 + z3
2636 addq.ph t1, t0, t6 // t1 = z13 + z2
2637 subq.ph t6, t0, t6 // t6 = z13 - z2
2638 addq.ph t0, t8, t7 // t0 = z11 + z4
2639 subq.ph t7, t8, t7 // t7 = z11 - z4
2640 addq.ph t5, t4, t9
2641 subq.ph t4, t9, t4
2642 sh t2, 0(v0)
2643 sh t5, 4(v0)
2644 sh t3, 8(v0)
2645 sh t4, 12(v0)
2646 sh t1, 10(v0)
2647 sh t6, 6(v0)
2648 sh t0, 2(v0)
2649 sh t7, 14(v0)
2650 addiu v0, 16
2651 bne v1, v0, 0b
2652 nop
2653 move v0, a0
2654 addiu v1, v0, 16
2655
26561:
2657 lh t0, 0(v0) // 0
2658 lh t1, 16(v0) // 8
2659 lh t2, 32(v0) // 16
2660 lh t3, 48(v0) // 24
2661 lh t4, 64(v0) // 32
2662 lh t5, 80(v0) // 40
2663 lh t6, 96(v0) // 48
2664 lh t7, 112(v0) // 56
2665 add t8, t0, t7 // t8 = tmp0
2666 sub t7, t0, t7 // t7 = tmp7
2667 add t0, t1, t6 // t0 = tmp1
2668 sub t1, t1, t6 // t1 = tmp6
2669 add t6, t2, t5 // t6 = tmp2
2670 sub t5, t2, t5 // t5 = tmp5
2671 add t2, t3, t4 // t2 = tmp3
2672 sub t3, t3, t4 // t3 = tmp4
2673 add t4, t8, t2 // t4 = tmp10 = tmp0 + tmp3
2674 sub t8, t8, t2 // t8 = tmp13 = tmp0 - tmp3
2675 sub s0, t0, t6 // s0 = tmp12 = tmp1 - tmp2
2676 ins t8, s0, 16, 16 // t8 = tmp12|tmp13
2677 add t2, t0, t6 // t2 = tmp11 = tmp1 + tmp2
2678 mult $0, $0 // ac0 = 0
2679 dpa.w.ph $ac0, t8, s1 // ac0 += t12*181 + t13*181
2680 add s0, t4, t2 // t8 = tmp10+tmp11
2681 sub t4, t4, t2 // t4 = tmp10-tmp11
2682 sh s0, 0(v0)
2683 sh t4, 64(v0)
2684 extr.w t2, $ac0, 8 // z1 = MULTIPLY(tmp12+tmp13,FIX_0_707106781)
2685 addq.ph t4, t8, t2 // t9 = tmp13 + z1
2686 subq.ph t8, t8, t2 // t2 = tmp13 - z1
2687 sh t4, 32(v0)
2688 sh t8, 96(v0)
2689 add t3, t3, t5 // t3 = tmp10 = tmp4 + tmp5
2690 add t0, t5, t1 // t0 = tmp11 = tmp5 + tmp6
2691 add t1, t1, t7 // t1 = tmp12 = tmp6 + tmp7
2692 andi t4, a1, 0xffff
2693 mul s0, t1, t4
2694 sra s0, s0, 8 // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965)
2695 ins t1, t3, 16, 16 // t1 = tmp10|tmp12
2696 mult $0, $0 // ac0 = 0
2697 mulsa.w.ph $ac0, t1, a3 // ac0 += t10*98 - t12*98
2698 extr.w t8, $ac0, 8 // z5 = MULTIPLY(tmp10-tmp12,FIX_0_382683433)
2699 add t2, t7, t8 // t2 = tmp7 + z5
2700 sub t7, t7, t8 // t7 = tmp7 - z5
2701 andi t4, a2, 0xffff
2702 mul t8, t3, t4
2703 sra t8, t8, 8 // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100)
2704 andi t4, s1, 0xffff
2705 mul t6, t0, t4
2706 sra t6, t6, 8 // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781)
2707 add t0, t6, t8 // t0 = z3 + z2
2708 sub t1, t6, t8 // t1 = z3 - z2
2709 add t3, t6, s0 // t3 = z3 + z4
2710 sub t4, t6, s0 // t4 = z3 - z4
2711 sub t5, t2, t1 // t5 = dataptr[5]
2712 sub t6, t7, t0 // t6 = dataptr[3]
2713 add t3, t2, t3 // t3 = dataptr[1]
2714 add t4, t7, t4 // t4 = dataptr[7]
2715 sh t5, 80(v0)
2716 sh t6, 48(v0)
2717 sh t3, 16(v0)
2718 sh t4, 112(v0)
2719 addiu v0, 2
2720 bne v0, v1, 1b
2721 nop
2722
2723 RESTORE_REGS_FROM_STACK 8, s0, s1
2724
2725 j ra
2726 nop
2727END(jsimd_fdct_ifast_mips_dspr2)
2728
2729/*****************************************************************************/
DRCa6b7fbd2013-09-30 18:13:27 +00002730LEAF_MIPS_DSPR2(jsimd_quantize_mips_dspr2)
2731/*
2732 * a0 - coef_block
2733 * a1 - divisors
2734 * a2 - workspace
2735 */
2736
2737 .set at
2738
2739 SAVE_REGS_ON_STACK 16, s0, s1, s2
2740
2741 addiu v0, a2, 124 // v0 = workspace_end
2742 lh t0, 0(a2)
2743 lh t1, 0(a1)
2744 lh t2, 128(a1)
2745 sra t3, t0, 15
2746 sll t3, t3, 1
2747 addiu t3, t3, 1
2748 mul t0, t0, t3
2749 lh t4, 384(a1)
2750 lh t5, 130(a1)
2751 lh t6, 2(a2)
2752 lh t7, 2(a1)
2753 lh t8, 386(a1)
2754
27551:
2756 andi t1, 0xffff
2757 add t9, t0, t2
2758 andi t9, 0xffff
2759 mul v1, t9, t1
2760 sra s0, t6, 15
2761 sll s0, s0, 1
2762 addiu s0, s0, 1
2763 addiu t9, t4, 16
2764 srav v1, v1, t9
2765 mul v1, v1, t3
2766 mul t6, t6, s0
2767 andi t7, 0xffff
2768 addiu a2, a2, 4
2769 addiu a1, a1, 4
2770 add s1, t6, t5
2771 andi s1, 0xffff
2772 sh v1, 0(a0)
2773
2774 mul s2, s1, t7
2775 addiu s1, t8, 16
2776 srav s2, s2, s1
2777 mul s2,s2, s0
2778 lh t0, 0(a2)
2779 lh t1, 0(a1)
2780 sra t3, t0, 15
2781 sll t3, t3, 1
2782 addiu t3, t3, 1
2783 mul t0, t0, t3
2784 lh t2, 128(a1)
2785 lh t4, 384(a1)
2786 lh t5, 130(a1)
2787 lh t8, 386(a1)
2788 lh t6, 2(a2)
2789 lh t7, 2(a1)
2790 sh s2, 2(a0)
2791 lh t0, 0(a2)
2792 sra t3, t0, 15
2793 sll t3, t3, 1
2794 addiu t3, t3, 1
2795 mul t0, t0,t3
2796 bne a2, v0, 1b
2797 addiu a0, a0, 4
2798
2799 andi t1, 0xffff
2800 add t9, t0, t2
2801 andi t9, 0xffff
2802 mul v1, t9, t1
2803 sra s0, t6, 15
2804 sll s0, s0, 1
2805 addiu s0, s0, 1
2806 addiu t9, t4, 16
2807 srav v1, v1, t9
2808 mul v1, v1, t3
2809 mul t6, t6, s0
2810 andi t7, 0xffff
2811 sh v1, 0(a0)
2812 add s1, t6, t5
2813 andi s1, 0xffff
2814 mul s2, s1, t7
2815 addiu s1, t8, 16
2816 addiu a2, a2, 4
2817 addiu a1, a1, 4
2818 srav s2, s2, s1
2819 mul s2, s2, s0
2820 sh s2, 2(a0)
2821
2822 RESTORE_REGS_FROM_STACK 16, s0, s1, s2
2823
2824 j ra
2825 nop
2826
2827END(jsimd_quantize_mips_dspr2)
DRC16962c12013-07-27 21:50:02 +00002828
2829/*****************************************************************************/
DRC3d727282013-10-09 18:39:44 +00002830LEAF_MIPS_DSPR2(jsimd_quantize_float_mips_dspr2)
2831/*
2832 * a0 - coef_block
2833 * a1 - divisors
2834 * a2 - workspace
2835 */
2836
2837 .set at
2838
2839 li t1, 0x46800100 //integer representation 16384.5
2840 mtc1 t1, f0
2841 li t0, 63
28420:
2843 lwc1 f1, 0(a2)
2844 lwc1 f5, 0(a1)
2845 lwc1 f2, 4(a2)
2846 lwc1 f6, 4(a1)
2847 lwc1 f3, 8(a2)
2848 lwc1 f7, 8(a1)
2849 lwc1 f4, 12(a2)
2850 lwc1 f8, 12(a1)
2851 madd.s f1, f0, f1, f5
2852 madd.s f2, f0, f2, f6
2853 madd.s f3, f0, f3, f7
2854 madd.s f4, f0, f4, f8
2855 lwc1 f5, 16(a1)
2856 lwc1 f6, 20(a1)
2857 trunc.w.s f1, f1
2858 trunc.w.s f2, f2
2859 trunc.w.s f3, f3
2860 trunc.w.s f4, f4
2861 lwc1 f7, 24(a1)
2862 lwc1 f8, 28(a1)
2863 mfc1 t1, f1
2864 mfc1 t2, f2
2865 mfc1 t3, f3
2866 mfc1 t4, f4
2867 lwc1 f1, 16(a2)
2868 lwc1 f2, 20(a2)
2869 lwc1 f3, 24(a2)
2870 lwc1 f4, 28(a2)
2871 madd.s f1, f0, f1, f5
2872 madd.s f2, f0, f2, f6
2873 madd.s f3, f0, f3, f7
2874 madd.s f4, f0, f4, f8
2875 addiu t1, t1, -16384
2876 addiu t2, t2, -16384
2877 addiu t3, t3, -16384
2878 addiu t4, t4, -16384
2879 trunc.w.s f1, f1
2880 trunc.w.s f2, f2
2881 trunc.w.s f3, f3
2882 trunc.w.s f4, f4
2883 sh t1, 0(a0)
2884 sh t2, 2(a0)
2885 sh t3, 4(a0)
2886 sh t4, 6(a0)
2887 mfc1 t1, f1
2888 mfc1 t2, f2
2889 mfc1 t3, f3
2890 mfc1 t4, f4
2891 addiu t0, t0, -8
2892 addiu a2, a2, 32
2893 addiu a1, a1, 32
2894 addiu t1, t1, -16384
2895 addiu t2, t2, -16384
2896 addiu t3, t3, -16384
2897 addiu t4, t4, -16384
2898 sh t1, 8(a0)
2899 sh t2, 10(a0)
2900 sh t3, 12(a0)
2901 sh t4, 14(a0)
2902 bgez t0, 0b
2903 addiu a0, a0, 16
2904
2905 j ra
2906 nop
2907
2908END(jsimd_quantize_float_mips_dspr2)
2909/*****************************************************************************/
DRC2ccf4d12013-09-27 17:43:23 +00002910LEAF_MIPS_DSPR2(jsimd_idct_2x2_mips_dspr2)
2911/*
2912 * a0 - compptr->dct_table
2913 * a1 - coef_block
2914 * a2 - output_buf
2915 * a3 - output_col
2916 */
2917 .set at
2918
2919 SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
2920
2921 addiu sp, sp, -40
2922 move v0, sp
2923 addiu s2, zero, 29692
2924 addiu s3, zero, -10426
2925 addiu s4, zero, 6967
2926 addiu s5, zero, -5906
2927 lh t0, 0(a1) // t0 = inptr[DCTSIZE*0]
2928 lh t5, 0(a0) // t5 = quantptr[DCTSIZE*0]
2929 lh t1, 48(a1) // t1 = inptr[DCTSIZE*3]
2930 lh t6, 48(a0) // t6 = quantptr[DCTSIZE*3]
2931 mul t4, t5, t0
2932 lh t0, 16(a1) // t0 = inptr[DCTSIZE*1]
2933 lh t5, 16(a0) // t5 = quantptr[DCTSIZE*1]
2934 mul t6, t6, t1
2935 mul t5, t5, t0
2936 lh t2, 80(a1) // t2 = inptr[DCTSIZE*5]
2937 lh t7, 80(a0) // t7 = quantptr[DCTSIZE*5]
2938 lh t3, 112(a1) // t3 = inptr[DCTSIZE*7]
2939 lh t8, 112(a0) // t8 = quantptr[DCTSIZE*7]
2940 mul t7, t7, t2
2941 mult zero, zero
2942 mul t8, t8, t3
2943 li s0, 0x73FCD746 // s0 = (29692 << 16) | (-10426 & 0xffff)
2944 li s1, 0x1B37E8EE // s1 = (6967 << 16) | (-5906 & 0xffff)
2945 ins t6, t5, 16, 16 // t6 = t5|t6
2946 sll t4, t4, 15
2947 dpa.w.ph $ac0, t6, s0
2948 lh t1, 2(a1)
2949 lh t6, 2(a0)
2950 ins t8, t7, 16, 16 // t8 = t7|t8
2951 dpa.w.ph $ac0, t8, s1
2952 mflo t0, $ac0
2953 mul t5, t6, t1
2954 lh t1, 18(a1)
2955 lh t6, 18(a0)
2956 lh t2, 50(a1)
2957 lh t7, 50(a0)
2958 mul t6, t6, t1
2959 subu t8, t4, t0
2960 mul t7, t7, t2
2961 addu t0, t4, t0
2962 shra_r.w t0, t0, 13
2963 lh t1, 82(a1)
2964 lh t2, 82(a0)
2965 lh t3, 114(a1)
2966 lh t4, 114(a0)
2967 shra_r.w t8, t8, 13
2968 mul t1, t1, t2
2969 mul t3, t3, t4
2970 sw t0, 0(v0)
2971 sw t8, 20(v0)
2972 sll t4, t5, 15
2973 ins t7, t6, 16, 16
2974 mult zero, zero
2975 dpa.w.ph $ac0, t7, s0
2976 ins t3, t1, 16, 16
2977 lh t1, 6(a1)
2978 lh t6, 6(a0)
2979 dpa.w.ph $ac0, t3, s1
2980 mflo t0, $ac0
2981 mul t5, t6, t1
2982 lh t1, 22(a1)
2983 lh t6, 22(a0)
2984 lh t2, 54(a1)
2985 lh t7, 54(a0)
2986 mul t6, t6, t1
2987 subu t8, t4, t0
2988 mul t7, t7, t2
2989 addu t0, t4, t0
2990 shra_r.w t0, t0, 13
2991 lh t1, 86(a1)
2992 lh t2, 86(a0)
2993 lh t3, 118(a1)
2994 lh t4, 118(a0)
2995 shra_r.w t8, t8, 13
2996 mul t1, t1, t2
2997 mul t3, t3, t4
2998 sw t0, 4(v0)
2999 sw t8, 24(v0)
3000 sll t4, t5, 15
3001 ins t7, t6, 16, 16
3002 mult zero, zero
3003 dpa.w.ph $ac0, t7, s0
3004 ins t3, t1, 16, 16
3005 lh t1, 10(a1)
3006 lh t6, 10(a0)
3007 dpa.w.ph $ac0, t3, s1
3008 mflo t0, $ac0
3009 mul t5, t6, t1
3010 lh t1, 26(a1)
3011 lh t6, 26(a0)
3012 lh t2, 58(a1)
3013 lh t7, 58(a0)
3014 mul t6, t6, t1
3015 subu t8, t4, t0
3016 mul t7, t7, t2
3017 addu t0, t4, t0
3018 shra_r.w t0, t0, 13
3019 lh t1, 90(a1)
3020 lh t2, 90(a0)
3021 lh t3, 122(a1)
3022 lh t4, 122(a0)
3023 shra_r.w t8, t8, 13
3024 mul t1, t1, t2
3025 mul t3, t3, t4
3026 sw t0, 8(v0)
3027 sw t8, 28(v0)
3028 sll t4, t5, 15
3029 ins t7, t6, 16, 16
3030 mult zero, zero
3031 dpa.w.ph $ac0, t7, s0
3032 ins t3, t1, 16, 16
3033 lh t1, 14(a1)
3034 lh t6, 14(a0)
3035 dpa.w.ph $ac0, t3, s1
3036 mflo t0, $ac0
3037 mul t5, t6, t1
3038 lh t1, 30(a1)
3039 lh t6, 30(a0)
3040 lh t2, 62(a1)
3041 lh t7, 62(a0)
3042 mul t6, t6, t1
3043 subu t8, t4, t0
3044 mul t7, t7, t2
3045 addu t0, t4, t0
3046 shra_r.w t0, t0, 13
3047 lh t1, 94(a1)
3048 lh t2, 94(a0)
3049 lh t3, 126(a1)
3050 lh t4, 126(a0)
3051 shra_r.w t8, t8, 13
3052 mul t1, t1, t2
3053 mul t3, t3, t4
3054 sw t0, 12(v0)
3055 sw t8, 32(v0)
3056 sll t4, t5, 15
3057 ins t7, t6, 16, 16
3058 mult zero, zero
3059 dpa.w.ph $ac0, t7, s0
3060 ins t3, t1, 16, 16
3061 dpa.w.ph $ac0, t3, s1
3062 mflo t0, $ac0
3063 lw t9, 0(a2)
3064 lw t3, 0(v0)
3065 lw t7, 4(v0)
3066 lw t1, 8(v0)
3067 addu t9, t9, a3
3068 sll t3, t3, 15
3069 subu t8, t4, t0
3070 addu t0, t4, t0
3071 shra_r.w t0, t0, 13
3072 shra_r.w t8, t8, 13
3073 sw t0, 16(v0)
3074 sw t8, 36(v0)
3075 lw t5, 12(v0)
3076 lw t6, 16(v0)
3077 mult t7, s2
3078 madd t1, s3
3079 madd t5, s4
3080 madd t6, s5
3081 lw t5, 24(v0)
3082 lw t7, 28(v0)
3083 mflo t0, $ac0
3084 lw t8, 32(v0)
3085 lw t2, 36(v0)
3086 mult $ac1, t5, s2
3087 madd $ac1, t7, s3
3088 madd $ac1, t8, s4
3089 madd $ac1, t2, s5
3090 addu t1, t3, t0
3091 subu t6, t3, t0
3092 shra_r.w t1, t1, 20
3093 shra_r.w t6, t6, 20
3094 mflo t4, $ac1
3095 shll_s.w t1, t1, 24
3096 shll_s.w t6, t6, 24
3097 sra t1, t1, 24
3098 sra t6, t6, 24
3099 addiu t1, t1, 128
3100 addiu t6, t6, 128
3101 lw t0, 20(v0)
3102 sb t1, 0(t9)
3103 sb t6, 1(t9)
3104 sll t0, t0, 15
3105 lw t9, 4(a2)
3106 addu t1, t0, t4
3107 subu t6, t0, t4
3108 addu t9, t9, a3
3109 shra_r.w t1, t1, 20
3110 shra_r.w t6, t6, 20
3111 shll_s.w t1, t1, 24
3112 shll_s.w t6, t6, 24
3113 sra t1, t1, 24
3114 sra t6, t6, 24
3115 addiu t1, t1, 128
3116 addiu t6, t6, 128
3117 sb t1, 0(t9)
3118 sb t6, 1(t9)
3119 addiu sp, sp, 40
3120
3121 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
3122
3123 j ra
3124 nop
3125
3126END(jsimd_idct_2x2_mips_dspr2)
3127
3128/*****************************************************************************/
3129LEAF_MIPS_DSPR2(jsimd_idct_4x4_mips_dspr2)
3130/*
3131 * a0 - compptr->dct_table
3132 * a1 - coef_block
3133 * a2 - output_buf
3134 * a3 - output_col
3135 * 16(sp) - workspace[DCTSIZE*4]; // buffers data between passes
3136 */
3137
3138 .set at
3139 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3140
3141 lw v1, 48(sp)
3142 move t0, a1
3143 move t1, v1
3144 li t9, 4
3145 li s0, 0x2e75f93e
3146 li s1, 0x21f9ba79
3147 li s2, 0xecc2efb0
3148 li s3, 0x52031ccd
3149
31500:
3151 lh s6, 32(t0) // inptr[DCTSIZE*2]
3152 lh t6, 32(a0) // quantptr[DCTSIZE*2]
3153 lh s7, 96(t0) // inptr[DCTSIZE*6]
3154 lh t7, 96(a0) // quantptr[DCTSIZE*6]
3155 mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3156 lh s4, 0(t0) // inptr[DCTSIZE*0]
3157 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3158 lh s5, 0(a0) // quantptr[0]
3159 li s6, 15137
3160 li s7, 6270
3161 mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0])
3162 mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3163 lh t5, 112(t0) // inptr[DCTSIZE*7]
3164 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3165 lh s4, 112(a0) // quantptr[DCTSIZE*7]
3166 lh v0, 80(t0) // inptr[DCTSIZE*5]
3167 lh s5, 80(a0) // quantptr[DCTSIZE*5]
3168 lh s6, 48(a0) // quantptr[DCTSIZE*3]
3169 sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1)
3170 lh s7, 16(a0) // quantptr[DCTSIZE*1]
3171 lh t8, 16(t0) // inptr[DCTSIZE*1]
3172 subu t6, t6, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
3173 lh t7, 48(t0) // inptr[DCTSIZE*3]
3174 mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
3175 mul v0, s5, v0 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
3176 mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
3177 mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
3178 addu t3, t2, t6 // tmp10 = tmp0 + z2
3179 subu t4, t2, t6 // tmp10 = tmp0 - z2
3180 mult $ac0, zero, zero
3181 mult $ac1, zero, zero
3182 ins t5, v0, 16, 16
3183 ins t7, t8, 16, 16
3184 addiu t9, t9, -1
3185 dpa.w.ph $ac0, t5, s0
3186 dpa.w.ph $ac0, t7, s1
3187 dpa.w.ph $ac1, t5, s2
3188 dpa.w.ph $ac1, t7, s3
3189 mflo s4, $ac0
3190 mflo s5, $ac1
3191 addiu a0, a0, 2
3192 addiu t1, t1, 4
3193 addiu t0, t0, 2
3194 addu t6, t4, s4
3195 subu t5, t4, s4
3196 addu s6, t3, s5
3197 subu s7, t3, s5
3198 shra_r.w t6, t6, 12 // DESCALE(tmp12 + temp1, 12)
3199 shra_r.w t5, t5, 12 // DESCALE(tmp12 - temp1, 12)
3200 shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12)
3201 shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12)
3202 sw t6, 28(t1)
3203 sw t5, 60(t1)
3204 sw s6, -4(t1)
3205 bgtz t9, 0b
3206 sw s7, 92(t1)
3207 // second loop three pass
3208 li t9, 3
32091:
3210 lh s6, 34(t0) // inptr[DCTSIZE*2]
3211 lh t6, 34(a0) // quantptr[DCTSIZE*2]
3212 lh s7, 98(t0) // inptr[DCTSIZE*6]
3213 lh t7, 98(a0) // quantptr[DCTSIZE*6]
3214 mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3215 lh s4, 2(t0) // inptr[DCTSIZE*0]
3216 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3217 lh s5, 2(a0) // quantptr[DCTSIZE*0]
3218 li s6, 15137
3219 li s7, 6270
3220 mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0])
3221 mul v0, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3222 lh t5, 114(t0) // inptr[DCTSIZE*7]
3223 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3224 lh s4, 114(a0) // quantptr[DCTSIZE*7]
3225 lh s5, 82(a0) // quantptr[DCTSIZE*5]
3226 lh t6, 82(t0) // inptr[DCTSIZE*5]
3227 sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1)
3228 lh s6, 50(a0) // quantptr[DCTSIZE*3]
3229 lh t8, 18(t0) // inptr[DCTSIZE*1]
3230 subu v0, v0, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
3231 lh t7, 50(t0) // inptr[DCTSIZE*3]
3232 lh s7, 18(a0) // quantptr[DCTSIZE*1]
3233 mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
3234 mul t6, s5, t6 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
3235 mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
3236 mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
3237 addu t3, t2, v0 // tmp10 = tmp0 + z2
3238 subu t4, t2, v0 // tmp10 = tmp0 - z2
3239 mult $ac0, zero, zero
3240 mult $ac1, zero, zero
3241 ins t5, t6, 16, 16
3242 ins t7, t8, 16, 16
3243 dpa.w.ph $ac0, t5, s0
3244 dpa.w.ph $ac0, t7, s1
3245 dpa.w.ph $ac1, t5, s2
3246 dpa.w.ph $ac1, t7, s3
3247 mflo t5, $ac0
3248 mflo t6, $ac1
3249 addiu t9, t9, -1
3250 addiu t0, t0, 2
3251 addiu a0, a0, 2
3252 addiu t1, t1, 4
3253 addu s5, t4, t5
3254 subu s4, t4, t5
3255 addu s6, t3, t6
3256 subu s7, t3, t6
3257 shra_r.w s5, s5, 12 // DESCALE(tmp12 + temp1, 12)
3258 shra_r.w s4, s4, 12 // DESCALE(tmp12 - temp1, 12)
3259 shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12)
3260 shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12)
3261 sw s5, 32(t1)
3262 sw s4, 64(t1)
3263 sw s6, 0(t1)
3264 bgtz t9, 1b
3265 sw s7, 96(t1)
3266 move t1, v1
3267 li s4, 15137
3268 lw s6, 8(t1) // wsptr[2]
3269 li s5, 6270
3270 lw s7, 24(t1) // wsptr[6]
3271 mul s4, s4, s6 // MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
3272 lw t2, 0(t1) // wsptr[0]
3273 mul s5, s5, s7 // MULTIPLY((INT32) wsptr[6], - FIX_0_765366865)
3274 lh t5, 28(t1) // wsptr[7]
3275 lh t6, 20(t1) // wsptr[5]
3276 lh t7, 12(t1) // wsptr[3]
3277 lh t8, 4(t1) // wsptr[1]
3278 ins t5, t6, 16, 16
3279 ins t7, t8, 16, 16
3280 mult $ac0, zero, zero
3281 dpa.w.ph $ac0, t5, s0
3282 dpa.w.ph $ac0, t7, s1
3283 mult $ac1, zero, zero
3284 dpa.w.ph $ac1, t5, s2
3285 dpa.w.ph $ac1, t7, s3
3286 sll t2, t2, 14 // tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1)
3287 mflo s6, $ac0
3288 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3289 subu s4, s4, s5
3290 addu t3, t2, s4 // tmp10 = tmp0 + z2
3291 mflo s7, $ac1
3292 subu t4, t2, s4 // tmp10 = tmp0 - z2
3293 addu t7, t4, s6
3294 subu t8, t4, s6
3295 addu t5, t3, s7
3296 subu t6, t3, s7
3297 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
3298 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
3299 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
3300 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
3301 sll s4, t9, 2
3302 lw v0, 0(a2) // output_buf[ctr]
3303 shll_s.w t5, t5, 24
3304 shll_s.w t6, t6, 24
3305 shll_s.w t7, t7, 24
3306 shll_s.w t8, t8, 24
3307 sra t5, t5, 24
3308 sra t6, t6, 24
3309 sra t7, t7, 24
3310 sra t8, t8, 24
3311 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
3312 addiu t5, t5, 128
3313 addiu t6, t6, 128
3314 addiu t7, t7, 128
3315 addiu t8, t8, 128
3316 sb t5, 0(v0)
3317 sb t7, 1(v0)
3318 sb t8, 2(v0)
3319 sb t6, 3(v0)
3320 // 2
3321 li s4, 15137
3322 lw s6, 40(t1) // wsptr[2]
3323 li s5, 6270
3324 lw s7, 56(t1) // wsptr[6]
3325 mul s4, s4, s6 // MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
3326 lw t2, 32(t1) // wsptr[0]
3327 mul s5, s5, s7 // MULTIPLY((INT32) wsptr[6], - FIX_0_765366865)
3328 lh t5, 60(t1) // wsptr[7]
3329 lh t6, 52(t1) // wsptr[5]
3330 lh t7, 44(t1) // wsptr[3]
3331 lh t8, 36(t1) // wsptr[1]
3332 ins t5, t6, 16, 16
3333 ins t7, t8, 16, 16
3334 mult $ac0, zero, zero
3335 dpa.w.ph $ac0, t5, s0
3336 dpa.w.ph $ac0, t7, s1
3337 mult $ac1, zero, zero
3338 dpa.w.ph $ac1, t5, s2
3339 dpa.w.ph $ac1, t7, s3
3340 sll t2, t2, 14 // tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1)
3341 mflo s6, $ac0
3342 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3343 subu s4, s4, s5
3344 addu t3, t2, s4 // tmp10 = tmp0 + z2
3345 mflo s7, $ac1
3346 subu t4, t2, s4 // tmp10 = tmp0 - z2
3347 addu t7, t4, s6
3348 subu t8, t4, s6
3349 addu t5, t3, s7
3350 subu t6, t3, s7
3351 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1)
3352 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1)
3353 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1)
3354 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1)
3355 sll s4, t9, 2
3356 lw v0, 4(a2) // output_buf[ctr]
3357 shll_s.w t5, t5, 24
3358 shll_s.w t6, t6, 24
3359 shll_s.w t7, t7, 24
3360 shll_s.w t8, t8, 24
3361 sra t5, t5, 24
3362 sra t6, t6, 24
3363 sra t7, t7, 24
3364 sra t8, t8, 24
3365 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
3366 addiu t5, t5, 128
3367 addiu t6, t6, 128
3368 addiu t7, t7, 128
3369 addiu t8, t8, 128
3370 sb t5, 0(v0)
3371 sb t7, 1(v0)
3372 sb t8, 2(v0)
3373 sb t6, 3(v0)
3374 // 3
3375 li s4, 15137
3376 lw s6, 72(t1) // wsptr[2]
3377 li s5, 6270
3378 lw s7, 88(t1) // wsptr[6]
3379 mul s4, s4, s6 // MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
3380 lw t2, 64(t1) // wsptr[0]
3381 mul s5, s5, s7 // MULTIPLY((INT32) wsptr[6], - FIX_0_765366865)
3382 lh t5, 92(t1) // wsptr[7]
3383 lh t6, 84(t1) // wsptr[5]
3384 lh t7, 76(t1) // wsptr[3]
3385 lh t8, 68(t1) // wsptr[1]
3386 ins t5, t6, 16, 16
3387 ins t7, t8, 16, 16
3388 mult $ac0, zero, zero
3389 dpa.w.ph $ac0, t5, s0
3390 dpa.w.ph $ac0, t7, s1
3391 mult $ac1, zero, zero
3392 dpa.w.ph $ac1, t5, s2
3393 dpa.w.ph $ac1, t7, s3
3394 sll t2, t2, 14 // tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1)
3395 mflo s6, $ac0
3396 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3397 subu s4, s4, s5
3398 addu t3, t2, s4 // tmp10 = tmp0 + z2
3399 mflo s7, $ac1
3400 subu t4, t2, s4 // tmp10 = tmp0 - z2
3401 addu t7, t4, s6
3402 subu t8, t4, s6
3403 addu t5, t3, s7
3404 subu t6, t3, s7
3405 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
3406 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
3407 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
3408 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
3409 sll s4, t9, 2
3410 lw v0, 8(a2) // output_buf[ctr]
3411 shll_s.w t5, t5, 24
3412 shll_s.w t6, t6, 24
3413 shll_s.w t7, t7, 24
3414 shll_s.w t8, t8, 24
3415 sra t5, t5, 24
3416 sra t6, t6, 24
3417 sra t7, t7, 24
3418 sra t8, t8, 24
3419 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
3420 addiu t5, t5, 128
3421 addiu t6, t6, 128
3422 addiu t7, t7, 128
3423 addiu t8, t8, 128
3424 sb t5, 0(v0)
3425 sb t7, 1(v0)
3426 sb t8, 2(v0)
3427 sb t6, 3(v0)
3428 li s4, 15137
3429 lw s6, 104(t1) // wsptr[2]
3430 li s5, 6270
3431 lw s7, 120(t1) // wsptr[6]
3432 mul s4, s4, s6 // MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
3433 lw t2, 96(t1) // wsptr[0]
3434 mul s5, s5, s7 // MULTIPLY((INT32) wsptr[6], -FIX_0_765366865)
3435 lh t5, 124(t1) // wsptr[7]
3436 lh t6, 116(t1) // wsptr[5]
3437 lh t7, 108(t1) // wsptr[3]
3438 lh t8, 100(t1) // wsptr[1]
3439 ins t5, t6, 16, 16
3440 ins t7, t8, 16, 16
3441 mult $ac0, zero, zero
3442 dpa.w.ph $ac0, t5, s0
3443 dpa.w.ph $ac0, t7, s1
3444 mult $ac1, zero, zero
3445 dpa.w.ph $ac1, t5, s2
3446 dpa.w.ph $ac1, t7, s3
3447 sll t2, t2, 14 // tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1)
3448 mflo s6, $ac0
3449 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3450 subu s4, s4, s5
3451 addu t3, t2, s4 // tmp10 = tmp0 + z2;
3452 mflo s7, $ac1
3453 subu t4, t2, s4 // tmp10 = tmp0 - z2;
3454 addu t7, t4, s6
3455 subu t8, t4, s6
3456 addu t5, t3, s7
3457 subu t6, t3, s7
3458 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
3459 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
3460 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
3461 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
3462 sll s4, t9, 2
3463 lw v0, 12(a2) // output_buf[ctr]
3464 shll_s.w t5, t5, 24
3465 shll_s.w t6, t6, 24
3466 shll_s.w t7, t7, 24
3467 shll_s.w t8, t8, 24
3468 sra t5, t5, 24
3469 sra t6, t6, 24
3470 sra t7, t7, 24
3471 sra t8, t8, 24
3472 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
3473 addiu t5, t5, 128
3474 addiu t6, t6, 128
3475 addiu t7, t7, 128
3476 addiu t8, t8, 128
3477 sb t5, 0(v0)
3478 sb t7, 1(v0)
3479 sb t8, 2(v0)
3480 sb t6, 3(v0)
3481
3482 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3483
3484 j ra
3485 nop
3486END(jsimd_idct_4x4_mips_dspr2)
3487
3488/*****************************************************************************/
DRCe5005912013-09-27 17:51:08 +00003489LEAF_MIPS_DSPR2(jsimd_idct_6x6_mips_dspr2)
3490/*
3491 * a0 - compptr->dct_table
3492 * a1 - coef_block
3493 * a2 - output_buf
3494 * a3 - output_col
3495 */
3496 .set at
3497
3498 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3499
3500 addiu sp, sp, -144
3501 move v0, sp
3502 addiu v1, v0, 24
3503 addiu t9, zero, 5793
3504 addiu s0, zero, 10033
3505 addiu s1, zero, 2998
3506
35071:
3508 lh s2, 0(a0) // q0 = quantptr[ 0]
3509 lh s3, 32(a0) // q1 = quantptr[16]
3510 lh s4, 64(a0) // q2 = quantptr[32]
3511 lh t2, 64(a1) // tmp2 = inptr[32]
3512 lh t1, 32(a1) // tmp1 = inptr[16]
3513 lh t0, 0(a1) // tmp0 = inptr[ 0]
3514 mul t2, t2, s4 // tmp2 = tmp2 * q2
3515 mul t1, t1, s3 // tmp1 = tmp1 * q1
3516 mul t0, t0, s2 // tmp0 = tmp0 * q0
3517 lh t6, 16(a1) // z1 = inptr[ 8]
3518 lh t8, 80(a1) // z3 = inptr[40]
3519 lh t7, 48(a1) // z2 = inptr[24]
3520 lh s2, 16(a0) // q0 = quantptr[ 8]
3521 lh s4, 80(a0) // q2 = quantptr[40]
3522 lh s3, 48(a0) // q1 = quantptr[24]
3523 mul t2, t2, t9 // tmp2 = tmp2 * 5793
3524 mul t1, t1, s0 // tmp1 = tmp1 * 10033
3525 sll t0, t0, 13 // tmp0 = tmp0 << 13
3526 mul t6, t6, s2 // z1 = z1 * q0
3527 mul t8, t8, s4 // z3 = z3 * q2
3528 mul t7, t7, s3 // z2 = z2 * q1
3529 addu t3, t0, t2 // tmp10 = tmp0 + tmp2
3530 sll t2, t2, 1 // tmp2 = tmp2 << 2
3531 subu t4, t0, t2 // tmp11 = tmp0 - tmp2;
3532 subu t5, t3, t1 // tmp12 = tmp10 - tmp1
3533 addu t3, t3, t1 // tmp10 = tmp10 + tmp1
3534 addu t1, t6, t8 // tmp1 = z1 + z3
3535 mul t1, t1, s1 // tmp1 = tmp1 * 2998
3536 shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11
3537 subu t2, t6, t8 // tmp2 = z1 - z3
3538 subu t2, t2, t7 // tmp2 = tmp2 - z2
3539 sll t2, t2, 2 // tmp2 = tmp2 << 2
3540 addu t0, t6, t7 // tmp0 = z1 + z2
3541 sll t0, t0, 13 // tmp0 = tmp0 << 13
3542 subu s2, t8, t7 // q0 = z3 - z2
3543 sll s2, s2, 13 // q0 = q0 << 13
3544 addu t0, t0, t1 // tmp0 = tmp0 + tmp1
3545 addu t1, s2, t1 // tmp1 = q0 + tmp1
3546 addu s2, t4, t2 // q0 = tmp11 + tmp2
3547 subu s3, t4, t2 // q1 = tmp11 - tmp2
3548 addu t6, t3, t0 // z1 = tmp10 + tmp0
3549 subu t7, t3, t0 // z2 = tmp10 - tmp0
3550 addu t4, t5, t1 // tmp11 = tmp12 + tmp1
3551 subu t5, t5, t1 // tmp12 = tmp12 - tmp1
3552 shra_r.w t6, t6, 11 // z1 = (z1 + 1024) >> 11
3553 shra_r.w t7, t7, 11 // z2 = (z2 + 1024) >> 11
3554 shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11
3555 shra_r.w t5, t5, 11 // tmp12 = (tmp12 + 1024) >> 11
3556 sw s2, 24(v0)
3557 sw s3, 96(v0)
3558 sw t6, 0(v0)
3559 sw t7, 120(v0)
3560 sw t4, 48(v0)
3561 sw t5, 72(v0)
3562 addiu v0, v0, 4
3563 addiu a1, a1, 2
3564 bne v0, v1, 1b
3565 addiu a0, a0, 2
3566
3567 /* Pass 2: process 6 rows from work array, store into output array. */
3568 move v0, sp
3569 addiu v1, v0, 144
3570
35712:
3572 lw t0, 0(v0)
3573 lw t2, 16(v0)
3574 lw s5, 0(a2)
3575 addiu t0, t0, 16
3576 sll t0, t0, 13
3577 mul t3, t2, t9
3578 lw t6, 4(v0)
3579 lw t8, 20(v0)
3580 lw t7, 12(v0)
3581 addu s5, s5, a3
3582 addu s6, t6, t8
3583 mul s6, s6, s1
3584 addu t1, t0, t3
3585 subu t4, t0, t3
3586 subu t4, t4, t3
3587 lw t3, 8(v0)
3588 mul t0, t3, s0
3589 addu s7, t6, t7
3590 sll s7, s7, 13
3591 addu s7, s6, s7
3592 subu t2, t8, t7
3593 sll t2, t2, 13
3594 addu t2, s6, t2
3595 subu s6, t6, t7
3596 subu s6, s6, t8
3597 sll s6, s6, 13
3598 addu t3, t1, t0
3599 subu t5, t1, t0
3600 addu t6, t3, s7
3601 subu t3, t3, s7
3602 addu t7, t4, s6
3603 subu t4, t4, s6
3604 addu t8, t5, t2
3605 subu t5, t5, t2
3606 shll_s.w t6, t6, 6
3607 shll_s.w t3, t3, 6
3608 shll_s.w t7, t7, 6
3609 shll_s.w t4, t4, 6
3610 shll_s.w t8, t8, 6
3611 shll_s.w t5, t5, 6
3612 sra t6, t6, 24
3613 addiu t6, t6, 128
3614 sra t3, t3, 24
3615 addiu t3, t3, 128
3616 sb t6, 0(s5)
3617 sra t7, t7, 24
3618 addiu t7, t7, 128
3619 sb t3, 5(s5)
3620 sra t4, t4, 24
3621 addiu t4, t4, 128
3622 sb t7, 1(s5)
3623 sra t8, t8, 24
3624 addiu t8, t8, 128
3625 sb t4, 4(s5)
3626 addiu v0, v0, 24
3627 sra t5, t5, 24
3628 addiu t5, t5, 128
3629 sb t8, 2(s5)
3630 addiu a2, a2, 4
3631 bne v0, v1, 2b
3632 sb t5, 3(s5)
3633
3634 addiu sp, sp, 144
3635
3636 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3637
3638 j ra
3639 nop
3640
3641END(jsimd_idct_6x6_mips_dspr2)
3642
3643/*****************************************************************************/
3644LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass1_mips_dspr2)
3645/*
3646 * a0 - compptr->dct_table
3647 * a1 - coef_block
3648 * a2 - workspace
3649 */
3650
3651 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
3652
3653 li a3, 8
3654
36551:
3656 // odd part
3657 lh t0, 48(a1)
3658 lh t1, 48(a0)
3659 lh t2, 16(a1)
3660 lh t3, 16(a0)
3661 lh t4, 80(a1)
3662 lh t5, 80(a0)
3663 lh t6, 112(a1)
3664 lh t7, 112(a0)
3665 mul t0, t0, t1 // z2
3666 mul t1, t2, t3 // z1
3667 mul t2, t4, t5 // z3
3668 mul t3, t6, t7 // z4
3669 li t4, 10703 // FIX(1.306562965)
3670 li t5, 4433 // FIX_0_541196100
3671 li t6, 7053 // FIX(0.860918669)
3672 mul t4, t0,t4 // tmp11
3673 mul t5, t0,t5 // -tmp14
3674 addu t7, t1,t2 // tmp10
3675 addu t8, t7,t3 // tmp10 + z4
3676 mul t6, t6, t8 // tmp15
3677 li t8, 2139 // FIX(0.261052384)
3678 mul t8, t7, t8 // MULTIPLY(tmp10, FIX(0.261052384))
3679 li t7, 2295 // FIX(0.280143716)
3680 mul t7, t1, t7 // MULTIPLY(z1, FIX(0.280143716))
3681 addu t9, t2, t3 // z3 + z4
3682 li s0, 8565 // FIX(1.045510580)
3683 mul t9, t9, s0 // -tmp13
3684 li s0, 12112 // FIX(1.478575242)
3685 mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242)
3686 li s1, 12998 // FIX(1.586706681)
3687 mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681))
3688 li s2, 5540 // FIX(0.676326758)
3689 mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758))
3690 li s3, 16244 // FIX(1.982889723)
3691 mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723))
3692 subu t1, t1, t3 // z1-=z4
3693 subu t0, t0, t2 // z2-=z3
3694 addu t2, t0, t1 // z1+z2
3695 li t3, 4433 // FIX_0_541196100
3696 mul t2, t2, t3 // z3
3697 li t3, 6270 // FIX_0_765366865
3698 mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865)
3699 li t3, 15137 // FIX_0_765366865
3700 mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065)
3701 addu t8, t6, t8 // tmp12
3702 addu t3, t8, t4 // tmp12 + tmp11
3703 addu t3, t3, t7 // tmp10
3704 subu t8, t8, t9 // tmp12 + tmp13
3705 addu s0, t5, s0
3706 subu t8, t8, s0 // tmp12
3707 subu t9, t6, t9
3708 subu s1, s1, t4
3709 addu t9, t9, s1 // tmp13
3710 subu t6, t6, t5
3711 subu t6, t6, s2
3712 subu t6, t6, s3 // tmp15
3713 // even part start
3714 lh t4, 64(a1)
3715 lh t5, 64(a0)
3716 lh t7, 32(a1)
3717 lh s0, 32(a0)
3718 lh s1, 0(a1)
3719 lh s2, 0(a0)
3720 lh s3, 96(a1)
3721 lh v0, 96(a0)
3722 mul t4, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*4],quantptr[DCTSIZE*4])
3723 mul t5, t7, s0 // DEQUANTIZE(inptr[DCTSIZE*2],quantptr[DCTSIZE*2])
3724 mul t7, s1, s2 // DEQUANTIZE(inptr[DCTSIZE*0],quantptr[DCTSIZE*0])
3725 mul s0, s3, v0 // DEQUANTIZE(inptr[DCTSIZE*6],quantptr[DCTSIZE*6])
3726 // odd part end
3727 addu t1, t2, t1 // tmp11
3728 subu t0, t2, t0 // tmp14
3729 // update counter and pointers
3730 addiu a3, a3, -1
3731 addiu a0, a0, 2
3732 addiu a1, a1, 2
3733 // even part rest
3734 li s1, 10033
3735 li s2, 11190
3736 mul t4, t4, s1 // z4
3737 mul s1, t5, s2 // z4
3738 sll t5, t5, 13 // z1
3739 sll t7, t7, 13
3740 addiu t7, t7, 1024 // z3
3741 sll s0, s0, 13 // z2
3742 addu s2, t7, t4 // tmp10
3743 subu t4, t7, t4 // tmp11
3744 subu s3, t5, s0 // tmp12
3745 addu t2, t7, s3 // tmp21
3746 subu s3, t7, s3 // tmp24
3747 addu t7, s1, s0 // tmp12
3748 addu v0, s2, t7 // tmp20
3749 subu s2, s2, t7 // tmp25
3750 subu s1, s1, t5 // z4 - z1
3751 subu s1, s1, s0 // tmp12
3752 addu s0, t4, s1 // tmp22
3753 subu t4, t4, s1 // tmp23
3754 // final output stage
3755 addu t5, v0, t3
3756 subu v0, v0, t3
3757 addu t3, t2, t1
3758 subu t2, t2, t1
3759 addu t1, s0, t8
3760 subu s0, s0, t8
3761 addu t8, t4, t9
3762 subu t4, t4, t9
3763 addu t9, s3, t0
3764 subu s3, s3, t0
3765 addu t0, s2, t6
3766 subu s2, s2, t6
3767 sra t5, t5, 11
3768 sra t3, t3, 11
3769 sra t1, t1, 11
3770 sra t8, t8, 11
3771 sra t9, t9, 11
3772 sra t0, t0, 11
3773 sra s2, s2, 11
3774 sra s3, s3, 11
3775 sra t4, t4, 11
3776 sra s0, s0, 11
3777 sra t2, t2, 11
3778 sra v0, v0, 11
3779 sw t5, 0(a2)
3780 sw t3, 32(a2)
3781 sw t1, 64(a2)
3782 sw t8, 96(a2)
3783 sw t9, 128(a2)
3784 sw t0, 160(a2)
3785 sw s2, 192(a2)
3786 sw s3, 224(a2)
3787 sw t4, 256(a2)
3788 sw s0, 288(a2)
3789 sw t2, 320(a2)
3790 sw v0, 352(a2)
3791 bgtz a3, 1b
3792 addiu a2, a2, 4
3793
3794 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
3795
3796 j ra
3797 nop
3798
3799END(jsimd_idct_12x12_pass1_mips_dspr2)
3800
3801/*****************************************************************************/
3802LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass2_mips_dspr2)
3803/*
3804 * a0 - workspace
3805 * a1 - output
3806 */
3807
3808 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
3809
3810 li a3, 12
3811
38121:
3813 // Odd part
3814 lw t0, 12(a0)
3815 lw t1, 4(a0)
3816 lw t2, 20(a0)
3817 lw t3, 28(a0)
3818 li t4, 10703 // FIX(1.306562965)
3819 li t5, 4433 // FIX_0_541196100
3820 mul t4, t0, t4 // tmp11
3821 mul t5, t0, t5 // -tmp14
3822 addu t6, t1, t2 // tmp10
3823 li t7, 2139 // FIX(0.261052384)
3824 mul t7, t6, t7 // MULTIPLY(tmp10, FIX(0.261052384))
3825 addu t6, t6, t3 // tmp10 + z4
3826 li t8, 7053 // FIX(0.860918669)
3827 mul t6, t6, t8 // tmp15
3828 li t8, 2295 // FIX(0.280143716)
3829 mul t8, t1, t8 // MULTIPLY(z1, FIX(0.280143716))
3830 addu t9, t2, t3 // z3 + z4
3831 li s0, 8565 // FIX(1.045510580)
3832 mul t9, t9, s0 // -tmp13
3833 li s0, 12112 // FIX(1.478575242)
3834 mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242))
3835 li s1, 12998 // FIX(1.586706681)
3836 mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681))
3837 li s2, 5540 // FIX(0.676326758)
3838 mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758))
3839 li s3, 16244 // FIX(1.982889723)
3840 mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723))
3841 subu t1, t1, t3 // z1 -= z4
3842 subu t0, t0, t2 // z2 -= z3
3843 addu t2, t1, t0 // z1 + z2
3844 li t3, 4433 // FIX_0_541196100
3845 mul t2, t2, t3 // z3
3846 li t3, 6270 // FIX_0_765366865
3847 mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865)
3848 li t3, 15137 // FIX_1_847759065
3849 mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065)
3850 addu t3, t6, t7 // tmp12
3851 addu t7, t3, t4
3852 addu t7, t7, t8 // tmp10
3853 subu t3, t3, t9
3854 subu t3, t3, t5
3855 subu t3, t3, s0 // tmp12
3856 subu t9, t6, t9
3857 subu t9, t9, t4
3858 addu t9, t9, s1 // tmp13
3859 subu t6, t6, t5
3860 subu t6, t6, s2
3861 subu t6, t6, s3 // tmp15
3862 addu t1, t2, t1 // tmp11
3863 subu t0, t2, t0 // tmp14
3864 // even part
3865 lw t2, 16(a0) // z4
3866 lw t4, 8(a0) // z1
3867 lw t5, 0(a0) // z3
3868 lw t8, 24(a0) // z2
3869 li s0, 10033 // FIX(1.224744871)
3870 li s1, 11190 // FIX(1.366025404)
3871 mul t2, t2, s0 // z4
3872 mul s0, t4, s1 // z4
3873 addiu t5, t5, 0x10
3874 sll t5, t5, 13 // z3
3875 sll t4, t4, 13 // z1
3876 sll t8, t8, 13 // z2
3877 subu s1, t4, t8 // tmp12
3878 addu s2, t5, t2 // tmp10
3879 subu t2, t5, t2 // tmp11
3880 addu s3, t5, s1 // tmp21
3881 subu s1, t5, s1 // tmp24
3882 addu t5, s0, t8 // tmp12
3883 addu v0, s2, t5 // tmp20
3884 subu t5, s2, t5 // tmp25
3885 subu t4, s0, t4
3886 subu t4, t4, t8 // tmp12
3887 addu t8, t2, t4 // tmp22
3888 subu t2, t2, t4 // tmp23
3889 // increment counter and pointers
3890 addiu a3, a3, -1
3891 addiu a0, a0, 32
3892 // Final stage
3893 addu t4, v0, t7
3894 subu v0, v0, t7
3895 addu t7, s3, t1
3896 subu s3, s3, t1
3897 addu t1, t8, t3
3898 subu t8, t8, t3
3899 addu t3, t2, t9
3900 subu t2, t2, t9
3901 addu t9, s1, t0
3902 subu s1, s1, t0
3903 addu t0, t5, t6
3904 subu t5, t5, t6
3905 sll t4, t4, 4
3906 sll t7, t7, 4
3907 sll t1, t1, 4
3908 sll t3, t3, 4
3909 sll t9, t9, 4
3910 sll t0, t0, 4
3911 sll t5, t5, 4
3912 sll s1, s1, 4
3913 sll t2, t2, 4
3914 sll t8, t8, 4
3915 sll s3, s3, 4
3916 sll v0, v0, 4
3917 shll_s.w t4, t4, 2
3918 shll_s.w t7, t7, 2
3919 shll_s.w t1, t1, 2
3920 shll_s.w t3, t3, 2
3921 shll_s.w t9, t9, 2
3922 shll_s.w t0, t0, 2
3923 shll_s.w t5, t5, 2
3924 shll_s.w s1, s1, 2
3925 shll_s.w t2, t2, 2
3926 shll_s.w t8, t8, 2
3927 shll_s.w s3, s3, 2
3928 shll_s.w v0, v0, 2
3929 srl t4, t4, 24
3930 srl t7, t7, 24
3931 srl t1, t1, 24
3932 srl t3, t3, 24
3933 srl t9, t9, 24
3934 srl t0, t0, 24
3935 srl t5, t5, 24
3936 srl s1, s1, 24
3937 srl t2, t2, 24
3938 srl t8, t8, 24
3939 srl s3, s3, 24
3940 srl v0, v0, 24
3941 lw t6, 0(a1)
3942 addiu t4, t4, 0x80
3943 addiu t7, t7, 0x80
3944 addiu t1, t1, 0x80
3945 addiu t3, t3, 0x80
3946 addiu t9, t9, 0x80
3947 addiu t0, t0, 0x80
3948 addiu t5, t5, 0x80
3949 addiu s1, s1, 0x80
3950 addiu t2, t2, 0x80
3951 addiu t8, t8, 0x80
3952 addiu s3, s3, 0x80
3953 addiu v0, v0, 0x80
3954 sb t4, 0(t6)
3955 sb t7, 1(t6)
3956 sb t1, 2(t6)
3957 sb t3, 3(t6)
3958 sb t9, 4(t6)
3959 sb t0, 5(t6)
3960 sb t5, 6(t6)
3961 sb s1, 7(t6)
3962 sb t2, 8(t6)
3963 sb t8, 9(t6)
3964 sb s3, 10(t6)
3965 sb v0, 11(t6)
3966 bgtz a3, 1b
3967 addiu a1, a1, 4
3968
3969 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
3970
3971 jr ra
3972 nop
3973
3974END(jsimd_idct_12x12_pass2_mips_dspr2)
3975
DRC71e06a72013-10-08 02:11:21 +00003976/*****************************************************************************/
DRCfff6c232013-10-12 21:39:20 +00003977LEAF_MIPS_DSPR2(jsimd_convsamp_mips_dspr2)
3978/*
3979 * a0 - sample_data
3980 * a1 - start_col
3981 * a2 - workspace
3982 */
3983
3984 lw t0, 0(a0)
3985 li t7, 0xff80ff80
3986 addu t0, t0, a1
3987 ulw t1, 0(t0)
3988 ulw t2, 4(t0)
3989 preceu.ph.qbr t3, t1
3990 preceu.ph.qbl t4, t1
3991 lw t0, 4(a0)
3992 preceu.ph.qbr t5, t2
3993 preceu.ph.qbl t6, t2
3994 addu t0, t0, a1
3995 addu.ph t3, t3, t7
3996 addu.ph t4, t4, t7
3997 ulw t1, 0(t0)
3998 ulw t2, 4(t0)
3999 addu.ph t5, t5, t7
4000 addu.ph t6, t6, t7
4001 usw t3, 0(a2)
4002 usw t4, 4(a2)
4003 preceu.ph.qbr t3, t1
4004 preceu.ph.qbl t4, t1
4005 usw t5, 8(a2)
4006 usw t6, 12(a2)
4007
4008 lw t0, 8(a0)
4009 preceu.ph.qbr t5, t2
4010 preceu.ph.qbl t6, t2
4011 addu t0, t0, a1
4012 addu.ph t3, t3, t7
4013 addu.ph t4, t4, t7
4014 ulw t1, 0(t0)
4015 ulw t2, 4(t0)
4016 addu.ph t5, t5, t7
4017 addu.ph t6, t6, t7
4018 usw t3, 16(a2)
4019 usw t4, 20(a2)
4020 preceu.ph.qbr t3, t1
4021 preceu.ph.qbl t4, t1
4022 usw t5, 24(a2)
4023 usw t6, 28(a2)
4024
4025 lw t0, 12(a0)
4026 preceu.ph.qbr t5, t2
4027 preceu.ph.qbl t6, t2
4028 addu t0, t0, a1
4029 addu.ph t3, t3, t7
4030 addu.ph t4, t4, t7
4031 ulw t1, 0(t0)
4032 ulw t2, 4(t0)
4033 addu.ph t5, t5, t7
4034 addu.ph t6, t6, t7
4035 usw t3, 32(a2)
4036 usw t4, 36(a2)
4037 preceu.ph.qbr t3, t1
4038 preceu.ph.qbl t4, t1
4039 usw t5, 40(a2)
4040 usw t6, 44(a2)
4041
4042 lw t0, 16(a0)
4043 preceu.ph.qbr t5, t2
4044 preceu.ph.qbl t6, t2
4045 addu t0, t0, a1
4046 addu.ph t3, t3, t7
4047 addu.ph t4, t4, t7
4048 ulw t1, 0(t0)
4049 ulw t2, 4(t0)
4050 addu.ph t5, t5, t7
4051 addu.ph t6, t6, t7
4052 usw t3, 48(a2)
4053 usw t4, 52(a2)
4054 preceu.ph.qbr t3, t1
4055 preceu.ph.qbl t4, t1
4056 usw t5, 56(a2)
4057 usw t6, 60(a2)
4058
4059 lw t0, 20(a0)
4060 preceu.ph.qbr t5, t2
4061 preceu.ph.qbl t6, t2
4062 addu t0, t0, a1
4063 addu.ph t3, t3, t7
4064 addu.ph t4, t4, t7
4065 ulw t1, 0(t0)
4066 ulw t2, 4(t0)
4067 addu.ph t5, t5, t7
4068 addu.ph t6, t6, t7
4069 usw t3, 64(a2)
4070 usw t4, 68(a2)
4071 preceu.ph.qbr t3, t1
4072 preceu.ph.qbl t4, t1
4073 usw t5, 72(a2)
4074 usw t6, 76(a2)
4075
4076 lw t0, 24(a0)
4077 preceu.ph.qbr t5, t2
4078 preceu.ph.qbl t6, t2
4079 addu t0, t0, a1
4080 addu.ph t3, t3, t7
4081 addu.ph t4, t4, t7
4082 ulw t1, 0(t0)
4083 ulw t2, 4(t0)
4084 addu.ph t5, t5, t7
4085 addu.ph t6, t6, t7
4086 usw t3, 80(a2)
4087 usw t4, 84(a2)
4088 preceu.ph.qbr t3, t1
4089 preceu.ph.qbl t4, t1
4090 usw t5, 88(a2)
4091 usw t6, 92(a2)
4092
4093 lw t0, 28(a0)
4094 preceu.ph.qbr t5, t2
4095 preceu.ph.qbl t6, t2
4096 addu t0, t0, a1
4097 addu.ph t3, t3, t7
4098 addu.ph t4, t4, t7
4099 ulw t1, 0(t0)
4100 ulw t2, 4(t0)
4101 addu.ph t5, t5, t7
4102 addu.ph t6, t6, t7
4103 usw t3, 96(a2)
4104 usw t4, 100(a2)
4105 preceu.ph.qbr t3, t1
4106 preceu.ph.qbl t4, t1
4107 usw t5, 104(a2)
4108 usw t6, 108(a2)
4109 preceu.ph.qbr t5, t2
4110 preceu.ph.qbl t6, t2
4111 addu.ph t3, t3, t7
4112 addu.ph t4, t4, t7
4113 addu.ph t5, t5, t7
4114 addu.ph t6, t6, t7
4115 usw t3, 112(a2)
4116 usw t4, 116(a2)
4117 usw t5, 120(a2)
4118 usw t6, 124(a2)
4119
4120 j ra
4121 nop
4122
4123END(jsimd_convsamp_mips_dspr2)
4124
4125/*****************************************************************************/
DRC3d727282013-10-09 18:39:44 +00004126LEAF_MIPS_DSPR2(jsimd_convsamp_float_mips_dspr2)
4127/*
4128 * a0 - sample_data
4129 * a1 - start_col
4130 * a2 - workspace
4131 */
4132
4133 .set at
4134
4135 lw t0, 0(a0)
4136 addu t0, t0, a1
4137 lbu t1, 0(t0)
4138 lbu t2, 1(t0)
4139 lbu t3, 2(t0)
4140 lbu t4, 3(t0)
4141 lbu t5, 4(t0)
4142 lbu t6, 5(t0)
4143 lbu t7, 6(t0)
4144 lbu t8, 7(t0)
4145 addiu t1, t1, -128
4146 addiu t2, t2, -128
4147 addiu t3, t3, -128
4148 addiu t4, t4, -128
4149 addiu t5, t5, -128
4150 addiu t6, t6, -128
4151 addiu t7, t7, -128
4152 addiu t8, t8, -128
4153 mtc1 t1, f1
4154 mtc1 t2, f2
4155 mtc1 t3, f3
4156 mtc1 t4, f4
4157 mtc1 t5, f5
4158 mtc1 t6, f6
4159 mtc1 t7, f7
4160 mtc1 t8, f8
4161 cvt.s.w f1, f1
4162 cvt.s.w f2, f2
4163 cvt.s.w f3, f3
4164 cvt.s.w f4, f4
4165 cvt.s.w f5, f5
4166 cvt.s.w f6, f6
4167 cvt.s.w f7, f7
4168 cvt.s.w f8, f8
4169 lw t0, 4(a0)
4170 swc1 f1, 0(a2)
4171 swc1 f2, 4(a2)
4172 swc1 f3, 8(a2)
4173 addu t0, t0, a1
4174 swc1 f4, 12(a2)
4175 swc1 f5, 16(a2)
4176 swc1 f6, 20(a2)
4177 swc1 f7, 24(a2)
4178 swc1 f8, 28(a2)
4179 //elemr 1
4180 lbu t1, 0(t0)
4181 lbu t2, 1(t0)
4182 lbu t3, 2(t0)
4183 lbu t4, 3(t0)
4184 lbu t5, 4(t0)
4185 lbu t6, 5(t0)
4186 lbu t7, 6(t0)
4187 lbu t8, 7(t0)
4188 addiu t1, t1, -128
4189 addiu t2, t2, -128
4190 addiu t3, t3, -128
4191 addiu t4, t4, -128
4192 addiu t5, t5, -128
4193 addiu t6, t6, -128
4194 addiu t7, t7, -128
4195 addiu t8, t8, -128
4196 mtc1 t1, f1
4197 mtc1 t2, f2
4198 mtc1 t3, f3
4199 mtc1 t4, f4
4200 mtc1 t5, f5
4201 mtc1 t6, f6
4202 mtc1 t7, f7
4203 mtc1 t8, f8
4204 cvt.s.w f1, f1
4205 cvt.s.w f2, f2
4206 cvt.s.w f3, f3
4207 cvt.s.w f4, f4
4208 cvt.s.w f5, f5
4209 cvt.s.w f6, f6
4210 cvt.s.w f7, f7
4211 cvt.s.w f8, f8
4212 lw t0, 8(a0)
4213 swc1 f1, 32(a2)
4214 swc1 f2, 36(a2)
4215 swc1 f3, 40(a2)
4216 addu t0, t0, a1
4217 swc1 f4, 44(a2)
4218 swc1 f5, 48(a2)
4219 swc1 f6, 52(a2)
4220 swc1 f7, 56(a2)
4221 swc1 f8, 60(a2)
4222 //elemr 2
4223 lbu t1, 0(t0)
4224 lbu t2, 1(t0)
4225 lbu t3, 2(t0)
4226 lbu t4, 3(t0)
4227 lbu t5, 4(t0)
4228 lbu t6, 5(t0)
4229 lbu t7, 6(t0)
4230 lbu t8, 7(t0)
4231 addiu t1, t1, -128
4232 addiu t2, t2, -128
4233 addiu t3, t3, -128
4234 addiu t4, t4, -128
4235 addiu t5, t5, -128
4236 addiu t6, t6, -128
4237 addiu t7, t7, -128
4238 addiu t8, t8, -128
4239 mtc1 t1, f1
4240 mtc1 t2, f2
4241 mtc1 t3, f3
4242 mtc1 t4, f4
4243 mtc1 t5, f5
4244 mtc1 t6, f6
4245 mtc1 t7, f7
4246 mtc1 t8, f8
4247 cvt.s.w f1, f1
4248 cvt.s.w f2, f2
4249 cvt.s.w f3, f3
4250 cvt.s.w f4, f4
4251 cvt.s.w f5, f5
4252 cvt.s.w f6, f6
4253 cvt.s.w f7, f7
4254 cvt.s.w f8, f8
4255 lw t0, 12(a0)
4256 swc1 f1, 64(a2)
4257 swc1 f2, 68(a2)
4258 swc1 f3, 72(a2)
4259 addu t0, t0, a1
4260 swc1 f4, 76(a2)
4261 swc1 f5, 80(a2)
4262 swc1 f6, 84(a2)
4263 swc1 f7, 88(a2)
4264 swc1 f8, 92(a2)
4265 //elemr 3
4266 lbu t1, 0(t0)
4267 lbu t2, 1(t0)
4268 lbu t3, 2(t0)
4269 lbu t4, 3(t0)
4270 lbu t5, 4(t0)
4271 lbu t6, 5(t0)
4272 lbu t7, 6(t0)
4273 lbu t8, 7(t0)
4274 addiu t1, t1, -128
4275 addiu t2, t2, -128
4276 addiu t3, t3, -128
4277 addiu t4, t4, -128
4278 addiu t5, t5, -128
4279 addiu t6, t6, -128
4280 addiu t7, t7, -128
4281 addiu t8, t8, -128
4282 mtc1 t1, f1
4283 mtc1 t2, f2
4284 mtc1 t3, f3
4285 mtc1 t4, f4
4286 mtc1 t5, f5
4287 mtc1 t6, f6
4288 mtc1 t7, f7
4289 mtc1 t8, f8
4290 cvt.s.w f1, f1
4291 cvt.s.w f2, f2
4292 cvt.s.w f3, f3
4293 cvt.s.w f4, f4
4294 cvt.s.w f5, f5
4295 cvt.s.w f6, f6
4296 cvt.s.w f7, f7
4297 cvt.s.w f8, f8
4298 lw t0, 16(a0)
4299 swc1 f1, 96(a2)
4300 swc1 f2, 100(a2)
4301 swc1 f3, 104(a2)
4302 addu t0, t0, a1
4303 swc1 f4, 108(a2)
4304 swc1 f5, 112(a2)
4305 swc1 f6, 116(a2)
4306 swc1 f7, 120(a2)
4307 swc1 f8, 124(a2)
4308 //elemr 4
4309 lbu t1, 0(t0)
4310 lbu t2, 1(t0)
4311 lbu t3, 2(t0)
4312 lbu t4, 3(t0)
4313 lbu t5, 4(t0)
4314 lbu t6, 5(t0)
4315 lbu t7, 6(t0)
4316 lbu t8, 7(t0)
4317 addiu t1, t1, -128
4318 addiu t2, t2, -128
4319 addiu t3, t3, -128
4320 addiu t4, t4, -128
4321 addiu t5, t5, -128
4322 addiu t6, t6, -128
4323 addiu t7, t7, -128
4324 addiu t8, t8, -128
4325 mtc1 t1, f1
4326 mtc1 t2, f2
4327 mtc1 t3, f3
4328 mtc1 t4, f4
4329 mtc1 t5, f5
4330 mtc1 t6, f6
4331 mtc1 t7, f7
4332 mtc1 t8, f8
4333 cvt.s.w f1, f1
4334 cvt.s.w f2, f2
4335 cvt.s.w f3, f3
4336 cvt.s.w f4, f4
4337 cvt.s.w f5, f5
4338 cvt.s.w f6, f6
4339 cvt.s.w f7, f7
4340 cvt.s.w f8, f8
4341 lw t0, 20(a0)
4342 swc1 f1, 128(a2)
4343 swc1 f2, 132(a2)
4344 swc1 f3, 136(a2)
4345 addu t0, t0, a1
4346 swc1 f4, 140(a2)
4347 swc1 f5, 144(a2)
4348 swc1 f6, 148(a2)
4349 swc1 f7, 152(a2)
4350 swc1 f8, 156(a2)
4351 //elemr 5
4352 lbu t1, 0(t0)
4353 lbu t2, 1(t0)
4354 lbu t3, 2(t0)
4355 lbu t4, 3(t0)
4356 lbu t5, 4(t0)
4357 lbu t6, 5(t0)
4358 lbu t7, 6(t0)
4359 lbu t8, 7(t0)
4360 addiu t1, t1, -128
4361 addiu t2, t2, -128
4362 addiu t3, t3, -128
4363 addiu t4, t4, -128
4364 addiu t5, t5, -128
4365 addiu t6, t6, -128
4366 addiu t7, t7, -128
4367 addiu t8, t8, -128
4368 mtc1 t1, f1
4369 mtc1 t2, f2
4370 mtc1 t3, f3
4371 mtc1 t4, f4
4372 mtc1 t5, f5
4373 mtc1 t6, f6
4374 mtc1 t7, f7
4375 mtc1 t8, f8
4376 cvt.s.w f1, f1
4377 cvt.s.w f2, f2
4378 cvt.s.w f3, f3
4379 cvt.s.w f4, f4
4380 cvt.s.w f5, f5
4381 cvt.s.w f6, f6
4382 cvt.s.w f7, f7
4383 cvt.s.w f8, f8
4384 lw t0, 24(a0)
4385 swc1 f1, 160(a2)
4386 swc1 f2, 164(a2)
4387 swc1 f3, 168(a2)
4388 addu t0, t0, a1
4389 swc1 f4, 172(a2)
4390 swc1 f5, 176(a2)
4391 swc1 f6, 180(a2)
4392 swc1 f7, 184(a2)
4393 swc1 f8, 188(a2)
4394 //elemr 6
4395 lbu t1, 0(t0)
4396 lbu t2, 1(t0)
4397 lbu t3, 2(t0)
4398 lbu t4, 3(t0)
4399 lbu t5, 4(t0)
4400 lbu t6, 5(t0)
4401 lbu t7, 6(t0)
4402 lbu t8, 7(t0)
4403 addiu t1, t1, -128
4404 addiu t2, t2, -128
4405 addiu t3, t3, -128
4406 addiu t4, t4, -128
4407 addiu t5, t5, -128
4408 addiu t6, t6, -128
4409 addiu t7, t7, -128
4410 addiu t8, t8, -128
4411 mtc1 t1, f1
4412 mtc1 t2, f2
4413 mtc1 t3, f3
4414 mtc1 t4, f4
4415 mtc1 t5, f5
4416 mtc1 t6, f6
4417 mtc1 t7, f7
4418 mtc1 t8, f8
4419 cvt.s.w f1, f1
4420 cvt.s.w f2, f2
4421 cvt.s.w f3, f3
4422 cvt.s.w f4, f4
4423 cvt.s.w f5, f5
4424 cvt.s.w f6, f6
4425 cvt.s.w f7, f7
4426 cvt.s.w f8, f8
4427 lw t0, 28(a0)
4428 swc1 f1, 192(a2)
4429 swc1 f2, 196(a2)
4430 swc1 f3, 200(a2)
4431 addu t0, t0, a1
4432 swc1 f4, 204(a2)
4433 swc1 f5, 208(a2)
4434 swc1 f6, 212(a2)
4435 swc1 f7, 216(a2)
4436 swc1 f8, 220(a2)
4437 //elemr 7
4438 lbu t1, 0(t0)
4439 lbu t2, 1(t0)
4440 lbu t3, 2(t0)
4441 lbu t4, 3(t0)
4442 lbu t5, 4(t0)
4443 lbu t6, 5(t0)
4444 lbu t7, 6(t0)
4445 lbu t8, 7(t0)
4446 addiu t1, t1, -128
4447 addiu t2, t2, -128
4448 addiu t3, t3, -128
4449 addiu t4, t4, -128
4450 addiu t5, t5, -128
4451 addiu t6, t6, -128
4452 addiu t7, t7, -128
4453 addiu t8, t8, -128
4454 mtc1 t1, f1
4455 mtc1 t2, f2
4456 mtc1 t3, f3
4457 mtc1 t4, f4
4458 mtc1 t5, f5
4459 mtc1 t6, f6
4460 mtc1 t7, f7
4461 mtc1 t8, f8
4462 cvt.s.w f1, f1
4463 cvt.s.w f2, f2
4464 cvt.s.w f3, f3
4465 cvt.s.w f4, f4
4466 cvt.s.w f5, f5
4467 cvt.s.w f6, f6
4468 cvt.s.w f7, f7
4469 cvt.s.w f8, f8
4470 swc1 f1, 224(a2)
4471 swc1 f2, 228(a2)
4472 swc1 f3, 232(a2)
4473 swc1 f4, 236(a2)
4474 swc1 f5, 240(a2)
4475 swc1 f6, 244(a2)
4476 swc1 f7, 248(a2)
4477 swc1 f8, 252(a2)
4478
4479 j ra
4480 nop
4481
4482END(jsimd_convsamp_float_mips_dspr2)
4483
4484/*****************************************************************************/
4485