blob: bdb6ffa61d1e5830edcd4d8dc825f211e662108d [file] [log] [blame]
Raghu Gandham71ee8592012-11-06 10:17:40 -08001#
2# Copyright (C) 2011 The Android Open Source Project
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16
17# IDCT implementation using the MIPS DSP ASE (little endian version)
18#
19# See MIPS Technologies Inc documents:
20# "JPEG Decoder Optimization for MIPS32(R) Cores" MD00483
21#
22# "MIPS32(R) Architecture for Programmers Volume IV-e: The MIPS(R) DSP
23# Application Specifice Extension to the MIPS32(R) Architecture" MD00374
24#
25
26 .set noreorder
27 .set nomacro
28 .set noat
29
30# This table has been moved to mips_jidctfst.c to avoid having to mess
31# with the global pointer to make this code PIC.
32# .rdata
33#
34# mips_idct_coefs:
35# # Constant table of scaled IDCT coefficients.
36#
37# .word 0x45464546 # FIX( 1.082392200 / 2) = 17734 = 0x4546
38# .word 0x5A825A82 # FIX( 1.414213562 / 2) = 23170 = 0x5A82
39# .word 0x76427642 # FIX( 1.847759065 / 2) = 30274 = 0x7642
40# .word 0xAC61AC61 # FIX(-2.613125930 / 4) = -21407 = 0xAC61
41
42 .text
43
44 .global mips_idct_columns
45 .ent mips_idct_columns
46
47# void mips_idct_columns(JCOEF * inptr, IFAST_MULT_TYPE * quantptr,
48# DCTELEM * wsptr, const int * mips_idct_coefs);
49
50mips_idct_columns:
51
52# $a0 - inptr
53# $a1 - quantptr
54# $a2 - wsptr
55# $a3, $at - mips_idct_coefs
56# $t0:7 - simd data
57# $t8 - coefficients, temp
58# $t9 - loop end address
59# $s0:3 - simd quantization factors
60# $s4:7 - temp results
61# $v0:1 - temp results
62
63 addiu $sp, $sp, -32 # reserve stack space for s0-s7
64
65 sw $s0, 28($sp)
66 sw $s1, 24($sp)
67 sw $s2, 20($sp)
68 sw $s3, 16($sp)
69 sw $s4, 12($sp)
70 sw $s5, 8($sp)
71 sw $s6, 4($sp)
72 sw $s7, 0($sp)
73
74 addiu $t9, $a0, 16 # end address
75
76 #lui $at, %hi(mips_idct_coefs)
77 #ori $at, %lo(mips_idct_coefs)
78 # move mips_idct_coefs address from $a3 into $at where the rest of this code expects it
79 or $at, $a3, $zero
80
81loop_columns:
82
83 lw $s0, 0($a1) # quantptr[DCTSIZE*0]
84
85 lw $t0, 0($a0) # inptr[DCTSIZE*0]
86 lw $t1, 16($a0) # inptr[DCTSIZE*1]
87
88 muleq_s.w.phl $v0, $t0, $s0 # tmp0 ...
89
90 lw $t2, 32($a0) # inptr[DCTSIZE*2]
91 lw $t3, 48($a0) # inptr[DCTSIZE*3]
92 lw $t4, 64($a0) # inptr[DCTSIZE*4]
93 lw $t5, 80($a0) # inptr[DCTSIZE*5]
94
95 muleq_s.w.phr $t0, $t0, $s0 # ... tmp0 ...
96
97 lw $t6, 96($a0) # inptr[DCTSIZE*6]
98 lw $t7, 112($a0) # inptr[DCTSIZE*7]
99
100 or $s4, $t1, $t2
101 or $s5, $t3, $t4
102
103 bnez $s4, full_column
104 ins $t0, $v0, 16, 16 # ... tmp0
105
106 bnez $s5, full_column
107 or $s6, $t5, $t6
108 or $s6, $s6, $t7
109 bnez $s6, full_column
110
111 sw $t0, 0($a2) # wsptr[DCTSIZE*0]
112 sw $t0, 16($a2) # wsptr[DCTSIZE*1]
113 sw $t0, 32($a2) # wsptr[DCTSIZE*2]
114 sw $t0, 48($a2) # wsptr[DCTSIZE*3]
115 sw $t0, 64($a2) # wsptr[DCTSIZE*4]
116 sw $t0, 80($a2) # wsptr[DCTSIZE*5]
117 sw $t0, 96($a2) # wsptr[DCTSIZE*6]
118 sw $t0, 112($a2) # wsptr[DCTSIZE*7]
119
120 addiu $a0, $a0, 4
121
122 b continue_columns
123 addiu $a1, $a1, 4
124
125
126full_column:
127
128 lw $s1, 32($a1) # quantptr[DCTSIZE*2]
129 lw $s2, 64($a1) # quantptr[DCTSIZE*4]
130
131 muleq_s.w.phl $v0, $t2, $s1 # tmp1 ...
132 muleq_s.w.phr $t2, $t2, $s1 # ... tmp1 ...
133
134 lw $s0, 16($a1) # quantptr[DCTSIZE*1]
135 lw $s1, 48($a1) # quantptr[DCTSIZE*3]
136 lw $s3, 96($a1) # quantptr[DCTSIZE*6]
137
138 muleq_s.w.phl $v1, $t4, $s2 # tmp2 ...
139 muleq_s.w.phr $t4, $t4, $s2 # ... tmp2 ...
140
141 lw $s2, 80($a1) # quantptr[DCTSIZE*5]
142 lw $t8, 4($at) # FIX(1.414213562)
143 ins $t2, $v0, 16, 16 # ... tmp1
144
145 muleq_s.w.phl $v0, $t6, $s3 # tmp3 ...
146 muleq_s.w.phr $t6, $t6, $s3 # ... tmp3 ...
147
148 ins $t4, $v1, 16, 16 # ... tmp2
149
150 addq.ph $s4, $t0, $t4 # tmp10
151 subq.ph $s5, $t0, $t4 # tmp11
152
153 ins $t6, $v0, 16, 16 # ... tmp3
154
155 subq.ph $s6, $t2, $t6 # tmp12 ...
156 addq.ph $s7, $t2, $t6 # tmp13
157
158 mulq_rs.ph $s6, $s6, $t8 # ... tmp12 ...
159
160 addq.ph $t0, $s4, $s7 # tmp0
161 subq.ph $t6, $s4, $s7 # tmp3
162
163################
164
165 muleq_s.w.phl $v0, $t1, $s0 # tmp4 ...
166 muleq_s.w.phr $t1, $t1, $s0 # ... tmp4 ...
167
168 shll_s.ph $s6, $s6, 1 # x2
169
170 lw $s3, 112($a1) # quantptr[DCTSIZE*7]
171
172 subq.ph $s6, $s6, $s7 # ... tmp12
173
174 muleq_s.w.phl $v1, $t7, $s3 # tmp7 ...
175 muleq_s.w.phr $t7, $t7, $s3 # ... tmp7 ...
176
177 ins $t1, $v0, 16, 16 # ... tmp4
178
179 addq.ph $t2, $s5, $s6 # tmp1
180 subq.ph $t4, $s5, $s6 # tmp2
181
182 muleq_s.w.phl $v0, $t5, $s2 # tmp6 ...
183 muleq_s.w.phr $t5, $t5, $s2 # ... tmp6 ...
184
185 ins $t7, $v1, 16, 16 # ... tmp7
186
187 addq.ph $s5, $t1, $t7 # z11
188 subq.ph $s6, $t1, $t7 # z12
189
190 muleq_s.w.phl $v1, $t3, $s1 # tmp5 ...
191 muleq_s.w.phr $t3, $t3, $s1 # ... tmp5 ...
192
193 ins $t5, $v0, 16, 16 # ... tmp6
194
195# stalls
196
197 ins $t3, $v1, 16, 16 # ... tmp5
198
199
200 addq.ph $s7, $t5, $t3 # z13
201 subq.ph $v0, $t5, $t3 # z10
202
203 addq.ph $t7, $s5, $s7 # tmp7
204 subq.ph $s5, $s5, $s7 # tmp11 ...
205
206 addq.ph $v1, $v0, $s6 # z5 ...
207
208 mulq_rs.ph $s5, $s5, $t8 # ... tmp11
209
210 lw $t8, 8($at) # FIX(1.847759065)
211 lw $s4, 0($at) # FIX(1.082392200)
212
213 addq.ph $s0, $t0, $t7
214 subq.ph $s1, $t0, $t7
215
216 mulq_rs.ph $v1, $v1, $t8 # ... z5
217
218 shll_s.ph $s5, $s5, 1 # x2
219
220 lw $t8, 12($at) # FIX(-2.613125930)
221 sw $s0, 0($a2) # wsptr[DCTSIZE*0]
222
223 mulq_rs.ph $v0, $v0, $t8 # tmp12 ...
224 mulq_rs.ph $s4, $s6, $s4 # tmp10 ...
225
226 shll_s.ph $v1, $v1, 1 # x2
227
228 addiu $a0, $a0, 4
229 addiu $a1, $a1, 4
230
231 sw $s1, 112($a2) # wsptr[DCTSIZE*7]
232
233 shll_s.ph $s6, $v0, 2 # x4
234 shll_s.ph $s4, $s4, 1 # x2
235 addq.ph $s6, $s6, $v1 # ... tmp12
236
237 subq.ph $t5, $s6, $t7 # tmp6
238 subq.ph $s4, $s4, $v1 # ... tmp10
239 subq.ph $t3, $s5, $t5 # tmp5
240 addq.ph $s2, $t2, $t5
241 addq.ph $t1, $s4, $t3 # tmp4
242 subq.ph $s3, $t2, $t5
243
244 sw $s2, 16($a2) # wsptr[DCTSIZE*1]
245 sw $s3, 96($a2) # wsptr[DCTSIZE*6]
246
247 addq.ph $v0, $t4, $t3
248 subq.ph $v1, $t4, $t3
249
250 sw $v0, 32($a2) # wsptr[DCTSIZE*2]
251 sw $v1, 80($a2) # wsptr[DCTSIZE*5]
252
253 addq.ph $v0, $t6, $t1
254 subq.ph $v1, $t6, $t1
255
256 sw $v0, 64($a2) # wsptr[DCTSIZE*4]
257 sw $v1, 48($a2) # wsptr[DCTSIZE*3]
258
259continue_columns:
260
261 bne $a0, $t9, loop_columns
262 addiu $a2, $a2, 4
263
264
265 lw $s0, 28($sp)
266 lw $s1, 24($sp)
267 lw $s2, 20($sp)
268 lw $s3, 16($sp)
269 lw $s4, 12($sp)
270 lw $s5, 8($sp)
271 lw $s6, 4($sp)
272 lw $s7, 0($sp)
273
274 jr $ra
275 addiu $sp, $sp, 32
276
277
278 .end mips_idct_columns
279
280
281##################################################################
282
283
284 .global mips_idct_rows
285 .ent mips_idct_rows
286
287# void mips_idct_rows(DCTELEM * wsptr, JSAMPARRAY output_buf,
288# JDIMENSION output_col, const int * mips_idct_coefs);
289
290mips_idct_rows:
291
292# $a0 - wsptr
293# $a1 - output_buf
294# $a2 - output_col
295# $a3 - outptr
296# $a3, $at - mips_idct_coefs
297# $t0:7 - simd data
298# $t8 - coefficients, temp
299# $t9 - loop end address
300# $s0:3 - simd quantization factors
301# $s4:7 - temp results
302# s8 - const 0x80808080
303# $v0:1 - temp results
304
305SHIFT = 2
306
307 addiu $sp, $sp, -48 # reserve stack space for s0-s8
308
309 # save $a3 (mips_idct_coefs) because it might get clobbered below
310 sw $a3, 36($sp)
311
312 sw $s0, 32($sp)
313 sw $s1, 28($sp)
314 sw $s2, 24($sp)
315 sw $s3, 20($sp)
316 sw $s4, 16($sp)
317 sw $s5, 12($sp)
318 sw $s6, 8($sp)
319 sw $s7, 4($sp)
320 sw $s8, 0($sp)
321
322 addiu $t9, $a0, 128 # end address
323
324 lui $s8, 0x8080
325 ori $s8, $s8, 0x8080
326
327loop_rows:
328
329 lw $at, 36($sp) # restore saved $a3 (mips_idct_coefs)
330
331 lw $t0, 0+0($a0) # wsptr[DCTSIZE*0+0/1] b a
332 lw $s0, 16+0($a0) # wsptr[DCTSIZE*1+0/1] B A
333 lw $t2, 0+4($a0) # wsptr[DCTSIZE*0+2/3] d c
334 lw $s2, 16+4($a0) # wsptr[DCTSIZE*1+2/3] D C
335 lw $t4, 0+8($a0) # wsptr[DCTSIZE*0+4/5] f e
336 lw $s4, 16+8($a0) # wsptr[DCTSIZE*1+4/5] F E
337 lw $t6, 0+12($a0) # wsptr[DCTSIZE*0+6/7] h g
338 lw $s6, 16+12($a0) # wsptr[DCTSIZE*1+6/7] H G
339
340 precrq.ph.w $t1, $s0, $t0 # B b
341 ins $t0, $s0, 16, 16 # A a
342
343 bnez $t1, full_row
344 or $s0, $t2, $s2
345 bnez $s0, full_row
346 or $s0, $t4, $s4
347 bnez $s0, full_row
348 or $s0, $t6, $s6
349 bnez $s0, full_row
350
351 shll_s.ph $s0, $t0, SHIFT # A a
352
353 lw $a3, 0($a1)
354 lw $at, 4($a1)
355
356 precrq.ph.w $t0, $s0, $s0 # A A
357 ins $s0, $s0, 16, 16 # a a
358
359 addu $a3, $a3, $a2
360 addu $at, $at, $a2
361
362 precrq.qb.ph $t0, $t0, $t0 # A A A A
363 precrq.qb.ph $s0, $s0, $s0 # a a a a
364
365
366 addu.qb $s0, $s0, $s8
367 addu.qb $t0, $t0, $s8
368
369
370 sw $s0, 0($a3)
371 sw $s0, 4($a3)
372
373 sw $t0, 0($at)
374 sw $t0, 4($at)
375
376
377 addiu $a0, $a0, 32
378
379 bne $a0, $t9, loop_rows
380 addiu $a1, $a1, 8
381
382 b exit_rows
383 nop
384
385
386full_row:
387
388 precrq.ph.w $t3, $s2, $t2
389 ins $t2, $s2, 16, 16
390
391 precrq.ph.w $t5, $s4, $t4
392 ins $t4, $s4, 16, 16
393
394 precrq.ph.w $t7, $s6, $t6
395 ins $t6, $s6, 16, 16
396
397
398 lw $t8, 4($at) # FIX(1.414213562)
399
400 addq.ph $s4, $t0, $t4 # tmp10
401 subq.ph $s5, $t0, $t4 # tmp11
402
403 subq.ph $s6, $t2, $t6 # tmp12 ...
404 addq.ph $s7, $t2, $t6 # tmp13
405
406 mulq_rs.ph $s6, $s6, $t8 # ... tmp12 ...
407
408 addq.ph $t0, $s4, $s7 # tmp0
409 subq.ph $t6, $s4, $s7 # tmp3
410
411 shll_s.ph $s6, $s6, 1 # x2
412
413 subq.ph $s6, $s6, $s7 # ... tmp12
414
415 addq.ph $t2, $s5, $s6 # tmp1
416 subq.ph $t4, $s5, $s6 # tmp2
417
418################
419
420 addq.ph $s5, $t1, $t7 # z11
421 subq.ph $s6, $t1, $t7 # z12
422
423 addq.ph $s7, $t5, $t3 # z13
424 subq.ph $v0, $t5, $t3 # z10
425
426 addq.ph $t7, $s5, $s7 # tmp7
427 subq.ph $s5, $s5, $s7 # tmp11 ...
428
429 addq.ph $v1, $v0, $s6 # z5 ...
430
431 mulq_rs.ph $s5, $s5, $t8 # ... tmp11
432
433 lw $t8, 8($at) # FIX(1.847759065)
434 lw $s4, 0($at) # FIX(1.082392200)
435
436 addq.ph $s0, $t0, $t7 # tmp0 + tmp7
437 subq.ph $s7, $t0, $t7 # tmp0 - tmp7
438
439 mulq_rs.ph $v1, $v1, $t8 # ... z5
440
441 lw $a3, 0($a1)
442 lw $t8, 12($at) # FIX(-2.613125930)
443
444 shll_s.ph $s5, $s5, 1 # x2
445
446 addu $a3, $a3, $a2
447
448 mulq_rs.ph $v0, $v0, $t8 # tmp12 ...
449 mulq_rs.ph $s4, $s6, $s4 # tmp10 ...
450
451 shll_s.ph $v1, $v1, 1 # x2
452
453 addiu $a0, $a0, 32
454 addiu $a1, $a1, 8
455
456
457 shll_s.ph $s6, $v0, 2 # x4
458 shll_s.ph $s4, $s4, 1 # x2
459 addq.ph $s6, $s6, $v1 # ... tmp12
460
461 shll_s.ph $s0, $s0, SHIFT
462
463 subq.ph $t5, $s6, $t7 # tmp6
464 subq.ph $s4, $s4, $v1 # ... tmp10
465 subq.ph $t3, $s5, $t5 # tmp5
466
467 shll_s.ph $s7, $s7, SHIFT
468
469 addq.ph $t1, $s4, $t3 # tmp4
470
471
472 addq.ph $s1, $t2, $t5 # tmp1 + tmp6
473 subq.ph $s6, $t2, $t5 # tmp1 - tmp6
474
475 addq.ph $s2, $t4, $t3 # tmp2 + tmp5
476 subq.ph $s5, $t4, $t3 # tmp2 - tmp5
477
478 addq.ph $s4, $t6, $t1 # tmp3 + tmp4
479 subq.ph $s3, $t6, $t1 # tmp3 - tmp4
480
481
482 shll_s.ph $s1, $s1, SHIFT
483 shll_s.ph $s2, $s2, SHIFT
484 shll_s.ph $s3, $s3, SHIFT
485 shll_s.ph $s4, $s4, SHIFT
486 shll_s.ph $s5, $s5, SHIFT
487 shll_s.ph $s6, $s6, SHIFT
488
489
490 precrq.ph.w $t0, $s1, $s0 # B A
491 ins $s0, $s1, 16, 16 # b a
492
493 precrq.ph.w $t2, $s3, $s2 # D C
494 ins $s2, $s3, 16, 16 # d c
495
496 precrq.ph.w $t4, $s5, $s4 # F E
497 ins $s4, $s5, 16, 16 # f e
498
499 precrq.ph.w $t6, $s7, $s6 # H G
500 ins $s6, $s7, 16, 16 # h g
501
502 precrq.qb.ph $t0, $t2, $t0 # D C B A
503 precrq.qb.ph $s0, $s2, $s0 # d c b a
504
505 precrq.qb.ph $t4, $t6, $t4 # H G F E
506 precrq.qb.ph $s4, $s6, $s4 # h g f e
507
508
509 addu.qb $s0, $s0, $s8
510 addu.qb $s4, $s4, $s8
511
512
513 sw $s0, 0($a3) # outptr[0/1/2/3] d c b a
514 sw $s4, 4($a3) # outptr[4/5/6/7] h g f e
515
516 lw $a3, -4($a1)
517
518 addu.qb $t0, $t0, $s8
519
520 addu $a3, $a3, $a2
521
522 addu.qb $t4, $t4, $s8
523
524
525 sw $t0, 0($a3) # outptr[0/1/2/3] D C B A
526
527 bne $a0, $t9, loop_rows
528 sw $t4, 4($a3) # outptr[4/5/6/7] H G F E
529
530
531exit_rows:
532
533 lw $s0, 32($sp)
534 lw $s1, 28($sp)
535 lw $s2, 24($sp)
536 lw $s3, 20($sp)
537 lw $s4, 16($sp)
538 lw $s5, 12($sp)
539 lw $s6, 8($sp)
540 lw $s7, 4($sp)
541 lw $s8, 0($sp)
542
543 jr $ra
544 addiu $sp, $sp, 48
545
546
547 .end mips_idct_rows