Raghu Gandham | 71ee859 | 2012-11-06 10:17:40 -0800 | [diff] [blame] | 1 | # |
| 2 | # Copyright (C) 2011 The Android Open Source Project |
| 3 | # |
| 4 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | # you may not use this file except in compliance with the License. |
| 6 | # You may obtain a copy of the License at |
| 7 | # |
| 8 | # http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | # |
| 10 | # Unless required by applicable law or agreed to in writing, software |
| 11 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | # See the License for the specific language governing permissions and |
| 14 | # limitations under the License. |
| 15 | |
| 16 | |
| 17 | # IDCT implementation using the MIPS DSP ASE (little endian version) |
| 18 | # |
| 19 | # See MIPS Technologies Inc documents: |
| 20 | # "JPEG Decoder Optimization for MIPS32(R) Cores" MD00483 |
| 21 | # |
| 22 | # "MIPS32(R) Architecture for Programmers Volume IV-e: The MIPS(R) DSP |
| 23 | # Application Specifice Extension to the MIPS32(R) Architecture" MD00374 |
| 24 | # |
| 25 | |
| 26 | .set noreorder |
| 27 | .set nomacro |
| 28 | .set noat |
| 29 | |
| 30 | # This table has been moved to mips_jidctfst.c to avoid having to mess |
| 31 | # with the global pointer to make this code PIC. |
| 32 | # .rdata |
| 33 | # |
| 34 | # mips_idct_coefs: |
| 35 | # # Constant table of scaled IDCT coefficients. |
| 36 | # |
| 37 | # .word 0x45464546 # FIX( 1.082392200 / 2) = 17734 = 0x4546 |
| 38 | # .word 0x5A825A82 # FIX( 1.414213562 / 2) = 23170 = 0x5A82 |
| 39 | # .word 0x76427642 # FIX( 1.847759065 / 2) = 30274 = 0x7642 |
| 40 | # .word 0xAC61AC61 # FIX(-2.613125930 / 4) = -21407 = 0xAC61 |
| 41 | |
| 42 | .text |
| 43 | |
| 44 | .global mips_idct_columns |
| 45 | .ent mips_idct_columns |
| 46 | |
| 47 | # void mips_idct_columns(JCOEF * inptr, IFAST_MULT_TYPE * quantptr, |
| 48 | # DCTELEM * wsptr, const int * mips_idct_coefs); |
| 49 | |
| 50 | mips_idct_columns: |
| 51 | |
| 52 | # $a0 - inptr |
| 53 | # $a1 - quantptr |
| 54 | # $a2 - wsptr |
| 55 | # $a3, $at - mips_idct_coefs |
| 56 | # $t0:7 - simd data |
| 57 | # $t8 - coefficients, temp |
| 58 | # $t9 - loop end address |
| 59 | # $s0:3 - simd quantization factors |
| 60 | # $s4:7 - temp results |
| 61 | # $v0:1 - temp results |
| 62 | |
| 63 | addiu $sp, $sp, -32 # reserve stack space for s0-s7 |
| 64 | |
| 65 | sw $s0, 28($sp) |
| 66 | sw $s1, 24($sp) |
| 67 | sw $s2, 20($sp) |
| 68 | sw $s3, 16($sp) |
| 69 | sw $s4, 12($sp) |
| 70 | sw $s5, 8($sp) |
| 71 | sw $s6, 4($sp) |
| 72 | sw $s7, 0($sp) |
| 73 | |
| 74 | addiu $t9, $a0, 16 # end address |
| 75 | |
| 76 | #lui $at, %hi(mips_idct_coefs) |
| 77 | #ori $at, %lo(mips_idct_coefs) |
| 78 | # move mips_idct_coefs address from $a3 into $at where the rest of this code expects it |
| 79 | or $at, $a3, $zero |
| 80 | |
| 81 | loop_columns: |
| 82 | |
| 83 | lw $s0, 0($a1) # quantptr[DCTSIZE*0] |
| 84 | |
| 85 | lw $t0, 0($a0) # inptr[DCTSIZE*0] |
| 86 | lw $t1, 16($a0) # inptr[DCTSIZE*1] |
| 87 | |
| 88 | muleq_s.w.phl $v0, $t0, $s0 # tmp0 ... |
| 89 | |
| 90 | lw $t2, 32($a0) # inptr[DCTSIZE*2] |
| 91 | lw $t3, 48($a0) # inptr[DCTSIZE*3] |
| 92 | lw $t4, 64($a0) # inptr[DCTSIZE*4] |
| 93 | lw $t5, 80($a0) # inptr[DCTSIZE*5] |
| 94 | |
| 95 | muleq_s.w.phr $t0, $t0, $s0 # ... tmp0 ... |
| 96 | |
| 97 | lw $t6, 96($a0) # inptr[DCTSIZE*6] |
| 98 | lw $t7, 112($a0) # inptr[DCTSIZE*7] |
| 99 | |
| 100 | or $s4, $t1, $t2 |
| 101 | or $s5, $t3, $t4 |
| 102 | |
| 103 | bnez $s4, full_column |
| 104 | ins $t0, $v0, 16, 16 # ... tmp0 |
| 105 | |
| 106 | bnez $s5, full_column |
| 107 | or $s6, $t5, $t6 |
| 108 | or $s6, $s6, $t7 |
| 109 | bnez $s6, full_column |
| 110 | |
| 111 | sw $t0, 0($a2) # wsptr[DCTSIZE*0] |
| 112 | sw $t0, 16($a2) # wsptr[DCTSIZE*1] |
| 113 | sw $t0, 32($a2) # wsptr[DCTSIZE*2] |
| 114 | sw $t0, 48($a2) # wsptr[DCTSIZE*3] |
| 115 | sw $t0, 64($a2) # wsptr[DCTSIZE*4] |
| 116 | sw $t0, 80($a2) # wsptr[DCTSIZE*5] |
| 117 | sw $t0, 96($a2) # wsptr[DCTSIZE*6] |
| 118 | sw $t0, 112($a2) # wsptr[DCTSIZE*7] |
| 119 | |
| 120 | addiu $a0, $a0, 4 |
| 121 | |
| 122 | b continue_columns |
| 123 | addiu $a1, $a1, 4 |
| 124 | |
| 125 | |
| 126 | full_column: |
| 127 | |
| 128 | lw $s1, 32($a1) # quantptr[DCTSIZE*2] |
| 129 | lw $s2, 64($a1) # quantptr[DCTSIZE*4] |
| 130 | |
| 131 | muleq_s.w.phl $v0, $t2, $s1 # tmp1 ... |
| 132 | muleq_s.w.phr $t2, $t2, $s1 # ... tmp1 ... |
| 133 | |
| 134 | lw $s0, 16($a1) # quantptr[DCTSIZE*1] |
| 135 | lw $s1, 48($a1) # quantptr[DCTSIZE*3] |
| 136 | lw $s3, 96($a1) # quantptr[DCTSIZE*6] |
| 137 | |
| 138 | muleq_s.w.phl $v1, $t4, $s2 # tmp2 ... |
| 139 | muleq_s.w.phr $t4, $t4, $s2 # ... tmp2 ... |
| 140 | |
| 141 | lw $s2, 80($a1) # quantptr[DCTSIZE*5] |
| 142 | lw $t8, 4($at) # FIX(1.414213562) |
| 143 | ins $t2, $v0, 16, 16 # ... tmp1 |
| 144 | |
| 145 | muleq_s.w.phl $v0, $t6, $s3 # tmp3 ... |
| 146 | muleq_s.w.phr $t6, $t6, $s3 # ... tmp3 ... |
| 147 | |
| 148 | ins $t4, $v1, 16, 16 # ... tmp2 |
| 149 | |
| 150 | addq.ph $s4, $t0, $t4 # tmp10 |
| 151 | subq.ph $s5, $t0, $t4 # tmp11 |
| 152 | |
| 153 | ins $t6, $v0, 16, 16 # ... tmp3 |
| 154 | |
| 155 | subq.ph $s6, $t2, $t6 # tmp12 ... |
| 156 | addq.ph $s7, $t2, $t6 # tmp13 |
| 157 | |
| 158 | mulq_rs.ph $s6, $s6, $t8 # ... tmp12 ... |
| 159 | |
| 160 | addq.ph $t0, $s4, $s7 # tmp0 |
| 161 | subq.ph $t6, $s4, $s7 # tmp3 |
| 162 | |
| 163 | ################ |
| 164 | |
| 165 | muleq_s.w.phl $v0, $t1, $s0 # tmp4 ... |
| 166 | muleq_s.w.phr $t1, $t1, $s0 # ... tmp4 ... |
| 167 | |
| 168 | shll_s.ph $s6, $s6, 1 # x2 |
| 169 | |
| 170 | lw $s3, 112($a1) # quantptr[DCTSIZE*7] |
| 171 | |
| 172 | subq.ph $s6, $s6, $s7 # ... tmp12 |
| 173 | |
| 174 | muleq_s.w.phl $v1, $t7, $s3 # tmp7 ... |
| 175 | muleq_s.w.phr $t7, $t7, $s3 # ... tmp7 ... |
| 176 | |
| 177 | ins $t1, $v0, 16, 16 # ... tmp4 |
| 178 | |
| 179 | addq.ph $t2, $s5, $s6 # tmp1 |
| 180 | subq.ph $t4, $s5, $s6 # tmp2 |
| 181 | |
| 182 | muleq_s.w.phl $v0, $t5, $s2 # tmp6 ... |
| 183 | muleq_s.w.phr $t5, $t5, $s2 # ... tmp6 ... |
| 184 | |
| 185 | ins $t7, $v1, 16, 16 # ... tmp7 |
| 186 | |
| 187 | addq.ph $s5, $t1, $t7 # z11 |
| 188 | subq.ph $s6, $t1, $t7 # z12 |
| 189 | |
| 190 | muleq_s.w.phl $v1, $t3, $s1 # tmp5 ... |
| 191 | muleq_s.w.phr $t3, $t3, $s1 # ... tmp5 ... |
| 192 | |
| 193 | ins $t5, $v0, 16, 16 # ... tmp6 |
| 194 | |
| 195 | # stalls |
| 196 | |
| 197 | ins $t3, $v1, 16, 16 # ... tmp5 |
| 198 | |
| 199 | |
| 200 | addq.ph $s7, $t5, $t3 # z13 |
| 201 | subq.ph $v0, $t5, $t3 # z10 |
| 202 | |
| 203 | addq.ph $t7, $s5, $s7 # tmp7 |
| 204 | subq.ph $s5, $s5, $s7 # tmp11 ... |
| 205 | |
| 206 | addq.ph $v1, $v0, $s6 # z5 ... |
| 207 | |
| 208 | mulq_rs.ph $s5, $s5, $t8 # ... tmp11 |
| 209 | |
| 210 | lw $t8, 8($at) # FIX(1.847759065) |
| 211 | lw $s4, 0($at) # FIX(1.082392200) |
| 212 | |
| 213 | addq.ph $s0, $t0, $t7 |
| 214 | subq.ph $s1, $t0, $t7 |
| 215 | |
| 216 | mulq_rs.ph $v1, $v1, $t8 # ... z5 |
| 217 | |
| 218 | shll_s.ph $s5, $s5, 1 # x2 |
| 219 | |
| 220 | lw $t8, 12($at) # FIX(-2.613125930) |
| 221 | sw $s0, 0($a2) # wsptr[DCTSIZE*0] |
| 222 | |
| 223 | mulq_rs.ph $v0, $v0, $t8 # tmp12 ... |
| 224 | mulq_rs.ph $s4, $s6, $s4 # tmp10 ... |
| 225 | |
| 226 | shll_s.ph $v1, $v1, 1 # x2 |
| 227 | |
| 228 | addiu $a0, $a0, 4 |
| 229 | addiu $a1, $a1, 4 |
| 230 | |
| 231 | sw $s1, 112($a2) # wsptr[DCTSIZE*7] |
| 232 | |
| 233 | shll_s.ph $s6, $v0, 2 # x4 |
| 234 | shll_s.ph $s4, $s4, 1 # x2 |
| 235 | addq.ph $s6, $s6, $v1 # ... tmp12 |
| 236 | |
| 237 | subq.ph $t5, $s6, $t7 # tmp6 |
| 238 | subq.ph $s4, $s4, $v1 # ... tmp10 |
| 239 | subq.ph $t3, $s5, $t5 # tmp5 |
| 240 | addq.ph $s2, $t2, $t5 |
| 241 | addq.ph $t1, $s4, $t3 # tmp4 |
| 242 | subq.ph $s3, $t2, $t5 |
| 243 | |
| 244 | sw $s2, 16($a2) # wsptr[DCTSIZE*1] |
| 245 | sw $s3, 96($a2) # wsptr[DCTSIZE*6] |
| 246 | |
| 247 | addq.ph $v0, $t4, $t3 |
| 248 | subq.ph $v1, $t4, $t3 |
| 249 | |
| 250 | sw $v0, 32($a2) # wsptr[DCTSIZE*2] |
| 251 | sw $v1, 80($a2) # wsptr[DCTSIZE*5] |
| 252 | |
| 253 | addq.ph $v0, $t6, $t1 |
| 254 | subq.ph $v1, $t6, $t1 |
| 255 | |
| 256 | sw $v0, 64($a2) # wsptr[DCTSIZE*4] |
| 257 | sw $v1, 48($a2) # wsptr[DCTSIZE*3] |
| 258 | |
| 259 | continue_columns: |
| 260 | |
| 261 | bne $a0, $t9, loop_columns |
| 262 | addiu $a2, $a2, 4 |
| 263 | |
| 264 | |
| 265 | lw $s0, 28($sp) |
| 266 | lw $s1, 24($sp) |
| 267 | lw $s2, 20($sp) |
| 268 | lw $s3, 16($sp) |
| 269 | lw $s4, 12($sp) |
| 270 | lw $s5, 8($sp) |
| 271 | lw $s6, 4($sp) |
| 272 | lw $s7, 0($sp) |
| 273 | |
| 274 | jr $ra |
| 275 | addiu $sp, $sp, 32 |
| 276 | |
| 277 | |
| 278 | .end mips_idct_columns |
| 279 | |
| 280 | |
| 281 | ################################################################## |
| 282 | |
| 283 | |
| 284 | .global mips_idct_rows |
| 285 | .ent mips_idct_rows |
| 286 | |
| 287 | # void mips_idct_rows(DCTELEM * wsptr, JSAMPARRAY output_buf, |
| 288 | # JDIMENSION output_col, const int * mips_idct_coefs); |
| 289 | |
| 290 | mips_idct_rows: |
| 291 | |
| 292 | # $a0 - wsptr |
| 293 | # $a1 - output_buf |
| 294 | # $a2 - output_col |
| 295 | # $a3 - outptr |
| 296 | # $a3, $at - mips_idct_coefs |
| 297 | # $t0:7 - simd data |
| 298 | # $t8 - coefficients, temp |
| 299 | # $t9 - loop end address |
| 300 | # $s0:3 - simd quantization factors |
| 301 | # $s4:7 - temp results |
| 302 | # s8 - const 0x80808080 |
| 303 | # $v0:1 - temp results |
| 304 | |
| 305 | SHIFT = 2 |
| 306 | |
| 307 | addiu $sp, $sp, -48 # reserve stack space for s0-s8 |
| 308 | |
| 309 | # save $a3 (mips_idct_coefs) because it might get clobbered below |
| 310 | sw $a3, 36($sp) |
| 311 | |
| 312 | sw $s0, 32($sp) |
| 313 | sw $s1, 28($sp) |
| 314 | sw $s2, 24($sp) |
| 315 | sw $s3, 20($sp) |
| 316 | sw $s4, 16($sp) |
| 317 | sw $s5, 12($sp) |
| 318 | sw $s6, 8($sp) |
| 319 | sw $s7, 4($sp) |
| 320 | sw $s8, 0($sp) |
| 321 | |
| 322 | addiu $t9, $a0, 128 # end address |
| 323 | |
| 324 | lui $s8, 0x8080 |
| 325 | ori $s8, $s8, 0x8080 |
| 326 | |
| 327 | loop_rows: |
| 328 | |
| 329 | lw $at, 36($sp) # restore saved $a3 (mips_idct_coefs) |
| 330 | |
| 331 | lw $t0, 0+0($a0) # wsptr[DCTSIZE*0+0/1] b a |
| 332 | lw $s0, 16+0($a0) # wsptr[DCTSIZE*1+0/1] B A |
| 333 | lw $t2, 0+4($a0) # wsptr[DCTSIZE*0+2/3] d c |
| 334 | lw $s2, 16+4($a0) # wsptr[DCTSIZE*1+2/3] D C |
| 335 | lw $t4, 0+8($a0) # wsptr[DCTSIZE*0+4/5] f e |
| 336 | lw $s4, 16+8($a0) # wsptr[DCTSIZE*1+4/5] F E |
| 337 | lw $t6, 0+12($a0) # wsptr[DCTSIZE*0+6/7] h g |
| 338 | lw $s6, 16+12($a0) # wsptr[DCTSIZE*1+6/7] H G |
| 339 | |
| 340 | precrq.ph.w $t1, $s0, $t0 # B b |
| 341 | ins $t0, $s0, 16, 16 # A a |
| 342 | |
| 343 | bnez $t1, full_row |
| 344 | or $s0, $t2, $s2 |
| 345 | bnez $s0, full_row |
| 346 | or $s0, $t4, $s4 |
| 347 | bnez $s0, full_row |
| 348 | or $s0, $t6, $s6 |
| 349 | bnez $s0, full_row |
| 350 | |
| 351 | shll_s.ph $s0, $t0, SHIFT # A a |
| 352 | |
| 353 | lw $a3, 0($a1) |
| 354 | lw $at, 4($a1) |
| 355 | |
| 356 | precrq.ph.w $t0, $s0, $s0 # A A |
| 357 | ins $s0, $s0, 16, 16 # a a |
| 358 | |
| 359 | addu $a3, $a3, $a2 |
| 360 | addu $at, $at, $a2 |
| 361 | |
| 362 | precrq.qb.ph $t0, $t0, $t0 # A A A A |
| 363 | precrq.qb.ph $s0, $s0, $s0 # a a a a |
| 364 | |
| 365 | |
| 366 | addu.qb $s0, $s0, $s8 |
| 367 | addu.qb $t0, $t0, $s8 |
| 368 | |
| 369 | |
| 370 | sw $s0, 0($a3) |
| 371 | sw $s0, 4($a3) |
| 372 | |
| 373 | sw $t0, 0($at) |
| 374 | sw $t0, 4($at) |
| 375 | |
| 376 | |
| 377 | addiu $a0, $a0, 32 |
| 378 | |
| 379 | bne $a0, $t9, loop_rows |
| 380 | addiu $a1, $a1, 8 |
| 381 | |
| 382 | b exit_rows |
| 383 | nop |
| 384 | |
| 385 | |
| 386 | full_row: |
| 387 | |
| 388 | precrq.ph.w $t3, $s2, $t2 |
| 389 | ins $t2, $s2, 16, 16 |
| 390 | |
| 391 | precrq.ph.w $t5, $s4, $t4 |
| 392 | ins $t4, $s4, 16, 16 |
| 393 | |
| 394 | precrq.ph.w $t7, $s6, $t6 |
| 395 | ins $t6, $s6, 16, 16 |
| 396 | |
| 397 | |
| 398 | lw $t8, 4($at) # FIX(1.414213562) |
| 399 | |
| 400 | addq.ph $s4, $t0, $t4 # tmp10 |
| 401 | subq.ph $s5, $t0, $t4 # tmp11 |
| 402 | |
| 403 | subq.ph $s6, $t2, $t6 # tmp12 ... |
| 404 | addq.ph $s7, $t2, $t6 # tmp13 |
| 405 | |
| 406 | mulq_rs.ph $s6, $s6, $t8 # ... tmp12 ... |
| 407 | |
| 408 | addq.ph $t0, $s4, $s7 # tmp0 |
| 409 | subq.ph $t6, $s4, $s7 # tmp3 |
| 410 | |
| 411 | shll_s.ph $s6, $s6, 1 # x2 |
| 412 | |
| 413 | subq.ph $s6, $s6, $s7 # ... tmp12 |
| 414 | |
| 415 | addq.ph $t2, $s5, $s6 # tmp1 |
| 416 | subq.ph $t4, $s5, $s6 # tmp2 |
| 417 | |
| 418 | ################ |
| 419 | |
| 420 | addq.ph $s5, $t1, $t7 # z11 |
| 421 | subq.ph $s6, $t1, $t7 # z12 |
| 422 | |
| 423 | addq.ph $s7, $t5, $t3 # z13 |
| 424 | subq.ph $v0, $t5, $t3 # z10 |
| 425 | |
| 426 | addq.ph $t7, $s5, $s7 # tmp7 |
| 427 | subq.ph $s5, $s5, $s7 # tmp11 ... |
| 428 | |
| 429 | addq.ph $v1, $v0, $s6 # z5 ... |
| 430 | |
| 431 | mulq_rs.ph $s5, $s5, $t8 # ... tmp11 |
| 432 | |
| 433 | lw $t8, 8($at) # FIX(1.847759065) |
| 434 | lw $s4, 0($at) # FIX(1.082392200) |
| 435 | |
| 436 | addq.ph $s0, $t0, $t7 # tmp0 + tmp7 |
| 437 | subq.ph $s7, $t0, $t7 # tmp0 - tmp7 |
| 438 | |
| 439 | mulq_rs.ph $v1, $v1, $t8 # ... z5 |
| 440 | |
| 441 | lw $a3, 0($a1) |
| 442 | lw $t8, 12($at) # FIX(-2.613125930) |
| 443 | |
| 444 | shll_s.ph $s5, $s5, 1 # x2 |
| 445 | |
| 446 | addu $a3, $a3, $a2 |
| 447 | |
| 448 | mulq_rs.ph $v0, $v0, $t8 # tmp12 ... |
| 449 | mulq_rs.ph $s4, $s6, $s4 # tmp10 ... |
| 450 | |
| 451 | shll_s.ph $v1, $v1, 1 # x2 |
| 452 | |
| 453 | addiu $a0, $a0, 32 |
| 454 | addiu $a1, $a1, 8 |
| 455 | |
| 456 | |
| 457 | shll_s.ph $s6, $v0, 2 # x4 |
| 458 | shll_s.ph $s4, $s4, 1 # x2 |
| 459 | addq.ph $s6, $s6, $v1 # ... tmp12 |
| 460 | |
| 461 | shll_s.ph $s0, $s0, SHIFT |
| 462 | |
| 463 | subq.ph $t5, $s6, $t7 # tmp6 |
| 464 | subq.ph $s4, $s4, $v1 # ... tmp10 |
| 465 | subq.ph $t3, $s5, $t5 # tmp5 |
| 466 | |
| 467 | shll_s.ph $s7, $s7, SHIFT |
| 468 | |
| 469 | addq.ph $t1, $s4, $t3 # tmp4 |
| 470 | |
| 471 | |
| 472 | addq.ph $s1, $t2, $t5 # tmp1 + tmp6 |
| 473 | subq.ph $s6, $t2, $t5 # tmp1 - tmp6 |
| 474 | |
| 475 | addq.ph $s2, $t4, $t3 # tmp2 + tmp5 |
| 476 | subq.ph $s5, $t4, $t3 # tmp2 - tmp5 |
| 477 | |
| 478 | addq.ph $s4, $t6, $t1 # tmp3 + tmp4 |
| 479 | subq.ph $s3, $t6, $t1 # tmp3 - tmp4 |
| 480 | |
| 481 | |
| 482 | shll_s.ph $s1, $s1, SHIFT |
| 483 | shll_s.ph $s2, $s2, SHIFT |
| 484 | shll_s.ph $s3, $s3, SHIFT |
| 485 | shll_s.ph $s4, $s4, SHIFT |
| 486 | shll_s.ph $s5, $s5, SHIFT |
| 487 | shll_s.ph $s6, $s6, SHIFT |
| 488 | |
| 489 | |
| 490 | precrq.ph.w $t0, $s1, $s0 # B A |
| 491 | ins $s0, $s1, 16, 16 # b a |
| 492 | |
| 493 | precrq.ph.w $t2, $s3, $s2 # D C |
| 494 | ins $s2, $s3, 16, 16 # d c |
| 495 | |
| 496 | precrq.ph.w $t4, $s5, $s4 # F E |
| 497 | ins $s4, $s5, 16, 16 # f e |
| 498 | |
| 499 | precrq.ph.w $t6, $s7, $s6 # H G |
| 500 | ins $s6, $s7, 16, 16 # h g |
| 501 | |
| 502 | precrq.qb.ph $t0, $t2, $t0 # D C B A |
| 503 | precrq.qb.ph $s0, $s2, $s0 # d c b a |
| 504 | |
| 505 | precrq.qb.ph $t4, $t6, $t4 # H G F E |
| 506 | precrq.qb.ph $s4, $s6, $s4 # h g f e |
| 507 | |
| 508 | |
| 509 | addu.qb $s0, $s0, $s8 |
| 510 | addu.qb $s4, $s4, $s8 |
| 511 | |
| 512 | |
| 513 | sw $s0, 0($a3) # outptr[0/1/2/3] d c b a |
| 514 | sw $s4, 4($a3) # outptr[4/5/6/7] h g f e |
| 515 | |
| 516 | lw $a3, -4($a1) |
| 517 | |
| 518 | addu.qb $t0, $t0, $s8 |
| 519 | |
| 520 | addu $a3, $a3, $a2 |
| 521 | |
| 522 | addu.qb $t4, $t4, $s8 |
| 523 | |
| 524 | |
| 525 | sw $t0, 0($a3) # outptr[0/1/2/3] D C B A |
| 526 | |
| 527 | bne $a0, $t9, loop_rows |
| 528 | sw $t4, 4($a3) # outptr[4/5/6/7] H G F E |
| 529 | |
| 530 | |
| 531 | exit_rows: |
| 532 | |
| 533 | lw $s0, 32($sp) |
| 534 | lw $s1, 28($sp) |
| 535 | lw $s2, 24($sp) |
| 536 | lw $s3, 20($sp) |
| 537 | lw $s4, 16($sp) |
| 538 | lw $s5, 12($sp) |
| 539 | lw $s6, 8($sp) |
| 540 | lw $s7, 4($sp) |
| 541 | lw $s8, 0($sp) |
| 542 | |
| 543 | jr $ra |
| 544 | addiu $sp, $sp, 48 |
| 545 | |
| 546 | |
| 547 | .end mips_idct_rows |