Blame - files/source/row_mips.cc - platform/external/libyuv

Hangyu Kuang

f047e7c

2016-07-06 14:21:45 -0700

[diff] [blame]

/*

*

* Use of this source code is governed by a BSD-style license

5

* that can be found in the LICENSE file in the root of the source

6

* tree. An additional intellectual property rights grant can be found

7

* in the file PATENTS. All contributing project authors may

8

* be found in the AUTHORS file in the root of the source tree.

9

*/

10

11

#include "libyuv/row.h"

#ifdef __cplusplus

namespace libyuv {

extern "C" {

#endif

// The following are available on Mips platforms:

19

#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \

20

(_MIPS_SIM == _MIPS_SIM_ABI32)

21

22

#ifdef HAS_COPYROW_MIPS

23

void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {

24

__asm__ __volatile__ (

25

".set noreorder \n"

26

".set noat \n"

27

"slti $at, %[count], 8 \n"

28

"bne $at ,$zero, $last8 \n"

29

"xor $t8, %[src], %[dst] \n"

30

"andi $t8, $t8, 0x3 \n"

31

32

"bne $t8, $zero, unaligned \n"

33

"negu $a3, %[dst] \n"

34

// make dst/src aligned

35

"andi $a3, $a3, 0x3 \n"

36

"beq $a3, $zero, $chk16w \n"

37

// word-aligned now count is the remining bytes count

38

"subu %[count], %[count], $a3 \n"

39

40

"lwr $t8, 0(%[src]) \n"

41

"addu %[src], %[src], $a3 \n"

42

"swr $t8, 0(%[dst]) \n"

43

"addu %[dst], %[dst], $a3 \n"

44

45

// Now the dst/src are mutually word-aligned with word-aligned addresses

46

"$chk16w: \n"

47

"andi $t8, %[count], 0x3f \n" // whole 64-B chunks?

48

// t8 is the byte count after 64-byte chunks

49

"beq %[count], $t8, chk8w \n"

50

// There will be at most 1 32-byte chunk after it

51

"subu $a3, %[count], $t8 \n" // the reminder

52

// Here a3 counts bytes in 16w chunks

53

"addu $a3, %[dst], $a3 \n"

54

// Now a3 is the final dst after 64-byte chunks

55

"addu $t0, %[dst], %[count] \n"

56

// t0 is the "past the end" address

57

58

// When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past

59

// the "t0-32" address

60

// This means: for x=128 the last "safe" a1 address is "t0-160"

61

// Alternatively, for x=64 the last "safe" a1 address is "t0-96"

62

// we will use "pref 30,128(a1)", so "t0-160" is the limit

63

"subu $t9, $t0, 160 \n"

64

// t9 is the "last safe pref 30,128(a1)" address

65

"pref 0, 0(%[src]) \n" // first line of src

66

"pref 0, 32(%[src]) \n" // second line of src

67

"pref 0, 64(%[src]) \n"

68

"pref 30, 32(%[dst]) \n"

69

// In case the a1 > t9 don't use "pref 30" at all

70

"sgtu $v1, %[dst], $t9 \n"

71

"bgtz $v1, $loop16w \n"

72

"nop \n"

73

// otherwise, start with using pref30

74

"pref 30, 64(%[dst]) \n"

75

"$loop16w: \n"

76

"pref 0, 96(%[src]) \n"

77

"lw $t0, 0(%[src]) \n"

78

"bgtz $v1, $skip_pref30_96 \n" // skip

79

"lw $t1, 4(%[src]) \n"

80

"pref 30, 96(%[dst]) \n" // continue

81

"$skip_pref30_96: \n"

82

"lw $t2, 8(%[src]) \n"

83

"lw $t3, 12(%[src]) \n"

84

"lw $t4, 16(%[src]) \n"

85

"lw $t5, 20(%[src]) \n"

86

"lw $t6, 24(%[src]) \n"

87

"lw $t7, 28(%[src]) \n"

88

"pref 0, 128(%[src]) \n"

89

// bring the next lines of src, addr 128

90

"sw $t0, 0(%[dst]) \n"

91

"sw $t1, 4(%[dst]) \n"

92

"sw $t2, 8(%[dst]) \n"

93

"sw $t3, 12(%[dst]) \n"

94

"sw $t4, 16(%[dst]) \n"

95

"sw $t5, 20(%[dst]) \n"

96

"sw $t6, 24(%[dst]) \n"

97

"sw $t7, 28(%[dst]) \n"

98

"lw $t0, 32(%[src]) \n"

99

"bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1)

100

"lw $t1, 36(%[src]) \n"

101

"pref 30, 128(%[dst]) \n" // set dest, addr 128

102

"$skip_pref30_128: \n"

103

"lw $t2, 40(%[src]) \n"

104

"lw $t3, 44(%[src]) \n"

105

"lw $t4, 48(%[src]) \n"

106

"lw $t5, 52(%[src]) \n"

107

"lw $t6, 56(%[src]) \n"

108

"lw $t7, 60(%[src]) \n"

109

"pref 0, 160(%[src]) \n"

110

// bring the next lines of src, addr 160

111

"sw $t0, 32(%[dst]) \n"

112

"sw $t1, 36(%[dst]) \n"

113

"sw $t2, 40(%[dst]) \n"

114

"sw $t3, 44(%[dst]) \n"

115

"sw $t4, 48(%[dst]) \n"

116

"sw $t5, 52(%[dst]) \n"

117

"sw $t6, 56(%[dst]) \n"

118

"sw $t7, 60(%[dst]) \n"

119

120

"addiu %[dst], %[dst], 64 \n" // adding 64 to dest

121

"sgtu $v1, %[dst], $t9 \n"

122

"bne %[dst], $a3, $loop16w \n"

123

" addiu %[src], %[src], 64 \n" // adding 64 to src

124

"move %[count], $t8 \n"

125

126

// Here we have src and dest word-aligned but less than 64-bytes to go

127

128

"chk8w: \n"

129

"pref 0, 0x0(%[src]) \n"

130

"andi $t8, %[count], 0x1f \n" // 32-byte chunk?

131

// the t8 is the reminder count past 32-bytes

132

"beq %[count], $t8, chk1w \n"

133

// count=t8,no 32-byte chunk

134

" nop \n"

135

136

"lw $t0, 0(%[src]) \n"

137

"lw $t1, 4(%[src]) \n"

138

"lw $t2, 8(%[src]) \n"

139

"lw $t3, 12(%[src]) \n"

140

"lw $t4, 16(%[src]) \n"

141

"lw $t5, 20(%[src]) \n"

142

"lw $t6, 24(%[src]) \n"

143

"lw $t7, 28(%[src]) \n"

144

"addiu %[src], %[src], 32 \n"

145

146

"sw $t0, 0(%[dst]) \n"

147

"sw $t1, 4(%[dst]) \n"

148

"sw $t2, 8(%[dst]) \n"

149

"sw $t3, 12(%[dst]) \n"

150

"sw $t4, 16(%[dst]) \n"

151

"sw $t5, 20(%[dst]) \n"

152

"sw $t6, 24(%[dst]) \n"

153

"sw $t7, 28(%[dst]) \n"

154

"addiu %[dst], %[dst], 32 \n"

155

156

"chk1w: \n"

157

"andi %[count], $t8, 0x3 \n"

158

// now count is the reminder past 1w chunks

159

"beq %[count], $t8, $last8 \n"

160

" subu $a3, $t8, %[count] \n"

161

// a3 is count of bytes in 1w chunks

162

"addu $a3, %[dst], $a3 \n"

163

// now a3 is the dst address past the 1w chunks

164

// copying in words (4-byte chunks)

165

"$wordCopy_loop: \n"

166

"lw $t3, 0(%[src]) \n"

167

// the first t3 may be equal t0 ... optimize?

168

"addiu %[src], %[src],4 \n"

169

"addiu %[dst], %[dst],4 \n"

170

"bne %[dst], $a3,$wordCopy_loop \n"

171

" sw $t3, -4(%[dst]) \n"

172

173

// For the last (<8) bytes

174

"$last8: \n"

175

"blez %[count], leave \n"

176

" addu $a3, %[dst], %[count] \n" // a3 -last dst address

177

"$last8loop: \n"

178

"lb $v1, 0(%[src]) \n"

179

"addiu %[src], %[src], 1 \n"

180

"addiu %[dst], %[dst], 1 \n"

181

"bne %[dst], $a3, $last8loop \n"

182

" sb $v1, -1(%[dst]) \n"

"leave: \n"

" j $ra \n"

" nop \n"

//

// UNALIGNED case

//

"unaligned: \n"

// got here with a3="negu a1"

194

"andi $a3, $a3, 0x3 \n" // a1 is word aligned?

195

"beqz $a3, $ua_chk16w \n"

196

" subu %[count], %[count], $a3 \n"

197

// bytes left after initial a3 bytes

198

"lwr $v1, 0(%[src]) \n"

199

"lwl $v1, 3(%[src]) \n"

200

"addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3

201

"swr $v1, 0(%[dst]) \n"

202

"addu %[dst], %[dst], $a3 \n"

203

// below the dst will be word aligned (NOTE1)

204

"$ua_chk16w: \n"

205

"andi $t8, %[count], 0x3f \n" // whole 64-B chunks?

206

// t8 is the byte count after 64-byte chunks

207

"beq %[count], $t8, ua_chk8w \n"

208

// if a2==t8, no 64-byte chunks

209

// There will be at most 1 32-byte chunk after it

210

"subu $a3, %[count], $t8 \n" // the reminder

211

// Here a3 counts bytes in 16w chunks

212

"addu $a3, %[dst], $a3 \n"

213

// Now a3 is the final dst after 64-byte chunks

214

"addu $t0, %[dst], %[count] \n" // t0 "past the end"

215

"subu $t9, $t0, 160 \n"

216

// t9 is the "last safe pref 30,128(a1)" address

217

"pref 0, 0(%[src]) \n" // first line of src

218

"pref 0, 32(%[src]) \n" // second line addr 32

219

"pref 0, 64(%[src]) \n"

220

"pref 30, 32(%[dst]) \n"

221

// safe, as we have at least 64 bytes ahead

222

// In case the a1 > t9 don't use "pref 30" at all

223

"sgtu $v1, %[dst], $t9 \n"

224

"bgtz $v1, $ua_loop16w \n"

225

// skip "pref 30,64(a1)" for too short arrays

226

" nop \n"

227

// otherwise, start with using pref30

228

"pref 30, 64(%[dst]) \n"

229

"$ua_loop16w: \n"

230

"pref 0, 96(%[src]) \n"

231

"lwr $t0, 0(%[src]) \n"

232

"lwl $t0, 3(%[src]) \n"

233

"lwr $t1, 4(%[src]) \n"

234

"bgtz $v1, $ua_skip_pref30_96 \n"

235

" lwl $t1, 7(%[src]) \n"

236

"pref 30, 96(%[dst]) \n"

237

// continue setting up the dest, addr 96

238

"$ua_skip_pref30_96: \n"

239

"lwr $t2, 8(%[src]) \n"

240

"lwl $t2, 11(%[src]) \n"

241

"lwr $t3, 12(%[src]) \n"

242

"lwl $t3, 15(%[src]) \n"

243

"lwr $t4, 16(%[src]) \n"

244

"lwl $t4, 19(%[src]) \n"

245

"lwr $t5, 20(%[src]) \n"

246

"lwl $t5, 23(%[src]) \n"

247

"lwr $t6, 24(%[src]) \n"

248

"lwl $t6, 27(%[src]) \n"

249

"lwr $t7, 28(%[src]) \n"

250

"lwl $t7, 31(%[src]) \n"

251

"pref 0, 128(%[src]) \n"

252

// bring the next lines of src, addr 128

253

"sw $t0, 0(%[dst]) \n"

254

"sw $t1, 4(%[dst]) \n"

255

"sw $t2, 8(%[dst]) \n"

256

"sw $t3, 12(%[dst]) \n"

257

"sw $t4, 16(%[dst]) \n"

258

"sw $t5, 20(%[dst]) \n"

259

"sw $t6, 24(%[dst]) \n"

260

"sw $t7, 28(%[dst]) \n"

261

"lwr $t0, 32(%[src]) \n"

262

"lwl $t0, 35(%[src]) \n"

263

"lwr $t1, 36(%[src]) \n"

264

"bgtz $v1, ua_skip_pref30_128 \n"

265

" lwl $t1, 39(%[src]) \n"

266

"pref 30, 128(%[dst]) \n"

267

// continue setting up the dest, addr 128

268

"ua_skip_pref30_128: \n"

269

270

"lwr $t2, 40(%[src]) \n"

271

"lwl $t2, 43(%[src]) \n"

272

"lwr $t3, 44(%[src]) \n"

273

"lwl $t3, 47(%[src]) \n"

274

"lwr $t4, 48(%[src]) \n"

275

"lwl $t4, 51(%[src]) \n"

276

"lwr $t5, 52(%[src]) \n"

277

"lwl $t5, 55(%[src]) \n"

278

"lwr $t6, 56(%[src]) \n"

279

"lwl $t6, 59(%[src]) \n"

280

"lwr $t7, 60(%[src]) \n"

281

"lwl $t7, 63(%[src]) \n"

282

"pref 0, 160(%[src]) \n"

283

// bring the next lines of src, addr 160

284

"sw $t0, 32(%[dst]) \n"

285

"sw $t1, 36(%[dst]) \n"

286

"sw $t2, 40(%[dst]) \n"

287

"sw $t3, 44(%[dst]) \n"

288

"sw $t4, 48(%[dst]) \n"

289

"sw $t5, 52(%[dst]) \n"

290

"sw $t6, 56(%[dst]) \n"

291

"sw $t7, 60(%[dst]) \n"

292

293

"addiu %[dst],%[dst],64 \n" // adding 64 to dest

294

"sgtu $v1,%[dst],$t9 \n"

295

"bne %[dst],$a3,$ua_loop16w \n"

296

" addiu %[src],%[src],64 \n" // adding 64 to src

297

"move %[count],$t8 \n"

298

299

// Here we have src and dest word-aligned but less than 64-bytes to go

300

301

"ua_chk8w: \n"

302

"pref 0, 0x0(%[src]) \n"

303

"andi $t8, %[count], 0x1f \n" // 32-byte chunk?

304

// the t8 is the reminder count

305

"beq %[count], $t8, $ua_chk1w \n"

306

// when count==t8, no 32-byte chunk

307

308

"lwr $t0, 0(%[src]) \n"

309

"lwl $t0, 3(%[src]) \n"

310

"lwr $t1, 4(%[src]) \n"

311

"lwl $t1, 7(%[src]) \n"

312

"lwr $t2, 8(%[src]) \n"

313

"lwl $t2, 11(%[src]) \n"

314

"lwr $t3, 12(%[src]) \n"

315

"lwl $t3, 15(%[src]) \n"

316

"lwr $t4, 16(%[src]) \n"

317

"lwl $t4, 19(%[src]) \n"

318

"lwr $t5, 20(%[src]) \n"

319

"lwl $t5, 23(%[src]) \n"

320

"lwr $t6, 24(%[src]) \n"

321

"lwl $t6, 27(%[src]) \n"

322

"lwr $t7, 28(%[src]) \n"

323

"lwl $t7, 31(%[src]) \n"

324

"addiu %[src], %[src], 32 \n"

325

326

"sw $t0, 0(%[dst]) \n"

327

"sw $t1, 4(%[dst]) \n"

328

"sw $t2, 8(%[dst]) \n"

329

"sw $t3, 12(%[dst]) \n"

330

"sw $t4, 16(%[dst]) \n"

331

"sw $t5, 20(%[dst]) \n"

332

"sw $t6, 24(%[dst]) \n"

333

"sw $t7, 28(%[dst]) \n"

334

"addiu %[dst], %[dst], 32 \n"

335

336

"$ua_chk1w: \n"

337

"andi %[count], $t8, 0x3 \n"

338

// now count is the reminder past 1w chunks

339

"beq %[count], $t8, ua_smallCopy \n"

340

"subu $a3, $t8, %[count] \n"

341

// a3 is count of bytes in 1w chunks

342

"addu $a3, %[dst], $a3 \n"

343

// now a3 is the dst address past the 1w chunks

344

345

// copying in words (4-byte chunks)

346

"$ua_wordCopy_loop: \n"

347

"lwr $v1, 0(%[src]) \n"

348

"lwl $v1, 3(%[src]) \n"

349

"addiu %[src], %[src], 4 \n"

350

"addiu %[dst], %[dst], 4 \n"

351

// note: dst=a1 is word aligned here, see NOTE1

352

"bne %[dst], $a3, $ua_wordCopy_loop \n"

353

" sw $v1,-4(%[dst]) \n"

354

355

// Now less than 4 bytes (value in count) left to copy

356

"ua_smallCopy: \n"

357

"beqz %[count], leave \n"

358

" addu $a3, %[dst], %[count] \n" // a3 = last dst address

359

"$ua_smallCopy_loop: \n"

360

"lb $v1, 0(%[src]) \n"

361

"addiu %[src], %[src], 1 \n"

362

"addiu %[dst], %[dst], 1 \n"

363

"bne %[dst],$a3,$ua_smallCopy_loop \n"

364

" sb $v1, -1(%[dst]) \n"

"j $ra \n"

" nop \n"

".set at \n"

".set reorder \n"

: [dst] "+r" (dst), [src] "+r" (src)

371

: [count] "r" (count)

372

: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",

373

"t8", "t9", "a3", "v1", "at"

374

);

375

}

376

#endif // HAS_COPYROW_MIPS

377

378

// DSPR2 functions

379

#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \

380

(__mips_dsp_rev >= 2) && \

381

(_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)

382

383

void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

384

int width) {

385

__asm__ __volatile__ (

386

".set push \n"

387

".set noreorder \n"

388

"srl $t4, %[width], 4 \n" // multiplies of 16

389

"blez $t4, 2f \n"

390

" andi %[width], %[width], 0xf \n" // residual

391

392

"1: \n"

393

"addiu $t4, $t4, -1 \n"

394

"lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0

395

"lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2

396

"lw $t2, 8(%[src_uv]) \n" // V5 | U5 | V4 | U4

397

"lw $t3, 12(%[src_uv]) \n" // V7 | U7 | V6 | U6

398

"lw $t5, 16(%[src_uv]) \n" // V9 | U9 | V8 | U8

399

"lw $t6, 20(%[src_uv]) \n" // V11 | U11 | V10 | U10

400

"lw $t7, 24(%[src_uv]) \n" // V13 | U13 | V12 | U12

401

"lw $t8, 28(%[src_uv]) \n" // V15 | U15 | V14 | U14

402

"addiu %[src_uv], %[src_uv], 32 \n"

403

"precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0

404

"precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0

405

"precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4

406

"precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4

407

"precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8

408

"precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8

409

"precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12

410

"precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12

411

"sw $t9, 0(%[dst_v]) \n"

412

"sw $t0, 0(%[dst_u]) \n"

413

"sw $t1, 4(%[dst_v]) \n"

414

"sw $t2, 4(%[dst_u]) \n"

415

"sw $t3, 8(%[dst_v]) \n"

416

"sw $t5, 8(%[dst_u]) \n"

417

"sw $t6, 12(%[dst_v]) \n"

418

"sw $t7, 12(%[dst_u]) \n"

419

"addiu %[dst_v], %[dst_v], 16 \n"

420

"bgtz $t4, 1b \n"

421

" addiu %[dst_u], %[dst_u], 16 \n"

422

423

"beqz %[width], 3f \n"

" nop \n"

"2: \n"

"lbu $t0, 0(%[src_uv]) \n"

428

"lbu $t1, 1(%[src_uv]) \n"

429

"addiu %[src_uv], %[src_uv], 2 \n"

430

"addiu %[width], %[width], -1 \n"

431

"sb $t0, 0(%[dst_u]) \n"

432

"sb $t1, 0(%[dst_v]) \n"

433

"addiu %[dst_u], %[dst_u], 1 \n"

434

"bgtz %[width], 2b \n"

435

" addiu %[dst_v], %[dst_v], 1 \n"

"3: \n"

".set pop \n"

: [src_uv] "+r" (src_uv),

440

[width] "+r" (width),

441

[dst_u] "+r" (dst_u),

442

[dst_v] "+r" (dst_v)

443

:

444

: "t0", "t1", "t2", "t3",

445

"t4", "t5", "t6", "t7", "t8", "t9"

);

}

void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {

450

__asm__ __volatile__ (

".set push \n"

".set noreorder \n"

"srl $t4, %[width], 4 \n" // multiplies of 16

455

"andi $t5, %[width], 0xf \n"

456

"blez $t4, 2f \n"

457

" addu %[src], %[src], %[width] \n" // src += width

458

459

"1: \n"

460

"lw $t0, -16(%[src]) \n" // |3|2|1|0|

461

"lw $t1, -12(%[src]) \n" // |7|6|5|4|

462

"lw $t2, -8(%[src]) \n" // |11|10|9|8|

463

"lw $t3, -4(%[src]) \n" // |15|14|13|12|

464

"wsbh $t0, $t0 \n" // |2|3|0|1|

465

"wsbh $t1, $t1 \n" // |6|7|4|5|

466

"wsbh $t2, $t2 \n" // |10|11|8|9|

467

"wsbh $t3, $t3 \n" // |14|15|12|13|

468

"rotr $t0, $t0, 16 \n" // |0|1|2|3|

469

"rotr $t1, $t1, 16 \n" // |4|5|6|7|

470

"rotr $t2, $t2, 16 \n" // |8|9|10|11|

471

"rotr $t3, $t3, 16 \n" // |12|13|14|15|

472

"addiu %[src], %[src], -16 \n"

473

"addiu $t4, $t4, -1 \n"

474

"sw $t3, 0(%[dst]) \n" // |15|14|13|12|

475

"sw $t2, 4(%[dst]) \n" // |11|10|9|8|

476

"sw $t1, 8(%[dst]) \n" // |7|6|5|4|

477

"sw $t0, 12(%[dst]) \n" // |3|2|1|0|

478

"bgtz $t4, 1b \n"

479

" addiu %[dst], %[dst], 16 \n"

"beqz $t5, 3f \n"

" nop \n"

"2: \n"

"lbu $t0, -1(%[src]) \n"

485

"addiu $t5, $t5, -1 \n"

486

"addiu %[src], %[src], -1 \n"

487

"sb $t0, 0(%[dst]) \n"

488

"bgez $t5, 2b \n"

489

" addiu %[dst], %[dst], 1 \n"

"3: \n"

".set pop \n"

: [src] "+r" (src), [dst] "+r" (dst)

494

: [width] "r" (width)

495

: "t0", "t1", "t2", "t3", "t4", "t5"

);

}

void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

int width) {

int x;

int y;

__asm__ __volatile__ (

".set push \n"

".set noreorder \n"

"addu $t4, %[width], %[width] \n"

508

"srl %[x], %[width], 4 \n"

509

"andi %[y], %[width], 0xf \n"

510

"blez %[x], 2f \n"

511

" addu %[src_uv], %[src_uv], $t4 \n"

512

513

"1: \n"

514

"lw $t0, -32(%[src_uv]) \n" // |3|2|1|0|

515

"lw $t1, -28(%[src_uv]) \n" // |7|6|5|4|

516

"lw $t2, -24(%[src_uv]) \n" // |11|10|9|8|

517

"lw $t3, -20(%[src_uv]) \n" // |15|14|13|12|

518

"lw $t4, -16(%[src_uv]) \n" // |19|18|17|16|

519

"lw $t6, -12(%[src_uv]) \n" // |23|22|21|20|

520

"lw $t7, -8(%[src_uv]) \n" // |27|26|25|24|

521

"lw $t8, -4(%[src_uv]) \n" // |31|30|29|28|

522

523

"rotr $t0, $t0, 16 \n" // |1|0|3|2|

524

"rotr $t1, $t1, 16 \n" // |5|4|7|6|

525

"rotr $t2, $t2, 16 \n" // |9|8|11|10|

526

"rotr $t3, $t3, 16 \n" // |13|12|15|14|

527

"rotr $t4, $t4, 16 \n" // |17|16|19|18|

528

"rotr $t6, $t6, 16 \n" // |21|20|23|22|

529

"rotr $t7, $t7, 16 \n" // |25|24|27|26|

530

"rotr $t8, $t8, 16 \n" // |29|28|31|30|

531

"precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6|

532

"precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7|

533

"precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14|

534

"precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15|

535

"precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22|

536

"precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23|

537

"precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30|

538

"precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31|

539

"addiu %[src_uv], %[src_uv], -32 \n"

540

"addiu %[x], %[x], -1 \n"

541

"swr $t4, 0(%[dst_u]) \n"

542

"swl $t4, 3(%[dst_u]) \n" // |30|28|26|24|

543

"swr $t6, 0(%[dst_v]) \n"

544

"swl $t6, 3(%[dst_v]) \n" // |31|29|27|25|

545

"swr $t2, 4(%[dst_u]) \n"

546

"swl $t2, 7(%[dst_u]) \n" // |22|20|18|16|

547

"swr $t3, 4(%[dst_v]) \n"

548

"swl $t3, 7(%[dst_v]) \n" // |23|21|19|17|

549

"swr $t0, 8(%[dst_u]) \n"

550

"swl $t0, 11(%[dst_u]) \n" // |14|12|10|8|

551

"swr $t1, 8(%[dst_v]) \n"

552

"swl $t1, 11(%[dst_v]) \n" // |15|13|11|9|

553

"swr $t9, 12(%[dst_u]) \n"

554

"swl $t9, 15(%[dst_u]) \n" // |6|4|2|0|

555

"swr $t5, 12(%[dst_v]) \n"

556

"swl $t5, 15(%[dst_v]) \n" // |7|5|3|1|

557

"addiu %[dst_v], %[dst_v], 16 \n"

558

"bgtz %[x], 1b \n"

559

" addiu %[dst_u], %[dst_u], 16 \n"

"beqz %[y], 3f \n"

" nop \n"

"b 2f \n"

" nop \n"

"2: \n"

"lbu $t0, -2(%[src_uv]) \n"

567

"lbu $t1, -1(%[src_uv]) \n"

568

"addiu %[src_uv], %[src_uv], -2 \n"

569

"addiu %[y], %[y], -1 \n"

570

"sb $t0, 0(%[dst_u]) \n"

571

"sb $t1, 0(%[dst_v]) \n"

572

"addiu %[dst_u], %[dst_u], 1 \n"

573

"bgtz %[y], 2b \n"

574

" addiu %[dst_v], %[dst_v], 1 \n"

"3: \n"

".set pop \n"

: [src_uv] "+r" (src_uv),

579

[dst_u] "+r" (dst_u),

580

[dst_v] "+r" (dst_v),

581

[x] "=&r" (x),

582

[y] "=&r" (y)

583

: [width] "r" (width)

584

: "t0", "t1", "t2", "t3", "t4",

585

"t5", "t7", "t8", "t9"

);

}

// Convert (4 Y and 2 VU) I422 and arrange RGB values into

590

// t5 = | 0 | B0 | 0 | b0 |

591

// t4 = | 0 | B1 | 0 | b1 |

592

// t9 = | 0 | G0 | 0 | g0 |

593

// t8 = | 0 | G1 | 0 | g1 |

594

// t2 = | 0 | R0 | 0 | r0 |

595

// t1 = | 0 | R1 | 0 | r1 |

596

#define YUVTORGB \

597

"lw $t0, 0(%[y_buf]) \n" \

598

"lhu $t1, 0(%[u_buf]) \n" \

599

"lhu $t2, 0(%[v_buf]) \n" \

600

"preceu.ph.qbr $t1, $t1 \n" \

601

"preceu.ph.qbr $t2, $t2 \n" \

602

"preceu.ph.qbra $t3, $t0 \n" \

603

"preceu.ph.qbla $t0, $t0 \n" \

604

"subu.ph $t1, $t1, $s5 \n" \

605

"subu.ph $t2, $t2, $s5 \n" \

606

"subu.ph $t3, $t3, $s4 \n" \

607

"subu.ph $t0, $t0, $s4 \n" \

608

"mul.ph $t3, $t3, $s0 \n" \

609

"mul.ph $t0, $t0, $s0 \n" \

610

"shll.ph $t4, $t1, 0x7 \n" \

611

"subu.ph $t4, $t4, $t1 \n" \

612

"mul.ph $t6, $t1, $s1 \n" \

613

"mul.ph $t1, $t2, $s2 \n" \

614

"addq_s.ph $t5, $t4, $t3 \n" \

615

"addq_s.ph $t4, $t4, $t0 \n" \

616

"shra.ph $t5, $t5, 6 \n" \

617

"shra.ph $t4, $t4, 6 \n" \

618

"addiu %[u_buf], 2 \n" \

619

"addiu %[v_buf], 2 \n" \

620

"addu.ph $t6, $t6, $t1 \n" \

621

"mul.ph $t1, $t2, $s3 \n" \

622

"addu.ph $t9, $t6, $t3 \n" \

623

"addu.ph $t8, $t6, $t0 \n" \

624

"shra.ph $t9, $t9, 6 \n" \

625

"shra.ph $t8, $t8, 6 \n" \

626

"addu.ph $t2, $t1, $t3 \n" \

627

"addu.ph $t1, $t1, $t0 \n" \

628

"shra.ph $t2, $t2, 6 \n" \

629

"shra.ph $t1, $t1, 6 \n" \

630

"subu.ph $t5, $t5, $s5 \n" \

631

"subu.ph $t4, $t4, $s5 \n" \

632

"subu.ph $t9, $t9, $s5 \n" \

633

"subu.ph $t8, $t8, $s5 \n" \

634

"subu.ph $t2, $t2, $s5 \n" \

635

"subu.ph $t1, $t1, $s5 \n" \

636

"shll_s.ph $t5, $t5, 8 \n" \

637

"shll_s.ph $t4, $t4, 8 \n" \

638

"shll_s.ph $t9, $t9, 8 \n" \

639

"shll_s.ph $t8, $t8, 8 \n" \

640

"shll_s.ph $t2, $t2, 8 \n" \

641

"shll_s.ph $t1, $t1, 8 \n" \

642

"shra.ph $t5, $t5, 8 \n" \

643

"shra.ph $t4, $t4, 8 \n" \

644

"shra.ph $t9, $t9, 8 \n" \

645

"shra.ph $t8, $t8, 8 \n" \

646

"shra.ph $t2, $t2, 8 \n" \

647

"shra.ph $t1, $t1, 8 \n" \

648

"addu.ph $t5, $t5, $s5 \n" \

649

"addu.ph $t4, $t4, $s5 \n" \

650

"addu.ph $t9, $t9, $s5 \n" \

651

"addu.ph $t8, $t8, $s5 \n" \

652

"addu.ph $t2, $t2, $s5 \n" \

653

"addu.ph $t1, $t1, $s5 \n"

654

655

// TODO(fbarchard): accept yuv conversion constants.

656

void I422ToARGBRow_DSPR2(const uint8* y_buf,

const uint8* u_buf,

const uint8* v_buf,

uint8* rgb_buf,

const struct YuvConstants* yuvconstants,

661

int width) {

662

__asm__ __volatile__ (

663

".set push \n"

664

".set noreorder \n"

665

"beqz %[width], 2f \n"

666

" repl.ph $s0, 74 \n" // |YG|YG| = |74|74|

667

"repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|

668

"repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|

669

"repl.ph $s3, 102 \n" // |VR|VR| = |102|102|

670

"repl.ph $s4, 16 \n" // |0|16|0|16|

671

"repl.ph $s5, 128 \n" // |128|128| // clipping

672

"lui $s6, 0xff00 \n"

673

"ori $s6, 0xff00 \n" // |ff|00|ff|00|ff|

"1: \n"

YUVTORGB

// Arranging into argb format

678

"precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1|

679

"precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0|

680

"addiu %[width], -4 \n"

681

"precrq.qb.ph $t8, $t4, $t5 \n" // |G1|B1|G0|B0|

682

"precr.qb.ph $t9, $t4, $t5 \n" // |g1|b1|g0|b0|

683

"precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|

684

685

"addiu %[y_buf], 4 \n"

686

"preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0|

687

"preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0|

688

"or $t1, $t1, $s6 \n" // |ff|R1|ff|R0|

689

"or $t2, $t2, $s6 \n" // |ff|r1|ff|r0|

690

"precrq.ph.w $t0, $t2, $t9 \n" // |ff|r1|g1|b1|

691

"precrq.ph.w $t3, $t1, $t8 \n" // |ff|R1|G1|B1|

692

"sll $t9, $t9, 16 \n"

693

"sll $t8, $t8, 16 \n"

694

"packrl.ph $t2, $t2, $t9 \n" // |ff|r0|g0|b0|

695

"packrl.ph $t1, $t1, $t8 \n" // |ff|R0|G0|B0|

696

// Store results.

697

"sw $t2, 0(%[rgb_buf]) \n"

698

"sw $t0, 4(%[rgb_buf]) \n"

699

"sw $t1, 8(%[rgb_buf]) \n"

700

"sw $t3, 12(%[rgb_buf]) \n"

701

"bnez %[width], 1b \n"

702

" addiu %[rgb_buf], 16 \n"

703

"2: \n"

704

".set pop \n"

705

:[y_buf] "+r" (y_buf),

706

[u_buf] "+r" (u_buf),

707

[v_buf] "+r" (v_buf),

708

[width] "+r" (width),

709

[rgb_buf] "+r" (rgb_buf)

710

:

711

: "t0", "t1", "t2", "t3", "t4", "t5",

712

"t6", "t7", "t8", "t9",

713

"s0", "s1", "s2", "s3",

"s4", "s5", "s6"

);

}

// Bilinear filter 8x2 -> 8x1

719

void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,

720

ptrdiff_t src_stride, int dst_width,

721

int source_y_fraction) {

722

int y0_fraction = 256 - source_y_fraction;

723

const uint8* src_ptr1 = src_ptr + src_stride;

724

725

__asm__ __volatile__ (

".set push \n"

".set noreorder \n"

"replv.ph $t0, %[y0_fraction] \n"

730

"replv.ph $t1, %[source_y_fraction] \n"

731

732

"1: \n"

733

"lw $t2, 0(%[src_ptr]) \n"

734

"lw $t3, 0(%[src_ptr1]) \n"

735

"lw $t4, 4(%[src_ptr]) \n"

736

"lw $t5, 4(%[src_ptr1]) \n"

737

"muleu_s.ph.qbl $t6, $t2, $t0 \n"

738

"muleu_s.ph.qbr $t7, $t2, $t0 \n"

739

"muleu_s.ph.qbl $t8, $t3, $t1 \n"

740

"muleu_s.ph.qbr $t9, $t3, $t1 \n"

741

"muleu_s.ph.qbl $t2, $t4, $t0 \n"

742

"muleu_s.ph.qbr $t3, $t4, $t0 \n"

743

"muleu_s.ph.qbl $t4, $t5, $t1 \n"

744

"muleu_s.ph.qbr $t5, $t5, $t1 \n"

745

"addq.ph $t6, $t6, $t8 \n"

746

"addq.ph $t7, $t7, $t9 \n"

747

"addq.ph $t2, $t2, $t4 \n"

748

"addq.ph $t3, $t3, $t5 \n"

749

"shra.ph $t6, $t6, 8 \n"

750

"shra.ph $t7, $t7, 8 \n"

751

"shra.ph $t2, $t2, 8 \n"

752

"shra.ph $t3, $t3, 8 \n"

753

"precr.qb.ph $t6, $t6, $t7 \n"

754

"precr.qb.ph $t2, $t2, $t3 \n"

755

"addiu %[src_ptr], %[src_ptr], 8 \n"

756

"addiu %[src_ptr1], %[src_ptr1], 8 \n"

757

"addiu %[dst_width], %[dst_width], -8 \n"

758

"sw $t6, 0(%[dst_ptr]) \n"

759

"sw $t2, 4(%[dst_ptr]) \n"

760

"bgtz %[dst_width], 1b \n"

761

" addiu %[dst_ptr], %[dst_ptr], 8 \n"

762

763

".set pop \n"

764

: [dst_ptr] "+r" (dst_ptr),

765

[src_ptr1] "+r" (src_ptr1),

766

[src_ptr] "+r" (src_ptr),

767

[dst_width] "+r" (dst_width)

768

: [source_y_fraction] "r" (source_y_fraction),

769

[y0_fraction] "r" (y0_fraction),

770

[src_stride] "r" (src_stride)

771

: "t0", "t1", "t2", "t3", "t4", "t5",

772

"t6", "t7", "t8", "t9"

773

);

774

}

775

#endif // __mips_dsp_rev >= 2

776

777

#endif // defined(__mips__)

#ifdef __cplusplus

} // extern "C"

} // namespace libyuv

782

#endif