Blame - cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S - platform/frameworks/rs

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

/*

*

* Licensed under the Apache License, Version 2.0 (the "License");

5

* you may not use this file except in compliance with the License.

6

* You may obtain a copy of the License at

7

*

8

* http://www.apache.org/licenses/LICENSE-2.0

9

*

10

* Unless required by applicable law or agreed to in writing, software

11

* distributed under the License is distributed on an "AS IS" BASIS,

12

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13

* See the License for the specific language governing permissions and

14

* limitations under the License.

15

*/

16

17

#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:

18

#define END(f) .size f, .-f;

19

20

/* Perform the actual YuvToRGB conversion in a macro, from register to

21

* register. This macro will be called from within several different wrapper

22

* variants for different data layouts. Y data starts with the even and odd

23

* bytes split into the low parts of v8 and v9 respectively. U and V are in

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

24

* v10 and v11. Working constants are pre-loaded into v24-v31, and v3 and v7

25

* are pre-loaded with a constant 0xff alpha channel.

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

26

*

27

* The complicated arithmetic is the result of refactoring the original

28

* equations to avoid 16-bit overflow without losing any precision.

29

*/

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

30

.macro yuvkern, regu=v10, regv=v11

31

/* v0 out R_lo / even R_lo accumulator

32

* v1 out G_lo / even G_lo accumulator

33

* v2 out B_lo / even B_lo accumulator

34

* v3 out A_lo / const 0xff*ff

35

* v4 out R_hi / even R_hi accumulator

36

* v5 out G_hi / even G_hi accumulator

37

* v6 out B_hi / even B_hi accumulator

38

* v7 out A_hi / const 0xff*ff

39

* v8 even Y / G_lo luma tmp

40

* v9 odd Y / G_lo luma tmp

* \regu in U

* \regv in V

* v12 R_lo luma tmp

* v13 B_lo luma tmp

* v14 R_hi luma tmp

* v15 B_hi luma tmp

* v16 odd R_lo accumulator

48

* v17 odd G_lo accumulator

49

* v18 odd B_lo accumulator

50

* v19 multiplier extra bits low

51

* v20 odd R_hi accumulator

52

* v21 odd G_hi accumulator

53

* v22 odd B_hi accumulator

54

* v23 multiplier extra bits high

* v24 constant 149

* v25 constant 50

* v26 constant 104

* v27 constant 204

* v28 constant 254

* v29 constant ((16 * 149 + (128 >> 1) + 128 * 204) >> 1)

61

* v30 constant ((-16 * 149 + 128 * 50 + 128 * 104) >> 0)

62

* v31 constant ((16 * 149 + (128 << 2) + 128 * 254) >> 1)

63

*/

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

64

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

65

umull v1.8h, v8.8b, v24.8b // g0 = y0 * 149

66

umull v17.8h, v9.8b, v24.8b // g1 = y1 * 149

67

umull2 v5.8h, v8.16b, v24.16b // g0_hi = y0_hi * 149

68

umull2 v21.8h, v9.16b, v24.16b // g1_hi = y1_hi * 149

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

69

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

70

umull v8.8h, \regu\().8b, v25.8b // g2 = u * 50 + v * 104

71

umlal v8.8h, \regv\().8b, v26.8b

72

umull2 v9.8h, \regu\().16b, v25.16b // g2_hi = u_hi * 50 + v_hi * 104

73

umlal2 v9.8h, \regv\().16b, v26.16b

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

74

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

75

ushr v19.16b, \regv\().16b, #1

76

uaddw v0.8h, v1.8h, v19.8b // r0 = g0 + (v >> 1)

77

uaddw v16.8h, v17.8h, v19.8b // r1 = g1 + (v >> 1)

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

78

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

79

uaddw2 v4.8h, v5.8h, v19.16b // r0_hi = g0_hi + (v_hi >> 1)

80

uaddw2 v20.8h, v21.8h, v19.16b // r1_hi = g1_hi + (v_hi >> 1)

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

81

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

82

ushll v19.8h, \regu\().8b, #2

83

ushll2 v23.8h, \regu\().16b, #2

84

add v2.8h, v1.8h, v19.8h // b0 = g0 + (u << 2)

85

add v18.8h, v17.8h, v19.8h // b1 = g1 + (u << 2)

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

86

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

87

add v6.8h, v5.8h, v23.8h // b0_hi = g0_hi + (u_hi << 2)

88

add v22.8h, v21.8h, v23.8h // b1_hi = g1_hi + (u_hi << 2)

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

89

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

90

umull v12.8h, \regv\().8b, v27.8b // r2 = v * 204

91

umull v13.8h, \regu\().8b, v28.8b // b2 = u * 254

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

92

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

93

umull2 v14.8h, \regv\().16b, v27.16b // r2_hi = v_hi * 204

94

umull2 v15.8h, \regu\().16b, v28.16b // b2_hi = u_hi * 254

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

95

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

96

uhadd v0.8h, v0.8h, v12.8h // r0 = (r0 + r2) >> 1

97

uhadd v16.8h, v16.8h, v12.8h // r1 = (r1 + r2) >> 1

98

uqadd v1.8h, v1.8h, v30.8h // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)

99

uqadd v17.8h, v17.8h, v30.8h // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)

100

uhadd v2.8h, v2.8h, v13.8h // b0 = (b0 + b2) >> 1

101

uhadd v18.8h, v18.8h, v13.8h // b1 = (b1 + b2) >> 1

102

103

uhadd v4.8h, v4.8h, v14.8h // r0_hi = (r0_hi + r2_hi) >> 1

104

uhadd v20.8h, v20.8h, v14.8h // r1_hi = (r1_hi + r2_hi) >> 1

105

uqadd v5.8h, v5.8h, v30.8h // g0_hi = satu16(g0_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)

106

uqadd v21.8h, v21.8h, v30.8h // g1_hi = satu16(g1_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)

107

uhadd v6.8h, v6.8h, v15.8h // b0_hi = (b0_hi + b2_hi) >> 1

108

uhadd v22.8h, v22.8h, v15.8h // b1_hi = (b1_hi + b2_hi) >> 1

109

110

uqsub v0.8h, v0.8h, v29.8h // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)

111

uqsub v16.8h, v16.8h, v29.8h // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)

112

uqsub v1.8h, v1.8h, v8.8h // g0 = satu16(g0 - g2)

113

uqsub v17.8h, v17.8h, v8.8h // g1 = satu16(g1 - g2)

114

uqsub v2.8h, v2.8h, v31.8h // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)

115

uqsub v18.8h, v18.8h, v31.8h // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)

116

117

uqsub v4.8h, v4.8h, v29.8h // r0_hi = satu16(r0_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)

118

uqsub v20.8h, v20.8h, v29.8h // r1_hi = satu16(r1_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)

119

uqsub v5.8h, v5.8h, v9.8h // g0_hi = satu16(g0_hi - g2_hi)

120

uqsub v21.8h, v21.8h, v9.8h // g1_hi = satu16(g1_hi - g2_hi)

121

uqsub v6.8h, v6.8h, v31.8h // b0_hi = satu16(b0_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1)

122

uqsub v22.8h, v22.8h, v31.8h // b1_hi = satu16(b1_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1)

123

124

uqrshrn v0.8b, v0.8h, #6

125

uqrshrn v16.8b, v16.8h, #6

126

uqrshrn v1.8b, v1.8h, #7

127

uqrshrn v17.8b, v17.8h, #7

128

uqrshrn v2.8b, v2.8h, #6

129

uqrshrn v18.8b, v18.8h, #6

130

131

uqrshrn v4.8b, v4.8h, #6

132

uqrshrn v20.8b, v20.8h, #6

133

uqrshrn v5.8b, v5.8h, #7

134

uqrshrn v21.8b, v21.8h, #7

135

uqrshrn v6.8b, v6.8h, #6

136

uqrshrn v22.8b, v22.8h, #6

137

138

zip1 v0.16b, v0.16b, v16.16b

139

zip1 v1.16b, v1.16b, v17.16b

140

zip1 v2.16b, v2.16b, v18.16b

141

142

zip1 v4.16b, v4.16b, v20.16b

143

zip1 v5.16b, v5.16b, v21.16b

144

zip1 v6.16b, v6.16b, v22.16b

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

145

.endm

146

147

/* Define the wrapper code which will load and store the data, iterate the

148

* correct number of times, and safely handle the remainder at the end of the

149

* loop. Some sections of code are switched out depending on the data packing

150

* being handled.

151

*/

152

.macro wrap_line kernel, interleaved=0, swapuv=0

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

movi v24.16b, #149

movi v25.16b, #50

movi v26.16b, #104

movi v27.16b, #204

movi v28.16b, #254

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

158

mov w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

159

dup v29.8h, w5

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

160

mov w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

161

dup v30.8h, w5

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

162

mov w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

163

dup v31.8h, w5

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

164

165

movi v3.16b, #0xff

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

166

movi v7.16b, #0xff

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

167

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

168

subs x2, x2, #32

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

bhs 1f

b 2f

.align 4

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

173

1: ld2 {v8.16b,v9.16b}, [x1], #32

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

174

.if \interleaved

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

175

ld2 {v10.16b,v11.16b}, [x3], #32

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

176

.else

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

177

ld1 {v10.16b}, [x3], #16

178

ld1 {v11.16b}, [x4], #16

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

179

.endif

180

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

181

.if \swapuv

182

\kernel regu=v11, regv=v10

183

.else

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

184

\kernel

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

185

.endif

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

186

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

187

subs x2, x2, #32

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

188

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

189

st4 {v0.16b - v3.16b}, [x0], #64

190

st4 {v4.16b - v7.16b}, [x0], #64

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

bhs 1b

Simon Hosie

2014-06-19 13:18:05 -0700

[diff] [blame]

194

2: adds x2, x2, #32

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

195

beq 2f

196

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

197

/* To handle the tail portion of the data (something less than 32

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

198

* bytes) load small power-of-two chunks into working registers. It

199

* doesn't matter where they end up in the register; the same process

200

* will store them back out using the same positions and the

201

* interaction between neighbouring pixels is constrained to odd

202

* boundaries where the load operations don't interfere.

203

*/

204

movi v8.8b, #0

205

movi v9.8b, #0

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

206

movi v10.8b, #0

207

movi v11.8b, #0

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

208

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

209

tbz x2, #4, 1f

210

ld1 {v9.16b}, [x1], #16

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

211

.if \interleaved

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

212

ld1 {v11.16b}, [x3], #16

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

213

.else

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

214

ld1 {v10.d}[1], [x3], #8

215

ld1 {v11.d}[1], [x4], #8

216

.endif

217

1: tbz x2, #3, 1f

218

ld1 {v8.d}[1], [x1], #8

219

.if \interleaved

220

ld1 {v10.d}[1], [x3], #8

221

.else

222

ld1 {v10.s}[1], [x3], #4

223

ld1 {v11.s}[1], [x4], #4

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

224

.endif

225

1: tbz x2, #2, 1f

226

ld1 {v8.s}[1], [x1], #4

227

.if \interleaved

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

228

ld1 {v10.s}[1], [x3], #4

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

229

.else

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

230

ld1 {v10.h}[1], [x3], #2

231

ld1 {v11.h}[1], [x4], #2

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

232

.endif

233

1: tbz x2, #1, 1f

234

ld1 {v8.h}[1], [x1], #2

235

.if \interleaved

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

236

ld1 {v10.h}[1], [x3], #2

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

237

.else

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

238

ld1 {v10.b}[1], [x3], #1

239

ld1 {v11.b}[1], [x4], #1

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

240

.endif

241

1: tbz x2, #0, 1f

242

ld1 {v8.b}[1], [x1], #1

243

.if \interleaved

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

244

ld1 {v10.h}[0], [x3], #2

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

245

.else

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

246

ld1 {v10.b}[0], [x3], #1

247

ld1 {v11.b}[0], [x4], #1

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

248

.endif

249

250

/* One small impediment in the process above is that some of the load

251

* operations can't perform byte-wise structure deinterleaving at the

252

* same time as loading only part of a register. So the data is loaded

253

* linearly and unpacked manually at this point if necessary.

254

*/

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

255

1: mov v12.16b, v8.16b

256

uzp1 v8.16b, v12.16b, v9.16b

257

uzp2 v9.16b, v12.16b, v9.16b

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

258

.if \interleaved

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

259

mov v12.16b, v10.16b

260

uzp1 v10.16b, v12.16b, v11.16b

261

uzp2 v11.16b, v12.16b, v11.16b

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

262

.endif

263

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

264

.if \swapuv

265

\kernel regu=v11, regv=v10

266

.else

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

267

\kernel

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

268

.endif

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

269

270

/* As above but with the output; structured stores for partial vectors

271

* aren't available, so the data is re-packed first and stored linearly.

272

*/

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

273

zip1 v16.16b, v0.16b, v2.16b

274

zip2 v18.16b, v0.16b, v2.16b

275

zip1 v17.16b, v1.16b, v3.16b

276

zip2 v19.16b, v1.16b, v3.16b

277

zip1 v0.16b, v16.16b, v17.16b

278

zip2 v1.16b, v16.16b, v17.16b

279

zip1 v2.16b, v18.16b, v19.16b

280

zip2 v3.16b, v18.16b, v19.16b

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

281

Simon Hosie

e8814f7

2014-06-19 13:18:05 -0700

[diff] [blame]

282

/* Luckily v4-v7 don't need to be unzipped because the complete set of

283

* four and can be stored using st4. */

284

285

tbz x2, #4, 1f

286

st4 {v4.16b - v7.16b}, [x0], #64

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

287

1: tbz x2, #3, 1f

288

st1 {v2.16b,v3.16b}, [x0], #32

289

1: tbz x2, #2, 1f

290

st1 {v1.16b}, [x0], #16

291

1: tbz x2, #1, 1f

292

st1 {v0.d}[1], [x0], #8

293

1: tbz x2, #0, 2f

294

st1 {v0.s}[1], [x0], #4

2:

.endm

/* void rsdIntrinsicYuv2_K(

300

* void *out, // x0

301

* void const *yin, // x1

302

* void const *uin, // x2

303

* void const *vin, // x3

304

* size_t xstart, // x4

305

* size_t xend); // x5

306

*/

307

ENTRY(rsdIntrinsicYuv2_K)

308

lsr x6, x4, #1

309

add x0, x0, x4, LSL #2

310

add x1, x1, x4

311

add x4, x3, x6

312

add x3, x2, x6

Simon Hosie

9732e85

2014-09-19 23:08:21 -0700

[diff] [blame]

313

sub x2, x5, x6, LSL #1

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

sub x6, sp, #32

sub sp, sp, #64

st1 {v8.1d - v11.1d}, [sp]

318

st1 {v12.1d - v15.1d}, [x6]

wrap_line yuvkern, 0

ld1 {v8.1d - v11.1d}, [sp], #32

323

ld1 {v12.1d - v15.1d}, [sp], #32

324

ret

325

END(rsdIntrinsicYuv2_K)

326

327

/* void rsdIntrinsicYuv_K(

328

* void *out, // x0

329

* void const *yin, // x1

330

* void const *uvin, // x2

331

* size_t xstart, // x3

332

* size_t xend); // x4

333

*/

334

ENTRY(rsdIntrinsicYuv_K)

Simon Hosie

1d9c887

2014-05-01 23:28:45 -0700

[diff] [blame]

335

bic x5, x3, #1

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

336

add x0, x0, x5, LSL #2

add x1, x1, x5

add x3, x2, x5

sub x2, x4, x5

sub x5, sp, #32

sub sp, sp, #64

st1 {v8.1d - v11.1d}, [sp]

344

st1 {v12.1d - v15.1d}, [x5]

345

346

wrap_line yuvkern, 1, 1

347

348

ld1 {v8.1d - v11.1d}, [sp], #32

349

ld1 {v12.1d - v15.1d}, [sp], #32

350

ret

351

END(rsdIntrinsicYuv_K)

352

353

/* void rsdIntrinsicYuvR_K(

354

* void *out, // x0

355

* void const *yin, // x1

356

* void const *uvin, // x2

357

* size_t xstart, // x3

358

* size_t xend); // x4

359

*/

360

ENTRY(rsdIntrinsicYuvR_K)

Simon Hosie

1d9c887

2014-05-01 23:28:45 -0700

[diff] [blame]

361

bic x5, x3, #1

Simon Hosie

ccd7a46

2014-02-01 01:35:11 -0800

[diff] [blame]

362

add x0, x0, x5, LSL #2

add x1, x1, x5

add x3, x2, x5

sub x2, x4, x5

sub x5, sp, #32

sub sp, sp, #64

st1 {v8.1d - v11.1d}, [sp]

370

st1 {v12.1d - v15.1d}, [x5]

wrap_line yuvkern, 1

ld1 {v8.1d - v11.1d}, [sp], #32

375

ld1 {v12.1d - v15.1d}, [sp], #32

376

ret

377

END(rsdIntrinsicYuvR_K)