Blame - common/x86/ih264_deblk_chroma_ssse3.c - platform/external/libavc

Hamsalekha S

8d3d303

2015-03-13 21:24:58 +0530

[diff] [blame]

1

/******************************************************************************

*

*

* Licensed under the Apache License, Version 2.0 (the "License");

6

* you may not use this file except in compliance with the License.

7

* You may obtain a copy of the License at:

8

*

9

* http://www.apache.org/licenses/LICENSE-2.0

10

*

11

* Unless required by applicable law or agreed to in writing, software

12

* distributed under the License is distributed on an "AS IS" BASIS,

13

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

* See the License for the specific language governing permissions and

15

* limitations under the License.

16

*

17

*****************************************************************************

18

* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore

19

*/

20

/*****************************************************************************/

21

/* */

22

/* File Name : ih264_deblk_chroma_ssse3.c */

23

/* */

24

/* Description : Contains function definitions for deblocking */

25

/* */

26

/* List of Functions : ih264_deblk_chroma_vert_bs4_ssse3() */

27

/* ih264_deblk_chroma_horz_bs4_ssse3() */

28

/* ih264_deblk_chroma_vert_bslt4_ssse3() */

29

/* ih264_deblk_chroma_horz_bslt4_ssse3() */

30

/* ih264_deblk_chroma_vert_bs4_mbaff_ssse3() */

31

/* ih264_deblk_chroma_vert_bslt4_mbaff_ssse3() */

32

/* */

33

/* Issues / Problems : None */

34

/* */

35

/* Revision History : */

36

/* */

37

/* DD MM YYYY Author(s) Changes (Describe the changes made) */

38

/* 12 02 2015 Naveen Kumar P Added chrom deblocking ssse3 */

39

/* intrinsics */

40

/* */

41

/*****************************************************************************/

42

43

/*****************************************************************************/

44

/* File Includes */

45

/*****************************************************************************/

46

47

/* System include files */

48

#include <stdio.h>

49

50

/* User include files */

51

#include "ih264_typedefs.h"

52

#include "ih264_platform_macros.h"

53

#include "ih264_deblk_edge_filters.h"

54

#include "ih264_macros.h"

55

56

/*****************************************************************************/

57

/* Function Definitions */

58

/*****************************************************************************/

59

60

/*****************************************************************************/

61

/* */

62

/* Function Name : ih264_deblk_chroma_vert_bs4_ssse3() */

63

/* */

64

/* Description : This function performs filtering of a chroma block */

65

/* vertical edge when the boundary strength is set to 4 in */

66

/* high profile. */

67

/* */

68

/* Inputs : pu1_src - pointer to the src sample q0 of U */

69

/* src_strd - source stride */

70

/* alpha_cb - alpha value for the boundary in U */

71

/* beta_cb - beta value for the boundary in U */

72

/* alpha_cr - alpha value for the boundary in V */

73

/* beta_cr - beta value for the boundary in V */

/* */

/* Globals : None */

/* */

/* Processing : This operation is described in Sec. 8.7.2.4 under the */

78

/* title "Filtering process for edges for bS equal to 4" in */

79

/* ITU T Rec H.264 with alpha and beta values different in */

/* U and V. */

/* */

/* Outputs : None */

/* */

/* Returns : None */

/* */

/* Issues : None */

/* */

/* Revision History: */

89

/* */

90

/* DD MM YYYY Author(s) Changes (Describe the changes made) */

91

/* 12 02 2015 Naveen Kumar P Initial version */

92

/* */

93

/*****************************************************************************/

94

void ih264_deblk_chroma_vert_bs4_ssse3(UWORD8 *pu1_src,

WORD32 src_strd,

WORD32 alpha_cb,

WORD32 beta_cb,

WORD32 alpha_cr,

WORD32 beta_cr)

{

UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/

102

WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;

103

WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;

104

__m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;

105

__m128i temp1, temp2, temp3, temp4;

106

107

__m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;

108

__m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;

109

__m128i flag1, flag2;

110

__m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;

111

__m128i zero = _mm_setzero_si128();

112

__m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;

113

114

/* Load and transpose the pixel values */

115

linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));

116

lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));

117

linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));

118

lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));

119

linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd));

120

linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd));

121

lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd));

122

lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd));

123

124

temp1 = _mm_unpacklo_epi16(linea, lineb);

125

temp2 = _mm_unpacklo_epi16(linec, lined);

126

temp3 = _mm_unpacklo_epi16(linee, linef);

127

temp4 = _mm_unpacklo_epi16(lineg, lineh);

128

129

p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2);

130

p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4);

131

q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2);

132

q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4);

133

134

p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16);

135

p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16);

136

q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16);

137

q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16);

138

/* End of transpose */

139

140

q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);

141

q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);

142

p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);

143

p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);

144

145

diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1

146

diff = _mm_abs_epi16(diff);

147

alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);

148

flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);

149

150

diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2

151

diff = _mm_abs_epi16(diff);

152

beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);

153

flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

154

155

diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3

156

diff = _mm_abs_epi16(diff);

157

flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

158

159

temp1 = _mm_slli_epi16(p1_uv_8x16, 1);

160

temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);

161

temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));

162

temp1 = _mm_add_epi16(temp1, temp2);

163

p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);

164

165

temp1 = _mm_slli_epi16(q1_uv_8x16, 1);

166

temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);

167

temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));

168

temp1 = _mm_add_epi16(temp1, temp2);

169

q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);

170

171

q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);

172

q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);

173

p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);

174

p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);

175

176

diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1

177

diff = _mm_abs_epi16(diff);

178

alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);

179

flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);

180

181

diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2

182

diff = _mm_abs_epi16(diff);

183

beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);

184

flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

185

186

diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3

187

diff = _mm_abs_epi16(diff);

188

flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

189

190

temp1 = _mm_slli_epi16(p1_uv_8x16, 1);

191

temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);

192

temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));

193

temp1 = _mm_add_epi16(temp1, temp2);

194

p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);

195

196

temp1 = _mm_slli_epi16(q1_uv_8x16, 1);

197

temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);

198

temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));

199

temp1 = _mm_add_epi16(temp1, temp2);

200

q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);

201

202

p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);

203

q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);

204

205

flag1 = _mm_packs_epi16(flag1, flag2);

206

207

p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,

208

_mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));

209

p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);

210

p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);

211

212

q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,

213

_mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));

214

q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);

215

q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);

216

217

/* Inverse-transpose and store back */

218

temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);

219

temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8);

220

temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);

221

temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8);

222

223

linea = _mm_unpacklo_epi32(temp1, temp3);

224

lineb = _mm_srli_si128(linea, 8);

225

linec = _mm_unpackhi_epi32(temp1, temp3);

226

lined = _mm_srli_si128(linec, 8);

227

linee = _mm_unpacklo_epi32(temp2, temp4);

228

linef = _mm_srli_si128(linee, 8);

229

lineg = _mm_unpackhi_epi32(temp2, temp4);

230

lineh = _mm_srli_si128(lineg, 8);

231

232

_mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);

233

_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);

234

_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);

235

_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);

236

_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);

237

_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);

238

_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);

239

_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);

}

/*****************************************************************************/

244

/* */

245

/* Function Name : ih264_deblk_chroma_horz_bs4_ssse3() */

246

/* */

247

/* Description : This function performs filtering of a chroma block */

248

/* horizontal edge when the boundary strength is set to 4 */

249

/* in high profile. */

250

/* */

251

/* Inputs : pu1_src - pointer to the src sample q0 of U */

252

/* src_strd - source stride */

253

/* alpha_cb - alpha value for the boundary in U */

254

/* beta_cb - beta value for the boundary in U */

255

/* alpha_cr - alpha value for the boundary in V */

256

/* beta_cr - beta value for the boundary in V */

/* */

/* Globals : None */

/* */

/* Processing : This operation is described in Sec. 8.7.2.4 under the */

261

/* title "Filtering process for edges for bS equal to 4" in */

262

/* ITU T Rec H.264 with alpha and beta values different in */

/* U and V. */

/* */

/* Outputs : None */

/* */

/* Returns : None */

/* */

/* Issues : None */

/* */

/* Revision History: */

272

/* */

273

/* DD MM YYYY Author(s) Changes (Describe the changes made) */

274

/* 12 02 2015 Naveen Kumar P Initial version */

275

/* */

276

/*****************************************************************************/

277

void ih264_deblk_chroma_horz_bs4_ssse3(UWORD8 *pu1_src,

WORD32 src_strd,

WORD32 alpha_cb,

WORD32 beta_cb,

WORD32 alpha_cr,

WORD32 beta_cr)

{

UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/

285

WORD16 i16_posP1, i16_posP0, i16_posQ1;

286

287

UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */

288

WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;

289

WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;

290

__m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;

291

__m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;

292

__m128i flag1, flag2;

293

__m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;

294

__m128i zero = _mm_setzero_si128();

295

__m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;

296

__m128i temp1, temp2;

297

298

pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);

299

300

i16_posQ1 = src_strd;

301

i16_posP0 = src_strd;

302

i16_posP1 = 0;

303

304

q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv));

305

q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1));

306

p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1));

307

p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0));

308

309

q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);

310

q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);

311

p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);

312

p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);

313

314

diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1

315

diff = _mm_abs_epi16(diff);

316

alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);

317

flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);

318

319

diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2

320

diff = _mm_abs_epi16(diff);

321

beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);

322

flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

323

324

diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3

325

diff = _mm_abs_epi16(diff);

326

flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

327

328

temp1 = _mm_slli_epi16(p1_uv_8x16, 1);

329

temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);

330

temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));

331

temp1 = _mm_add_epi16(temp1, temp2);

332

p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);

333

334

temp1 = _mm_slli_epi16(q1_uv_8x16, 1);

335

temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);

336

temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));

337

temp1 = _mm_add_epi16(temp1, temp2);

338

q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);

339

340

q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);

341

q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);

342

p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);

343

p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);

344

345

diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1

346

diff = _mm_abs_epi16(diff);

347

alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);

348

flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);

349

350

diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2

351

diff = _mm_abs_epi16(diff);

352

beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);

353

flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

354

355

diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3

356

diff = _mm_abs_epi16(diff);

357

flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

358

359

temp1 = _mm_slli_epi16(p1_uv_8x16, 1);

360

temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);

361

temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));

362

temp1 = _mm_add_epi16(temp1, temp2);

363

p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);

364

365

temp1 = _mm_slli_epi16(q1_uv_8x16, 1);

366

temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);

367

temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));

368

temp1 = _mm_add_epi16(temp1, temp2);

369

q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);

370

371

p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);

372

q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);

373

374

flag1 = _mm_packs_epi16(flag1, flag2);

375

376

p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,

377

_mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));

378

p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);

379

p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);

380

_mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1);

381

382

q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,

383

_mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));

384

q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);

385

q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);

386

_mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1);

}

/*****************************************************************************/

391

/* */

392

/* Function Name : ih264_deblk_chroma_vert_bslt4_ssse3() */

393

/* */

394

/* Description : This function performs filtering of a chroma block */

395

/* vertical edge when the boundary strength is less than 4 */

396

/* in high profile. */

397

/* */

398

/* Inputs : pu1_src - pointer to the src sample q0 of U */

399

/* src_strd - source stride */

400

/* alpha_cb - alpha value for the boundary in U */

401

/* beta_cb - beta value for the boundary in U */

402

/* alpha_cr - alpha value for the boundary in V */

403

/* beta_cr - beta value for the boundary in V */

404

/* u4_bs - packed Boundary strength array */

405

/* pu1_cliptab_cb - tc0_table for U */

406

/* pu1_cliptab_cr - tc0_table for V */

/* */

/* Globals : None */

/* */

/* Processing : This operation is described in Sec. 8.7.2.3 under the */

411

/* title "Filtering process for edges for bS less than 4" */

412

/* in ITU T Rec H.264 with alpha and beta values different */

/* in U and V. */

/* */

/* Outputs : None */

/* */

/* Returns : None */

/* */

/* Issues : None */

/* */

/* Revision History: */

422

/* */

423

/* DD MM YYYY Author(s) Changes (Describe the changes made) */

424

/* 12 02 2015 Naveen Kumar P Initial version */

425

/* */

426

/*****************************************************************************/

427

void ih264_deblk_chroma_vert_bslt4_ssse3(UWORD8 *pu1_src,

WORD32 src_strd,

WORD32 alpha_cb,

WORD32 beta_cb,

WORD32 alpha_cr,

WORD32 beta_cr,

UWORD32 u4_bs,

const UWORD8 *pu1_cliptab_cb,

435

const UWORD8 *pu1_cliptab_cr)

436

{

437

UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/

438

UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;

439

WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;

440

WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;

441

__m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;

442

__m128i temp1, temp2, temp3, temp4;

443

444

__m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;

445

__m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;

446

__m128i flag_bs, flag1, flag2;

447

__m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;

448

__m128i zero = _mm_setzero_si128();

449

__m128i C0_uv_8x16;

450

__m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;

451

452

u1_Bs0 = (u4_bs >> 24) & 0xff;

453

u1_Bs1 = (u4_bs >> 16) & 0xff;

454

u1_Bs2 = (u4_bs >> 8) & 0xff;

455

u1_Bs3 = (u4_bs >> 0) & 0xff;

456

457

flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,

458

u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,

459

u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);

460

flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s

461

flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask

462

463

/* Load and transpose the pixel values */

464

linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));

465

lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));

466

linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));

467

lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));

468

linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd));

469

linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd));

470

lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd));

471

lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd));

472

473

temp1 = _mm_unpacklo_epi16(linea, lineb);

474

temp2 = _mm_unpacklo_epi16(linec, lined);

475

temp3 = _mm_unpacklo_epi16(linee, linef);

476

temp4 = _mm_unpacklo_epi16(lineg, lineh);

477

478

p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2);

479

p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4);

480

q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2);

481

q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4);

482

483

p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16);

484

p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16);

485

q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16);

486

q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16);

487

/* End of transpose */

488

489

q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);

490

q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);

491

p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);

492

p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);

493

494

diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1

495

diff = _mm_abs_epi16(diff);

496

alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);

497

flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);

498

499

diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2

500

diff = _mm_abs_epi16(diff);

501

beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);

502

flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

503

504

diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3

505

diff = _mm_abs_epi16(diff);

506

flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

507

508

diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);

509

diff = _mm_slli_epi16(diff, 2);

510

diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);

511

diff = _mm_add_epi16(diff, diff1);

512

diff = _mm_add_epi16(diff, _mm_set1_epi16(4));

513

in_macro = _mm_srai_epi16(diff, 3);

514

515

C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],

516

pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],

517

pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],

518

pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);

519

520

C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));

521

522

in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3

523

C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);

524

in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);

525

526

p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);

527

q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);

528

529

q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);

530

q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);

531

p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);

532

p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);

533

534

diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1

535

diff = _mm_abs_epi16(diff);

536

alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);

537

flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);

538

539

diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2

540

diff = _mm_abs_epi16(diff);

541

beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);

542

flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

543

544

diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3

545

diff = _mm_abs_epi16(diff);

546

flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

547

548

diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);

549

diff = _mm_slli_epi16(diff, 2);

550

diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);

551

diff = _mm_add_epi16(diff, diff1);

552

diff = _mm_add_epi16(diff, _mm_set1_epi16(4));

553

in_macro = _mm_srai_epi16(diff, 3);

554

555

C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],

556

pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],

557

pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],

558

pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);

559

560

C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));

561

562

in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3

563

C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);

564

in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);

565

566

p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro);

567

q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro);

568

569

p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);

570

q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);

571

572

flag1 = _mm_packs_epi16(flag1, flag2);

573

flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)

574

575

p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,

576

_mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));

577

p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);

578

p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);

579

580

q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,

581

_mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));

582

q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);

583

q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);

584

585

/* Inverse-transpose and store back */

586

temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);

587

temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8);

588

temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);

589

temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8);

590

591

linea = _mm_unpacklo_epi32(temp1, temp3);

592

lineb = _mm_srli_si128(linea, 8);

593

linec = _mm_unpackhi_epi32(temp1, temp3);

594

lined = _mm_srli_si128(linec, 8);

595

linee = _mm_unpacklo_epi32(temp2, temp4);

596

linef = _mm_srli_si128(linee, 8);

597

lineg = _mm_unpackhi_epi32(temp2, temp4);

598

lineh = _mm_srli_si128(lineg, 8);

599

600

_mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);

601

_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);

602

_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);

603

_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);

604

_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);

605

_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);

606

_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);

607

_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);

}

/*****************************************************************************/

612

/* */

613

/* Function Name : ih264_deblk_chroma_horz_bslt4_ssse3() */

614

/* */

615

/* Description : This function performs filtering of a chroma block */

616

/* horizontal edge when the boundary strength is less than */

617

/* 4 in high profile. */

618

/* */

619

/* Inputs : pu1_src - pointer to the src sample q0 of U */

620

/* src_strd - source stride */

621

/* alpha_cb - alpha value for the boundary in U */

622

/* beta_cb - beta value for the boundary in U */

623

/* alpha_cr - alpha value for the boundary in V */

624

/* beta_cr - beta value for the boundary in V */

625

/* u4_bs - packed Boundary strength array */

626

/* pu1_cliptab_cb - tc0_table for U */

627

/* pu1_cliptab_cr - tc0_table for V */

/* */

/* Globals : None */

/* */

/* Processing : This operation is described in Sec. 8.7.2.3 under the */

632

/* title "Filtering process for edges for bS less than 4" */

633

/* in ITU T Rec H.264 with alpha and beta values different */

/* in U and V. */

/* */

/* Outputs : None */

/* */

/* Returns : None */

/* */

/* Issues : None */

/* */

/* Revision History: */

643

/* */

644

/* DD MM YYYY Author(s) Changes (Describe the changes made) */

645

/* 12 02 2015 Naveen Kumar P Initial version */

646

/* */

647

/*****************************************************************************/

648

void ih264_deblk_chroma_horz_bslt4_ssse3(UWORD8 *pu1_src,

WORD32 src_strd,

WORD32 alpha_cb,

WORD32 beta_cb,

WORD32 alpha_cr,

WORD32 beta_cr,

UWORD32 u4_bs,

const UWORD8 *pu1_cliptab_cb,

656

const UWORD8 *pu1_cliptab_cr)

657

{

658

UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/

659

WORD16 i16_posP1, i16_posP0, i16_posQ1;

660

UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;

661

662

UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */

663

WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;

664

WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;

665

__m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;

666

__m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;

667

__m128i flag_bs, flag1, flag2;

668

__m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;

669

__m128i zero = _mm_setzero_si128();

670

__m128i C0_uv_8x16;

671

__m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;

672

673

pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);

674

675

i16_posQ1 = src_strd;

676

i16_posP0 = src_strd;

677

i16_posP1 = 0;

678

679

u1_Bs0 = (u4_bs >> 24) & 0xff;

680

u1_Bs1 = (u4_bs >> 16) & 0xff;

681

u1_Bs2 = (u4_bs >> 8) & 0xff;

682

u1_Bs3 = (u4_bs >> 0) & 0xff;

683

684

flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,

685

u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,

686

u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);

687

flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s

688

flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask

689

690

q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv));

691

q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1));

692

p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1));

693

p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0));

694

695

q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);

696

q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);

697

p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);

698

p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);

699

700

diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1

701

diff = _mm_abs_epi16(diff);

702

alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);

703

flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);

704

705

diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2

706

diff = _mm_abs_epi16(diff);

707

beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);

708

flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

709

710

diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3

711

diff = _mm_abs_epi16(diff);

712

flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

713

714

diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);

715

diff = _mm_slli_epi16(diff, 2);

716

diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);

717

diff = _mm_add_epi16(diff, diff1);

718

diff = _mm_add_epi16(diff, _mm_set1_epi16(4));

719

in_macro = _mm_srai_epi16(diff, 3);

720

721

C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],

722

pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],

723

pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],

724

pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);

725

726

C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));

727

728

in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3

729

C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);

730

in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);

731

732

p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);

733

q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);

734

735

q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);

736

q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);

737

p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);

738

p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);

739

740

diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1

741

diff = _mm_abs_epi16(diff);

742

alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);

743

flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);

744

745

diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2

746

diff = _mm_abs_epi16(diff);

747

beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);

748

flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

749

750

diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3

751

diff = _mm_abs_epi16(diff);

752

flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

753

754

diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);

755

diff = _mm_slli_epi16(diff, 2);

756

diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);

757

diff = _mm_add_epi16(diff, diff1);

758

diff = _mm_add_epi16(diff, _mm_set1_epi16(4));

759

in_macro = _mm_srai_epi16(diff, 3);

760

761

C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],

762

pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],

763

pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],

764

pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);

765

766

C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));

767

768

in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3

769

C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);

770

in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);

771

772

p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro);

773

q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro);

774

775

p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);

776

q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);

777

778

flag1 = _mm_packs_epi16(flag1, flag2);

779

flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)

780

781

p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,

782

_mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));

783

p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);

784

p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);

785

_mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1);

786

787

q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,

788

_mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));

789

q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);

790

q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);

791

_mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1);

}

/*****************************************************************************/

796

/* */

797

/* Function Name : ih264_deblk_chroma_vert_bs4_mbaff_ssse3() */

798

/* */

799

/* Description : This function performs filtering of a chroma block */

800

/* vertical edge when boundary strength is set to 4 in high */

801

/* profile. */

802

/* */

803

/* Inputs : pu1_src - pointer to the src sample q0 of U */

804

/* src_strd - source stride */

805

/* alpha_cb - alpha value for the boundary in U */

806

/* beta_cb - beta value for the boundary in U */

807

/* alpha_cr - alpha value for the boundary in V */

808

/* beta_cr - beta value for the boundary in V */

809

/* u4_bs - packed Boundary strength array */

810

/* pu1_cliptab_cb - tc0_table for U */

811

/* pu1_cliptab_cr - tc0_table for V */

/* */

/* Globals : None */

/* */

/* Processing : When the function is called twice, this operation is as */

816

/* described in Sec. 8.7.2.4 under the title "Filtering */

817

/* process for edges for bS equal to 4" in ITU T Rec H.264 */

818

/* with alpha and beta values different in U and V. */

/* */

/* Outputs : None */

/* */

/* Returns : None */

/* */

/* Issues : None */

/* */

/* Revision History: */

827

/* */

828

/* DD MM YYYY Author(s) Changes (Describe the changes made) */

829

/* 12 02 2015 Naveen Kumar P Initial version */

830

/* */

831

/*****************************************************************************/

832

void ih264_deblk_chroma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src,

WORD32 src_strd,

WORD32 alpha_cb,

WORD32 beta_cb,

WORD32 alpha_cr,

WORD32 beta_cr)

{

UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/

840

WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;

841

WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;

842

__m128i linea, lineb, linec, lined;

843

__m128i temp1, temp2;

844

845

__m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;

846

__m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;

847

__m128i flag1;

848

__m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;

849

__m128i zero = _mm_setzero_si128();

850

__m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;

851

852

/* Load and transpose the pixel values */

853

linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));

854

lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));

855

linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));

856

lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));

857

858

temp1 = _mm_unpacklo_epi16(linea, lineb);

859

temp2 = _mm_unpacklo_epi16(linec, lined);

860

861

p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2);

862

p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8);

863

q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2);

864

q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8);

865

/* End of transpose */

866

867

q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);

868

q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);

869

p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);

870

p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);

871

872

diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1

873

diff = _mm_abs_epi16(diff);

874

alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);

875

flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);

876

877

diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2

878

diff = _mm_abs_epi16(diff);

879

beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);

880

flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

881

882

diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3

883

diff = _mm_abs_epi16(diff);

884

flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

885

886

temp1 = _mm_slli_epi16(p1_uv_8x16, 1);

887

temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);

888

temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));

889

temp1 = _mm_add_epi16(temp1, temp2);

890

p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);

891

892

temp1 = _mm_slli_epi16(q1_uv_8x16, 1);

893

temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);

894

temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));

895

temp1 = _mm_add_epi16(temp1, temp2);

896

q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);

897

898

p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1);

899

q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1);

900

901

flag1 = _mm_packs_epi16(flag1, flag1);

902

903

p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,

904

_mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));

905

p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);

906

p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);

907

908

q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,

909

_mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));

910

q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);

911

q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);

912

913

/* Inverse-transpose and store back */

914

temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);

915

temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);

916

917

linea = _mm_unpacklo_epi32(temp1, temp2);

918

lineb = _mm_srli_si128(linea, 8);

919

linec = _mm_unpackhi_epi32(temp1, temp2);

920

lined = _mm_srli_si128(linec, 8);

921

922

_mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);

923

_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);

924

_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);

925

_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);

}

/*****************************************************************************/

930

/* */

931

/* Function Name : ih264_deblk_chroma_vert_bslt4_mbaff_ssse3() */

932

/* */

933

/* Description : This function performs filtering of a chroma block */

934

/* vertical edge when boundary strength is less than 4 in */

935

/* high profile. */

936

/* */

937

/* Inputs : pu1_src - pointer to the src sample q0 of U */

938

/* src_strd - source stride */

939

/* alpha_cb - alpha value for the boundary in U */

940

/* beta_cb - beta value for the boundary in U */

941

/* alpha_cr - alpha value for the boundary in V */

942

/* beta_cr - beta value for the boundary in V */

943

/* u4_bs - packed Boundary strength array */

944

/* pu1_cliptab_cb - tc0_table for U */

945

/* pu1_cliptab_cr - tc0_table for V */

/* */

/* Globals : None */

/* */

/* Processing : When the function is called twice, this operation is as */

950

/* described in Sec. 8.7.2.4 under the title "Filtering */

951

/* process for edges for bS less than 4" in ITU T Rec H.264 */

952

/* with alpha and beta values different in U and V. */

/* */

/* Outputs : None */

/* */

/* Returns : None */

/* */

/* Issues : None */

/* */

/* Revision History: */

961

/* */

962

/* DD MM YYYY Author(s) Changes (Describe the changes made) */

963

/* 12 02 2015 Naveen Kumar P Initial version */

964

/* */

965

/*****************************************************************************/

966

void ih264_deblk_chroma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src,

WORD32 src_strd,

WORD32 alpha_cb,

WORD32 beta_cb,

WORD32 alpha_cr,

WORD32 beta_cr,

UWORD32 u4_bs,

const UWORD8 *pu1_cliptab_cb,

974

const UWORD8 *pu1_cliptab_cr)

975

{

976

UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/

977

UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;

978

WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;

979

WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;

980

__m128i linea, lineb, linec, lined;

981

__m128i temp1, temp2;

982

983

__m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;

984

__m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;

985

__m128i flag_bs, flag1;

986

__m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;

987

__m128i zero = _mm_setzero_si128();

988

__m128i C0_uv_8x16;

989

__m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;

990

991

u1_Bs0 = (u4_bs >> 24) & 0xff;

992

u1_Bs1 = (u4_bs >> 16) & 0xff;

993

u1_Bs2 = (u4_bs >> 8) & 0xff;

994

u1_Bs3 = (u4_bs >> 0) & 0xff;

995

996

flag_bs = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2,

997

u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0);

998

flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s

999

flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask

1000

1001

/* Load and transpose the pixel values */

1002

linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));

1003

lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));

1004

linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));

1005

lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));

1006

1007

temp1 = _mm_unpacklo_epi16(linea, lineb);

1008

temp2 = _mm_unpacklo_epi16(linec, lined);

1009

1010

p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2);

1011

p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8);

1012

q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2);

1013

q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8);

1014

/* End of transpose */

1015

1016

q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);

1017

q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);

1018

p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);

1019

p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);

1020

1021

diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1

1022

diff = _mm_abs_epi16(diff);

1023

alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);

1024

flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);

1025

1026

diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2

1027

diff = _mm_abs_epi16(diff);

1028

beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);

1029

flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

1030

1031

diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3

1032

diff = _mm_abs_epi16(diff);

1033

flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

1034

1035

diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);

1036

diff = _mm_slli_epi16(diff, 2);

1037

diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);

1038

diff = _mm_add_epi16(diff, diff1);

1039

diff = _mm_add_epi16(diff, _mm_set1_epi16(4));

1040

in_macro = _mm_srai_epi16(diff, 3);

1041

1042

C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],

1043

pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],

1044

pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],

1045

pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);

1046

1047

C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));

1048

1049

in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3

1050

C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);

1051

in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);

1052

1053

p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);

1054

q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);

1055

1056

p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1);

1057

q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1);

1058

1059

flag1 = _mm_packs_epi16(flag1, flag1);

1060

flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)

1061

1062

p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,

1063

_mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));

1064

p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);

1065

p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);

1066

1067

q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,

1068

_mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));

1069

q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);

1070

q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);

1071

1072

/* Inverse-transpose and store back */

1073

temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);

1074

temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);

1075

1076

linea = _mm_unpacklo_epi32(temp1, temp2);

1077

lineb = _mm_srli_si128(linea, 8);

1078

linec = _mm_unpackhi_epi32(temp1, temp2);

1079

lined = _mm_srli_si128(linec, 8);

1080

1081

_mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);

1082

_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);

1083

_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);

1084

_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);

1085

1086

}

1087