blob: 05ccfd643fdf34a90e46c9a4e4b0a0707b380379 [file] [log] [blame]
Matthieu Delahaye6fc3e122014-03-04 11:05:49 -06001/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17
18#include "rsCpuIntrinsic.h"
19#include "rsCpuIntrinsicInlines.h"
20#include <sys/syscall.h>
21#include "cutils/atomic.h"
22
23#ifdef RS_COMPATIBILITY_LIB
24#include "rsCompatibilityLib.h"
25#endif
26
27#ifndef RS_COMPATIBILITY_LIB
28#include "hardware/gralloc.h"
29#endif
30
31
32#define INLINE inline
33
34#define MIN(x, y) (((x) < (y)) ? (x) : (y))
35#define MAX(x, y) (((x) > (y)) ? (x) : (y))
36
37#define ROUND_POWER_OF_TWO(value, n) \
38 (((value) + (1 << ((n) - 1))) >> (n))
39
40
41#define MI_SIZE_LOG2 3
42#define MI_BLOCK_SIZE_LOG2 (6 - MI_SIZE_LOG2) // 64 = 2^6
43
44#define MI_SIZE (1 << MI_SIZE_LOG2) // pixels per mi-unit
45#define MI_BLOCK_SIZE (1 << MI_BLOCK_SIZE_LOG2) // mi-units per max block
46
47#define MI_MASK (MI_BLOCK_SIZE - 1)
48
49#define SIMD_WIDTH 16
50#define MAX_LOOP_FILTER 63
51#define MAX_SEGMENTS 8
52#define MAX_REF_FRAMES 4
53#define MAX_MODE_LF_DELTAS 2
54#define MB_MODE_COUNT 14
55#define BLOCK_SIZES 13
56
57
58#if (defined(__GNUC__) && __GNUC__) || defined(__SUNPRO_C)
59#define DECLARE_ALIGNED(n,typ,val) typ val __attribute__ ((aligned (n)))
60#elif defined(_MSC_VER)
61#define DECLARE_ALIGNED(n,typ,val) __declspec(align(n)) typ val
62#else
63#warning No alignment directives known for this compiler.
64#define DECLARE_ALIGNED(n,typ,val) typ val
65#endif
66
67// block transform size
68typedef enum {
69 TX_4X4 = 0, // 4x4 transform
70 TX_8X8 = 1, // 8x8 transform
71 TX_16X16 = 2, // 16x16 transform
72 TX_32X32 = 3, // 32x32 transform
73 TX_SIZES
74} TX_SIZE;
75
76typedef enum {
77 PLANE_TYPE_Y_WITH_DC,
78 PLANE_TYPE_UV,
79} PLANE_TYPE;
80
81// This structure holds bit masks for all 8x8 blocks in a 64x64 region.
82// Each 1 bit represents a position in which we want to apply the loop filter.
83// Left_ entries refer to whether we apply a filter on the border to the
84// left of the block. Above_ entries refer to whether or not to apply a
85// filter on the above border. Int_ entries refer to whether or not to
86// apply borders on the 4x4 edges within the 8x8 block that each bit
87// represents.
88// Since each transform is accompanied by a potentially different type of
89// loop filter there is a different entry in the array for each transform size.
90struct LoopFilterMask {
91 uint64_t left_y[4];
92 uint64_t above_y[4];
93 uint64_t int_4x4_y;
94 unsigned short left_uv[4];
95 unsigned short above_uv[4];
96 unsigned short int_4x4_uv;
97 unsigned char lfl_y[64];
98 unsigned char lfl_uv[16];
99};
100
101// Need to align this structure so when it is declared and
102// passed it can be loaded into vector registers.
103struct LoopFilterThresh {
104 DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[SIMD_WIDTH]);
105 DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[SIMD_WIDTH]);
106 DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[SIMD_WIDTH]);
107};
108
109struct LoopFilterInfoN {
110 LoopFilterThresh lfthr[MAX_LOOP_FILTER + 1];
111 uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS];
112 uint8_t mode_lf_lut[MB_MODE_COUNT];
113};
114
115struct BufferInfo {
116 int y_offset;
117 int u_offset;
118 int v_offset;
119 int y_stride;
120 int uv_stride;
121};
122
123#define MAX_CPU_CORES 32
124#define MAX_MB_PLANE 3
Matthieu Delahayeadab8492014-04-01 16:33:24 -0500125#define MAX_SB_ROW 64
Matthieu Delahaye6fc3e122014-03-04 11:05:49 -0600126
127struct LoopFilterProgressChart {
128 int start;
129 int stop;
130 int num_planes;
131 int mi_rows;
132 int mi_cols;
133 BufferInfo buf_info;
134 uint8_t *buffer_alloc;
135 LoopFilterInfoN *lf_info;
136 LoopFilterMask *lfms;
137
138 int wid;
139 int quit;
140 int doing;
Matthieu Delahayeadab8492014-04-01 16:33:24 -0500141 volatile int32_t chart[MAX_SB_ROW];
Matthieu Delahaye6fc3e122014-03-04 11:05:49 -0600142 int32_t sb_row_pro;
143 pthread_t *tid;
144 pthread_mutex_t *mutex;
145 pthread_cond_t *start_cond;
146 pthread_mutex_t *hmutex;
147 pthread_cond_t *finish;
148};
149
150using namespace android;
151using namespace android::renderscript;
152
153namespace android {
154namespace renderscript {
155
156
157class RsdCpuScriptIntrinsicLoopFilter : public RsdCpuScriptIntrinsic {
158private:
159 LoopFilterProgressChart mPrch;
160 int mWorkerCount;
161
162public:
163 virtual void populateScript(Script *);
164 virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
165 virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
166
167 virtual ~RsdCpuScriptIntrinsicLoopFilter();
168 RsdCpuScriptIntrinsicLoopFilter(RsdCpuReferenceImpl *ctx, const Script *s,
169 const Element *e);
170
171protected:
172 ObjectBaseRef<Allocation> mLfInfo;
173 ObjectBaseRef<Allocation> mLfMasks;
174 ObjectBaseRef<Allocation> mFrameBuffer;
175
176 void doLoopFilter();
Chris Wailes80ef6932014-07-08 11:22:18 -0700177 static void kernel(const RsExpandKernelParams *p,
Matthieu Delahaye6fc3e122014-03-04 11:05:49 -0600178 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -0700179 uint32_t outstep);
Matthieu Delahaye6fc3e122014-03-04 11:05:49 -0600180};
181
182}
183}
184
Chris Wailes80ef6932014-07-08 11:22:18 -0700185void RsdCpuScriptIntrinsicLoopFilter::kernel(const RsExpandKernelParams *p,
Matthieu Delahaye6fc3e122014-03-04 11:05:49 -0600186 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -0700187 uint32_t outstep) {
Matthieu Delahaye6fc3e122014-03-04 11:05:49 -0600188 RsdCpuScriptIntrinsicLoopFilter *cp = (RsdCpuScriptIntrinsicLoopFilter*)p->usr;
189 memset((void*)&cp->mPrch.chart, 0, sizeof(cp->mPrch.chart));
190 cp->mPrch.chart[0] = 0x0fffffff;
191 cp->mPrch.sb_row_pro = 0;
192 cp->mPrch.doing = cp->mWorkerCount;
193
194 int i = 0;
195 for (i = 0; i < cp->mWorkerCount; ++i) {
196 pthread_cond_signal(&cp->mPrch.start_cond[i]);
197 }
198 pthread_mutex_lock(cp->mPrch.hmutex);
199 if (cp->mPrch.doing) {
200 pthread_cond_wait(cp->mPrch.finish, cp->mPrch.hmutex);
201 }
202 pthread_mutex_unlock(cp->mPrch.hmutex);
203}
204
205
206void RsdCpuScriptIntrinsicLoopFilter::setGlobalVar(uint32_t slot,
207 const void *data,
208 size_t dataLength) {
209 rsAssert(slot >= 0 && slot < 2);
210 const int *dptr = (const int *)data;
211 switch (slot) {
212 case 0:
213 rsAssert(dataLength == sizeof(int) * 5);
214 mPrch.start = dptr[0];
215 mPrch.stop = dptr[1];
216 mPrch.num_planes = dptr[2];
217 mPrch.mi_rows = dptr[3];
218 mPrch.mi_cols = dptr[4];
219 break;
220 case 1:
221 rsAssert(dataLength == sizeof(BufferInfo));
222 mPrch.buf_info = *((BufferInfo*)data);
223 break;
224 default:
225 ALOGE("Non-exist global value slot: %d", slot);
226 rsAssert(0);
227 }
228}
229
230void RsdCpuScriptIntrinsicLoopFilter::setGlobalObj(uint32_t slot, ObjectBase *data) {
231 rsAssert(slot > 1 && slot < 5);
232 if (slot == 2) {
233 mLfInfo.set(static_cast<Allocation *>(data));
234 mPrch.lf_info = (LoopFilterInfoN *)mLfInfo->mHal.state.userProvidedPtr;
235 } else if (slot == 3) {
236 mLfMasks.set(static_cast<Allocation *>(data));
237 mPrch.lfms = (LoopFilterMask *)mLfMasks->mHal.state.userProvidedPtr;
238 } else {
239 mFrameBuffer.set(static_cast<Allocation *>(data));
240 mPrch.buffer_alloc = (uint8_t *)mFrameBuffer->mHal.state.userProvidedPtr;
241 }
242}
243
244RsdCpuScriptIntrinsicLoopFilter::~RsdCpuScriptIntrinsicLoopFilter() {
245 android_atomic_inc(&mPrch.quit);
246 int i = 0;
247 for (i = 0; i < mWorkerCount; ++i) {
248 pthread_cond_signal(&mPrch.start_cond[i]);
249 }
250 for (i = 0; i < mWorkerCount; ++i) {
251 pthread_join(mPrch.tid[i], NULL);
252 }
253 free(mPrch.tid);
254}
255
256void RsdCpuScriptIntrinsicLoopFilter::populateScript(Script *s) {
257 s->mHal.info.exportedVariableCount = 9;
258 s->mHal.info.exportedFunctionCount = 1;
259}
260
261RsdCpuScriptImpl * rsdIntrinsic_LoopFilter(RsdCpuReferenceImpl *ctx,
262 const Script *s, const Element *e) {
263 return new RsdCpuScriptIntrinsicLoopFilter(ctx, s, e);
264}
265
266extern "C" void vp9_lpf_vertical_16_c(uint8_t *s, int pitch,
267 const uint8_t *blimit,
268 const uint8_t *limit,
269 const uint8_t *thresh);
270extern "C" void vp9_lpf_vertical_16_neon(uint8_t *s, int pitch,
271 const uint8_t *blimit,
272 const uint8_t *limit,
273 const uint8_t *thresh);
274extern "C" void vp9_lpf_vertical_16_dual_c(uint8_t *s, int pitch,
275 const uint8_t *blimit,
276 const uint8_t *limit,
277 const uint8_t *thresh);
278extern "C" void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int pitch,
279 const uint8_t *blimit,
280 const uint8_t *limit,
281 const uint8_t *thresh);
282extern "C" void vp9_lpf_vertical_8_c(uint8_t *s, int pitch,
283 const uint8_t *blimit,
284 const uint8_t *limit,
285 const uint8_t *thresh,
286 int count);
287extern "C" void vp9_lpf_vertical_8_neon(uint8_t *s, int pitch,
288 const uint8_t *blimit,
289 const uint8_t *limit,
290 const uint8_t *thresh, int count);
291extern "C" void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch,
292 const uint8_t *blimit0,
293 const uint8_t *limit0,
294 const uint8_t *thresh0,
295 const uint8_t *blimit1,
296 const uint8_t *limit1,
297 const uint8_t *thresh1);
298extern "C" void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int pitch,
299 const uint8_t *blimit0,
300 const uint8_t *limit0,
301 const uint8_t *thresh0,
302 const uint8_t *blimit1,
303 const uint8_t *limit1,
304 const uint8_t *thresh1);
305extern "C" void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
306 const uint8_t *limit, const uint8_t *thresh,
307 int count);
308extern "C" void vp9_lpf_vertical_4_neon(uint8_t *s, int pitch,
309 const uint8_t *blimit,
310 const uint8_t *limit,
311 const uint8_t *thresh, int count);
312extern "C" void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch,
313 const uint8_t *blimit0,
314 const uint8_t *limit0,
315 const uint8_t *thresh0,
316 const uint8_t *blimit1,
317 const uint8_t *limit1,
318 const uint8_t *thresh1);
319extern "C" void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int pitch,
320 const uint8_t *blimit0,
321 const uint8_t *limit0,
322 const uint8_t *thresh0,
323 const uint8_t *blimit1,
324 const uint8_t *limit1,
325 const uint8_t *thresh1);
326extern "C" void vp9_lpf_horizontal_16_c(uint8_t *s, int pitch,
327 const uint8_t *blimit,
328 const uint8_t *limit,
329 const uint8_t *thresh, int count);
330extern "C" void vp9_lpf_horizontal_16_neon(uint8_t *s, int pitch,
331 const uint8_t *blimit,
332 const uint8_t *limit,
333 const uint8_t *thresh, int count);
334extern "C" void vp9_lpf_horizontal_8_c(uint8_t *s, int pitch,
335 const uint8_t *blimit,
336 const uint8_t *limit,
337 const uint8_t *thresh, int count);
338extern "C" void vp9_lpf_horizontal_8_neon(uint8_t *s, int pitch,
339 const uint8_t *blimit,
340 const uint8_t *limit,
341 const uint8_t *thresh, int count);
342extern "C" void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int pitch,
343 const uint8_t *blimit0,
344 const uint8_t *limit0,
345 const uint8_t *thresh0,
346 const uint8_t *blimit1,
347 const uint8_t *limit1,
348 const uint8_t *thresh1);
349extern "C" void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int pitch,
350 const uint8_t *blimit0,
351 const uint8_t *limit0,
352 const uint8_t *thresh0,
353 const uint8_t *blimit1,
354 const uint8_t *limit1,
355 const uint8_t *thresh1);
356extern "C" void vp9_lpf_horizontal_4_c(uint8_t *s, int pitch,
357 const uint8_t *blimit,
358 const uint8_t *limit,
359 const uint8_t *thresh, int count);
360extern "C" void vp9_lpf_horizontal_4_neon(uint8_t *s, int pitch,
361 const uint8_t *blimit,
362 const uint8_t *limit,
363 const uint8_t *thresh, int count);
364extern "C" void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int pitch,
365 const uint8_t *blimit0,
366 const uint8_t *limit0,
367 const uint8_t *thresh0,
368 const uint8_t *blimit1,
369 const uint8_t *limit1,
370 const uint8_t *thresh1);
371extern "C" void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int pitch,
372 const uint8_t *blimit0,
373 const uint8_t *limit0,
374 const uint8_t *thresh0,
375 const uint8_t *blimit1,
376 const uint8_t *limit1,
377 const uint8_t *thresh1);
378
379
Jason Sams074424a2014-05-22 13:30:03 -0700380// remove ARM64 statement when ARM64 asm available
381#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
Matthieu Delahaye6fc3e122014-03-04 11:05:49 -0600382
383#define vp9_lpf_vertical_16 vp9_lpf_vertical_16_neon
384#define vp9_lpf_vertical_16_dual vp9_lpf_vertical_16_dual_neon
385#define vp9_lpf_vertical_8 vp9_lpf_vertical_8_neon
386#define vp9_lpf_vertical_8_dual vp9_lpf_vertical_8_dual_neon
387#define vp9_lpf_vertical_4 vp9_lpf_vertical_4_neon
388#define vp9_lpf_vertical_4_dual vp9_lpf_vertical_4_dual_neon
389#define vp9_lpf_horizontal_16 vp9_lpf_horizontal_16_neon
390#define vp9_lpf_horizontal_8 vp9_lpf_horizontal_8_neon
391#define vp9_lpf_horizontal_8_dual vp9_lpf_horizontal_8_dual_neon
392#define vp9_lpf_horizontal_4 vp9_lpf_horizontal_4_neon
393#define vp9_lpf_horizontal_4_dual vp9_lpf_horizontal_4_dual_neon
394
395void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
396 const uint8_t *blimit0,
397 const uint8_t *limit0,
398 const uint8_t *thresh0,
399 const uint8_t *blimit1,
400 const uint8_t *limit1,
401 const uint8_t *thresh1) {
402 vp9_lpf_horizontal_8(s, p, blimit0, limit0, thresh0, 1);
403 vp9_lpf_horizontal_8(s + 8, p, blimit1, limit1, thresh1, 1);
404}
405
406void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p,
407 const uint8_t *blimit0,
408 const uint8_t *limit0,
409 const uint8_t *thresh0,
410 const uint8_t *blimit1,
411 const uint8_t *limit1,
412 const uint8_t *thresh1) {
413 vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
414 vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
415}
416
417void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p,
418 const uint8_t *blimit0,
419 const uint8_t *limit0,
420 const uint8_t *thresh0,
421 const uint8_t *blimit1,
422 const uint8_t *limit1,
423 const uint8_t *thresh1) {
424 vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
425 vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
426}
427
428void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p,
429 const uint8_t *blimit,
430 const uint8_t *limit,
431 const uint8_t *thresh) {
432 vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
433 vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
434}
435
436#else
437
438#define vp9_lpf_vertical_16 vp9_lpf_vertical_16_c
439#define vp9_lpf_vertical_16_dual vp9_lpf_vertical_16_dual_c
440#define vp9_lpf_vertical_8 vp9_lpf_vertical_8_c
441#define vp9_lpf_vertical_8_dual vp9_lpf_vertical_8_dual_c
442#define vp9_lpf_vertical_4 vp9_lpf_vertical_4_c
443#define vp9_lpf_vertical_4_dual vp9_lpf_vertical_4_dual_c
444#define vp9_lpf_horizontal_16 vp9_lpf_horizontal_16_c
445#define vp9_lpf_horizontal_8 vp9_lpf_horizontal_8_c
446#define vp9_lpf_horizontal_8_dual vp9_lpf_horizontal_8_dual_c
447#define vp9_lpf_horizontal_4 vp9_lpf_horizontal_4_c
448#define vp9_lpf_horizontal_4_dual vp9_lpf_horizontal_4_dual_c
449
Jason Sams074424a2014-05-22 13:30:03 -0700450#endif // ARCH_ARM_USE_INTRINSICS && !ARCH_ARM64_USE_INTRINSICS
Matthieu Delahaye6fc3e122014-03-04 11:05:49 -0600451
452
453
454
455static INLINE int8_t signed_char_clamp(int t) {
456 return (int8_t)clamp(t, -128, 127);
457}
458
459// should we apply any filter at all: 11111111 yes, 00000000 no
460static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,
461 uint8_t p3, uint8_t p2,
462 uint8_t p1, uint8_t p0,
463 uint8_t q0, uint8_t q1,
464 uint8_t q2, uint8_t q3) {
465 int8_t mask = 0;
466 mask |= (abs(p3 - p2) > limit) * -1;
467 mask |= (abs(p2 - p1) > limit) * -1;
468 mask |= (abs(p1 - p0) > limit) * -1;
469 mask |= (abs(q1 - q0) > limit) * -1;
470 mask |= (abs(q2 - q1) > limit) * -1;
471 mask |= (abs(q3 - q2) > limit) * -1;
472 mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
473 return ~mask;
474}
475
476static INLINE int8_t flat_mask4(uint8_t thresh,
477 uint8_t p3, uint8_t p2,
478 uint8_t p1, uint8_t p0,
479 uint8_t q0, uint8_t q1,
480 uint8_t q2, uint8_t q3) {
481 int8_t mask = 0;
482 mask |= (abs(p1 - p0) > thresh) * -1;
483 mask |= (abs(q1 - q0) > thresh) * -1;
484 mask |= (abs(p2 - p0) > thresh) * -1;
485 mask |= (abs(q2 - q0) > thresh) * -1;
486 mask |= (abs(p3 - p0) > thresh) * -1;
487 mask |= (abs(q3 - q0) > thresh) * -1;
488 return ~mask;
489}
490
491static INLINE int8_t flat_mask5(uint8_t thresh,
492 uint8_t p4, uint8_t p3,
493 uint8_t p2, uint8_t p1,
494 uint8_t p0, uint8_t q0,
495 uint8_t q1, uint8_t q2,
496 uint8_t q3, uint8_t q4) {
497 int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
498 mask |= (abs(p4 - p0) > thresh) * -1;
499 mask |= (abs(q4 - q0) > thresh) * -1;
500 return ~mask;
501}
502
503// is there high edge variance internal edge: 11111111 yes, 00000000 no
504static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
505 uint8_t q0, uint8_t q1) {
506 int8_t hev = 0;
507 hev |= (abs(p1 - p0) > thresh) * -1;
508 hev |= (abs(q1 - q0) > thresh) * -1;
509 return hev;
510}
511
512static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
513 uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
514 int8_t filter1, filter2;
515
516 const int8_t ps1 = (int8_t) *op1 ^ 0x80;
517 const int8_t ps0 = (int8_t) *op0 ^ 0x80;
518 const int8_t qs0 = (int8_t) *oq0 ^ 0x80;
519 const int8_t qs1 = (int8_t) *oq1 ^ 0x80;
520 const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
521
522 // add outer taps if we have high edge variance
523 int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
524
525 // inner taps
526 filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
527
528 // save bottom 3 bits so that we round one side +4 and the other +3
529 // if it equals 4 we'll set to adjust by -1 to account for the fact
530 // we'd round 3 the other way
531 filter1 = signed_char_clamp(filter + 4) >> 3;
532 filter2 = signed_char_clamp(filter + 3) >> 3;
533
534 *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
535 *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
536
537 // outer tap adjustments
538 filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
539
540 *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
541 *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
542}
543
544void vp9_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
545 const uint8_t *blimit, const uint8_t *limit,
546 const uint8_t *thresh, int count) {
547 int i;
548
549 // loop filter designed to work using chars so that we can make maximum use
550 // of 8 bit simd instructions.
551 for (i = 0; i < 8 * count; ++i) {
552 const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
553 const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
554 const int8_t mask = filter_mask(*limit, *blimit,
555 p3, p2, p1, p0, q0, q1, q2, q3);
556 filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
557 ++s;
558 }
559}
560
561void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
562 const uint8_t *limit0, const uint8_t *thresh0,
563 const uint8_t *blimit1, const uint8_t *limit1,
564 const uint8_t *thresh1) {
565 vp9_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1);
566 vp9_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1);
567}
568
569void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
570 const uint8_t *limit, const uint8_t *thresh,
571 int count) {
572 int i;
573
574 // loop filter designed to work using chars so that we can make maximum use
575 // of 8 bit simd instructions.
576 for (i = 0; i < 8 * count; ++i) {
577 const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
578 const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
579 const int8_t mask = filter_mask(*limit, *blimit,
580 p3, p2, p1, p0, q0, q1, q2, q3);
581 filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
582 s += pitch;
583 }
584}
585
586void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
587 const uint8_t *limit0, const uint8_t *thresh0,
588 const uint8_t *blimit1, const uint8_t *limit1,
589 const uint8_t *thresh1) {
590 vp9_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1);
591 vp9_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1, 1);
592}
593
594static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
595 uint8_t *op3, uint8_t *op2,
596 uint8_t *op1, uint8_t *op0,
597 uint8_t *oq0, uint8_t *oq1,
598 uint8_t *oq2, uint8_t *oq3) {
599 if (flat && mask) {
600 const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
601 const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
602
603 // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
604 *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
605 *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
606 *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
607 *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
608 *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
609 *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
610 } else {
611 filter4(mask, thresh, op1, op0, oq0, oq1);
612 }
613}
614
615void vp9_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
616 const uint8_t *limit, const uint8_t *thresh,
617 int count) {
618 int i;
619
620 // loop filter designed to work using chars so that we can make maximum use
621 // of 8 bit simd instructions.
622 for (i = 0; i < 8 * count; ++i) {
623 const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
624 const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
625
626 const int8_t mask = filter_mask(*limit, *blimit,
627 p3, p2, p1, p0, q0, q1, q2, q3);
628 const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
629 filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
630 s, s + 1 * p, s + 2 * p, s + 3 * p);
631 ++s;
632 }
633}
634
635void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
636 const uint8_t *limit0, const uint8_t *thresh0,
637 const uint8_t *blimit1, const uint8_t *limit1,
638 const uint8_t *thresh1) {
639 vp9_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1);
640 vp9_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1);
641}
642
643void vp9_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
644 const uint8_t *limit, const uint8_t *thresh,
645 int count) {
646 int i;
647
648 for (i = 0; i < 8 * count; ++i) {
649 const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
650 const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
651 const int8_t mask = filter_mask(*limit, *blimit,
652 p3, p2, p1, p0, q0, q1, q2, q3);
653 const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
654 filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1,
655 s, s + 1, s + 2, s + 3);
656 s += pitch;
657 }
658}
659
660void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
661 const uint8_t *limit0, const uint8_t *thresh0,
662 const uint8_t *blimit1, const uint8_t *limit1,
663 const uint8_t *thresh1) {
664 vp9_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1);
665 vp9_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1, 1);
666}
667
668static INLINE void filter16(int8_t mask, uint8_t thresh,
669 uint8_t flat, uint8_t flat2,
670 uint8_t *op7, uint8_t *op6,
671 uint8_t *op5, uint8_t *op4,
672 uint8_t *op3, uint8_t *op2,
673 uint8_t *op1, uint8_t *op0,
674 uint8_t *oq0, uint8_t *oq1,
675 uint8_t *oq2, uint8_t *oq3,
676 uint8_t *oq4, uint8_t *oq5,
677 uint8_t *oq6, uint8_t *oq7) {
678 if (flat2 && flat && mask) {
679 const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4,
680 p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
681
682 const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3,
683 q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
684
685 // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
686 *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
687 q0, 4);
688 *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
689 q0 + q1, 4);
690 *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
691 q0 + q1 + q2, 4);
692 *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
693 q0 + q1 + q2 + q3, 4);
694 *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
695 q0 + q1 + q2 + q3 + q4, 4);
696 *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
697 q0 + q1 + q2 + q3 + q4 + q5, 4);
698 *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
699 q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
700 *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
701 q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
702 *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
703 q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
704 *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
705 q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
706 *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
707 q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
708 *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
709 q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
710 *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
711 q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
712 *oq6 = ROUND_POWER_OF_TWO(p0 +
713 q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
714 } else {
715 filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
716 }
717}
718
719void vp9_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,
720 const uint8_t *limit, const uint8_t *thresh,
721 int count) {
722 int i;
723
724 // loop filter designed to work using chars so that we can make maximum use
725 // of 8 bit simd instructions.
726 for (i = 0; i < 8 * count; ++i) {
727 const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
728 const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
729 const int8_t mask = filter_mask(*limit, *blimit,
730 p3, p2, p1, p0, q0, q1, q2, q3);
731 const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
732 const int8_t flat2 = flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
733 q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
734
735 filter16(mask, *thresh, flat, flat2,
736 s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
737 s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
738 s, s + 1 * p, s + 2 * p, s + 3 * p,
739 s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
740 ++s;
741 }
742}
743
744static void mb_lpf_vertical_edge_w(uint8_t *s, int p,
745 const uint8_t *blimit,
746 const uint8_t *limit,
747 const uint8_t *thresh,
748 int count) {
749 int i;
750
751 for (i = 0; i < count; ++i) {
752 const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
753 const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
754 const int8_t mask = filter_mask(*limit, *blimit,
755 p3, p2, p1, p0, q0, q1, q2, q3);
756 const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
757 const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
758 q0, s[4], s[5], s[6], s[7]);
759
760 filter16(mask, *thresh, flat, flat2,
761 s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
762 s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
763 s += p;
764 }
765}
766
767void vp9_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
768 const uint8_t *limit, const uint8_t *thresh) {
769 mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
770}
771
772void vp9_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
773 const uint8_t *limit, const uint8_t *thresh) {
774 mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
775}
776
777
778static void filter_selectively_vert_row2(PLANE_TYPE plane_type,
779 uint8_t *s, int pitch,
780 unsigned int mask_16x16_l,
781 unsigned int mask_8x8_l,
782 unsigned int mask_4x4_l,
783 unsigned int mask_4x4_int_l,
784 const LoopFilterInfoN *lfi_n,
785 const uint8_t *lfl) {
786 const int mask_shift = plane_type ? 4 : 8;
787 const int mask_cutoff = plane_type ? 0xf : 0xff;
788 const int lfl_forward = plane_type ? 4 : 8;
789
790 unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
791 unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
792 unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
793 unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
794 unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
795 unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
796 unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
797 unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
798 unsigned int mask;
799
800 for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
801 mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
802 mask; mask >>= 1) {
803 const LoopFilterThresh *lfi0 = lfi_n->lfthr + *lfl;
804 const LoopFilterThresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
805
806 // TODO(yunqingwang): count in loopfilter functions should be removed.
807 if (mask & 1) {
808 if ((mask_16x16_0 | mask_16x16_1) & 1) {
809 if ((mask_16x16_0 & mask_16x16_1) & 1) {
810 vp9_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
811 lfi0->hev_thr);
812 } else if (mask_16x16_0 & 1) {
813 vp9_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
814 lfi0->hev_thr);
815 } else {
816 vp9_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
817 lfi1->lim, lfi1->hev_thr);
818 }
819 }
820
821 if ((mask_8x8_0 | mask_8x8_1) & 1) {
822 if ((mask_8x8_0 & mask_8x8_1) & 1) {
823 vp9_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
824 lfi0->hev_thr, lfi1->mblim, lfi1->lim,
825 lfi1->hev_thr);
826 } else if (mask_8x8_0 & 1) {
827 vp9_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,
828 lfi0->hev_thr, 1);
829 } else {
830 vp9_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
831 lfi1->hev_thr, 1);
832 }
833 }
834
835 if ((mask_4x4_0 | mask_4x4_1) & 1) {
836 if ((mask_4x4_0 & mask_4x4_1) & 1) {
837 vp9_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
838 lfi0->hev_thr, lfi1->mblim, lfi1->lim,
839 lfi1->hev_thr);
840 } else if (mask_4x4_0 & 1) {
841 vp9_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
842 lfi0->hev_thr, 1);
843 } else {
844 vp9_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
845 lfi1->hev_thr, 1);
846 }
847 }
848
849 if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
850 if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
851 vp9_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
852 lfi0->hev_thr, lfi1->mblim, lfi1->lim,
853 lfi1->hev_thr);
854 } else if (mask_4x4_int_0 & 1) {
855 vp9_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
856 lfi0->hev_thr, 1);
857 } else {
858 vp9_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,
859 lfi1->lim, lfi1->hev_thr, 1);
860 }
861 }
862 }
863
864 s += 8;
865 lfl += 1;
866 mask_16x16_0 >>= 1;
867 mask_8x8_0 >>= 1;
868 mask_4x4_0 >>= 1;
869 mask_4x4_int_0 >>= 1;
870 mask_16x16_1 >>= 1;
871 mask_8x8_1 >>= 1;
872 mask_4x4_1 >>= 1;
873 mask_4x4_int_1 >>= 1;
874 }
875}
876
877static void filter_selectively_horiz(uint8_t *s, int pitch,
878 unsigned int mask_16x16,
879 unsigned int mask_8x8,
880 unsigned int mask_4x4,
881 unsigned int mask_4x4_int,
882 const LoopFilterInfoN *lfi_n,
883 const uint8_t *lfl) {
884 unsigned int mask;
885 int count;
886
887 for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
888 mask; mask >>= count) {
889 const LoopFilterThresh *lfi = lfi_n->lfthr + *lfl;
890
891 count = 1;
892 if (mask & 1) {
893 if (mask_16x16 & 1) {
894 if ((mask_16x16 & 3) == 3) {
895 vp9_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
896 lfi->hev_thr, 2);
897 count = 2;
898 } else {
899 vp9_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
900 lfi->hev_thr, 1);
901 }
902 } else if (mask_8x8 & 1) {
903 if ((mask_8x8 & 3) == 3) {
904 // Next block's thresholds
905 const LoopFilterThresh *lfin = lfi_n->lfthr + *(lfl + 1);
906
907 vp9_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
908 lfi->hev_thr, lfin->mblim, lfin->lim,
909 lfin->hev_thr);
910
911 if ((mask_4x4_int & 3) == 3) {
912 vp9_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
913 lfi->lim, lfi->hev_thr, lfin->mblim,
914 lfin->lim, lfin->hev_thr);
915 } else {
916 if (mask_4x4_int & 1)
917 vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
918 lfi->lim, lfi->hev_thr, 1);
919 else if (mask_4x4_int & 2)
920 vp9_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
921 lfin->lim, lfin->hev_thr, 1);
922 }
923 count = 2;
924 } else {
925 vp9_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
926
927 if (mask_4x4_int & 1)
928 vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
929 lfi->lim, lfi->hev_thr, 1);
930 }
931 } else if (mask_4x4 & 1) {
932 if ((mask_4x4 & 3) == 3) {
933 // Next block's thresholds
934 const LoopFilterThresh *lfin = lfi_n->lfthr + *(lfl + 1);
935
936 vp9_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
937 lfi->hev_thr, lfin->mblim, lfin->lim,
938 lfin->hev_thr);
939 if ((mask_4x4_int & 3) == 3) {
940 vp9_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
941 lfi->lim, lfi->hev_thr, lfin->mblim,
942 lfin->lim, lfin->hev_thr);
943 } else {
944 if (mask_4x4_int & 1)
945 vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
946 lfi->lim, lfi->hev_thr, 1);
947 else if (mask_4x4_int & 2)
948 vp9_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
949 lfin->lim, lfin->hev_thr, 1);
950 }
951 count = 2;
952 } else {
953 vp9_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
954
955 if (mask_4x4_int & 1)
956 vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
957 lfi->hev_thr, 1);
958 }
959 } else if (mask_4x4_int & 1) {
960 vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
961 lfi->hev_thr, 1);
962 }
963 }
964 s += 8 * count;
965 lfl += count;
966 mask_16x16 >>= count;
967 mask_8x8 >>= count;
968 mask_4x4 >>= count;
969 mask_4x4_int >>= count;
970 }
971}
972
973static void filter_block_plane_y(LoopFilterInfoN *lf_info,
974 LoopFilterMask *lfm,
975 int stride,
976 uint8_t *buf,
977 int mi_rows,
978 int mi_row) {
979 uint8_t* dst0 = buf;
980 int r; //, c;
981
982 uint64_t mask_16x16 = lfm->left_y[TX_16X16];
983 uint64_t mask_8x8 = lfm->left_y[TX_8X8];
984 uint64_t mask_4x4 = lfm->left_y[TX_4X4];
985 uint64_t mask_4x4_int = lfm->int_4x4_y;
986
987 // Vertical pass: do 2 rows at one time
988 for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < mi_rows; r += 2) {
989 unsigned int mask_16x16_l = mask_16x16 & 0xffff;
990 unsigned int mask_8x8_l = mask_8x8 & 0xffff;
991 unsigned int mask_4x4_l = mask_4x4 & 0xffff;
992 unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff;
993
994 // Disable filtering on the leftmost column
995 filter_selectively_vert_row2(PLANE_TYPE_Y_WITH_DC, buf, stride,
996 mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, lf_info,
997 &lfm->lfl_y[r << 3]);
998
999 buf += 16 * stride;
1000 mask_16x16 >>= 16;
1001 mask_8x8 >>= 16;
1002 mask_4x4 >>= 16;
1003 mask_4x4_int >>= 16;
1004 }
1005
1006 // Horizontal pass
1007 buf = dst0;
1008 mask_16x16 = lfm->above_y[TX_16X16];
1009 mask_8x8 = lfm->above_y[TX_8X8];
1010 mask_4x4 = lfm->above_y[TX_4X4];
1011 mask_4x4_int = lfm->int_4x4_y;
1012
1013 for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < mi_rows; r++) {
1014 unsigned int mask_16x16_r;
1015 unsigned int mask_8x8_r;
1016 unsigned int mask_4x4_r;
1017
1018 if (mi_row + r == 0) {
1019 mask_16x16_r = 0;
1020 mask_8x8_r = 0;
1021 mask_4x4_r = 0;
1022 } else {
1023 mask_16x16_r = mask_16x16 & 0xff;
1024 mask_8x8_r = mask_8x8 & 0xff;
1025 mask_4x4_r = mask_4x4 & 0xff;
1026 }
1027
1028 filter_selectively_horiz(buf, stride, mask_16x16_r, mask_8x8_r,
1029 mask_4x4_r, mask_4x4_int & 0xff, lf_info, &lfm->lfl_y[r << 3]);
1030
1031 buf += 8 * stride;
1032 mask_16x16 >>= 8;
1033 mask_8x8 >>= 8;
1034 mask_4x4 >>= 8;
1035 mask_4x4_int >>= 8;
1036 }
1037}
1038
1039static void filter_block_plane_uv(LoopFilterInfoN *lf_info,
1040 LoopFilterMask *lfm,
1041 int stride,
1042 uint8_t *buf,
1043 int mi_rows,
1044 int mi_row) {
1045 uint8_t* dst0 = buf;
1046 int r, c;
1047
1048 uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
1049 uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
1050 uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
1051 uint16_t mask_4x4_int = lfm->int_4x4_uv;
1052
1053 // Vertical pass: do 2 rows at one time
1054 for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < mi_rows; r += 4) {
1055
1056 for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) {
1057 lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
1058 lfm->lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)];
1059 }
1060
1061 {
1062 unsigned int mask_16x16_l = mask_16x16 & 0xff;
1063 unsigned int mask_8x8_l = mask_8x8 & 0xff;
1064 unsigned int mask_4x4_l = mask_4x4 & 0xff;
1065 unsigned int mask_4x4_int_l = mask_4x4_int & 0xff;
1066
1067 // Disable filtering on the leftmost column
1068 filter_selectively_vert_row2(PLANE_TYPE_UV, buf, stride,
1069 mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l,
1070 lf_info, &lfm->lfl_uv[r << 1]);
1071
1072 buf += 16 * stride;
1073 mask_16x16 >>= 8;
1074 mask_8x8 >>= 8;
1075 mask_4x4 >>= 8;
1076 mask_4x4_int >>= 8;
1077 }
1078 }
1079
1080 // Horizontal pass
1081 buf = dst0;
1082 mask_16x16 = lfm->above_uv[TX_16X16];
1083 mask_8x8 = lfm->above_uv[TX_8X8];
1084 mask_4x4 = lfm->above_uv[TX_4X4];
1085 mask_4x4_int = lfm->int_4x4_uv;
1086
1087 for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < mi_rows; r += 2) {
1088 int skip_border_4x4_r = mi_row + r == mi_rows - 1;
1089 unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : (mask_4x4_int & 0xf);
1090 unsigned int mask_16x16_r;
1091 unsigned int mask_8x8_r;
1092 unsigned int mask_4x4_r;
1093
1094 if (mi_row + r == 0) {
1095 mask_16x16_r = 0;
1096 mask_8x8_r = 0;
1097 mask_4x4_r = 0;
1098 } else {
1099 mask_16x16_r = mask_16x16 & 0xf;
1100 mask_8x8_r = mask_8x8 & 0xf;
1101 mask_4x4_r = mask_4x4 & 0xf;
1102 }
1103
1104 filter_selectively_horiz(buf, stride, mask_16x16_r, mask_8x8_r,
1105 mask_4x4_r, mask_4x4_int_r, lf_info, &lfm->lfl_uv[r << 1]);
1106
1107 buf += 8 * stride;
1108 mask_16x16 >>= 4;
1109 mask_8x8 >>= 4;
1110 mask_4x4 >>= 4;
1111 mask_4x4_int >>= 4;
1112 }
1113}
1114
1115static void *vp9_loop_filter_rows_work_proc(void *data) {
1116 LoopFilterProgressChart *param = (LoopFilterProgressChart *)data;
1117 int wid = android_atomic_inc(&param->wid);
1118 int sb_row;
1119 int mi_row, mi_col;
1120 int lfm_idx;
1121 uint8_t *buf_start[MAX_MB_PLANE];
1122 uint8_t *buf[MAX_MB_PLANE];
1123 BufferInfo *buf_info = &param->buf_info;
1124
1125 while (!android_atomic_release_load(&param->quit)) {
1126 pthread_mutex_lock(&param->mutex[wid]);
1127 pthread_cond_wait(&param->start_cond[wid], &param->mutex[wid]);
1128 pthread_mutex_unlock(&param->mutex[wid]);
1129
1130 if (android_atomic_release_load(&param->quit)) return NULL;
1131
1132 buf_start[0] = param->buffer_alloc + buf_info->y_offset;
1133 buf_start[1] = param->buffer_alloc + buf_info->u_offset;
1134 buf_start[2] = param->buffer_alloc + buf_info->v_offset;
1135 sb_row = android_atomic_inc(&param->sb_row_pro);
1136 mi_row = (sb_row * MI_BLOCK_SIZE) + param->start;
1137
1138 while (mi_row < param->stop) {
1139 buf[0] = buf_start[0] + (mi_row * buf_info->y_stride << 3);
1140 buf[1] = buf_start[1] + (mi_row * buf_info->uv_stride << 2);
1141 buf[2] = buf_start[2] + (mi_row * buf_info->uv_stride << 2);
1142 lfm_idx = sb_row * ((param->mi_cols + 7) >> 3);
1143 for (mi_col = 0; mi_col < param->mi_cols; mi_col += MI_BLOCK_SIZE) {
1144
1145 while (param->chart[sb_row+1] + 2 > android_atomic_release_load(&param->chart[sb_row])) {
1146 usleep(1);
1147 }
1148
1149 filter_block_plane_y(param->lf_info, param->lfms + lfm_idx,
1150 buf_info->y_stride, buf[0], param->mi_rows,
1151 mi_row);
1152 mi_col += MI_BLOCK_SIZE;
1153 if (mi_col < param->mi_cols) {
1154 lfm_idx++;
1155 buf[0] += MI_BLOCK_SIZE * MI_BLOCK_SIZE;
1156 filter_block_plane_y(param->lf_info, param->lfms + lfm_idx,
1157 buf_info->y_stride, buf[0],
1158 param->mi_rows, mi_row);
1159 }
1160 buf[0] += MI_BLOCK_SIZE * MI_BLOCK_SIZE;
1161 if (param->num_planes > 1) {
1162 lfm_idx--;
1163 filter_block_plane_uv(param->lf_info, param->lfms + lfm_idx,
1164 buf_info->uv_stride, buf[1],
1165 param->mi_rows, mi_row);
1166 filter_block_plane_uv(param->lf_info, param->lfms + lfm_idx,
1167 buf_info->uv_stride, buf[2],
1168 param->mi_rows, mi_row);
1169 if (mi_col < param->mi_cols) {
1170 lfm_idx++;
1171 buf[1] += MI_BLOCK_SIZE * MI_BLOCK_SIZE >> 1;
1172 buf[2] += MI_BLOCK_SIZE * MI_BLOCK_SIZE >> 1;
1173 filter_block_plane_uv(param->lf_info,
1174 param->lfms + lfm_idx,
1175 buf_info->uv_stride, buf[1],
1176 param->mi_rows, mi_row);
1177 filter_block_plane_uv(param->lf_info,
1178 param->lfms + lfm_idx,
1179 buf_info->uv_stride, buf[2],
1180 param->mi_rows, mi_row);
1181 }
1182 buf[1] += MI_BLOCK_SIZE * MI_BLOCK_SIZE >> 1;
1183 buf[2] += MI_BLOCK_SIZE * MI_BLOCK_SIZE >> 1;
1184 }
1185 lfm_idx++;
1186 android_atomic_inc(&param->chart[sb_row+1]);
1187 }
1188 android_atomic_inc(&param->chart[sb_row+1]);
1189 sb_row = android_atomic_inc(&param->sb_row_pro);
1190 mi_row = (sb_row << 3) + param->start;
1191 }
1192
1193 pthread_mutex_lock(param->hmutex);
1194 if ((--param->doing) == 0)
1195 pthread_cond_signal(param->finish);
1196 pthread_mutex_unlock(param->hmutex);
1197 }
1198
1199 return NULL;
1200}
1201
1202RsdCpuScriptIntrinsicLoopFilter::RsdCpuScriptIntrinsicLoopFilter(
1203 RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
1204 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB) {
1205 mRootPtr = &kernel;
1206 mWorkerCount = sysconf(_SC_NPROCESSORS_ONLN);
1207 mPrch.quit = 0;
1208 mPrch.wid = 0;
1209 mPrch.sb_row_pro = 0;
1210 mPrch.doing = mWorkerCount;
1211 int size = mWorkerCount * sizeof(pthread_t) +
1212 mWorkerCount * sizeof(pthread_mutex_t) +
1213 mWorkerCount * sizeof(pthread_cond_t) +
1214 sizeof(pthread_mutex_t) + sizeof(pthread_cond_t);
1215 uint8_t *ptr = (uint8_t *)malloc(size);
1216 rsAssert(ptr);
1217 mPrch.tid = (pthread_t *)ptr;
1218 mPrch.mutex = (pthread_mutex_t *) (mPrch.tid + mWorkerCount);
1219 mPrch.start_cond = (pthread_cond_t *) (mPrch.mutex + mWorkerCount);
1220 mPrch.hmutex = (pthread_mutex_t *) (mPrch.start_cond + mWorkerCount);
1221 mPrch.finish = (pthread_cond_t *) (mPrch.hmutex + 1);
1222 int i = 0;
1223 int rv = 0;
1224 pthread_mutex_init(mPrch.hmutex, NULL);
1225 pthread_cond_init(mPrch.finish, NULL);
1226 for (i = 0; i < mWorkerCount; ++i) {
1227 pthread_mutex_init(&mPrch.mutex[i], NULL);
1228 pthread_cond_init(&mPrch.start_cond[i], NULL);
1229 }
1230 for (i = 0; i < mWorkerCount; ++i) {
1231 rv = pthread_create(&mPrch.tid[i], NULL, &vp9_loop_filter_rows_work_proc, &mPrch);
1232 rsAssert(rv == 0);
1233 }
1234}