Blame - cpu_ref/rsCpuIntrinsicLoopFilter.cpp - platform/frameworks/rs

blob: 05ccfd643fdf34a90e46c9a4e4b0a0707b380379 [file] [log] [blame]

Matthieu Delahaye	6fc3e12	2014-03-04 11:05:49 -0600	[diff] [blame]	1	/*
				2	* Copyright (C) 2013 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17
				18	#include "rsCpuIntrinsic.h"
				19	#include "rsCpuIntrinsicInlines.h"
				20	#include <sys/syscall.h>
				21	#include "cutils/atomic.h"
				22
				23	#ifdef RS_COMPATIBILITY_LIB
				24	#include "rsCompatibilityLib.h"
				25	#endif
				26
				27	#ifndef RS_COMPATIBILITY_LIB
				28	#include "hardware/gralloc.h"
				29	#endif
				30
				31
				32	#define INLINE inline
				33
				34	#define MIN(x, y) (((x) < (y)) ? (x) : (y))
				35	#define MAX(x, y) (((x) > (y)) ? (x) : (y))
				36
				37	#define ROUND_POWER_OF_TWO(value, n) \
				38	(((value) + (1 << ((n) - 1))) >> (n))
				39
				40
				41	#define MI_SIZE_LOG2 3
				42	#define MI_BLOCK_SIZE_LOG2 (6 - MI_SIZE_LOG2) // 64 = 2^6
				43
				44	#define MI_SIZE (1 << MI_SIZE_LOG2) // pixels per mi-unit
				45	#define MI_BLOCK_SIZE (1 << MI_BLOCK_SIZE_LOG2) // mi-units per max block
				46
				47	#define MI_MASK (MI_BLOCK_SIZE - 1)
				48
				49	#define SIMD_WIDTH 16
				50	#define MAX_LOOP_FILTER 63
				51	#define MAX_SEGMENTS 8
				52	#define MAX_REF_FRAMES 4
				53	#define MAX_MODE_LF_DELTAS 2
				54	#define MB_MODE_COUNT 14
				55	#define BLOCK_SIZES 13
				56
				57
				58	#if (defined(__GNUC__) && __GNUC__) \|\| defined(__SUNPRO_C)
				59	#define DECLARE_ALIGNED(n,typ,val) typ val __attribute__ ((aligned (n)))
				60	#elif defined(_MSC_VER)
				61	#define DECLARE_ALIGNED(n,typ,val) __declspec(align(n)) typ val
				62	#else
				63	#warning No alignment directives known for this compiler.
				64	#define DECLARE_ALIGNED(n,typ,val) typ val
				65	#endif
				66
				67	// block transform size
				68	typedef enum {
				69	TX_4X4 = 0, // 4x4 transform
				70	TX_8X8 = 1, // 8x8 transform
				71	TX_16X16 = 2, // 16x16 transform
				72	TX_32X32 = 3, // 32x32 transform
				73	TX_SIZES
				74	} TX_SIZE;
				75
				76	typedef enum {
				77	PLANE_TYPE_Y_WITH_DC,
				78	PLANE_TYPE_UV,
				79	} PLANE_TYPE;
				80
				81	// This structure holds bit masks for all 8x8 blocks in a 64x64 region.
				82	// Each 1 bit represents a position in which we want to apply the loop filter.
				83	// Left_ entries refer to whether we apply a filter on the border to the
				84	// left of the block. Above_ entries refer to whether or not to apply a
				85	// filter on the above border. Int_ entries refer to whether or not to
				86	// apply borders on the 4x4 edges within the 8x8 block that each bit
				87	// represents.
				88	// Since each transform is accompanied by a potentially different type of
				89	// loop filter there is a different entry in the array for each transform size.
				90	struct LoopFilterMask {
				91	uint64_t left_y[4];
				92	uint64_t above_y[4];
				93	uint64_t int_4x4_y;
				94	unsigned short left_uv[4];
				95	unsigned short above_uv[4];
				96	unsigned short int_4x4_uv;
				97	unsigned char lfl_y[64];
				98	unsigned char lfl_uv[16];
				99	};
				100
				101	// Need to align this structure so when it is declared and
				102	// passed it can be loaded into vector registers.
				103	struct LoopFilterThresh {
				104	DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[SIMD_WIDTH]);
				105	DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[SIMD_WIDTH]);
				106	DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[SIMD_WIDTH]);
				107	};
				108
				109	struct LoopFilterInfoN {
				110	LoopFilterThresh lfthr[MAX_LOOP_FILTER + 1];
				111	uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS];
				112	uint8_t mode_lf_lut[MB_MODE_COUNT];
				113	};
				114
				115	struct BufferInfo {
				116	int y_offset;
				117	int u_offset;
				118	int v_offset;
				119	int y_stride;
				120	int uv_stride;
				121	};
				122
				123	#define MAX_CPU_CORES 32
				124	#define MAX_MB_PLANE 3
Matthieu Delahaye	adab849	2014-04-01 16:33:24 -0500	[diff] [blame]	125	#define MAX_SB_ROW 64
Matthieu Delahaye	6fc3e12	2014-03-04 11:05:49 -0600	[diff] [blame]	126
				127	struct LoopFilterProgressChart {
				128	int start;
				129	int stop;
				130	int num_planes;
				131	int mi_rows;
				132	int mi_cols;
				133	BufferInfo buf_info;
				134	uint8_t *buffer_alloc;
				135	LoopFilterInfoN *lf_info;
				136	LoopFilterMask *lfms;
				137
				138	int wid;
				139	int quit;
				140	int doing;
Matthieu Delahaye	adab849	2014-04-01 16:33:24 -0500	[diff] [blame]	141	volatile int32_t chart[MAX_SB_ROW];
Matthieu Delahaye	6fc3e12	2014-03-04 11:05:49 -0600	[diff] [blame]	142	int32_t sb_row_pro;
				143	pthread_t *tid;
				144	pthread_mutex_t *mutex;
				145	pthread_cond_t *start_cond;
				146	pthread_mutex_t *hmutex;
				147	pthread_cond_t *finish;
				148	};
				149
				150	using namespace android;
				151	using namespace android::renderscript;
				152
				153	namespace android {
				154	namespace renderscript {
				155
				156
				157	class RsdCpuScriptIntrinsicLoopFilter : public RsdCpuScriptIntrinsic {
				158	private:
				159	LoopFilterProgressChart mPrch;
				160	int mWorkerCount;
				161
				162	public:
				163	virtual void populateScript(Script *);
				164	virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
				165	virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
				166
				167	virtual ~RsdCpuScriptIntrinsicLoopFilter();
				168	RsdCpuScriptIntrinsicLoopFilter(RsdCpuReferenceImpl ctx, const Script s,
				169	const Element *e);
				170
				171	protected:
				172	ObjectBaseRef<Allocation> mLfInfo;
				173	ObjectBaseRef<Allocation> mLfMasks;
				174	ObjectBaseRef<Allocation> mFrameBuffer;
				175
				176	void doLoopFilter();
Chris Wailes	80ef693	2014-07-08 11:22:18 -0700	[diff] [blame]	177	static void kernel(const RsExpandKernelParams *p,
Matthieu Delahaye	6fc3e12	2014-03-04 11:05:49 -0600	[diff] [blame]	178	uint32_t xstart, uint32_t xend,
Chris Wailes	9ed7910	2014-07-25 15:53:28 -0700	[diff] [blame]	179	uint32_t outstep);
Matthieu Delahaye	6fc3e12	2014-03-04 11:05:49 -0600	[diff] [blame]	180	};
				181
				182	}
				183	}
				184
Chris Wailes	80ef693	2014-07-08 11:22:18 -0700	[diff] [blame]	185	void RsdCpuScriptIntrinsicLoopFilter::kernel(const RsExpandKernelParams *p,
Matthieu Delahaye	6fc3e12	2014-03-04 11:05:49 -0600	[diff] [blame]	186	uint32_t xstart, uint32_t xend,
Chris Wailes	9ed7910	2014-07-25 15:53:28 -0700	[diff] [blame]	187	uint32_t outstep) {
Matthieu Delahaye	6fc3e12	2014-03-04 11:05:49 -0600	[diff] [blame]	188	RsdCpuScriptIntrinsicLoopFilter cp = (RsdCpuScriptIntrinsicLoopFilter)p->usr;
				189	memset((void*)&cp->mPrch.chart, 0, sizeof(cp->mPrch.chart));
				190	cp->mPrch.chart[0] = 0x0fffffff;
				191	cp->mPrch.sb_row_pro = 0;
				192	cp->mPrch.doing = cp->mWorkerCount;
				193
				194	int i = 0;
				195	for (i = 0; i < cp->mWorkerCount; ++i) {
				196	pthread_cond_signal(&cp->mPrch.start_cond[i]);
				197	}
				198	pthread_mutex_lock(cp->mPrch.hmutex);
				199	if (cp->mPrch.doing) {
				200	pthread_cond_wait(cp->mPrch.finish, cp->mPrch.hmutex);
				201	}
				202	pthread_mutex_unlock(cp->mPrch.hmutex);
				203	}
				204
				205
				206	void RsdCpuScriptIntrinsicLoopFilter::setGlobalVar(uint32_t slot,
				207	const void *data,
				208	size_t dataLength) {
				209	rsAssert(slot >= 0 && slot < 2);
				210	const int dptr = (const int )data;
				211	switch (slot) {
				212	case 0:
				213	rsAssert(dataLength == sizeof(int) * 5);
				214	mPrch.start = dptr[0];
				215	mPrch.stop = dptr[1];
				216	mPrch.num_planes = dptr[2];
				217	mPrch.mi_rows = dptr[3];
				218	mPrch.mi_cols = dptr[4];
				219	break;
				220	case 1:
				221	rsAssert(dataLength == sizeof(BufferInfo));
				222	mPrch.buf_info = ((BufferInfo)data);
				223	break;
				224	default:
				225	ALOGE("Non-exist global value slot: %d", slot);
				226	rsAssert(0);
				227	}
				228	}
				229
				230	void RsdCpuScriptIntrinsicLoopFilter::setGlobalObj(uint32_t slot, ObjectBase *data) {
				231	rsAssert(slot > 1 && slot < 5);
				232	if (slot == 2) {
				233	mLfInfo.set(static_cast<Allocation *>(data));
				234	mPrch.lf_info = (LoopFilterInfoN *)mLfInfo->mHal.state.userProvidedPtr;
				235	} else if (slot == 3) {
				236	mLfMasks.set(static_cast<Allocation *>(data));
				237	mPrch.lfms = (LoopFilterMask *)mLfMasks->mHal.state.userProvidedPtr;
				238	} else {
				239	mFrameBuffer.set(static_cast<Allocation *>(data));
				240	mPrch.buffer_alloc = (uint8_t *)mFrameBuffer->mHal.state.userProvidedPtr;
				241	}
				242	}
				243
				244	RsdCpuScriptIntrinsicLoopFilter::~RsdCpuScriptIntrinsicLoopFilter() {
				245	android_atomic_inc(&mPrch.quit);
				246	int i = 0;
				247	for (i = 0; i < mWorkerCount; ++i) {
				248	pthread_cond_signal(&mPrch.start_cond[i]);
				249	}
				250	for (i = 0; i < mWorkerCount; ++i) {
				251	pthread_join(mPrch.tid[i], NULL);
				252	}
				253	free(mPrch.tid);
				254	}
				255
				256	void RsdCpuScriptIntrinsicLoopFilter::populateScript(Script *s) {
				257	s->mHal.info.exportedVariableCount = 9;
				258	s->mHal.info.exportedFunctionCount = 1;
				259	}
				260
				261	RsdCpuScriptImpl * rsdIntrinsic_LoopFilter(RsdCpuReferenceImpl *ctx,
				262	const Script s, const Element e) {
				263	return new RsdCpuScriptIntrinsicLoopFilter(ctx, s, e);
				264	}
				265
				266	extern "C" void vp9_lpf_vertical_16_c(uint8_t *s, int pitch,
				267	const uint8_t *blimit,
				268	const uint8_t *limit,
				269	const uint8_t *thresh);
				270	extern "C" void vp9_lpf_vertical_16_neon(uint8_t *s, int pitch,
				271	const uint8_t *blimit,
				272	const uint8_t *limit,
				273	const uint8_t *thresh);
				274	extern "C" void vp9_lpf_vertical_16_dual_c(uint8_t *s, int pitch,
				275	const uint8_t *blimit,
				276	const uint8_t *limit,
				277	const uint8_t *thresh);
				278	extern "C" void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int pitch,
				279	const uint8_t *blimit,
				280	const uint8_t *limit,
				281	const uint8_t *thresh);
				282	extern "C" void vp9_lpf_vertical_8_c(uint8_t *s, int pitch,
				283	const uint8_t *blimit,
				284	const uint8_t *limit,
				285	const uint8_t *thresh,
				286	int count);
				287	extern "C" void vp9_lpf_vertical_8_neon(uint8_t *s, int pitch,
				288	const uint8_t *blimit,
				289	const uint8_t *limit,
				290	const uint8_t *thresh, int count);
				291	extern "C" void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch,
				292	const uint8_t *blimit0,
				293	const uint8_t *limit0,
				294	const uint8_t *thresh0,
				295	const uint8_t *blimit1,
				296	const uint8_t *limit1,
				297	const uint8_t *thresh1);
				298	extern "C" void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int pitch,
				299	const uint8_t *blimit0,
				300	const uint8_t *limit0,
				301	const uint8_t *thresh0,
				302	const uint8_t *blimit1,
				303	const uint8_t *limit1,
				304	const uint8_t *thresh1);
				305	extern "C" void vp9_lpf_vertical_4_c(uint8_t s, int pitch, const uint8_t blimit,
				306	const uint8_t limit, const uint8_t thresh,
				307	int count);
				308	extern "C" void vp9_lpf_vertical_4_neon(uint8_t *s, int pitch,
				309	const uint8_t *blimit,
				310	const uint8_t *limit,
				311	const uint8_t *thresh, int count);
				312	extern "C" void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch,
				313	const uint8_t *blimit0,
				314	const uint8_t *limit0,
				315	const uint8_t *thresh0,
				316	const uint8_t *blimit1,
				317	const uint8_t *limit1,
				318	const uint8_t *thresh1);
				319	extern "C" void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int pitch,
				320	const uint8_t *blimit0,
				321	const uint8_t *limit0,
				322	const uint8_t *thresh0,
				323	const uint8_t *blimit1,
				324	const uint8_t *limit1,
				325	const uint8_t *thresh1);
				326	extern "C" void vp9_lpf_horizontal_16_c(uint8_t *s, int pitch,
				327	const uint8_t *blimit,
				328	const uint8_t *limit,
				329	const uint8_t *thresh, int count);
				330	extern "C" void vp9_lpf_horizontal_16_neon(uint8_t *s, int pitch,
				331	const uint8_t *blimit,
				332	const uint8_t *limit,
				333	const uint8_t *thresh, int count);
				334	extern "C" void vp9_lpf_horizontal_8_c(uint8_t *s, int pitch,
				335	const uint8_t *blimit,
				336	const uint8_t *limit,
				337	const uint8_t *thresh, int count);
				338	extern "C" void vp9_lpf_horizontal_8_neon(uint8_t *s, int pitch,
				339	const uint8_t *blimit,
				340	const uint8_t *limit,
				341	const uint8_t *thresh, int count);
				342	extern "C" void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int pitch,
				343	const uint8_t *blimit0,
				344	const uint8_t *limit0,
				345	const uint8_t *thresh0,
				346	const uint8_t *blimit1,
				347	const uint8_t *limit1,
				348	const uint8_t *thresh1);
				349	extern "C" void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int pitch,
				350	const uint8_t *blimit0,
				351	const uint8_t *limit0,
				352	const uint8_t *thresh0,
				353	const uint8_t *blimit1,
				354	const uint8_t *limit1,
				355	const uint8_t *thresh1);
				356	extern "C" void vp9_lpf_horizontal_4_c(uint8_t *s, int pitch,
				357	const uint8_t *blimit,
				358	const uint8_t *limit,
				359	const uint8_t *thresh, int count);
				360	extern "C" void vp9_lpf_horizontal_4_neon(uint8_t *s, int pitch,
				361	const uint8_t *blimit,
				362	const uint8_t *limit,
				363	const uint8_t *thresh, int count);
				364	extern "C" void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int pitch,
				365	const uint8_t *blimit0,
				366	const uint8_t *limit0,
				367	const uint8_t *thresh0,
				368	const uint8_t *blimit1,
				369	const uint8_t *limit1,
				370	const uint8_t *thresh1);
				371	extern "C" void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int pitch,
				372	const uint8_t *blimit0,
				373	const uint8_t *limit0,
				374	const uint8_t *thresh0,
				375	const uint8_t *blimit1,
				376	const uint8_t *limit1,
				377	const uint8_t *thresh1);
				378
				379
Jason Sams	074424a	2014-05-22 13:30:03 -0700	[diff] [blame]	380	// remove ARM64 statement when ARM64 asm available
				381	#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
Matthieu Delahaye	6fc3e12	2014-03-04 11:05:49 -0600	[diff] [blame]	382
				383	#define vp9_lpf_vertical_16 vp9_lpf_vertical_16_neon
				384	#define vp9_lpf_vertical_16_dual vp9_lpf_vertical_16_dual_neon
				385	#define vp9_lpf_vertical_8 vp9_lpf_vertical_8_neon
				386	#define vp9_lpf_vertical_8_dual vp9_lpf_vertical_8_dual_neon
				387	#define vp9_lpf_vertical_4 vp9_lpf_vertical_4_neon
				388	#define vp9_lpf_vertical_4_dual vp9_lpf_vertical_4_dual_neon
				389	#define vp9_lpf_horizontal_16 vp9_lpf_horizontal_16_neon
				390	#define vp9_lpf_horizontal_8 vp9_lpf_horizontal_8_neon
				391	#define vp9_lpf_horizontal_8_dual vp9_lpf_horizontal_8_dual_neon
				392	#define vp9_lpf_horizontal_4 vp9_lpf_horizontal_4_neon
				393	#define vp9_lpf_horizontal_4_dual vp9_lpf_horizontal_4_dual_neon
				394
				395	void vp9_lpf_horizontal_8_dual_neon(uint8_t s, int p / pitch */,
				396	const uint8_t *blimit0,
				397	const uint8_t *limit0,
				398	const uint8_t *thresh0,
				399	const uint8_t *blimit1,
				400	const uint8_t *limit1,
				401	const uint8_t *thresh1) {
				402	vp9_lpf_horizontal_8(s, p, blimit0, limit0, thresh0, 1);
				403	vp9_lpf_horizontal_8(s + 8, p, blimit1, limit1, thresh1, 1);
				404	}
				405
				406	void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p,
				407	const uint8_t *blimit0,
				408	const uint8_t *limit0,
				409	const uint8_t *thresh0,
				410	const uint8_t *blimit1,
				411	const uint8_t *limit1,
				412	const uint8_t *thresh1) {
				413	vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
				414	vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
				415	}
				416
				417	void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p,
				418	const uint8_t *blimit0,
				419	const uint8_t *limit0,
				420	const uint8_t *thresh0,
				421	const uint8_t *blimit1,
				422	const uint8_t *limit1,
				423	const uint8_t *thresh1) {
				424	vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
				425	vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
				426	}
				427
				428	void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p,
				429	const uint8_t *blimit,
				430	const uint8_t *limit,
				431	const uint8_t *thresh) {
				432	vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
				433	vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
				434	}
				435
				436	#else
				437
				438	#define vp9_lpf_vertical_16 vp9_lpf_vertical_16_c
				439	#define vp9_lpf_vertical_16_dual vp9_lpf_vertical_16_dual_c
				440	#define vp9_lpf_vertical_8 vp9_lpf_vertical_8_c
				441	#define vp9_lpf_vertical_8_dual vp9_lpf_vertical_8_dual_c
				442	#define vp9_lpf_vertical_4 vp9_lpf_vertical_4_c
				443	#define vp9_lpf_vertical_4_dual vp9_lpf_vertical_4_dual_c
				444	#define vp9_lpf_horizontal_16 vp9_lpf_horizontal_16_c
				445	#define vp9_lpf_horizontal_8 vp9_lpf_horizontal_8_c
				446	#define vp9_lpf_horizontal_8_dual vp9_lpf_horizontal_8_dual_c
				447	#define vp9_lpf_horizontal_4 vp9_lpf_horizontal_4_c
				448	#define vp9_lpf_horizontal_4_dual vp9_lpf_horizontal_4_dual_c
				449
Jason Sams	074424a	2014-05-22 13:30:03 -0700	[diff] [blame]	450	#endif // ARCH_ARM_USE_INTRINSICS && !ARCH_ARM64_USE_INTRINSICS
Matthieu Delahaye	6fc3e12	2014-03-04 11:05:49 -0600	[diff] [blame]	451
				452
				453
				454
				455	static INLINE int8_t signed_char_clamp(int t) {
				456	return (int8_t)clamp(t, -128, 127);
				457	}
				458
				459	// should we apply any filter at all: 11111111 yes, 00000000 no
				460	static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,
				461	uint8_t p3, uint8_t p2,
				462	uint8_t p1, uint8_t p0,
				463	uint8_t q0, uint8_t q1,
				464	uint8_t q2, uint8_t q3) {
				465	int8_t mask = 0;
				466	mask \|= (abs(p3 - p2) > limit) * -1;
				467	mask \|= (abs(p2 - p1) > limit) * -1;
				468	mask \|= (abs(p1 - p0) > limit) * -1;
				469	mask \|= (abs(q1 - q0) > limit) * -1;
				470	mask \|= (abs(q2 - q1) > limit) * -1;
				471	mask \|= (abs(q3 - q2) > limit) * -1;
				472	mask \|= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
				473	return ~mask;
				474	}
				475
				476	static INLINE int8_t flat_mask4(uint8_t thresh,
				477	uint8_t p3, uint8_t p2,
				478	uint8_t p1, uint8_t p0,
				479	uint8_t q0, uint8_t q1,
				480	uint8_t q2, uint8_t q3) {
				481	int8_t mask = 0;
				482	mask \|= (abs(p1 - p0) > thresh) * -1;
				483	mask \|= (abs(q1 - q0) > thresh) * -1;
				484	mask \|= (abs(p2 - p0) > thresh) * -1;
				485	mask \|= (abs(q2 - q0) > thresh) * -1;
				486	mask \|= (abs(p3 - p0) > thresh) * -1;
				487	mask \|= (abs(q3 - q0) > thresh) * -1;
				488	return ~mask;
				489	}
				490
				491	static INLINE int8_t flat_mask5(uint8_t thresh,
				492	uint8_t p4, uint8_t p3,
				493	uint8_t p2, uint8_t p1,
				494	uint8_t p0, uint8_t q0,
				495	uint8_t q1, uint8_t q2,
				496	uint8_t q3, uint8_t q4) {
				497	int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
				498	mask \|= (abs(p4 - p0) > thresh) * -1;
				499	mask \|= (abs(q4 - q0) > thresh) * -1;
				500	return ~mask;
				501	}
				502
				503	// is there high edge variance internal edge: 11111111 yes, 00000000 no
				504	static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
				505	uint8_t q0, uint8_t q1) {
				506	int8_t hev = 0;
				507	hev \|= (abs(p1 - p0) > thresh) * -1;
				508	hev \|= (abs(q1 - q0) > thresh) * -1;
				509	return hev;
				510	}
				511
				512	static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
				513	uint8_t op0, uint8_t oq0, uint8_t *oq1) {
				514	int8_t filter1, filter2;
				515
				516	const int8_t ps1 = (int8_t) *op1 ^ 0x80;
				517	const int8_t ps0 = (int8_t) *op0 ^ 0x80;
				518	const int8_t qs0 = (int8_t) *oq0 ^ 0x80;
				519	const int8_t qs1 = (int8_t) *oq1 ^ 0x80;
				520	const uint8_t hev = hev_mask(thresh, op1, op0, oq0, oq1);
				521
				522	// add outer taps if we have high edge variance
				523	int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
				524
				525	// inner taps
				526	filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
				527
				528	// save bottom 3 bits so that we round one side +4 and the other +3
				529	// if it equals 4 we'll set to adjust by -1 to account for the fact
				530	// we'd round 3 the other way
				531	filter1 = signed_char_clamp(filter + 4) >> 3;
				532	filter2 = signed_char_clamp(filter + 3) >> 3;
				533
				534	*oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
				535	*op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
				536
				537	// outer tap adjustments
				538	filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
				539
				540	*oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
				541	*op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
				542	}
				543
				544	void vp9_lpf_horizontal_4_c(uint8_t s, int p / pitch */,
				545	const uint8_t blimit, const uint8_t limit,
				546	const uint8_t *thresh, int count) {
				547	int i;
				548
				549	// loop filter designed to work using chars so that we can make maximum use
				550	// of 8 bit simd instructions.
				551	for (i = 0; i < 8 * count; ++i) {
				552	const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
				553	const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
				554	const int8_t mask = filter_mask(limit, blimit,
				555	p3, p2, p1, p0, q0, q1, q2, q3);
				556	filter4(mask, thresh, s - 2 p, s - 1 * p, s, s + 1 * p);
				557	++s;
				558	}
				559	}
				560
				561	void vp9_lpf_horizontal_4_dual_c(uint8_t s, int p, const uint8_t blimit0,
				562	const uint8_t limit0, const uint8_t thresh0,
				563	const uint8_t blimit1, const uint8_t limit1,
				564	const uint8_t *thresh1) {
				565	vp9_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1);
				566	vp9_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1);
				567	}
				568
				569	void vp9_lpf_vertical_4_c(uint8_t s, int pitch, const uint8_t blimit,
				570	const uint8_t limit, const uint8_t thresh,
				571	int count) {
				572	int i;
				573
				574	// loop filter designed to work using chars so that we can make maximum use
				575	// of 8 bit simd instructions.
				576	for (i = 0; i < 8 * count; ++i) {
				577	const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
				578	const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
				579	const int8_t mask = filter_mask(limit, blimit,
				580	p3, p2, p1, p0, q0, q1, q2, q3);
				581	filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
				582	s += pitch;
				583	}
				584	}
				585
				586	void vp9_lpf_vertical_4_dual_c(uint8_t s, int pitch, const uint8_t blimit0,
				587	const uint8_t limit0, const uint8_t thresh0,
				588	const uint8_t blimit1, const uint8_t limit1,
				589	const uint8_t *thresh1) {
				590	vp9_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1);
				591	vp9_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1, 1);
				592	}
				593
				594	static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
				595	uint8_t op3, uint8_t op2,
				596	uint8_t op1, uint8_t op0,
				597	uint8_t oq0, uint8_t oq1,
				598	uint8_t oq2, uint8_t oq3) {
				599	if (flat && mask) {
				600	const uint8_t p3 = op3, p2 = op2, p1 = op1, p0 = op0;
				601	const uint8_t q0 = oq0, q1 = oq1, q2 = oq2, q3 = oq3;
				602
				603	// 7-tap filter [1, 1, 1, 2, 1, 1, 1]
				604	op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 p2 + p1 + p0 + q0, 3);
				605	op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 p1 + p0 + q0 + q1, 3);
				606	op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 p0 + q0 + q1 + q2, 3);
				607	oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 q0 + q1 + q2 + q3, 3);
				608	oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 q1 + q2 + q3 + q3, 3);
				609	oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 q2 + q3 + q3 + q3, 3);
				610	} else {
				611	filter4(mask, thresh, op1, op0, oq0, oq1);
				612	}
				613	}
				614
				615	void vp9_lpf_horizontal_8_c(uint8_t s, int p, const uint8_t blimit,
				616	const uint8_t limit, const uint8_t thresh,
				617	int count) {
				618	int i;
				619
				620	// loop filter designed to work using chars so that we can make maximum use
				621	// of 8 bit simd instructions.
				622	for (i = 0; i < 8 * count; ++i) {
				623	const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
				624	const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
				625
				626	const int8_t mask = filter_mask(limit, blimit,
				627	p3, p2, p1, p0, q0, q1, q2, q3);
				628	const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
				629	filter8(mask, thresh, flat, s - 4 p, s - 3 * p, s - 2 * p, s - 1 * p,
				630	s, s + 1 * p, s + 2 * p, s + 3 * p);
				631	++s;
				632	}
				633	}
				634
				635	void vp9_lpf_horizontal_8_dual_c(uint8_t s, int p, const uint8_t blimit0,
				636	const uint8_t limit0, const uint8_t thresh0,
				637	const uint8_t blimit1, const uint8_t limit1,
				638	const uint8_t *thresh1) {
				639	vp9_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1);
				640	vp9_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1);
				641	}
				642
				643	void vp9_lpf_vertical_8_c(uint8_t s, int pitch, const uint8_t blimit,
				644	const uint8_t limit, const uint8_t thresh,
				645	int count) {
				646	int i;
				647
				648	for (i = 0; i < 8 * count; ++i) {
				649	const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
				650	const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
				651	const int8_t mask = filter_mask(limit, blimit,
				652	p3, p2, p1, p0, q0, q1, q2, q3);
				653	const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
				654	filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1,
				655	s, s + 1, s + 2, s + 3);
				656	s += pitch;
				657	}
				658	}
				659
				660	void vp9_lpf_vertical_8_dual_c(uint8_t s, int pitch, const uint8_t blimit0,
				661	const uint8_t limit0, const uint8_t thresh0,
				662	const uint8_t blimit1, const uint8_t limit1,
				663	const uint8_t *thresh1) {
				664	vp9_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1);
				665	vp9_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1, 1);
				666	}
				667
				668	static INLINE void filter16(int8_t mask, uint8_t thresh,
				669	uint8_t flat, uint8_t flat2,
				670	uint8_t op7, uint8_t op6,
				671	uint8_t op5, uint8_t op4,
				672	uint8_t op3, uint8_t op2,
				673	uint8_t op1, uint8_t op0,
				674	uint8_t oq0, uint8_t oq1,
				675	uint8_t oq2, uint8_t oq3,
				676	uint8_t oq4, uint8_t oq5,
				677	uint8_t oq6, uint8_t oq7) {
				678	if (flat2 && flat && mask) {
				679	const uint8_t p7 = op7, p6 = op6, p5 = op5, p4 = op4,
				680	p3 = op3, p2 = op2, p1 = op1, p0 = op0;
				681
				682	const uint8_t q0 = oq0, q1 = oq1, q2 = oq2, q3 = oq3,
				683	q4 = oq4, q5 = oq5, q6 = oq6, q7 = oq7;
				684
				685	// 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
				686	op6 = ROUND_POWER_OF_TWO(p7 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
				687	q0, 4);
				688	op5 = ROUND_POWER_OF_TWO(p7 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
				689	q0 + q1, 4);
				690	op4 = ROUND_POWER_OF_TWO(p7 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
				691	q0 + q1 + q2, 4);
				692	op3 = ROUND_POWER_OF_TWO(p7 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
				693	q0 + q1 + q2 + q3, 4);
				694	op2 = ROUND_POWER_OF_TWO(p7 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
				695	q0 + q1 + q2 + q3 + q4, 4);
				696	op1 = ROUND_POWER_OF_TWO(p7 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
				697	q0 + q1 + q2 + q3 + q4 + q5, 4);
				698	op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 2 +
				699	q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
				700	*oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
				701	q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
				702	*oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
				703	q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
				704	*oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
				705	q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
				706	*oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
				707	q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
				708	*oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
				709	q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
				710	*oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
				711	q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
				712	*oq6 = ROUND_POWER_OF_TWO(p0 +
				713	q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
				714	} else {
				715	filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
				716	}
				717	}
				718
				719	void vp9_lpf_horizontal_16_c(uint8_t s, int p, const uint8_t blimit,
				720	const uint8_t limit, const uint8_t thresh,
				721	int count) {
				722	int i;
				723
				724	// loop filter designed to work using chars so that we can make maximum use
				725	// of 8 bit simd instructions.
				726	for (i = 0; i < 8 * count; ++i) {
				727	const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
				728	const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
				729	const int8_t mask = filter_mask(limit, blimit,
				730	p3, p2, p1, p0, q0, q1, q2, q3);
				731	const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
				732	const int8_t flat2 = flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
				733	q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
				734
				735	filter16(mask, *thresh, flat, flat2,
				736	s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
				737	s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
				738	s, s + 1 * p, s + 2 * p, s + 3 * p,
				739	s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
				740	++s;
				741	}
				742	}
				743
				744	static void mb_lpf_vertical_edge_w(uint8_t *s, int p,
				745	const uint8_t *blimit,
				746	const uint8_t *limit,
				747	const uint8_t *thresh,
				748	int count) {
				749	int i;
				750
				751	for (i = 0; i < count; ++i) {
				752	const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
				753	const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
				754	const int8_t mask = filter_mask(limit, blimit,
				755	p3, p2, p1, p0, q0, q1, q2, q3);
				756	const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
				757	const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
				758	q0, s[4], s[5], s[6], s[7]);
				759
				760	filter16(mask, *thresh, flat, flat2,
				761	s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
				762	s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
				763	s += p;
				764	}
				765	}
				766
				767	void vp9_lpf_vertical_16_c(uint8_t s, int p, const uint8_t blimit,
				768	const uint8_t limit, const uint8_t thresh) {
				769	mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
				770	}
				771
				772	void vp9_lpf_vertical_16_dual_c(uint8_t s, int p, const uint8_t blimit,
				773	const uint8_t limit, const uint8_t thresh) {
				774	mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
				775	}
				776
				777
				778	static void filter_selectively_vert_row2(PLANE_TYPE plane_type,
				779	uint8_t *s, int pitch,
				780	unsigned int mask_16x16_l,
				781	unsigned int mask_8x8_l,
				782	unsigned int mask_4x4_l,
				783	unsigned int mask_4x4_int_l,
				784	const LoopFilterInfoN *lfi_n,
				785	const uint8_t *lfl) {
				786	const int mask_shift = plane_type ? 4 : 8;
				787	const int mask_cutoff = plane_type ? 0xf : 0xff;
				788	const int lfl_forward = plane_type ? 4 : 8;
				789
				790	unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
				791	unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
				792	unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
				793	unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
				794	unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
				795	unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
				796	unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
				797	unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
				798	unsigned int mask;
				799
				800	for (mask = mask_16x16_0 \| mask_8x8_0 \| mask_4x4_0 \| mask_4x4_int_0 \|
				801	mask_16x16_1 \| mask_8x8_1 \| mask_4x4_1 \| mask_4x4_int_1;
				802	mask; mask >>= 1) {
				803	const LoopFilterThresh lfi0 = lfi_n->lfthr + lfl;
				804	const LoopFilterThresh lfi1 = lfi_n->lfthr + (lfl + lfl_forward);
				805
				806	// TODO(yunqingwang): count in loopfilter functions should be removed.
				807	if (mask & 1) {
				808	if ((mask_16x16_0 \| mask_16x16_1) & 1) {
				809	if ((mask_16x16_0 & mask_16x16_1) & 1) {
				810	vp9_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
				811	lfi0->hev_thr);
				812	} else if (mask_16x16_0 & 1) {
				813	vp9_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
				814	lfi0->hev_thr);
				815	} else {
				816	vp9_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
				817	lfi1->lim, lfi1->hev_thr);
				818	}
				819	}
				820
				821	if ((mask_8x8_0 \| mask_8x8_1) & 1) {
				822	if ((mask_8x8_0 & mask_8x8_1) & 1) {
				823	vp9_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
				824	lfi0->hev_thr, lfi1->mblim, lfi1->lim,
				825	lfi1->hev_thr);
				826	} else if (mask_8x8_0 & 1) {
				827	vp9_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,
				828	lfi0->hev_thr, 1);
				829	} else {
				830	vp9_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
				831	lfi1->hev_thr, 1);
				832	}
				833	}
				834
				835	if ((mask_4x4_0 \| mask_4x4_1) & 1) {
				836	if ((mask_4x4_0 & mask_4x4_1) & 1) {
				837	vp9_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
				838	lfi0->hev_thr, lfi1->mblim, lfi1->lim,
				839	lfi1->hev_thr);
				840	} else if (mask_4x4_0 & 1) {
				841	vp9_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
				842	lfi0->hev_thr, 1);
				843	} else {
				844	vp9_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
				845	lfi1->hev_thr, 1);
				846	}
				847	}
				848
				849	if ((mask_4x4_int_0 \| mask_4x4_int_1) & 1) {
				850	if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
				851	vp9_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
				852	lfi0->hev_thr, lfi1->mblim, lfi1->lim,
				853	lfi1->hev_thr);
				854	} else if (mask_4x4_int_0 & 1) {
				855	vp9_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
				856	lfi0->hev_thr, 1);
				857	} else {
				858	vp9_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,
				859	lfi1->lim, lfi1->hev_thr, 1);
				860	}
				861	}
				862	}
				863
				864	s += 8;
				865	lfl += 1;
				866	mask_16x16_0 >>= 1;
				867	mask_8x8_0 >>= 1;
				868	mask_4x4_0 >>= 1;
				869	mask_4x4_int_0 >>= 1;
				870	mask_16x16_1 >>= 1;
				871	mask_8x8_1 >>= 1;
				872	mask_4x4_1 >>= 1;
				873	mask_4x4_int_1 >>= 1;
				874	}
				875	}
				876
				877	static void filter_selectively_horiz(uint8_t *s, int pitch,
				878	unsigned int mask_16x16,
				879	unsigned int mask_8x8,
				880	unsigned int mask_4x4,
				881	unsigned int mask_4x4_int,
				882	const LoopFilterInfoN *lfi_n,
				883	const uint8_t *lfl) {
				884	unsigned int mask;
				885	int count;
				886
				887	for (mask = mask_16x16 \| mask_8x8 \| mask_4x4 \| mask_4x4_int;
				888	mask; mask >>= count) {
				889	const LoopFilterThresh lfi = lfi_n->lfthr + lfl;
				890
				891	count = 1;
				892	if (mask & 1) {
				893	if (mask_16x16 & 1) {
				894	if ((mask_16x16 & 3) == 3) {
				895	vp9_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
				896	lfi->hev_thr, 2);
				897	count = 2;
				898	} else {
				899	vp9_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
				900	lfi->hev_thr, 1);
				901	}
				902	} else if (mask_8x8 & 1) {
				903	if ((mask_8x8 & 3) == 3) {
				904	// Next block's thresholds
				905	const LoopFilterThresh lfin = lfi_n->lfthr + (lfl + 1);
				906
				907	vp9_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
				908	lfi->hev_thr, lfin->mblim, lfin->lim,
				909	lfin->hev_thr);
				910
				911	if ((mask_4x4_int & 3) == 3) {
				912	vp9_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
				913	lfi->lim, lfi->hev_thr, lfin->mblim,
				914	lfin->lim, lfin->hev_thr);
				915	} else {
				916	if (mask_4x4_int & 1)
				917	vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
				918	lfi->lim, lfi->hev_thr, 1);
				919	else if (mask_4x4_int & 2)
				920	vp9_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
				921	lfin->lim, lfin->hev_thr, 1);
				922	}
				923	count = 2;
				924	} else {
				925	vp9_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
				926
				927	if (mask_4x4_int & 1)
				928	vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
				929	lfi->lim, lfi->hev_thr, 1);
				930	}
				931	} else if (mask_4x4 & 1) {
				932	if ((mask_4x4 & 3) == 3) {
				933	// Next block's thresholds
				934	const LoopFilterThresh lfin = lfi_n->lfthr + (lfl + 1);
				935
				936	vp9_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
				937	lfi->hev_thr, lfin->mblim, lfin->lim,
				938	lfin->hev_thr);
				939	if ((mask_4x4_int & 3) == 3) {
				940	vp9_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
				941	lfi->lim, lfi->hev_thr, lfin->mblim,
				942	lfin->lim, lfin->hev_thr);
				943	} else {
				944	if (mask_4x4_int & 1)
				945	vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
				946	lfi->lim, lfi->hev_thr, 1);
				947	else if (mask_4x4_int & 2)
				948	vp9_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
				949	lfin->lim, lfin->hev_thr, 1);
				950	}
				951	count = 2;
				952	} else {
				953	vp9_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
				954
				955	if (mask_4x4_int & 1)
				956	vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
				957	lfi->hev_thr, 1);
				958	}
				959	} else if (mask_4x4_int & 1) {
				960	vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
				961	lfi->hev_thr, 1);
				962	}
				963	}
				964	s += 8 * count;
				965	lfl += count;
				966	mask_16x16 >>= count;
				967	mask_8x8 >>= count;
				968	mask_4x4 >>= count;
				969	mask_4x4_int >>= count;
				970	}
				971	}
				972
				973	static void filter_block_plane_y(LoopFilterInfoN *lf_info,
				974	LoopFilterMask *lfm,
				975	int stride,
				976	uint8_t *buf,
				977	int mi_rows,
				978	int mi_row) {
				979	uint8_t* dst0 = buf;
				980	int r; //, c;
				981
				982	uint64_t mask_16x16 = lfm->left_y[TX_16X16];
				983	uint64_t mask_8x8 = lfm->left_y[TX_8X8];
				984	uint64_t mask_4x4 = lfm->left_y[TX_4X4];
				985	uint64_t mask_4x4_int = lfm->int_4x4_y;
				986
				987	// Vertical pass: do 2 rows at one time
				988	for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < mi_rows; r += 2) {
				989	unsigned int mask_16x16_l = mask_16x16 & 0xffff;
				990	unsigned int mask_8x8_l = mask_8x8 & 0xffff;
				991	unsigned int mask_4x4_l = mask_4x4 & 0xffff;
				992	unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff;
				993
				994	// Disable filtering on the leftmost column
				995	filter_selectively_vert_row2(PLANE_TYPE_Y_WITH_DC, buf, stride,
				996	mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, lf_info,
				997	&lfm->lfl_y[r << 3]);
				998
				999	buf += 16 * stride;
				1000	mask_16x16 >>= 16;
				1001	mask_8x8 >>= 16;
				1002	mask_4x4 >>= 16;
				1003	mask_4x4_int >>= 16;
				1004	}
				1005
				1006	// Horizontal pass
				1007	buf = dst0;
				1008	mask_16x16 = lfm->above_y[TX_16X16];
				1009	mask_8x8 = lfm->above_y[TX_8X8];
				1010	mask_4x4 = lfm->above_y[TX_4X4];
				1011	mask_4x4_int = lfm->int_4x4_y;
				1012
				1013	for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < mi_rows; r++) {
				1014	unsigned int mask_16x16_r;
				1015	unsigned int mask_8x8_r;
				1016	unsigned int mask_4x4_r;
				1017
				1018	if (mi_row + r == 0) {
				1019	mask_16x16_r = 0;
				1020	mask_8x8_r = 0;
				1021	mask_4x4_r = 0;
				1022	} else {
				1023	mask_16x16_r = mask_16x16 & 0xff;
				1024	mask_8x8_r = mask_8x8 & 0xff;
				1025	mask_4x4_r = mask_4x4 & 0xff;
				1026	}
				1027
				1028	filter_selectively_horiz(buf, stride, mask_16x16_r, mask_8x8_r,
				1029	mask_4x4_r, mask_4x4_int & 0xff, lf_info, &lfm->lfl_y[r << 3]);
				1030
				1031	buf += 8 * stride;
				1032	mask_16x16 >>= 8;
				1033	mask_8x8 >>= 8;
				1034	mask_4x4 >>= 8;
				1035	mask_4x4_int >>= 8;
				1036	}
				1037	}
				1038
				1039	static void filter_block_plane_uv(LoopFilterInfoN *lf_info,
				1040	LoopFilterMask *lfm,
				1041	int stride,
				1042	uint8_t *buf,
				1043	int mi_rows,
				1044	int mi_row) {
				1045	uint8_t* dst0 = buf;
				1046	int r, c;
				1047
				1048	uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
				1049	uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
				1050	uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
				1051	uint16_t mask_4x4_int = lfm->int_4x4_uv;
				1052
				1053	// Vertical pass: do 2 rows at one time
				1054	for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < mi_rows; r += 4) {
				1055
				1056	for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) {
				1057	lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
				1058	lfm->lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)];
				1059	}
				1060
				1061	{
				1062	unsigned int mask_16x16_l = mask_16x16 & 0xff;
				1063	unsigned int mask_8x8_l = mask_8x8 & 0xff;
				1064	unsigned int mask_4x4_l = mask_4x4 & 0xff;
				1065	unsigned int mask_4x4_int_l = mask_4x4_int & 0xff;
				1066
				1067	// Disable filtering on the leftmost column
				1068	filter_selectively_vert_row2(PLANE_TYPE_UV, buf, stride,
				1069	mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l,
				1070	lf_info, &lfm->lfl_uv[r << 1]);
				1071
				1072	buf += 16 * stride;
				1073	mask_16x16 >>= 8;
				1074	mask_8x8 >>= 8;
				1075	mask_4x4 >>= 8;
				1076	mask_4x4_int >>= 8;
				1077	}
				1078	}
				1079
				1080	// Horizontal pass
				1081	buf = dst0;
				1082	mask_16x16 = lfm->above_uv[TX_16X16];
				1083	mask_8x8 = lfm->above_uv[TX_8X8];
				1084	mask_4x4 = lfm->above_uv[TX_4X4];
				1085	mask_4x4_int = lfm->int_4x4_uv;
				1086
				1087	for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < mi_rows; r += 2) {
				1088	int skip_border_4x4_r = mi_row + r == mi_rows - 1;
				1089	unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : (mask_4x4_int & 0xf);
				1090	unsigned int mask_16x16_r;
				1091	unsigned int mask_8x8_r;
				1092	unsigned int mask_4x4_r;
				1093
				1094	if (mi_row + r == 0) {
				1095	mask_16x16_r = 0;
				1096	mask_8x8_r = 0;
				1097	mask_4x4_r = 0;
				1098	} else {
				1099	mask_16x16_r = mask_16x16 & 0xf;
				1100	mask_8x8_r = mask_8x8 & 0xf;
				1101	mask_4x4_r = mask_4x4 & 0xf;
				1102	}
				1103
				1104	filter_selectively_horiz(buf, stride, mask_16x16_r, mask_8x8_r,
				1105	mask_4x4_r, mask_4x4_int_r, lf_info, &lfm->lfl_uv[r << 1]);
				1106
				1107	buf += 8 * stride;
				1108	mask_16x16 >>= 4;
				1109	mask_8x8 >>= 4;
				1110	mask_4x4 >>= 4;
				1111	mask_4x4_int >>= 4;
				1112	}
				1113	}
				1114
				1115	static void vp9_loop_filter_rows_work_proc(void data) {
				1116	LoopFilterProgressChart param = (LoopFilterProgressChart )data;
				1117	int wid = android_atomic_inc(&param->wid);
				1118	int sb_row;
				1119	int mi_row, mi_col;
				1120	int lfm_idx;
				1121	uint8_t *buf_start[MAX_MB_PLANE];
				1122	uint8_t *buf[MAX_MB_PLANE];
				1123	BufferInfo *buf_info = &param->buf_info;
				1124
				1125	while (!android_atomic_release_load(&param->quit)) {
				1126	pthread_mutex_lock(&param->mutex[wid]);
				1127	pthread_cond_wait(&param->start_cond[wid], &param->mutex[wid]);
				1128	pthread_mutex_unlock(&param->mutex[wid]);
				1129
				1130	if (android_atomic_release_load(&param->quit)) return NULL;
				1131
				1132	buf_start[0] = param->buffer_alloc + buf_info->y_offset;
				1133	buf_start[1] = param->buffer_alloc + buf_info->u_offset;
				1134	buf_start[2] = param->buffer_alloc + buf_info->v_offset;
				1135	sb_row = android_atomic_inc(&param->sb_row_pro);
				1136	mi_row = (sb_row * MI_BLOCK_SIZE) + param->start;
				1137
				1138	while (mi_row < param->stop) {
				1139	buf[0] = buf_start[0] + (mi_row * buf_info->y_stride << 3);
				1140	buf[1] = buf_start[1] + (mi_row * buf_info->uv_stride << 2);
				1141	buf[2] = buf_start[2] + (mi_row * buf_info->uv_stride << 2);
				1142	lfm_idx = sb_row * ((param->mi_cols + 7) >> 3);
				1143	for (mi_col = 0; mi_col < param->mi_cols; mi_col += MI_BLOCK_SIZE) {
				1144
				1145	while (param->chart[sb_row+1] + 2 > android_atomic_release_load(&param->chart[sb_row])) {
				1146	usleep(1);
				1147	}
				1148
				1149	filter_block_plane_y(param->lf_info, param->lfms + lfm_idx,
				1150	buf_info->y_stride, buf[0], param->mi_rows,
				1151	mi_row);
				1152	mi_col += MI_BLOCK_SIZE;
				1153	if (mi_col < param->mi_cols) {
				1154	lfm_idx++;
				1155	buf[0] += MI_BLOCK_SIZE * MI_BLOCK_SIZE;
				1156	filter_block_plane_y(param->lf_info, param->lfms + lfm_idx,
				1157	buf_info->y_stride, buf[0],
				1158	param->mi_rows, mi_row);
				1159	}
				1160	buf[0] += MI_BLOCK_SIZE * MI_BLOCK_SIZE;
				1161	if (param->num_planes > 1) {
				1162	lfm_idx--;
				1163	filter_block_plane_uv(param->lf_info, param->lfms + lfm_idx,
				1164	buf_info->uv_stride, buf[1],
				1165	param->mi_rows, mi_row);
				1166	filter_block_plane_uv(param->lf_info, param->lfms + lfm_idx,
				1167	buf_info->uv_stride, buf[2],
				1168	param->mi_rows, mi_row);
				1169	if (mi_col < param->mi_cols) {
				1170	lfm_idx++;
				1171	buf[1] += MI_BLOCK_SIZE * MI_BLOCK_SIZE >> 1;
				1172	buf[2] += MI_BLOCK_SIZE * MI_BLOCK_SIZE >> 1;
				1173	filter_block_plane_uv(param->lf_info,
				1174	param->lfms + lfm_idx,
				1175	buf_info->uv_stride, buf[1],
				1176	param->mi_rows, mi_row);
				1177	filter_block_plane_uv(param->lf_info,
				1178	param->lfms + lfm_idx,
				1179	buf_info->uv_stride, buf[2],
				1180	param->mi_rows, mi_row);
				1181	}
				1182	buf[1] += MI_BLOCK_SIZE * MI_BLOCK_SIZE >> 1;
				1183	buf[2] += MI_BLOCK_SIZE * MI_BLOCK_SIZE >> 1;
				1184	}
				1185	lfm_idx++;
				1186	android_atomic_inc(&param->chart[sb_row+1]);
				1187	}
				1188	android_atomic_inc(&param->chart[sb_row+1]);
				1189	sb_row = android_atomic_inc(&param->sb_row_pro);
				1190	mi_row = (sb_row << 3) + param->start;
				1191	}
				1192
				1193	pthread_mutex_lock(param->hmutex);
				1194	if ((--param->doing) == 0)
				1195	pthread_cond_signal(param->finish);
				1196	pthread_mutex_unlock(param->hmutex);
				1197	}
				1198
				1199	return NULL;
				1200	}
				1201
				1202	RsdCpuScriptIntrinsicLoopFilter::RsdCpuScriptIntrinsicLoopFilter(
				1203	RsdCpuReferenceImpl ctx, const Script s, const Element *e)
				1204	: RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB) {
				1205	mRootPtr = &kernel;
				1206	mWorkerCount = sysconf(_SC_NPROCESSORS_ONLN);
				1207	mPrch.quit = 0;
				1208	mPrch.wid = 0;
				1209	mPrch.sb_row_pro = 0;
				1210	mPrch.doing = mWorkerCount;
				1211	int size = mWorkerCount * sizeof(pthread_t) +
				1212	mWorkerCount * sizeof(pthread_mutex_t) +
				1213	mWorkerCount * sizeof(pthread_cond_t) +
				1214	sizeof(pthread_mutex_t) + sizeof(pthread_cond_t);
				1215	uint8_t ptr = (uint8_t )malloc(size);
				1216	rsAssert(ptr);
				1217	mPrch.tid = (pthread_t *)ptr;
				1218	mPrch.mutex = (pthread_mutex_t *) (mPrch.tid + mWorkerCount);
				1219	mPrch.start_cond = (pthread_cond_t *) (mPrch.mutex + mWorkerCount);
				1220	mPrch.hmutex = (pthread_mutex_t *) (mPrch.start_cond + mWorkerCount);
				1221	mPrch.finish = (pthread_cond_t *) (mPrch.hmutex + 1);
				1222	int i = 0;
				1223	int rv = 0;
				1224	pthread_mutex_init(mPrch.hmutex, NULL);
				1225	pthread_cond_init(mPrch.finish, NULL);
				1226	for (i = 0; i < mWorkerCount; ++i) {
				1227	pthread_mutex_init(&mPrch.mutex[i], NULL);
				1228	pthread_cond_init(&mPrch.start_cond[i], NULL);
				1229	}
				1230	for (i = 0; i < mWorkerCount; ++i) {
				1231	rv = pthread_create(&mPrch.tid[i], NULL, &vp9_loop_filter_rows_work_proc, &mPrch);
				1232	rsAssert(rv == 0);
				1233	}
				1234	}