Blame - src/dec/dsp.c - fp2-dev/platform/external/webp

blob: efde49d96027deee781a05d43f4dacef556b5d1b [file] [log] [blame]

Eric Hassold	9aea642	2011-01-04 17:22:46 -0800	[diff] [blame]	1	// Copyright 2010 Google Inc.
				2	//
				3	// This code is licensed under the same terms as WebM:
				4	// Software License Agreement: http://www.webmproject.org/license/software/
				5	// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
				6	// -----------------------------------------------------------------------------
				7	//
				8	// speed-critical functions.
				9	//
				10	// Author: Skal (pascal.massimino@gmail.com)
				11
				12	#include "vp8i.h"
				13
				14	#if defined(__SSE2__)
				15	#include <emmintrin.h>
				16	#endif
				17
				18	#if defined(__cplusplus) \|\| defined(c_plusplus)
				19	extern "C" {
				20	#endif
				21
				22	//-----------------------------------------------------------------------------
				23	// run-time tables (~4k)
				24
				25	static uint8_t abs0[255 + 255 + 1]; // abs(i)
				26	static uint8_t abs1[255 + 255 + 1]; // abs(i)>>1
				27	static int8_t sclip1[1020 + 1020 + 1]; // clips [-1020, 1020] to [-128, 127]
				28	static int8_t sclip2[112 + 112 + 1]; // clips [-112, 112] to [-16, 15]
				29	static uint8_t clip1[255 + 510 + 1]; // clips [-255,510] to [0,255]
				30
Vikas Arora	03d5e34	2011-06-02 23:59:44 +0530	[diff] [blame^]	31	// We declare this variable 'volatile' to prevent instruction reordering
				32	// and make sure it's set to true _last_ (so as to be thread-safe)
				33	static volatile int tables_ok = 0;
Eric Hassold	9aea642	2011-01-04 17:22:46 -0800	[diff] [blame]	34
Vikas Arora	03d5e34	2011-06-02 23:59:44 +0530	[diff] [blame^]	35	void VP8DspInitTables(void) {
Eric Hassold	9aea642	2011-01-04 17:22:46 -0800	[diff] [blame]	36	if (!tables_ok) {
				37	int i;
				38	for (i = -255; i <= 255; ++i) {
				39	abs0[255 + i] = (i < 0) ? -i : i;
				40	abs1[255 + i] = abs0[255 + i] >> 1;
				41	}
				42	for (i = -1020; i <= 1020; ++i) {
				43	sclip1[1020 + i] = (i < -128) ? -128 : (i > 127) ? 127 : i;
				44	}
				45	for (i = -112; i <= 112; ++i) {
				46	sclip2[112 + i] = (i < -16) ? -16 : (i > 15) ? 15 : i;
				47	}
				48	for (i = -255; i <= 255 + 255; ++i) {
				49	clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
				50	}
				51	tables_ok = 1;
				52	}
				53	}
				54
				55	static inline uint8_t clip_8b(int v) {
				56	return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
				57	}
				58
				59	//-----------------------------------------------------------------------------
				60	// Transforms (Paragraph 14.4)
				61
				62	#define STORE(x, y, v) \
				63	dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3))
				64
				65	static const int kC1 = 20091 + (1 << 16);
				66	static const int kC2 = 35468;
				67	#define MUL(a, b) (((a) * (b)) >> 16)
				68
				69	static void Transform(const int16_t* in, uint8_t* dst) {
				70	int C[4 * 4], *tmp;
				71	int i;
				72	tmp = C;
				73	for (i = 0; i < 4; ++i) { // vertical pass
				74	const int a = in[0] + in[8]; // [-4096, 4094]
				75	const int b = in[0] - in[8]; // [-4095, 4095]
				76	const int c = MUL(in[4], kC2) - MUL(in[12], kC1); // [-3783, 3783]
				77	const int d = MUL(in[4], kC1) + MUL(in[12], kC2); // [-3785, 3781]
				78	tmp[0] = a + d; // [-7881, 7875]
				79	tmp[1] = b + c; // [-7878, 7878]
				80	tmp[2] = b - c; // [-7878, 7878]
				81	tmp[3] = a - d; // [-7877, 7879]
				82	tmp += 4;
				83	in++;
				84	}
				85	// Each pass is expanding the dynamic range by ~3.85 (upper bound).
				86	// The exact value is (2. + (kC1 + kC2) / 65536).
				87	// After the second pass, maximum interval is [-3794, 3794], assuming
				88	// an input in [-2048, 2047] interval. We then need to add a dst value
				89	// in the [0, 255] range.
				90	// In the worst case scenario, the input to clip_8b() can be as large as
				91	// [-60713, 60968].
				92	tmp = C;
				93	for (i = 0; i < 4; ++i) { // horizontal pass
				94	const int dc = tmp[0] + 4;
				95	const int a = dc + tmp[8];
				96	const int b = dc - tmp[8];
				97	const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
				98	const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
				99	STORE(0, 0, a + d);
				100	STORE(1, 0, b + c);
				101	STORE(2, 0, b - c);
				102	STORE(3, 0, a - d);
				103	tmp++;
				104	dst += BPS;
				105	}
				106	}
				107	#undef MUL
				108
				109	static void TransformUV(const int16_t* in, uint8_t* dst) {
				110	Transform(in + 0 * 16, dst);
				111	Transform(in + 1 * 16, dst + 4);
				112	Transform(in + 2 * 16, dst + 4 * BPS);
				113	Transform(in + 3 * 16, dst + 4 * BPS + 4);
				114	}
				115
				116	static void TransformDC(const int16_t in, uint8_t dst) {
				117	const int DC = in[0] + 4;
				118	int i, j;
				119	for (j = 0; j < 4; ++j) {
				120	for (i = 0; i < 4; ++i) {
				121	STORE(i, j, DC);
				122	}
				123	}
				124	}
				125
				126	static void TransformDCUV(const int16_t* in, uint8_t* dst) {
				127	if (in[0 * 16]) TransformDC(in + 0 * 16, dst);
				128	if (in[1 * 16]) TransformDC(in + 1 * 16, dst + 4);
				129	if (in[2 * 16]) TransformDC(in + 2 * 16, dst + 4 * BPS);
				130	if (in[3 * 16]) TransformDC(in + 3 * 16, dst + 4 * BPS + 4);
				131	}
				132
				133	#undef STORE
				134
				135	// default C implementations:
				136	VP8Idct VP8Transform = Transform;
				137	VP8Idct VP8TransformUV = TransformUV;
				138	VP8Idct VP8TransformDC = TransformDC;
				139	VP8Idct VP8TransformDCUV = TransformDCUV;
				140
				141	//-----------------------------------------------------------------------------
				142	// Paragraph 14.3
				143
				144	static void TransformWHT(const int16_t* in, int16_t* out) {
				145	int tmp[16];
				146	int i;
				147	for (i = 0; i < 4; ++i) {
				148	const int a0 = in[0 + i] + in[12 + i];
				149	const int a1 = in[4 + i] + in[ 8 + i];
				150	const int a2 = in[4 + i] - in[ 8 + i];
				151	const int a3 = in[0 + i] - in[12 + i];
				152	tmp[0 + i] = a0 + a1;
				153	tmp[8 + i] = a0 - a1;
				154	tmp[4 + i] = a3 + a2;
				155	tmp[12 + i] = a3 - a2;
				156	}
				157	for (i = 0; i < 4; ++i) {
				158	const int dc = tmp[0 + i * 4] + 3; // w/ rounder
				159	const int a0 = dc + tmp[3 + i * 4];
				160	const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4];
				161	const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4];
				162	const int a3 = dc - tmp[3 + i * 4];
				163	out[ 0] = (a0 + a1) >> 3;
				164	out[16] = (a3 + a2) >> 3;
				165	out[32] = (a0 - a1) >> 3;
				166	out[48] = (a3 - a2) >> 3;
				167	out += 64;
				168	}
				169	}
				170
				171	void (VP8TransformWHT)(const int16_t in, int16_t* out) = TransformWHT;
				172
				173	//-----------------------------------------------------------------------------
				174	// Intra predictions
				175
				176	#define OUT(x, y) dst[(x) + (y) * BPS]
				177
				178	static inline void TrueMotion(uint8_t *dst, int size) {
				179	const uint8_t* top = dst - BPS;
				180	const uint8_t* const clip0 = clip1 + 255 - top[-1];
				181	int y;
				182	for (y = 0; y < size; ++y) {
				183	const uint8_t* const clip = clip0 + dst[-1];
				184	int x;
				185	for (x = 0; x < size; ++x) {
				186	dst[x] = clip[top[x]];
				187	}
				188	dst += BPS;
				189	}
				190	}
				191	static void TM4(uint8_t *dst) { TrueMotion(dst, 4); }
				192	static void TM8uv(uint8_t *dst) { TrueMotion(dst, 8); }
				193	static void TM16(uint8_t *dst) { TrueMotion(dst, 16); }
				194
				195	//-----------------------------------------------------------------------------
				196	// 16x16
				197
				198	static void VE16(uint8_t *dst) { // vertical
				199	int j;
				200	for (j = 0; j < 16; ++j) {
				201	memcpy(dst + j * BPS, dst - BPS, 16);
				202	}
				203	}
				204
				205	static void HE16(uint8_t *dst) { // horizontal
				206	int j;
				207	for (j = 16; j > 0; --j) {
				208	memset(dst, dst[-1], 16);
				209	dst += BPS;
				210	}
				211	}
				212
				213	static inline void Put16(int v, uint8_t* dst) {
				214	int j;
				215	for (j = 0; j < 16; ++j) {
				216	memset(dst + j * BPS, v, 16);
				217	}
				218	}
				219
				220	static void DC16(uint8_t *dst) { // DC
				221	int DC = 16;
				222	int j;
				223	for (j = 0; j < 16; ++j) {
				224	DC += dst[-1 + j * BPS] + dst[j - BPS];
				225	}
				226	Put16(DC >> 5, dst);
				227	}
				228
				229	static void DC16NoTop(uint8_t *dst) { // DC with top samples not available
				230	int DC = 8;
				231	int j;
				232	for (j = 0; j < 16; ++j) {
				233	DC += dst[-1 + j * BPS];
				234	}
				235	Put16(DC >> 4, dst);
				236	}
				237
				238	static void DC16NoLeft(uint8_t *dst) { // DC with left samples not available
				239	int DC = 8;
				240	int i;
				241	for (i = 0; i < 16; ++i) {
				242	DC += dst[i - BPS];
				243	}
				244	Put16(DC >> 4, dst);
				245	}
				246
				247	static void DC16NoTopLeft(uint8_t *dst) { // DC with no top and left samples
				248	Put16(0x80, dst);
				249	}
				250
				251	//-----------------------------------------------------------------------------
				252	// 4x4
				253
				254	#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
				255	#define AVG2(a, b) (((a) + (b) + 1) >> 1)
				256
				257	static void VE4(uint8_t *dst) { // vertical
				258	const uint8_t* top = dst - BPS;
				259	const uint8_t vals[4] = {
				260	AVG3(top[-1], top[0], top[1]),
				261	AVG3(top[ 0], top[1], top[2]),
				262	AVG3(top[ 1], top[2], top[3]),
				263	AVG3(top[ 2], top[3], top[4])
				264	};
				265	int i;
				266	for (i = 0; i < 4; ++i) {
				267	memcpy(dst + i * BPS, vals, sizeof(vals));
				268	}
				269	}
				270
				271	static void HE4(uint8_t *dst) { // horizontal
				272	const int A = dst[-1 - BPS];
				273	const int B = dst[-1];
				274	const int C = dst[-1 + BPS];
				275	const int D = dst[-1 + 2 * BPS];
				276	const int E = dst[-1 + 3 * BPS];
				277	(uint32_t)(dst + 0 * BPS) = 0x01010101U * AVG3(A, B, C);
				278	(uint32_t)(dst + 1 * BPS) = 0x01010101U * AVG3(B, C, D);
				279	(uint32_t)(dst + 2 * BPS) = 0x01010101U * AVG3(C, D, E);
				280	(uint32_t)(dst + 3 * BPS) = 0x01010101U * AVG3(D, E, E);
				281	}
				282
				283	static void DC4(uint8_t *dst) { // DC
				284	uint32_t dc = 4;
				285	int i;
				286	for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
				287	dc >>= 3;
				288	for (i = 0; i < 4; ++i) memset(dst + i * BPS, dc, 4);
				289	}
				290
				291	static void RD4(uint8_t *dst) { // Down-right
				292	const int I = dst[-1 + 0 * BPS];
				293	const int J = dst[-1 + 1 * BPS];
				294	const int K = dst[-1 + 2 * BPS];
				295	const int L = dst[-1 + 3 * BPS];
				296	const int X = dst[-1 - BPS];
				297	const int A = dst[0 - BPS];
				298	const int B = dst[1 - BPS];
				299	const int C = dst[2 - BPS];
				300	const int D = dst[3 - BPS];
				301	OUT(0, 3) = AVG3(J, K, L);
				302	OUT(0, 2) = OUT(1, 3) = AVG3(I, J, K);
				303	OUT(0, 1) = OUT(1, 2) = OUT(2, 3) = AVG3(X, I, J);
				304	OUT(0, 0) = OUT(1, 1) = OUT(2, 2) = OUT(3, 3) = AVG3(A, X, I);
				305	OUT(1, 0) = OUT(2, 1) = OUT(3, 2) = AVG3(B, A, X);
				306	OUT(2, 0) = OUT(3, 1) = AVG3(C, B, A);
				307	OUT(3, 0) = AVG3(D, C, B);
				308	}
				309
				310	static void LD4(uint8_t *dst) { // Down-Left
				311	const int A = dst[0 - BPS];
				312	const int B = dst[1 - BPS];
				313	const int C = dst[2 - BPS];
				314	const int D = dst[3 - BPS];
				315	const int E = dst[4 - BPS];
				316	const int F = dst[5 - BPS];
				317	const int G = dst[6 - BPS];
				318	const int H = dst[7 - BPS];
				319	OUT(0, 0) = AVG3(A, B, C);
				320	OUT(1, 0) = OUT(0, 1) = AVG3(B, C, D);
				321	OUT(2, 0) = OUT(1, 1) = OUT(0, 2) = AVG3(C, D, E);
				322	OUT(3, 0) = OUT(2, 1) = OUT(1, 2) = OUT(0, 3) = AVG3(D, E, F);
				323	OUT(3, 1) = OUT(2, 2) = OUT(1, 3) = AVG3(E, F, G);
				324	OUT(3, 2) = OUT(2, 3) = AVG3(F, G, H);
				325	OUT(3, 3) = AVG3(G, H, H);
				326	}
				327
				328	static void VR4(uint8_t *dst) { // Vertical-Right
				329	const int I = dst[-1 + 0 * BPS];
				330	const int J = dst[-1 + 1 * BPS];
				331	const int K = dst[-1 + 2 * BPS];
				332	const int X = dst[-1 - BPS];
				333	const int A = dst[0 - BPS];
				334	const int B = dst[1 - BPS];
				335	const int C = dst[2 - BPS];
				336	const int D = dst[3 - BPS];
				337	OUT(0, 0) = OUT(1, 2) = AVG2(X, A);
				338	OUT(1, 0) = OUT(2, 2) = AVG2(A, B);
				339	OUT(2, 0) = OUT(3, 2) = AVG2(B, C);
				340	OUT(3, 0) = AVG2(C, D);
				341
				342	OUT(0, 3) = AVG3(K, J, I);
				343	OUT(0, 2) = AVG3(J, I, X);
				344	OUT(0, 1) = OUT(1, 3) = AVG3(I, X, A);
				345	OUT(1, 1) = OUT(2, 3) = AVG3(X, A, B);
				346	OUT(2, 1) = OUT(3, 3) = AVG3(A, B, C);
				347	OUT(3, 1) = AVG3(B, C, D);
				348	}
				349
				350	static void VL4(uint8_t *dst) { // Vertical-Left
				351	const int A = dst[0 - BPS];
				352	const int B = dst[1 - BPS];
				353	const int C = dst[2 - BPS];
				354	const int D = dst[3 - BPS];
				355	const int E = dst[4 - BPS];
				356	const int F = dst[5 - BPS];
				357	const int G = dst[6 - BPS];
				358	const int H = dst[7 - BPS];
				359	OUT(0, 0) = AVG2(A, B);
				360	OUT(1, 0) = OUT(0, 2) = AVG2(B, C);
				361	OUT(2, 0) = OUT(1, 2) = AVG2(C, D);
				362	OUT(3, 0) = OUT(2, 2) = AVG2(D, E);
				363
				364	OUT(0, 1) = AVG3(A, B, C);
				365	OUT(1, 1) = OUT(0, 3) = AVG3(B, C, D);
				366	OUT(2, 1) = OUT(1, 3) = AVG3(C, D, E);
				367	OUT(3, 1) = OUT(2, 3) = AVG3(D, E, F);
				368	OUT(3, 2) = AVG3(E, F, G);
				369	OUT(3, 3) = AVG3(F, G, H);
				370	}
				371
				372	static void HU4(uint8_t *dst) { // Horizontal-Up
				373	const int I = dst[-1 + 0 * BPS];
				374	const int J = dst[-1 + 1 * BPS];
				375	const int K = dst[-1 + 2 * BPS];
				376	const int L = dst[-1 + 3 * BPS];
				377	OUT(0, 0) = AVG2(I, J);
				378	OUT(2, 0) = OUT(0, 1) = AVG2(J, K);
				379	OUT(2, 1) = OUT(0, 2) = AVG2(K, L);
				380	OUT(1, 0) = AVG3(I, J, K);
				381	OUT(3, 0) = OUT(1, 1) = AVG3(J, K, L);
				382	OUT(3, 1) = OUT(1, 2) = AVG3(K, L, L);
				383	OUT(3, 2) = OUT(2, 2) =
				384	OUT(0, 3) = OUT(1, 3) = OUT(2, 3) = OUT(3, 3) = L;
				385	}
				386
				387	static void HD4(uint8_t *dst) { // Horizontal-Down
				388	const int I = dst[-1 + 0 * BPS];
				389	const int J = dst[-1 + 1 * BPS];
				390	const int K = dst[-1 + 2 * BPS];
				391	const int L = dst[-1 + 3 * BPS];
				392	const int X = dst[-1 - BPS];
				393	const int A = dst[0 - BPS];
				394	const int B = dst[1 - BPS];
				395	const int C = dst[2 - BPS];
				396
				397	OUT(0, 0) = OUT(2, 1) = AVG2(I, X);
				398	OUT(0, 1) = OUT(2, 2) = AVG2(J, I);
				399	OUT(0, 2) = OUT(2, 3) = AVG2(K, J);
				400	OUT(0, 3) = AVG2(L, K);
				401
				402	OUT(3, 0) = AVG3(A, B, C);
				403	OUT(2, 0) = AVG3(X, A, B);
				404	OUT(1, 0) = OUT(3, 1) = AVG3(I, X, A);
				405	OUT(1, 1) = OUT(3, 2) = AVG3(J, I, X);
				406	OUT(1, 2) = OUT(3, 3) = AVG3(K, J, I);
				407	OUT(1, 3) = AVG3(L, K, J);
				408	}
				409
				410	#undef AVG3
				411	#undef AVG2
				412
				413	//-----------------------------------------------------------------------------
				414	// Chroma
				415
				416	static void VE8uv(uint8_t *dst) { // vertical
				417	int j;
				418	for (j = 0; j < 8; ++j) {
				419	memcpy(dst + j * BPS, dst - BPS, 8);
				420	}
				421	}
				422
				423	static void HE8uv(uint8_t *dst) { // horizontal
				424	int j;
				425	for (j = 0; j < 8; ++j) {
				426	memset(dst, dst[-1], 8);
				427	dst += BPS;
				428	}
				429	}
				430
				431	// helper for chroma-DC predictions
				432	static inline void Put8x8uv(uint64_t v, uint8_t* dst) {
				433	int j;
				434	for (j = 0; j < 8; ++j) {
				435	(uint64_t)(dst + j * BPS) = v;
				436	}
				437	}
				438
				439	static void DC8uv(uint8_t *dst) { // DC
				440	int dc0 = 8;
				441	int i;
				442	for (i = 0; i < 8; ++i) {
				443	dc0 += dst[i - BPS] + dst[-1 + i * BPS];
				444	}
				445	Put8x8uv((uint64_t)((dc0 >> 4) * 0x0101010101010101ULL), dst);
				446	}
				447
				448	static void DC8uvNoLeft(uint8_t *dst) { // DC with no left samples
				449	int dc0 = 4;
				450	int i;
				451	for (i = 0; i < 8; ++i) {
				452	dc0 += dst[i - BPS];
				453	}
				454	Put8x8uv((uint64_t)((dc0 >> 3) * 0x0101010101010101ULL), dst);
				455	}
				456
				457	static void DC8uvNoTop(uint8_t *dst) { // DC with no top samples
				458	int dc0 = 4;
				459	int i;
				460	for (i = 0; i < 8; ++i) {
				461	dc0 += dst[-1 + i * BPS];
				462	}
				463	Put8x8uv((uint64_t)((dc0 >> 3) * 0x0101010101010101ULL), dst);
				464	}
				465
				466	static void DC8uvNoTopLeft(uint8_t *dst) { // DC with nothing
				467	Put8x8uv(0x8080808080808080ULL, dst);
				468	}
				469
				470	//-----------------------------------------------------------------------------
				471	// default C implementations
				472
Vikas Arora	03d5e34	2011-06-02 23:59:44 +0530	[diff] [blame^]	473	VP8PredFunc VP8PredLuma4[NUM_BMODES] = {
Eric Hassold	9aea642	2011-01-04 17:22:46 -0800	[diff] [blame]	474	DC4, TM4, VE4, HE4, RD4, VR4, LD4, VL4, HD4, HU4
				475	};
				476
Vikas Arora	03d5e34	2011-06-02 23:59:44 +0530	[diff] [blame^]	477	VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES] = {
Eric Hassold	9aea642	2011-01-04 17:22:46 -0800	[diff] [blame]	478	DC16, TM16, VE16, HE16,
				479	DC16NoTop, DC16NoLeft, DC16NoTopLeft
				480	};
				481
Vikas Arora	03d5e34	2011-06-02 23:59:44 +0530	[diff] [blame^]	482	VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES] = {
Eric Hassold	9aea642	2011-01-04 17:22:46 -0800	[diff] [blame]	483	DC8uv, TM8uv, VE8uv, HE8uv,
				484	DC8uvNoTop, DC8uvNoLeft, DC8uvNoTopLeft
				485	};
				486
				487	//-----------------------------------------------------------------------------
				488	// Edge filtering functions
				489
				490	// 4 pixels in, 2 pixels out
				491	static inline void do_filter2(uint8_t* p, int step) {
				492	const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
				493	const int a = 3 * (q0 - p0) + sclip1[1020 + p1 - q1];
				494	const int a1 = sclip2[112 + ((a + 4) >> 3)];
				495	const int a2 = sclip2[112 + ((a + 3) >> 3)];
				496	p[-step] = clip1[255 + p0 + a2];
				497	p[ 0] = clip1[255 + q0 - a1];
				498	}
				499
				500	// 4 pixels in, 4 pixels out
				501	static inline void do_filter4(uint8_t* p, int step) {
				502	const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
				503	const int a = 3 * (q0 - p0);
				504	const int a1 = sclip2[112 + ((a + 4) >> 3)];
				505	const int a2 = sclip2[112 + ((a + 3) >> 3)];
				506	const int a3 = (a1 + 1) >> 1;
				507	p[-2*step] = clip1[255 + p1 + a3];
				508	p[- step] = clip1[255 + p0 + a2];
				509	p[ 0] = clip1[255 + q0 - a1];
				510	p[ step] = clip1[255 + q1 - a3];
				511	}
				512
				513	// 6 pixels in, 6 pixels out
				514	static inline void do_filter6(uint8_t* p, int step) {
				515	const int p2 = p[-3step], p1 = p[-2step], p0 = p[-step];
				516	const int q0 = p[0], q1 = p[step], q2 = p[2*step];
				517	const int a = sclip1[1020 + 3 * (q0 - p0) + sclip1[1020 + p1 - q1]];
				518	const int a1 = (27 * a + 63) >> 7; // eq. to ((3 * a + 7) * 9) >> 7
				519	const int a2 = (18 * a + 63) >> 7; // eq. to ((2 * a + 7) * 9) >> 7
				520	const int a3 = (9 * a + 63) >> 7; // eq. to ((1 * a + 7) * 9) >> 7
				521	p[-3*step] = clip1[255 + p2 + a3];
				522	p[-2*step] = clip1[255 + p1 + a2];
				523	p[- step] = clip1[255 + p0 + a1];
				524	p[ 0] = clip1[255 + q0 - a1];
				525	p[ step] = clip1[255 + q1 - a2];
				526	p[ 2*step] = clip1[255 + q2 - a3];
				527	}
				528
				529	static inline int hev(const uint8_t* p, int step, int thresh) {
				530	const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
				531	return (abs0[255 + p1 - p0] > thresh) \|\| (abs0[255 + q1 - q0] > thresh);
				532	}
				533
				534	static inline int needs_filter(const uint8_t* p, int step, int thresh) {
				535	const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
				536	return (2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) <= thresh;
				537	}
				538
				539	static inline int needs_filter2(const uint8_t* p, int step, int t, int it) {
				540	const int p3 = p[-4step], p2 = p[-3step], p1 = p[-2*step], p0 = p[-step];
				541	const int q0 = p[0], q1 = p[step], q2 = p[2step], q3 = p[3step];
				542	if ((2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) > t)
				543	return 0;
				544	return abs0[255 + p3 - p2] <= it && abs0[255 + p2 - p1] <= it &&
				545	abs0[255 + p1 - p0] <= it && abs0[255 + q3 - q2] <= it &&
				546	abs0[255 + q2 - q1] <= it && abs0[255 + q1 - q0] <= it;
				547	}
				548
				549	//-----------------------------------------------------------------------------
				550	// Simple In-loop filtering (Paragraph 15.2)
				551
				552	static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
				553	int i;
				554	for (i = 0; i < 16; ++i) {
				555	if (needs_filter(p + i, stride, thresh)) {
				556	do_filter2(p + i, stride);
				557	}
				558	}
				559	}
				560
				561	static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
				562	int i;
				563	for (i = 0; i < 16; ++i) {
				564	if (needs_filter(p + i * stride, 1, thresh)) {
				565	do_filter2(p + i * stride, 1);
				566	}
				567	}
				568	}
				569
				570	static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
				571	int k;
				572	for (k = 3; k > 0; --k) {
				573	p += 4 * stride;
				574	SimpleVFilter16(p, stride, thresh);
				575	}
				576	}
				577
				578	static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
				579	int k;
				580	for (k = 3; k > 0; --k) {
				581	p += 4;
				582	SimpleHFilter16(p, stride, thresh);
				583	}
				584	}
				585
				586	//-----------------------------------------------------------------------------
				587	// Complex In-loop filtering (Paragraph 15.3)
				588
				589	static inline void FilterLoop26(uint8_t* p, int hstride, int vstride, int size,
				590	int thresh, int ithresh, int hev_thresh) {
				591	while (size-- > 0) {
				592	if (needs_filter2(p, hstride, thresh, ithresh)) {
				593	if (hev(p, hstride, hev_thresh)) {
				594	do_filter2(p, hstride);
				595	} else {
				596	do_filter6(p, hstride);
				597	}
				598	}
				599	p += vstride;
				600	}
				601	}
				602
				603	static inline void FilterLoop24(uint8_t* p, int hstride, int vstride, int size,
				604	int thresh, int ithresh, int hev_thresh) {
				605	while (size-- > 0) {
				606	if (needs_filter2(p, hstride, thresh, ithresh)) {
				607	if (hev(p, hstride, hev_thresh)) {
				608	do_filter2(p, hstride);
				609	} else {
				610	do_filter4(p, hstride);
				611	}
				612	}
				613	p += vstride;
				614	}
				615	}
				616
				617	// on macroblock edges
				618	static void VFilter16(uint8_t* p, int stride,
				619	int thresh, int ithresh, int hev_thresh) {
				620	FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
				621	}
				622
				623	static void HFilter16(uint8_t* p, int stride,
				624	int thresh, int ithresh, int hev_thresh) {
				625	FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
				626	}
				627
				628	// on three inner edges
				629	static void VFilter16i(uint8_t* p, int stride,
				630	int thresh, int ithresh, int hev_thresh) {
				631	int k;
				632	for (k = 3; k > 0; --k) {
				633	p += 4 * stride;
				634	FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
				635	}
				636	}
				637
				638	static void HFilter16i(uint8_t* p, int stride,
				639	int thresh, int ithresh, int hev_thresh) {
				640	int k;
				641	for (k = 3; k > 0; --k) {
				642	p += 4;
				643	FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
				644	}
				645	}
				646
				647	// 8-pixels wide variant, for chroma filtering
				648	static void VFilter8(uint8_t* u, uint8_t* v, int stride,
				649	int thresh, int ithresh, int hev_thresh) {
				650	FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
				651	FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
				652	}
				653
				654	static void HFilter8(uint8_t* u, uint8_t* v, int stride,
				655	int thresh, int ithresh, int hev_thresh) {
				656	FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
				657	FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
				658	}
				659
				660	static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
				661	int thresh, int ithresh, int hev_thresh) {
				662	FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
				663	FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
				664	}
				665
				666	static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
				667	int thresh, int ithresh, int hev_thresh) {
				668	FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
				669	FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
				670	}
				671
				672	//-----------------------------------------------------------------------------
				673
				674	void (VP8VFilter16)(uint8_t, int, int, int, int) = VFilter16;
				675	void (VP8HFilter16)(uint8_t, int, int, int, int) = HFilter16;
				676	void (VP8VFilter8)(uint8_t, uint8_t*, int, int, int, int) = VFilter8;
				677	void (VP8HFilter8)(uint8_t, uint8_t*, int, int, int, int) = HFilter8;
				678	void (VP8VFilter16i)(uint8_t, int, int, int, int) = VFilter16i;
				679	void (VP8HFilter16i)(uint8_t, int, int, int, int) = HFilter16i;
				680	void (VP8VFilter8i)(uint8_t, uint8_t*, int, int, int, int) = VFilter8i;
				681	void (VP8HFilter8i)(uint8_t, uint8_t*, int, int, int, int) = HFilter8i;
				682
				683	void (VP8SimpleVFilter16)(uint8_t, int, int) = SimpleVFilter16;
				684	void (VP8SimpleHFilter16)(uint8_t, int, int) = SimpleHFilter16;
				685	void (VP8SimpleVFilter16i)(uint8_t, int, int) = SimpleVFilter16i;
				686	void (VP8SimpleHFilter16i)(uint8_t, int, int) = SimpleHFilter16i;
				687
				688	//-----------------------------------------------------------------------------
				689
Vikas Arora	03d5e34	2011-06-02 23:59:44 +0530	[diff] [blame^]	690	void VP8DspInit(void) {
Eric Hassold	9aea642	2011-01-04 17:22:46 -0800	[diff] [blame]	691	// later we'll plug some SSE2 variant here
				692	}
				693
				694	#if defined(__cplusplus) \|\| defined(c_plusplus)
				695	} // extern "C"
				696	#endif