Blame - source/rotate_neon.cc - fp2-dev/platform/external/chromium_org/third_party/libyuv

blob: a6496d33be5f82c67a19ed8c5e22ddbf97b302f8 [file] [log] [blame]

frkoenig@google.com	f7e74a1	2011-11-03 22:41:59 +0000	[diff] [blame^]	1	/*
				2	* Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
				3	*
				4	* Use of this source code is governed by a BSD-style license
				5	* that can be found in the LICENSE file in the root of the source
				6	* tree. An additional intellectual property rights grant can be found
				7	* in the file PATENTS. All contributing project authors may
				8	* be found in the AUTHORS file in the root of the source tree.
				9	*/
				10
				11	#include "libyuv/basic_types.h"
				12
				13	namespace libyuv {
				14
				15	#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
				16
				17	void ReverseLine_NEON(const uint8* src, uint8* dst, int width) {
				18	asm volatile(
				19	// compute where to start writing destination
				20	"add %1, %2\n"
				21
				22	// work on segments that are multiples of 16
				23	"lsrs r3, %2, #4\n"
				24
				25	// the output is written in two block. 8 bytes followed
				26	// by another 8. reading is done sequentially, from left to
				27	// right. writing is done from right to left in block sizes
				28	// %1, the destination pointer is incremented after writing
				29	// the first of the two blocks. need to subtract that 8 off
				30	// along with 16 to get the next location.
				31	"mov r3, #-24\n"
				32
				33	"beq 2f\n"
				34
				35	// back of destination by the size of the register that is
				36	// going to be reversed
				37	"sub %1, #16\n"
				38
				39	// the loop needs to run on blocks of 16. what will be left
				40	// over is either a negative number, the residuals that need
				41	// to be done, or 0. if this isn't subtracted off here the
				42	// loop will run one extra time.
				43	"sub %2, #16\n"
				44
				45	"1:\n"
				46	"vld1.8 {q0}, [%0]!\n" // src += 16
				47
				48	// reverse the bytes in the 64 bit segments. unable to reverse
				49	// the bytes in the entire 128 bits in one go.
				50	"vrev64.8 q0, q0\n"
				51
				52	// because of the inability to reverse the entire 128 bits
				53	// reverse the writing out of the two 64 bit segments.
				54	"vst1.8 {d1}, [%1]!\n"
				55	"vst1.8 {d0}, [%1], r3\n" // dst -= 16
				56
				57	"subs %2, #16\n"
				58	"bge 1b\n"
				59
				60	// add 16 back to the counter. if the result is 0 there is no
				61	// residuals so jump past
				62	"adds %2, #16\n"
				63	"beq 5f\n"
				64
				65	"add %1, #16\n"
				66
				67	"2:\n"
				68
				69	"mov r3, #-3\n"
				70
				71	"sub %1, #2\n"
				72	"subs %2, #2\n"
				73	// check for 16*n+1 scenarios where segments_of_2 should not
				74	// be run, but there is something left over.
				75	"blt 4f\n"
				76
				77	// do this in neon registers as per
				78	// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
				79	"3:\n"
				80	"vld2.8 {d0[0], d1[0]}, [%0]!\n" // src += 2
				81
				82	"vst1.8 {d1[0]}, [%1]!\n"
				83	"vst1.8 {d0[0]}, [%1], r3\n" // dst -= 2
				84
				85	"subs %2, #2\n"
				86	"bge 3b\n"
				87
				88	"adds %2, #2\n"
				89	"beq 5f\n"
				90
				91	"4:\n"
				92	"add %1, #1\n"
				93	"vld1.8 {d0[0]}, [%0]\n"
				94	"vst1.8 {d0[0]}, [%1]\n"
				95
				96	"5:\n"
				97	: "+r"(src), // %0
				98	"+r"(dst), // %1
				99	"+r"(width) // %2
				100	:
				101	: "memory", "cc", "r3", "q0"
				102	);
				103	}
				104
				105	static const uint8 vtbl_4x4_transpose[16] __attribute__((vector_size(16))) =
				106	{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
				107
				108	void TransposeWx8_NEON(const uint8* src, int src_stride,
				109	uint8* dst, int dst_stride,
				110	int width) {
				111	asm volatile(
				112	// loops are on blocks of 8. loop will stop when
				113	// counter gets to or below 0. starting the counter
				114	// at w-8 allow for this
				115	"sub %4, #8\n"
				116
				117	// handle 8x8 blocks. this should be the majority of the plane
				118	"1:\n"
				119	"mov r9, %0\n"
				120
				121	"vld1.8 {d0}, [r9], %1\n"
				122	"vld1.8 {d1}, [r9], %1\n"
				123	"vld1.8 {d2}, [r9], %1\n"
				124	"vld1.8 {d3}, [r9], %1\n"
				125	"vld1.8 {d4}, [r9], %1\n"
				126	"vld1.8 {d5}, [r9], %1\n"
				127	"vld1.8 {d6}, [r9], %1\n"
				128	"vld1.8 {d7}, [r9]\n"
				129
				130	"vtrn.8 d1, d0\n"
				131	"vtrn.8 d3, d2\n"
				132	"vtrn.8 d5, d4\n"
				133	"vtrn.8 d7, d6\n"
				134
				135	"vtrn.16 d1, d3\n"
				136	"vtrn.16 d0, d2\n"
				137	"vtrn.16 d5, d7\n"
				138	"vtrn.16 d4, d6\n"
				139
				140	"vtrn.32 d1, d5\n"
				141	"vtrn.32 d0, d4\n"
				142	"vtrn.32 d3, d7\n"
				143	"vtrn.32 d2, d6\n"
				144
				145	"vrev16.8 q0, q0\n"
				146	"vrev16.8 q1, q1\n"
				147	"vrev16.8 q2, q2\n"
				148	"vrev16.8 q3, q3\n"
				149
				150	"mov r9, %2\n"
				151
				152	"vst1.8 {d1}, [r9], %3\n"
				153	"vst1.8 {d0}, [r9], %3\n"
				154	"vst1.8 {d3}, [r9], %3\n"
				155	"vst1.8 {d2}, [r9], %3\n"
				156	"vst1.8 {d5}, [r9], %3\n"
				157	"vst1.8 {d4}, [r9], %3\n"
				158	"vst1.8 {d7}, [r9], %3\n"
				159	"vst1.8 {d6}, [r9]\n"
				160
				161	"add %0, #8\n" // src += 8
				162	"add %2, %3, lsl #3\n" // dst += 8 * dst_stride
				163	"subs %4, #8\n" // w -= 8
				164	"bge 1b\n"
				165
				166	// add 8 back to counter. if the result is 0 there are
				167	// no residuals.
				168	"adds %4, #8\n"
				169	"beq 4f\n"
				170
				171	// some residual, so between 1 and 7 lines left to transpose
				172	"cmp %4, #2\n"
				173	"blt 3f\n"
				174
				175	"cmp %4, #4\n"
				176	"blt 2f\n"
				177
				178	// 4x8 block
				179	"mov r9, %0\n"
				180	"vld1.32 {d0[0]}, [r9], %1\n"
				181	"vld1.32 {d0[1]}, [r9], %1\n"
				182	"vld1.32 {d1[0]}, [r9], %1\n"
				183	"vld1.32 {d1[1]}, [r9], %1\n"
				184	"vld1.32 {d2[0]}, [r9], %1\n"
				185	"vld1.32 {d2[1]}, [r9], %1\n"
				186	"vld1.32 {d3[0]}, [r9], %1\n"
				187	"vld1.32 {d3[1]}, [r9]\n"
				188
				189	"mov r9, %2\n"
				190
				191	"vld1.8 {q3}, [%5]\n"
				192
				193	"vtbl.8 d4, {d0, d1}, d6\n"
				194	"vtbl.8 d5, {d0, d1}, d7\n"
				195	"vtbl.8 d0, {d2, d3}, d6\n"
				196	"vtbl.8 d1, {d2, d3}, d7\n"
				197
				198	// TODO: rework shuffle above to write
				199	// out with 4 instead of 8 writes
				200	"vst1.32 {d4[0]}, [r9], %3\n"
				201	"vst1.32 {d4[1]}, [r9], %3\n"
				202	"vst1.32 {d5[0]}, [r9], %3\n"
				203	"vst1.32 {d5[1]}, [r9]\n"
				204
				205	"add r9, %2, #4\n"
				206	"vst1.32 {d0[0]}, [r9], %3\n"
				207	"vst1.32 {d0[1]}, [r9], %3\n"
				208	"vst1.32 {d1[0]}, [r9], %3\n"
				209	"vst1.32 {d1[1]}, [r9]\n"
				210
				211	"add %0, #4\n" // src += 4
				212	"add %2, %3, lsl #2\n" // dst += 4 * dst_stride
				213	"subs %4, #4\n" // w -= 4
				214	"beq 4f\n"
				215
				216	// some residual, check to see if it includes a 2x8 block,
				217	// or less
				218	"cmp %4, #2\n"
				219	"blt 3f\n"
				220
				221	// 2x8 block
				222	"2:\n"
				223	"mov r9, %0\n"
				224	"vld1.16 {d0[0]}, [r9], %1\n"
				225	"vld1.16 {d1[0]}, [r9], %1\n"
				226	"vld1.16 {d0[1]}, [r9], %1\n"
				227	"vld1.16 {d1[1]}, [r9], %1\n"
				228	"vld1.16 {d0[2]}, [r9], %1\n"
				229	"vld1.16 {d1[2]}, [r9], %1\n"
				230	"vld1.16 {d0[3]}, [r9], %1\n"
				231	"vld1.16 {d1[3]}, [r9]\n"
				232
				233	"vtrn.8 d0, d1\n"
				234
				235	"mov r9, %2\n"
				236
				237	"vst1.64 {d0}, [r9], %3\n"
				238	"vst1.64 {d1}, [r9]\n"
				239
				240	"add %0, #2\n" // src += 2
				241	"add %2, %3, lsl #1\n" // dst += 2 * dst_stride
				242	"subs %4, #2\n" // w -= 2
				243	"beq 4f\n"
				244
				245	// 1x8 block
				246	"3:\n"
				247	"vld1.8 {d0[0]}, [%0], %1\n"
				248	"vld1.8 {d0[1]}, [%0], %1\n"
				249	"vld1.8 {d0[2]}, [%0], %1\n"
				250	"vld1.8 {d0[3]}, [%0], %1\n"
				251	"vld1.8 {d0[4]}, [%0], %1\n"
				252	"vld1.8 {d0[5]}, [%0], %1\n"
				253	"vld1.8 {d0[6]}, [%0], %1\n"
				254	"vld1.8 {d0[7]}, [%0]\n"
				255
				256	"vst1.64 {d0}, [%2]\n"
				257
				258	"4:\n"
				259
				260	: "+r"(src), // %0
				261	"+r"(src_stride), // %1
				262	"+r"(dst), // %2
				263	"+r"(dst_stride), // %3
				264	"+r"(width) // %4
				265	: "r"(vtbl_4x4_transpose) // %5
				266	: "memory", "cc", "r9", "q0", "q1", "q2", "q3"
				267	);
				268	}
				269
				270	void ReverseLineUV_NEON(const uint8* src,
				271	uint8* dst_a, uint8* dst_b,
				272	int width) {
				273	asm volatile(
				274	// compute where to start writing destination
				275	"add %1, %3\n" // dst_a + width
				276	"add %2, %3\n" // dst_b + width
				277
				278	// work on input segments that are multiples of 16, but
				279	// width that has been passed is output segments, half
				280	// the size of input.
				281	"lsrs r12, %3, #3\n"
				282
				283	"beq 2f\n"
				284
				285	// the output is written in to two blocks.
				286	"mov r12, #-8\n"
				287
				288	// back of destination by the size of the register that is
				289	// going to be reversed
				290	"sub %1, #8\n"
				291	"sub %2, #8\n"
				292
				293	// the loop needs to run on blocks of 8. what will be left
				294	// over is either a negative number, the residuals that need
				295	// to be done, or 0. if this isn't subtracted off here the
				296	// loop will run one extra time.
				297	"sub %3, #8\n"
				298
				299	"1:\n"
				300	"vld2.8 {d0, d1}, [%0]!\n" // src += 16
				301
				302	// reverse the bytes in the 64 bit segments
				303	"vrev64.8 q0, q0\n"
				304
				305	"vst1.8 {d0}, [%1], r12\n" // dst_a -= 8
				306	"vst1.8 {d1}, [%2], r12\n" // dst_b -= 8
				307
				308	"subs %3, #8\n"
				309	"bge 1b\n"
				310
				311	// add 8 back to the counter. if the result is 0 there is no
				312	// residuals so return
				313	"adds %3, #8\n"
				314	"beq 4f\n"
				315
				316	"add %1, #8\n"
				317	"add %2, #8\n"
				318
				319	"2:\n"
				320
				321	"mov r12, #-1\n"
				322
				323	"sub %1, #1\n"
				324	"sub %2, #1\n"
				325
				326	"3:\n"
				327	"vld2.8 {d0[0], d1[0]}, [%0]!\n" // src += 2
				328
				329	"vst1.8 {d0[0]}, [%1], r12\n" // dst_a -= 1
				330	"vst1.8 {d1[0]}, [%2], r12\n" // dst_b -= 1
				331
				332	"subs %3, %3, #1\n"
				333	"bgt 3b\n"
				334	"4:\n"
				335	: "+r"(src), // %0
				336	"+r"(dst_a), // %1
				337	"+r"(dst_b), // %2
				338	"+r"(width) // %3
				339	:
				340	: "memory", "cc", "r12", "q0"
				341	);
				342	}
				343
				344	static const uint8 vtbl_4x4_transpose_di[16] __attribute__((vector_size(16))) =
				345	{ 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
				346
				347	void TransposeUVWx8_NEON(const uint8* src, int src_stride,
				348	uint8* dst_a, int dst_stride_a,
				349	uint8* dst_b, int dst_stride_b,
				350	int width) {
				351	asm volatile(
				352	// loops are on blocks of 8. loop will stop when
				353	// counter gets to or below 0. starting the counter
				354	// at w-8 allow for this
				355	"sub %6, #8\n"
				356
				357	// handle 8x8 blocks. this should be the majority of the plane
				358	"1:\n"
				359	"mov r9, %0\n"
				360
				361	"vld2.8 {d0, d1}, [r9], %1\n"
				362	"vld2.8 {d2, d3}, [r9], %1\n"
				363	"vld2.8 {d4, d5}, [r9], %1\n"
				364	"vld2.8 {d6, d7}, [r9], %1\n"
				365	"vld2.8 {d16, d17}, [r9], %1\n"
				366	"vld2.8 {d18, d19}, [r9], %1\n"
				367	"vld2.8 {d20, d21}, [r9], %1\n"
				368	"vld2.8 {d22, d23}, [r9]\n"
				369
				370	"vtrn.8 q1, q0\n"
				371	"vtrn.8 q3, q2\n"
				372	"vtrn.8 q9, q8\n"
				373	"vtrn.8 q11, q10\n"
				374
				375	"vtrn.16 q1, q3\n"
				376	"vtrn.16 q0, q2\n"
				377	"vtrn.16 q9, q11\n"
				378	"vtrn.16 q8, q10\n"
				379
				380	"vtrn.32 q1, q9\n"
				381	"vtrn.32 q0, q8\n"
				382	"vtrn.32 q3, q11\n"
				383	"vtrn.32 q2, q10\n"
				384
				385	"vrev16.8 q0, q0\n"
				386	"vrev16.8 q1, q1\n"
				387	"vrev16.8 q2, q2\n"
				388	"vrev16.8 q3, q3\n"
				389	"vrev16.8 q8, q8\n"
				390	"vrev16.8 q9, q9\n"
				391	"vrev16.8 q10, q10\n"
				392	"vrev16.8 q11, q11\n"
				393
				394	"mov r9, %2\n"
				395
				396	"vst1.8 {d2}, [r9], %3\n"
				397	"vst1.8 {d0}, [r9], %3\n"
				398	"vst1.8 {d6}, [r9], %3\n"
				399	"vst1.8 {d4}, [r9], %3\n"
				400	"vst1.8 {d18}, [r9], %3\n"
				401	"vst1.8 {d16}, [r9], %3\n"
				402	"vst1.8 {d22}, [r9], %3\n"
				403	"vst1.8 {d20}, [r9]\n"
				404
				405	"mov r9, %4\n"
				406
				407	"vst1.8 {d3}, [r9], %5\n"
				408	"vst1.8 {d1}, [r9], %5\n"
				409	"vst1.8 {d7}, [r9], %5\n"
				410	"vst1.8 {d5}, [r9], %5\n"
				411	"vst1.8 {d19}, [r9], %5\n"
				412	"vst1.8 {d17}, [r9], %5\n"
				413	"vst1.8 {d23}, [r9], %5\n"
				414	"vst1.8 {d21}, [r9]\n"
				415
				416	"add %0, #82\n" // src += 82
				417	"add %2, %3, lsl #3\n" // dst_a += 8 * dst_stride_a
				418	"add %4, %5, lsl #3\n" // dst_b += 8 * dst_stride_b
				419	"subs %6, #8\n" // w -= 8
				420	"bge 1b\n"
				421
				422	// add 8 back to counter. if the result is 0 there are
				423	// no residuals.
				424	"adds %6, #8\n"
				425	"beq 4f\n"
				426
				427	// some residual, so between 1 and 7 lines left to transpose
				428	"cmp %6, #2\n"
				429	"blt 3f\n"
				430
				431	"cmp %6, #4\n"
				432	"blt 2f\n"
				433
				434	//TODO(frkoenig) : clean this up
				435	// 4x8 block
				436	"mov r9, %0\n"
				437	"vld1.64 {d0}, [r9], %1\n"
				438	"vld1.64 {d1}, [r9], %1\n"
				439	"vld1.64 {d2}, [r9], %1\n"
				440	"vld1.64 {d3}, [r9], %1\n"
				441	"vld1.64 {d4}, [r9], %1\n"
				442	"vld1.64 {d5}, [r9], %1\n"
				443	"vld1.64 {d6}, [r9], %1\n"
				444	"vld1.64 {d7}, [r9]\n"
				445
				446	"vld1.8 {q15}, [%7]\n"
				447
				448	"vtrn.8 q0, q1\n"
				449	"vtrn.8 q2, q3\n"
				450
				451	"vtbl.8 d16, {d0, d1}, d30\n"
				452	"vtbl.8 d17, {d0, d1}, d31\n"
				453	"vtbl.8 d18, {d2, d3}, d30\n"
				454	"vtbl.8 d19, {d2, d3}, d31\n"
				455	"vtbl.8 d20, {d4, d5}, d30\n"
				456	"vtbl.8 d21, {d4, d5}, d31\n"
				457	"vtbl.8 d22, {d6, d7}, d30\n"
				458	"vtbl.8 d23, {d6, d7}, d31\n"
				459
				460	"mov r9, %2\n"
				461
				462	"vst1.32 {d16[0]}, [r9], %3\n"
				463	"vst1.32 {d16[1]}, [r9], %3\n"
				464	"vst1.32 {d17[0]}, [r9], %3\n"
				465	"vst1.32 {d17[1]}, [r9], %3\n"
				466
				467	"add r9, %2, #4\n"
				468	"vst1.32 {d20[0]}, [r9], %3\n"
				469	"vst1.32 {d20[1]}, [r9], %3\n"
				470	"vst1.32 {d21[0]}, [r9], %3\n"
				471	"vst1.32 {d21[1]}, [r9]\n"
				472
				473	"mov r9, %4\n"
				474
				475	"vst1.32 {d18[0]}, [r9], %5\n"
				476	"vst1.32 {d18[1]}, [r9], %5\n"
				477	"vst1.32 {d19[0]}, [r9], %5\n"
				478	"vst1.32 {d19[1]}, [r9], %5\n"
				479
				480	"add r9, %4, #4\n"
				481	"vst1.32 {d22[0]}, [r9], %5\n"
				482	"vst1.32 {d22[1]}, [r9], %5\n"
				483	"vst1.32 {d23[0]}, [r9], %5\n"
				484	"vst1.32 {d23[1]}, [r9]\n"
				485
				486	"add %0, #42\n" // src += 4 2
				487	"add %2, %3, lsl #2\n" // dst_a += 4 * dst_stride_a
				488	"add %4, %5, lsl #2\n" // dst_b += 4 * dst_stride_b
				489	"subs %6, #4\n" // w -= 4
				490	"beq 4f\n"
				491
				492	// some residual, check to see if it includes a 2x8 block,
				493	// or less
				494	"cmp %6, #2\n"
				495	"blt 3f\n"
				496
				497	// 2x8 block
				498	"2:\n"
				499	"mov r9, %0\n"
				500	"vld2.16 {d0[0], d2[0]}, [r9], %1\n"
				501	"vld2.16 {d1[0], d3[0]}, [r9], %1\n"
				502	"vld2.16 {d0[1], d2[1]}, [r9], %1\n"
				503	"vld2.16 {d1[1], d3[1]}, [r9], %1\n"
				504	"vld2.16 {d0[2], d2[2]}, [r9], %1\n"
				505	"vld2.16 {d1[2], d3[2]}, [r9], %1\n"
				506	"vld2.16 {d0[3], d2[3]}, [r9], %1\n"
				507	"vld2.16 {d1[3], d3[3]}, [r9]\n"
				508
				509	"vtrn.8 d0, d1\n"
				510	"vtrn.8 d2, d3\n"
				511
				512	"mov r9, %2\n"
				513
				514	"vst1.64 {d0}, [r9], %3\n"
				515	"vst1.64 {d2}, [r9]\n"
				516
				517	"mov r9, %4\n"
				518
				519	"vst1.64 {d1}, [r9], %5\n"
				520	"vst1.64 {d3}, [r9]\n"
				521
				522	"add %0, #22\n" // src += 2 2
				523	"add %2, %3, lsl #1\n" // dst_a += 2 * dst_stride_a
				524	"add %4, %5, lsl #1\n" // dst_b += 2 * dst_stride_b
				525	"subs %6, #2\n" // w -= 2
				526	"beq 4f\n"
				527
				528	// 1x8 block
				529	"3:\n"
				530	"vld2.8 {d0[0], d1[0]}, [%0], %1\n"
				531	"vld2.8 {d0[1], d1[1]}, [%0], %1\n"
				532	"vld2.8 {d0[2], d1[2]}, [%0], %1\n"
				533	"vld2.8 {d0[3], d1[3]}, [%0], %1\n"
				534	"vld2.8 {d0[4], d1[4]}, [%0], %1\n"
				535	"vld2.8 {d0[5], d1[5]}, [%0], %1\n"
				536	"vld2.8 {d0[6], d1[6]}, [%0], %1\n"
				537	"vld2.8 {d0[7], d1[7]}, [%0]\n"
				538
				539	"vst1.64 {d0}, [%2]\n"
				540	"vst1.64 {d1}, [%4]\n"
				541
				542	"4:\n"
				543
				544	: "+r"(src), // %0
				545	"+r"(src_stride), // %1
				546	"+r"(dst_a), // %2
				547	"+r"(dst_stride_a), // %3
				548	"+r"(dst_b), // %4
				549	"+r"(dst_stride_b), // %5
				550	"+r"(width) // %6
				551	: "r"(vtbl_4x4_transpose_di)// %7
				552	: "memory", "cc", "r9",
				553	"q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
				554	);
				555	}
				556	#endif
				557	}