Blame - src/opts/SkBlitRect_opts_SSE2.cpp - platform/external/skia

blob: d65a313dadf5b997c64172e272dff70ac2f16ba0 [file] [log] [blame]

tomhudson@google.com	8dd90a9	2012-03-19 13:49:50 +0000	[diff] [blame]	1	/*
				2	* Copyright 2011 Google Inc.
				3	*
				4	* Use of this source code is governed by a BSD-style license that can be
				5	* found in the LICENSE file.
				6	*/
				7
commit-bot@chromium.org	8c4953c	2014-04-30 14:58:46 +0000	[diff] [blame]	8	#include <emmintrin.h>
tomhudson@google.com	8dd90a9	2012-03-19 13:49:50 +0000	[diff] [blame]	9	#include "SkBlitRect_opts_SSE2.h"
				10	#include "SkBlitRow.h"
				11	#include "SkColorPriv.h"
				12
commit-bot@chromium.org	8c4953c	2014-04-30 14:58:46 +0000	[diff] [blame]	13	/* Simple blitting of opaque rectangles less than 31 pixels wide:
				14	* inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
				15	*/
caryclark@google.com	83ecdc3	2012-06-06 12:10:26 +0000	[diff] [blame]	16	static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination,
tomhudson@google.com	8dd90a9	2012-03-19 13:49:50 +0000	[diff] [blame]	17	int width, int height,
				18	size_t rowBytes, uint32_t color) {
				19	SkASSERT(255 == SkGetPackedA32(color));
				20	SkASSERT(width > 0);
				21	SkASSERT(width < 31);
				22
				23	while (--height >= 0) {
				24	SkPMColor* dst = destination;
				25	int count = width;
				26
				27	while (count > 4) {
				28	*dst++ = color;
				29	*dst++ = color;
				30	*dst++ = color;
				31	*dst++ = color;
				32	count -= 4;
				33	}
				34
				35	while (count > 0) {
				36	*dst++ = color;
				37	--count;
				38	}
				39
				40	destination = (uint32_t)((char)destination + rowBytes);
				41	}
				42	}
				43
commit-bot@chromium.org	8c4953c	2014-04-30 14:58:46 +0000	[diff] [blame]	44	/*
				45	* Fast blitting of opaque rectangles at least 31 pixels wide:
				46	* inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
				47	* A 31 pixel rectangle is guaranteed to have at least one
				48	* 16-pixel aligned span that can take advantage of mm_store.
				49	*/
caryclark@google.com	83ecdc3	2012-06-06 12:10:26 +0000	[diff] [blame]	50	static void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination,
tomhudson@google.com	8dd90a9	2012-03-19 13:49:50 +0000	[diff] [blame]	51	int width, int height,
				52	size_t rowBytes, uint32_t color) {
				53	SkASSERT(255 == SkGetPackedA32(color));
				54	SkASSERT(width >= 31);
				55
				56	__m128i color_wide = _mm_set1_epi32(color);
				57	while (--height >= 0) {
				58	// Prefetching one row ahead to L1 cache can equal hardware
				59	// performance for large/tall rects, but never beats
				60	// hardware performance.
				61	SkPMColor* dst = destination;
				62	int count = width;
				63
				64	while (((size_t)dst) & 0x0F) {
				65	*dst++ = color;
				66	--count;
				67	}
				68	__m128i d = reinterpret_cast<__m128i>(dst);
				69
				70	// Googling suggests _mm_stream is only going to beat _mm_store
				71	// for things that wouldn't fit in L2 cache anyway, typically
				72	// >500kB, and precisely fill cache lines. For us, with
				73	// arrays > 100k elements _mm_stream is still 100%+ slower than
				74	// mm_store.
				75
				76	// Unrolling to count >= 64 is a break-even for most
				77	// input patterns; we seem to be saturating the bus and having
				78	// low enough overhead at 32.
				79
				80	while (count >= 32) {
				81	_mm_store_si128(d++, color_wide);
				82	_mm_store_si128(d++, color_wide);
				83	_mm_store_si128(d++, color_wide);
				84	_mm_store_si128(d++, color_wide);
				85	_mm_store_si128(d++, color_wide);
				86	_mm_store_si128(d++, color_wide);
				87	_mm_store_si128(d++, color_wide);
				88	_mm_store_si128(d++, color_wide);
				89	count -= 32;
				90	}
				91	if (count >= 16) {
				92	_mm_store_si128(d++, color_wide);
				93	_mm_store_si128(d++, color_wide);
				94	_mm_store_si128(d++, color_wide);
				95	_mm_store_si128(d++, color_wide);
				96	count -= 16;
				97	}
				98	dst = reinterpret_cast<uint32_t*>(d);
				99
				100	// Unrolling the loop in the Narrow code is a significant performance
				101	// gain, but unrolling this loop appears to make no difference in
				102	// benchmarks with either mm_store_si128 or individual sets.
				103
				104	while (count > 0) {
				105	*dst++ = color;
				106	--count;
				107	}
				108
				109	destination = (uint32_t)((char)destination + rowBytes);
				110	}
				111	}
				112
				113	void ColorRect32_SSE2(SkPMColor* destination,
				114	int width, int height,
				115	size_t rowBytes, uint32_t color) {
				116	if (0 == height \|\| 0 == width \|\| 0 == color) {
				117	return;
				118	}
				119	unsigned colorA = SkGetPackedA32(color);
bsalomon@google.com	b58a639	2013-03-21 20:29:05 +0000	[diff] [blame]	120	colorA = 0; // skip below if () for now...(has been disabled since this was added in r3423).
				121	if (255 == colorA) {
caryclark@google.com	83ecdc3	2012-06-06 12:10:26 +0000	[diff] [blame]	122	if (width < 31) {
				123	BlitRect32_OpaqueNarrow_SSE2(destination, width, height,
				124	rowBytes, color);
				125	} else {
				126	BlitRect32_OpaqueWide_SSE2(destination, width, height,
				127	rowBytes, color);
				128	}
				129	} else {
tomhudson@google.com	8dd90a9	2012-03-19 13:49:50 +0000	[diff] [blame]	130	SkBlitRow::ColorRect32(destination, width, height, rowBytes, color);
caryclark@google.com	83ecdc3	2012-06-06 12:10:26 +0000	[diff] [blame]	131	}
tomhudson@google.com	8dd90a9	2012-03-19 13:49:50 +0000	[diff] [blame]	132	}