blob: 9336951f184d2200c54ed543997ae1229d489867 [file] [log] [blame]
/*
* Copyright 2011 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include "SkBlitRect_opts_SSE2.h"
#include "SkBlitRow.h"
#include "SkColorPriv.h"
#include <emmintrin.h>
/** Simple blitting of opaque rectangles less than 31 pixels wide:
inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
*/
void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination,
int width, int height,
size_t rowBytes, uint32_t color) {
SkASSERT(255 == SkGetPackedA32(color));
SkASSERT(width > 0);
SkASSERT(width < 31);
while (--height >= 0) {
SkPMColor* dst = destination;
int count = width;
while (count > 4) {
*dst++ = color;
*dst++ = color;
*dst++ = color;
*dst++ = color;
count -= 4;
}
while (count > 0) {
*dst++ = color;
--count;
}
destination = (uint32_t*)((char*)destination + rowBytes);
}
}
/**
Fast blitting of opaque rectangles at least 31 pixels wide:
inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
A 31 pixel rectangle is guaranteed to have at least one
16-pixel aligned span that can take advantage of mm_store.
*/
void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination,
int width, int height,
size_t rowBytes, uint32_t color) {
SkASSERT(255 == SkGetPackedA32(color));
SkASSERT(width >= 31);
__m128i color_wide = _mm_set1_epi32(color);
while (--height >= 0) {
// Prefetching one row ahead to L1 cache can equal hardware
// performance for large/tall rects, but never *beats*
// hardware performance.
SkPMColor* dst = destination;
int count = width;
while (((size_t)dst) & 0x0F) {
*dst++ = color;
--count;
}
__m128i *d = reinterpret_cast<__m128i*>(dst);
// Googling suggests _mm_stream is only going to beat _mm_store
// for things that wouldn't fit in L2 cache anyway, typically
// >500kB, and precisely fill cache lines. For us, with
// arrays > 100k elements _mm_stream is still 100%+ slower than
// mm_store.
// Unrolling to count >= 64 is a break-even for most
// input patterns; we seem to be saturating the bus and having
// low enough overhead at 32.
while (count >= 32) {
_mm_store_si128(d++, color_wide);
_mm_store_si128(d++, color_wide);
_mm_store_si128(d++, color_wide);
_mm_store_si128(d++, color_wide);
_mm_store_si128(d++, color_wide);
_mm_store_si128(d++, color_wide);
_mm_store_si128(d++, color_wide);
_mm_store_si128(d++, color_wide);
count -= 32;
}
if (count >= 16) {
_mm_store_si128(d++, color_wide);
_mm_store_si128(d++, color_wide);
_mm_store_si128(d++, color_wide);
_mm_store_si128(d++, color_wide);
count -= 16;
}
dst = reinterpret_cast<uint32_t*>(d);
// Unrolling the loop in the Narrow code is a significant performance
// gain, but unrolling this loop appears to make no difference in
// benchmarks with either mm_store_si128 or individual sets.
while (count > 0) {
*dst++ = color;
--count;
}
destination = (uint32_t*)((char*)destination + rowBytes);
}
}
void ColorRect32_SSE2(SkPMColor* destination,
int width, int height,
size_t rowBytes, uint32_t color) {
if (0 == height || 0 == width || 0 == color) {
return;
}
unsigned colorA = SkGetPackedA32(color);
//if (255 == colorA) {
//if (width < 31) {
//BlitRect32_OpaqueNarrow_SSE2(destination, width, height,
//rowBytes, color);
//} else {
//BlitRect32_OpaqueWide_SSE2(destination, width, height,
//rowBytes, color);
//}
//} else {
SkBlitRow::ColorRect32(destination, width, height, rowBytes, color);
//}
}