blob: d65a313dadf5b997c64172e272dff70ac2f16ba0 [file] [log] [blame]
tomhudson@google.com8dd90a92012-03-19 13:49:50 +00001/*
2 * Copyright 2011 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
commit-bot@chromium.org8c4953c2014-04-30 14:58:46 +00008#include <emmintrin.h>
tomhudson@google.com8dd90a92012-03-19 13:49:50 +00009#include "SkBlitRect_opts_SSE2.h"
10#include "SkBlitRow.h"
11#include "SkColorPriv.h"
12
commit-bot@chromium.org8c4953c2014-04-30 14:58:46 +000013/* Simple blitting of opaque rectangles less than 31 pixels wide:
14 * inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
15 */
caryclark@google.com83ecdc32012-06-06 12:10:26 +000016static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination,
tomhudson@google.com8dd90a92012-03-19 13:49:50 +000017 int width, int height,
18 size_t rowBytes, uint32_t color) {
19 SkASSERT(255 == SkGetPackedA32(color));
20 SkASSERT(width > 0);
21 SkASSERT(width < 31);
22
23 while (--height >= 0) {
24 SkPMColor* dst = destination;
25 int count = width;
26
27 while (count > 4) {
28 *dst++ = color;
29 *dst++ = color;
30 *dst++ = color;
31 *dst++ = color;
32 count -= 4;
33 }
34
35 while (count > 0) {
36 *dst++ = color;
37 --count;
38 }
39
40 destination = (uint32_t*)((char*)destination + rowBytes);
41 }
42}
43
commit-bot@chromium.org8c4953c2014-04-30 14:58:46 +000044/*
45 * Fast blitting of opaque rectangles at least 31 pixels wide:
46 * inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
47 * A 31 pixel rectangle is guaranteed to have at least one
48 * 16-pixel aligned span that can take advantage of mm_store.
49 */
caryclark@google.com83ecdc32012-06-06 12:10:26 +000050static void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination,
tomhudson@google.com8dd90a92012-03-19 13:49:50 +000051 int width, int height,
52 size_t rowBytes, uint32_t color) {
53 SkASSERT(255 == SkGetPackedA32(color));
54 SkASSERT(width >= 31);
55
56 __m128i color_wide = _mm_set1_epi32(color);
57 while (--height >= 0) {
58 // Prefetching one row ahead to L1 cache can equal hardware
59 // performance for large/tall rects, but never *beats*
60 // hardware performance.
61 SkPMColor* dst = destination;
62 int count = width;
63
64 while (((size_t)dst) & 0x0F) {
65 *dst++ = color;
66 --count;
67 }
68 __m128i *d = reinterpret_cast<__m128i*>(dst);
69
70 // Googling suggests _mm_stream is only going to beat _mm_store
71 // for things that wouldn't fit in L2 cache anyway, typically
72 // >500kB, and precisely fill cache lines. For us, with
73 // arrays > 100k elements _mm_stream is still 100%+ slower than
74 // mm_store.
75
76 // Unrolling to count >= 64 is a break-even for most
77 // input patterns; we seem to be saturating the bus and having
78 // low enough overhead at 32.
79
80 while (count >= 32) {
81 _mm_store_si128(d++, color_wide);
82 _mm_store_si128(d++, color_wide);
83 _mm_store_si128(d++, color_wide);
84 _mm_store_si128(d++, color_wide);
85 _mm_store_si128(d++, color_wide);
86 _mm_store_si128(d++, color_wide);
87 _mm_store_si128(d++, color_wide);
88 _mm_store_si128(d++, color_wide);
89 count -= 32;
90 }
91 if (count >= 16) {
92 _mm_store_si128(d++, color_wide);
93 _mm_store_si128(d++, color_wide);
94 _mm_store_si128(d++, color_wide);
95 _mm_store_si128(d++, color_wide);
96 count -= 16;
97 }
98 dst = reinterpret_cast<uint32_t*>(d);
99
100 // Unrolling the loop in the Narrow code is a significant performance
101 // gain, but unrolling this loop appears to make no difference in
102 // benchmarks with either mm_store_si128 or individual sets.
103
104 while (count > 0) {
105 *dst++ = color;
106 --count;
107 }
108
109 destination = (uint32_t*)((char*)destination + rowBytes);
110 }
111}
112
113void ColorRect32_SSE2(SkPMColor* destination,
114 int width, int height,
115 size_t rowBytes, uint32_t color) {
116 if (0 == height || 0 == width || 0 == color) {
117 return;
118 }
119 unsigned colorA = SkGetPackedA32(color);
bsalomon@google.comb58a6392013-03-21 20:29:05 +0000120 colorA = 0; // skip below if () for now...(has been disabled since this was added in r3423).
121 if (255 == colorA) {
caryclark@google.com83ecdc32012-06-06 12:10:26 +0000122 if (width < 31) {
123 BlitRect32_OpaqueNarrow_SSE2(destination, width, height,
124 rowBytes, color);
125 } else {
126 BlitRect32_OpaqueWide_SSE2(destination, width, height,
127 rowBytes, color);
128 }
129 } else {
tomhudson@google.com8dd90a92012-03-19 13:49:50 +0000130 SkBlitRow::ColorRect32(destination, width, height, rowBytes, color);
caryclark@google.com83ecdc32012-06-06 12:10:26 +0000131 }
tomhudson@google.com8dd90a92012-03-19 13:49:50 +0000132}