blob: 9a7837d11f7d8fd7a250a79b05e8c9fc379d9098 [file] [log] [blame]
Chris Metcalf867e3592010-05-28 23:09:12 -04001/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
Chris Metcalf867e3592010-05-28 23:09:12 -040015#include <linux/types.h>
16#include <linux/string.h>
17#include <linux/module.h>
Chris Metcalfc53c70a2013-08-01 15:52:17 -040018#include <arch/chip.h>
Chris Metcalf867e3592010-05-28 23:09:12 -040019
20void *memset(void *s, int c, size_t n)
21{
22 uint32_t *out32;
23 int n32;
24 uint32_t v16, v32;
25 uint8_t *out8 = s;
26#if !CHIP_HAS_WH64()
27 int ahead32;
28#else
29 int to_align32;
30#endif
31
32 /* Experimentation shows that a trivial tight loop is a win up until
33 * around a size of 20, where writing a word at a time starts to win.
34 */
35#define BYTE_CUTOFF 20
36
37#if BYTE_CUTOFF < 3
38 /* This must be at least at least this big, or some code later
39 * on doesn't work.
40 */
41#error "BYTE_CUTOFF is too small"
42#endif
43
44 if (n < BYTE_CUTOFF) {
45 /* Strangely, this turns out to be the tightest way to
46 * write this loop.
47 */
48 if (n != 0) {
49 do {
50 /* Strangely, combining these into one line
51 * performs worse.
52 */
53 *out8 = c;
54 out8++;
55 } while (--n != 0);
56 }
57
58 return s;
59 }
60
61#if !CHIP_HAS_WH64()
62 /* Use a spare issue slot to start prefetching the first cache
63 * line early. This instruction is free as the store can be buried
64 * in otherwise idle issue slots doing ALU ops.
65 */
66 __insn_prefetch(out8);
67
68 /* We prefetch the end so that a short memset that spans two cache
69 * lines gets some prefetching benefit. Again we believe this is free
70 * to issue.
71 */
72 __insn_prefetch(&out8[n - 1]);
73#endif /* !CHIP_HAS_WH64() */
74
75
76 /* Align 'out8'. We know n >= 3 so this won't write past the end. */
77 while (((uintptr_t) out8 & 3) != 0) {
78 *out8++ = c;
79 --n;
80 }
81
82 /* Align 'n'. */
83 while (n & 3)
84 out8[--n] = c;
85
86 out32 = (uint32_t *) out8;
87 n32 = n >> 2;
88
89 /* Tile input byte out to 32 bits. */
90 v16 = __insn_intlb(c, c);
91 v32 = __insn_intlh(v16, v16);
92
93 /* This must be at least 8 or the following loop doesn't work. */
94#define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4)
95
96#if !CHIP_HAS_WH64()
97
98 ahead32 = CACHE_LINE_SIZE_IN_WORDS;
99
100 /* We already prefetched the first and last cache lines, so
101 * we only need to do more prefetching if we are storing
102 * to more than two cache lines.
103 */
104 if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) {
105 int i;
106
107 /* Prefetch the next several cache lines.
108 * This is the setup code for the software-pipelined
109 * loop below.
110 */
111#define MAX_PREFETCH 5
112 ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS;
113 if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS)
114 ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS;
115
116 for (i = CACHE_LINE_SIZE_IN_WORDS;
117 i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS)
118 __insn_prefetch(&out32[i]);
119 }
120
121 if (n32 > ahead32) {
122 while (1) {
123 int j;
124
125 /* Prefetch by reading one word several cache lines
126 * ahead. Since loads are non-blocking this will
127 * cause the full cache line to be read while we are
128 * finishing earlier cache lines. Using a store
129 * here causes microarchitectural performance
130 * problems where a victimizing store miss goes to
131 * the head of the retry FIFO and locks the pipe for
132 * a few cycles. So a few subsequent stores in this
133 * loop go into the retry FIFO, and then later
134 * stores see other stores to the same cache line
135 * are already in the retry FIFO and themselves go
136 * into the retry FIFO, filling it up and grinding
137 * to a halt waiting for the original miss to be
138 * satisfied.
139 */
140 __insn_prefetch(&out32[ahead32]);
141
Chris Metcalf867e3592010-05-28 23:09:12 -0400142#if CACHE_LINE_SIZE_IN_WORDS % 4 != 0
143#error "Unhandled CACHE_LINE_SIZE_IN_WORDS"
144#endif
145
146 n32 -= CACHE_LINE_SIZE_IN_WORDS;
147
148 /* Save icache space by only partially unrolling
149 * this loop.
150 */
151 for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) {
152 *out32++ = v32;
153 *out32++ = v32;
154 *out32++ = v32;
155 *out32++ = v32;
156 }
Chris Metcalf867e3592010-05-28 23:09:12 -0400157
158 /* To save compiled code size, reuse this loop even
159 * when we run out of prefetching to do by dropping
160 * ahead32 down.
161 */
162 if (n32 <= ahead32) {
163 /* Not even a full cache line left,
164 * so stop now.
165 */
166 if (n32 < CACHE_LINE_SIZE_IN_WORDS)
167 break;
168
169 /* Choose a small enough value that we don't
170 * prefetch past the end. There's no sense
171 * in touching cache lines we don't have to.
172 */
173 ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1;
174 }
175 }
176 }
177
178#else /* CHIP_HAS_WH64() */
179
180 /* Determine how many words we need to emit before the 'out32'
181 * pointer becomes aligned modulo the cache line size.
182 */
183 to_align32 =
184 (-((uintptr_t)out32 >> 2)) & (CACHE_LINE_SIZE_IN_WORDS - 1);
185
186 /* Only bother aligning and using wh64 if there is at least
187 * one full cache line to process. This check also prevents
188 * overrunning the end of the buffer with alignment words.
189 */
190 if (to_align32 <= n32 - CACHE_LINE_SIZE_IN_WORDS) {
191 int lines_left;
192
193 /* Align out32 mod the cache line size so we can use wh64. */
194 n32 -= to_align32;
195 for (; to_align32 != 0; to_align32--) {
196 *out32 = v32;
197 out32++;
198 }
199
200 /* Use unsigned divide to turn this into a right shift. */
201 lines_left = (unsigned)n32 / CACHE_LINE_SIZE_IN_WORDS;
202
203 do {
204 /* Only wh64 a few lines at a time, so we don't
205 * exceed the maximum number of victim lines.
206 */
207 int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS())
208 ? lines_left
209 : CHIP_MAX_OUTSTANDING_VICTIMS());
210 uint32_t *wh = out32;
211 int i = x;
212 int j;
213
214 lines_left -= x;
215
216 do {
217 __insn_wh64(wh);
218 wh += CACHE_LINE_SIZE_IN_WORDS;
219 } while (--i);
220
Chris Metcalf0707ad32010-06-25 17:04:17 -0400221 for (j = x * (CACHE_LINE_SIZE_IN_WORDS / 4);
222 j != 0; j--) {
Chris Metcalf867e3592010-05-28 23:09:12 -0400223 *out32++ = v32;
224 *out32++ = v32;
225 *out32++ = v32;
226 *out32++ = v32;
227 }
228 } while (lines_left != 0);
229
230 /* We processed all full lines above, so only this many
231 * words remain to be processed.
232 */
233 n32 &= CACHE_LINE_SIZE_IN_WORDS - 1;
234 }
235
236#endif /* CHIP_HAS_WH64() */
237
238 /* Now handle any leftover values. */
239 if (n32 != 0) {
240 do {
241 *out32 = v32;
242 out32++;
243 } while (--n32 != 0);
244 }
245
246 return s;
247}
248EXPORT_SYMBOL(memset);