blob: 7afcc7471af7d90baa7c6bbb24412790f46f5e48 [file] [log] [blame]
Nate Begemanb64af912004-08-10 20:42:36 +00001TODO:
Nate Begemanef9531e2005-04-11 20:48:57 +00002* gpr0 allocation
Nate Begeman4a0de072004-10-26 04:10:53 +00003* implement do-loop -> bdnz transform
Nate Begemanca068e82004-08-14 22:16:36 +00004* implement powerpc-64 for darwin
Nate Begemand332fd52004-08-29 22:02:43 +00005* use stfiwx in float->int
Nate Begeman50fb3c42005-12-24 01:00:15 +00006
7* Fold add and sub with constant into non-extern, non-weak addresses so this:
Nate Begeman4ad870d2005-07-26 18:59:06 +00008 lis r2, ha16(l2__ZTV4Cell)
9 la r2, lo16(l2__ZTV4Cell)(r2)
10 addi r2, r2, 8
Nate Begeman50fb3c42005-12-24 01:00:15 +000011becomes:
12 lis r2, ha16(l2__ZTV4Cell+8)
13 la r2, lo16(l2__ZTV4Cell+8)(r2)
14
Chris Lattnerb65975a2005-07-26 19:07:51 +000015
Nate Begeman5a014812005-08-14 01:17:16 +000016* Teach LLVM how to codegen this:
17unsigned short foo(float a) { return a; }
18as:
19_foo:
20 fctiwz f0,f1
21 stfd f0,-8(r1)
22 lhz r3,-2(r1)
23 blr
24not:
25_foo:
26 fctiwz f0, f1
27 stfd f0, -8(r1)
28 lwz r2, -4(r1)
29 rlwinm r3, r2, 0, 16, 31
30 blr
31
Chris Lattner6281ae42005-08-05 19:18:32 +000032* Support 'update' load/store instructions. These are cracked on the G5, but
33 are still a codesize win.
34
Misha Brukman4ce5ce22004-07-27 18:43:04 +000035* should hint to the branch select pass that it doesn't need to print the
36 second unconditional branch, so we don't end up with things like:
Misha Brukman4ce5ce22004-07-27 18:43:04 +000037 b .LBBl42__2E_expand_function_8_674 ; loopentry.24
38 b .LBBl42__2E_expand_function_8_42 ; NewDefault
39 b .LBBl42__2E_expand_function_8_42 ; NewDefault
Chris Lattner424dcbd2005-08-23 06:27:59 +000040
Chris Lattnera3c44542005-08-24 18:15:24 +000041===-------------------------------------------------------------------------===
42
Chris Lattner424dcbd2005-08-23 06:27:59 +000043* Codegen this:
44
45 void test2(int X) {
46 if (X == 0x12345678) bar();
47 }
48
49 as:
50
51 xoris r0,r3,0x1234
52 cmpwi cr0,r0,0x5678
53 beq cr0,L6
54
55 not:
56
57 lis r2, 4660
58 ori r2, r2, 22136
59 cmpw cr0, r3, r2
60 bne .LBB_test2_2
61
Chris Lattnera3c44542005-08-24 18:15:24 +000062===-------------------------------------------------------------------------===
63
64Lump the constant pool for each function into ONE pic object, and reference
65pieces of it as offsets from the start. For functions like this (contrived
66to have lots of constants obviously):
67
68double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
69
70We generate:
71
72_X:
73 lis r2, ha16(.CPI_X_0)
74 lfd f0, lo16(.CPI_X_0)(r2)
75 lis r2, ha16(.CPI_X_1)
76 lfd f2, lo16(.CPI_X_1)(r2)
77 fmadd f0, f1, f0, f2
78 lis r2, ha16(.CPI_X_2)
79 lfd f1, lo16(.CPI_X_2)(r2)
80 lis r2, ha16(.CPI_X_3)
81 lfd f2, lo16(.CPI_X_3)(r2)
82 fmadd f1, f0, f1, f2
83 blr
84
85It would be better to materialize .CPI_X into a register, then use immediates
86off of the register to avoid the lis's. This is even more important in PIC
87mode.
88
Chris Lattner39b248b2006-02-02 23:50:22 +000089Note that this (and the static variable version) is discussed here for GCC:
90http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
91
Chris Lattnera3c44542005-08-24 18:15:24 +000092===-------------------------------------------------------------------------===
Nate Begeman92cce902005-09-06 15:30:48 +000093
94Implement Newton-Rhapson method for improving estimate instructions to the
95correct accuracy, and implementing divide as multiply by reciprocal when it has
96more than one use. Itanium will want this too.
Nate Begeman21e463b2005-10-16 05:39:50 +000097
98===-------------------------------------------------------------------------===
99
Nate Begeman5cd61ce2005-10-25 23:50:02 +0000100#define ARRAY_LENGTH 16
101
102union bitfield {
103 struct {
104#ifndef __ppc__
105 unsigned int field0 : 6;
106 unsigned int field1 : 6;
107 unsigned int field2 : 6;
108 unsigned int field3 : 6;
109 unsigned int field4 : 3;
110 unsigned int field5 : 4;
111 unsigned int field6 : 1;
112#else
113 unsigned int field6 : 1;
114 unsigned int field5 : 4;
115 unsigned int field4 : 3;
116 unsigned int field3 : 6;
117 unsigned int field2 : 6;
118 unsigned int field1 : 6;
119 unsigned int field0 : 6;
120#endif
121 } bitfields, bits;
122 unsigned int u32All;
123 signed int i32All;
124 float f32All;
125};
126
127
128typedef struct program_t {
129 union bitfield array[ARRAY_LENGTH];
130 int size;
131 int loaded;
132} program;
133
134
135void AdjustBitfields(program* prog, unsigned int fmt1)
136{
137 unsigned int shift = 0;
138 unsigned int texCount = 0;
139 unsigned int i;
140
141 for (i = 0; i < 8; i++)
142 {
143 prog->array[i].bitfields.field0 = texCount;
144 prog->array[i].bitfields.field1 = texCount + 1;
145 prog->array[i].bitfields.field2 = texCount + 2;
146 prog->array[i].bitfields.field3 = texCount + 3;
147
148 texCount += (fmt1 >> shift) & 0x7;
149 shift += 3;
150 }
151}
152
153In the loop above, the bitfield adds get generated as
154(add (shl bitfield, C1), (shl C2, C1)) where C2 is 1, 2 or 3.
155
156Since the input to the (or and, and) is an (add) rather than a (shl), the shift
157doesn't get folded into the rlwimi instruction. We should ideally see through
158things like this, rather than forcing llvm to generate the equivalent
159
160(shl (add bitfield, C2), C1) with some kind of mask.
Chris Lattner01959102005-10-28 00:20:45 +0000161
162===-------------------------------------------------------------------------===
163
Chris Lattnerae4664a2005-11-05 08:57:56 +0000164Compile this:
165
166int %f1(int %a, int %b) {
167 %tmp.1 = and int %a, 15 ; <int> [#uses=1]
168 %tmp.3 = and int %b, 240 ; <int> [#uses=1]
169 %tmp.4 = or int %tmp.3, %tmp.1 ; <int> [#uses=1]
170 ret int %tmp.4
171}
172
173without a copy. We make this currently:
174
175_f1:
176 rlwinm r2, r4, 0, 24, 27
177 rlwimi r2, r3, 0, 28, 31
178 or r3, r2, r2
179 blr
180
181The two-addr pass or RA needs to learn when it is profitable to commute an
182instruction to avoid a copy AFTER the 2-addr instruction. The 2-addr pass
183currently only commutes to avoid inserting a copy BEFORE the two addr instr.
184
Chris Lattner62c08dd2005-12-08 07:13:28 +0000185===-------------------------------------------------------------------------===
186
Nate Begemaneb20ed62006-01-28 01:22:10 +0000187176.gcc contains a bunch of code like this (this occurs dozens of times):
188
189int %test(uint %mode.0.i.0) {
190 %tmp.79 = cast uint %mode.0.i.0 to sbyte ; <sbyte> [#uses=1]
191 %tmp.80 = cast sbyte %tmp.79 to int ; <int> [#uses=1]
192 %tmp.81 = shl int %tmp.80, ubyte 16 ; <int> [#uses=1]
193 %tmp.82 = and int %tmp.81, 16711680
194 ret int %tmp.82
195}
196
197which we compile to:
198
199_test:
200 extsb r2, r3
201 rlwinm r3, r2, 16, 8, 15
202 blr
203
204The extsb is obviously dead. This can be handled by a future thing like
205MaskedValueIsZero that checks to see if bits are ever demanded (in this case,
206the sign bits are never used, so we can fold the sext_inreg to nothing).
207
208I'm seeing code like this:
209
210 srwi r3, r3, 16
211 extsb r3, r3
212 rlwimi r4, r3, 16, 8, 15
213
214in which the extsb is preventing the srwi from being nuked.
215
216===-------------------------------------------------------------------------===
217
218Another example that occurs is:
219
220uint %test(int %specbits.6.1) {
221 %tmp.2540 = shr int %specbits.6.1, ubyte 11 ; <int> [#uses=1]
222 %tmp.2541 = cast int %tmp.2540 to uint ; <uint> [#uses=1]
223 %tmp.2542 = shl uint %tmp.2541, ubyte 13 ; <uint> [#uses=1]
224 %tmp.2543 = and uint %tmp.2542, 8192 ; <uint> [#uses=1]
225 ret uint %tmp.2543
226}
227
228which we codegen as:
229
230l1_test:
231 srawi r2, r3, 11
232 rlwinm r3, r2, 13, 18, 18
233 blr
234
235the srawi can be nuked by turning the SAR into a logical SHR (the sext bits are
236dead), which I think can then be folded into the rlwinm.
237
238===-------------------------------------------------------------------------===
239
Chris Lattner62c08dd2005-12-08 07:13:28 +0000240Compile offsets from allocas:
241
242int *%test() {
243 %X = alloca { int, int }
244 %Y = getelementptr {int,int}* %X, int 0, uint 1
245 ret int* %Y
246}
247
248into a single add, not two:
249
250_test:
251 addi r2, r1, -8
252 addi r3, r2, 4
253 blr
254
255--> important for C++.
256
Chris Lattner39706e62005-12-22 17:19:28 +0000257===-------------------------------------------------------------------------===
258
259int test3(int a, int b) { return (a < 0) ? a : 0; }
260
261should be branch free code. LLVM is turning it into < 1 because of the RHS.
262
263===-------------------------------------------------------------------------===
264
Chris Lattner39706e62005-12-22 17:19:28 +0000265No loads or stores of the constants should be needed:
266
267struct foo { double X, Y; };
268void xxx(struct foo F);
269void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
270
Chris Lattner1db4b4f2006-01-16 17:53:00 +0000271===-------------------------------------------------------------------------===
272
Chris Lattner98fbc2f2006-01-16 17:58:54 +0000273Darwin Stub LICM optimization:
274
275Loops like this:
276
277 for (...) bar();
278
279Have to go through an indirect stub if bar is external or linkonce. It would
280be better to compile it as:
281
282 fp = &bar;
283 for (...) fp();
284
285which only computes the address of bar once (instead of each time through the
286stub). This is Darwin specific and would have to be done in the code generator.
287Probably not a win on x86.
288
289===-------------------------------------------------------------------------===
290
291PowerPC i1/setcc stuff (depends on subreg stuff):
292
293Check out the PPC code we get for 'compare' in this testcase:
294http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
295
296oof. on top of not doing the logical crnand instead of (mfcr, mfcr,
297invert, invert, or), we then have to compare it against zero instead of
298using the value already in a CR!
299
300that should be something like
301 cmpw cr7, r8, r5
302 cmpw cr0, r7, r3
303 crnand cr0, cr0, cr7
304 bne cr0, LBB_compare_4
305
306instead of
307 cmpw cr7, r8, r5
308 cmpw cr0, r7, r3
309 mfcr r7, 1
310 mcrf cr7, cr0
311 mfcr r8, 1
312 rlwinm r7, r7, 30, 31, 31
313 rlwinm r8, r8, 30, 31, 31
314 xori r7, r7, 1
315 xori r8, r8, 1
316 addi r2, r2, 1
317 or r7, r8, r7
318 cmpwi cr0, r7, 0
319 bne cr0, LBB_compare_4 ; loopexit
320
321===-------------------------------------------------------------------------===
322
323Simple IPO for argument passing, change:
324 void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
325
326the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
327of arguments get assigned to r3 through r10. That is, if you have a function
328foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
329argument bytes for r4 and r5. The trick then would be to shuffle the argument
330order for functions we can internalize so that the maximum number of
331integers/pointers get passed in regs before you see any of the fp arguments.
332
333Instead of implementing this, it would actually probably be easier to just
334implement a PPC fastcc, where we could do whatever we wanted to the CC,
335including having this work sanely.
336
337===-------------------------------------------------------------------------===
338
339Fix Darwin FP-In-Integer Registers ABI
340
341Darwin passes doubles in structures in integer registers, which is very very
342bad. Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
343that percolates these things out of functions.
344
345Check out how horrible this is:
346http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
347
348This is an extension of "interprocedural CC unmunging" that can't be done with
349just fastcc.
350
351===-------------------------------------------------------------------------===
352
353Code Gen IPO optimization:
354
355Squish small scalar globals together into a single global struct, allowing the
356address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
357of the GOT on targets with one).
358
Chris Lattner3cda14f2006-01-19 02:09:38 +0000359===-------------------------------------------------------------------------===
360
361Generate lwbrx and other byteswapping load/store instructions when reasonable.
362
Chris Lattner96909792006-01-28 05:40:47 +0000363===-------------------------------------------------------------------------===
364
365Implement TargetConstantVec, and set up PPC to custom lower ConstantVec into
366TargetConstantVec's if it's one of the many forms that are algorithmically
367computable using the spiffy altivec instructions.
368
Chris Lattner56b69642006-01-31 02:55:28 +0000369===-------------------------------------------------------------------------===
370
371Compile this:
372
373double %test(double %X) {
374 %Y = cast double %X to long
375 %Z = cast long %Y to double
376 ret double %Z
377}
378
379to this:
380
381_test:
382 fctidz f0, f1
383 stfd f0, -8(r1)
384 lwz r2, -4(r1)
385 lwz r3, -8(r1)
386 stw r2, -12(r1)
387 stw r3, -16(r1)
388 lfd f0, -16(r1)
389 fcfid f1, f0
390 blr
391
392without the lwz/stw's.
393
Chris Lattner83e64ba2006-01-31 07:16:34 +0000394===-------------------------------------------------------------------------===
395
396Compile this:
397
398int foo(int a) {
399 int b = (a < 8);
400 if (b) {
401 return b * 3; // ignore the fact that this is always 3.
402 } else {
403 return 2;
404 }
405}
406
407into something not this:
408
409_foo:
4101) cmpwi cr7, r3, 8
411 mfcr r2, 1
412 rlwinm r2, r2, 29, 31, 31
4131) cmpwi cr0, r3, 7
414 bgt cr0, LBB1_2 ; UnifiedReturnBlock
415LBB1_1: ; then
416 rlwinm r2, r2, 0, 31, 31
417 mulli r3, r2, 3
418 blr
419LBB1_2: ; UnifiedReturnBlock
420 li r3, 2
421 blr
422
423In particular, the two compares (marked 1) could be shared by reversing one.
424This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
425same operands (but backwards) exists. In this case, this wouldn't save us
426anything though, because the compares still wouldn't be shared.
Chris Lattner0ddc1802006-02-01 00:28:12 +0000427
Chris Lattner5a7efc92006-02-01 17:54:23 +0000428===-------------------------------------------------------------------------===
429
430The legalizer should lower this:
431
432bool %test(ulong %x) {
433 %tmp = setlt ulong %x, 4294967296
434 ret bool %tmp
435}
436
437into "if x.high == 0", not:
438
439_test:
440 addi r2, r3, -1
441 cntlzw r2, r2
442 cntlzw r3, r3
443 srwi r2, r2, 5
Nate Begeman93c740b2006-02-02 07:27:56 +0000444 srwi r4, r3, 5
445 li r3, 0
Chris Lattner5a7efc92006-02-01 17:54:23 +0000446 cmpwi cr0, r2, 0
447 bne cr0, LBB1_2 ;
448LBB1_1:
Nate Begeman93c740b2006-02-02 07:27:56 +0000449 or r3, r4, r4
Chris Lattner5a7efc92006-02-01 17:54:23 +0000450LBB1_2:
Chris Lattner5a7efc92006-02-01 17:54:23 +0000451 blr
452
453noticed in 2005-05-11-Popcount-ffs-fls.c.
Chris Lattner275b8842006-02-02 07:37:11 +0000454
455
456===-------------------------------------------------------------------------===
457
458We should custom expand setcc instead of pretending that we have it. That
459would allow us to expose the access of the crbit after the mfcr, allowing
460that access to be trivially folded into other ops. A simple example:
461
462int foo(int a, int b) { return (a < b) << 4; }
463
464compiles into:
465
466_foo:
467 cmpw cr7, r3, r4
468 mfcr r2, 1
469 rlwinm r2, r2, 29, 31, 31
470 slwi r3, r2, 4
471 blr
472