blob: fb52dd3b962d7d14249fbd7707086b8f109d9e0b [file] [log] [blame]
Nate Begemanb64af912004-08-10 20:42:36 +00001TODO:
Nate Begemanef9531e2005-04-11 20:48:57 +00002* gpr0 allocation
Nate Begeman4a0de072004-10-26 04:10:53 +00003* implement do-loop -> bdnz transform
Nate Begemanca068e82004-08-14 22:16:36 +00004* implement powerpc-64 for darwin
Nate Begeman50fb3c42005-12-24 01:00:15 +00005
Nate Begemana63fee82006-02-03 05:17:06 +00006===-------------------------------------------------------------------------===
Nate Begeman50fb3c42005-12-24 01:00:15 +00007
Nate Begemana63fee82006-02-03 05:17:06 +00008Use the stfiwx instruction for:
Chris Lattnerb65975a2005-07-26 19:07:51 +00009
Nate Begemana63fee82006-02-03 05:17:06 +000010void foo(float a, int *b) { *b = a; }
11
12===-------------------------------------------------------------------------===
13
Nate Begeman5a014812005-08-14 01:17:16 +000014unsigned short foo(float a) { return a; }
Nate Begemana63fee82006-02-03 05:17:06 +000015should be:
Nate Begeman5a014812005-08-14 01:17:16 +000016_foo:
17 fctiwz f0,f1
18 stfd f0,-8(r1)
19 lhz r3,-2(r1)
20 blr
21not:
22_foo:
23 fctiwz f0, f1
24 stfd f0, -8(r1)
25 lwz r2, -4(r1)
26 rlwinm r3, r2, 0, 16, 31
27 blr
28
Nate Begemana63fee82006-02-03 05:17:06 +000029===-------------------------------------------------------------------------===
Chris Lattner6281ae42005-08-05 19:18:32 +000030
Nate Begemana63fee82006-02-03 05:17:06 +000031Support 'update' load/store instructions. These are cracked on the G5, but are
32still a codesize win.
33
34===-------------------------------------------------------------------------===
35
36Should hint to the branch select pass that it doesn't need to print the second
37unconditional branch, so we don't end up with things like:
Misha Brukman4ce5ce22004-07-27 18:43:04 +000038 b .LBBl42__2E_expand_function_8_674 ; loopentry.24
39 b .LBBl42__2E_expand_function_8_42 ; NewDefault
40 b .LBBl42__2E_expand_function_8_42 ; NewDefault
Chris Lattner424dcbd2005-08-23 06:27:59 +000041
Chris Lattnera3c44542005-08-24 18:15:24 +000042===-------------------------------------------------------------------------===
43
Chris Lattner424dcbd2005-08-23 06:27:59 +000044* Codegen this:
45
46 void test2(int X) {
47 if (X == 0x12345678) bar();
48 }
49
50 as:
51
52 xoris r0,r3,0x1234
53 cmpwi cr0,r0,0x5678
54 beq cr0,L6
55
56 not:
57
58 lis r2, 4660
59 ori r2, r2, 22136
60 cmpw cr0, r3, r2
61 bne .LBB_test2_2
62
Chris Lattnera3c44542005-08-24 18:15:24 +000063===-------------------------------------------------------------------------===
64
65Lump the constant pool for each function into ONE pic object, and reference
66pieces of it as offsets from the start. For functions like this (contrived
67to have lots of constants obviously):
68
69double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
70
71We generate:
72
73_X:
74 lis r2, ha16(.CPI_X_0)
75 lfd f0, lo16(.CPI_X_0)(r2)
76 lis r2, ha16(.CPI_X_1)
77 lfd f2, lo16(.CPI_X_1)(r2)
78 fmadd f0, f1, f0, f2
79 lis r2, ha16(.CPI_X_2)
80 lfd f1, lo16(.CPI_X_2)(r2)
81 lis r2, ha16(.CPI_X_3)
82 lfd f2, lo16(.CPI_X_3)(r2)
83 fmadd f1, f0, f1, f2
84 blr
85
86It would be better to materialize .CPI_X into a register, then use immediates
87off of the register to avoid the lis's. This is even more important in PIC
88mode.
89
Chris Lattner39b248b2006-02-02 23:50:22 +000090Note that this (and the static variable version) is discussed here for GCC:
91http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
92
Chris Lattnera3c44542005-08-24 18:15:24 +000093===-------------------------------------------------------------------------===
Nate Begeman92cce902005-09-06 15:30:48 +000094
95Implement Newton-Rhapson method for improving estimate instructions to the
96correct accuracy, and implementing divide as multiply by reciprocal when it has
97more than one use. Itanium will want this too.
Nate Begeman21e463b2005-10-16 05:39:50 +000098
99===-------------------------------------------------------------------------===
100
Nate Begeman5cd61ce2005-10-25 23:50:02 +0000101#define ARRAY_LENGTH 16
102
103union bitfield {
104 struct {
105#ifndef __ppc__
106 unsigned int field0 : 6;
107 unsigned int field1 : 6;
108 unsigned int field2 : 6;
109 unsigned int field3 : 6;
110 unsigned int field4 : 3;
111 unsigned int field5 : 4;
112 unsigned int field6 : 1;
113#else
114 unsigned int field6 : 1;
115 unsigned int field5 : 4;
116 unsigned int field4 : 3;
117 unsigned int field3 : 6;
118 unsigned int field2 : 6;
119 unsigned int field1 : 6;
120 unsigned int field0 : 6;
121#endif
122 } bitfields, bits;
123 unsigned int u32All;
124 signed int i32All;
125 float f32All;
126};
127
128
129typedef struct program_t {
130 union bitfield array[ARRAY_LENGTH];
131 int size;
132 int loaded;
133} program;
134
135
136void AdjustBitfields(program* prog, unsigned int fmt1)
137{
138 unsigned int shift = 0;
139 unsigned int texCount = 0;
140 unsigned int i;
141
142 for (i = 0; i < 8; i++)
143 {
144 prog->array[i].bitfields.field0 = texCount;
145 prog->array[i].bitfields.field1 = texCount + 1;
146 prog->array[i].bitfields.field2 = texCount + 2;
147 prog->array[i].bitfields.field3 = texCount + 3;
148
149 texCount += (fmt1 >> shift) & 0x7;
150 shift += 3;
151 }
152}
153
154In the loop above, the bitfield adds get generated as
155(add (shl bitfield, C1), (shl C2, C1)) where C2 is 1, 2 or 3.
156
157Since the input to the (or and, and) is an (add) rather than a (shl), the shift
158doesn't get folded into the rlwimi instruction. We should ideally see through
159things like this, rather than forcing llvm to generate the equivalent
160
161(shl (add bitfield, C2), C1) with some kind of mask.
Chris Lattner01959102005-10-28 00:20:45 +0000162
163===-------------------------------------------------------------------------===
164
Chris Lattnerae4664a2005-11-05 08:57:56 +0000165Compile this:
166
167int %f1(int %a, int %b) {
168 %tmp.1 = and int %a, 15 ; <int> [#uses=1]
169 %tmp.3 = and int %b, 240 ; <int> [#uses=1]
170 %tmp.4 = or int %tmp.3, %tmp.1 ; <int> [#uses=1]
171 ret int %tmp.4
172}
173
174without a copy. We make this currently:
175
176_f1:
177 rlwinm r2, r4, 0, 24, 27
178 rlwimi r2, r3, 0, 28, 31
179 or r3, r2, r2
180 blr
181
182The two-addr pass or RA needs to learn when it is profitable to commute an
183instruction to avoid a copy AFTER the 2-addr instruction. The 2-addr pass
184currently only commutes to avoid inserting a copy BEFORE the two addr instr.
185
Chris Lattner62c08dd2005-12-08 07:13:28 +0000186===-------------------------------------------------------------------------===
187
Nate Begemaneb20ed62006-01-28 01:22:10 +0000188176.gcc contains a bunch of code like this (this occurs dozens of times):
189
190int %test(uint %mode.0.i.0) {
191 %tmp.79 = cast uint %mode.0.i.0 to sbyte ; <sbyte> [#uses=1]
192 %tmp.80 = cast sbyte %tmp.79 to int ; <int> [#uses=1]
193 %tmp.81 = shl int %tmp.80, ubyte 16 ; <int> [#uses=1]
194 %tmp.82 = and int %tmp.81, 16711680
195 ret int %tmp.82
196}
197
198which we compile to:
199
200_test:
201 extsb r2, r3
202 rlwinm r3, r2, 16, 8, 15
203 blr
204
205The extsb is obviously dead. This can be handled by a future thing like
206MaskedValueIsZero that checks to see if bits are ever demanded (in this case,
207the sign bits are never used, so we can fold the sext_inreg to nothing).
208
209I'm seeing code like this:
210
211 srwi r3, r3, 16
212 extsb r3, r3
213 rlwimi r4, r3, 16, 8, 15
214
215in which the extsb is preventing the srwi from being nuked.
216
217===-------------------------------------------------------------------------===
218
219Another example that occurs is:
220
221uint %test(int %specbits.6.1) {
222 %tmp.2540 = shr int %specbits.6.1, ubyte 11 ; <int> [#uses=1]
223 %tmp.2541 = cast int %tmp.2540 to uint ; <uint> [#uses=1]
224 %tmp.2542 = shl uint %tmp.2541, ubyte 13 ; <uint> [#uses=1]
225 %tmp.2543 = and uint %tmp.2542, 8192 ; <uint> [#uses=1]
226 ret uint %tmp.2543
227}
228
229which we codegen as:
230
231l1_test:
232 srawi r2, r3, 11
233 rlwinm r3, r2, 13, 18, 18
234 blr
235
236the srawi can be nuked by turning the SAR into a logical SHR (the sext bits are
237dead), which I think can then be folded into the rlwinm.
238
239===-------------------------------------------------------------------------===
240
Chris Lattner62c08dd2005-12-08 07:13:28 +0000241Compile offsets from allocas:
242
243int *%test() {
244 %X = alloca { int, int }
245 %Y = getelementptr {int,int}* %X, int 0, uint 1
246 ret int* %Y
247}
248
249into a single add, not two:
250
251_test:
252 addi r2, r1, -8
253 addi r3, r2, 4
254 blr
255
256--> important for C++.
257
Chris Lattner39706e62005-12-22 17:19:28 +0000258===-------------------------------------------------------------------------===
259
260int test3(int a, int b) { return (a < 0) ? a : 0; }
261
262should be branch free code. LLVM is turning it into < 1 because of the RHS.
263
264===-------------------------------------------------------------------------===
265
Chris Lattner39706e62005-12-22 17:19:28 +0000266No loads or stores of the constants should be needed:
267
268struct foo { double X, Y; };
269void xxx(struct foo F);
270void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
271
Chris Lattner1db4b4f2006-01-16 17:53:00 +0000272===-------------------------------------------------------------------------===
273
Chris Lattner98fbc2f2006-01-16 17:58:54 +0000274Darwin Stub LICM optimization:
275
276Loops like this:
277
278 for (...) bar();
279
280Have to go through an indirect stub if bar is external or linkonce. It would
281be better to compile it as:
282
283 fp = &bar;
284 for (...) fp();
285
286which only computes the address of bar once (instead of each time through the
287stub). This is Darwin specific and would have to be done in the code generator.
288Probably not a win on x86.
289
290===-------------------------------------------------------------------------===
291
292PowerPC i1/setcc stuff (depends on subreg stuff):
293
294Check out the PPC code we get for 'compare' in this testcase:
295http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
296
297oof. on top of not doing the logical crnand instead of (mfcr, mfcr,
298invert, invert, or), we then have to compare it against zero instead of
299using the value already in a CR!
300
301that should be something like
302 cmpw cr7, r8, r5
303 cmpw cr0, r7, r3
304 crnand cr0, cr0, cr7
305 bne cr0, LBB_compare_4
306
307instead of
308 cmpw cr7, r8, r5
309 cmpw cr0, r7, r3
310 mfcr r7, 1
311 mcrf cr7, cr0
312 mfcr r8, 1
313 rlwinm r7, r7, 30, 31, 31
314 rlwinm r8, r8, 30, 31, 31
315 xori r7, r7, 1
316 xori r8, r8, 1
317 addi r2, r2, 1
318 or r7, r8, r7
319 cmpwi cr0, r7, 0
320 bne cr0, LBB_compare_4 ; loopexit
321
322===-------------------------------------------------------------------------===
323
324Simple IPO for argument passing, change:
325 void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
326
327the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
328of arguments get assigned to r3 through r10. That is, if you have a function
329foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
330argument bytes for r4 and r5. The trick then would be to shuffle the argument
331order for functions we can internalize so that the maximum number of
332integers/pointers get passed in regs before you see any of the fp arguments.
333
334Instead of implementing this, it would actually probably be easier to just
335implement a PPC fastcc, where we could do whatever we wanted to the CC,
336including having this work sanely.
337
338===-------------------------------------------------------------------------===
339
340Fix Darwin FP-In-Integer Registers ABI
341
342Darwin passes doubles in structures in integer registers, which is very very
343bad. Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
344that percolates these things out of functions.
345
346Check out how horrible this is:
347http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
348
349This is an extension of "interprocedural CC unmunging" that can't be done with
350just fastcc.
351
352===-------------------------------------------------------------------------===
353
354Code Gen IPO optimization:
355
356Squish small scalar globals together into a single global struct, allowing the
357address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
358of the GOT on targets with one).
359
Chris Lattner3cda14f2006-01-19 02:09:38 +0000360===-------------------------------------------------------------------------===
361
362Generate lwbrx and other byteswapping load/store instructions when reasonable.
363
Chris Lattner96909792006-01-28 05:40:47 +0000364===-------------------------------------------------------------------------===
365
366Implement TargetConstantVec, and set up PPC to custom lower ConstantVec into
367TargetConstantVec's if it's one of the many forms that are algorithmically
368computable using the spiffy altivec instructions.
369
Chris Lattner56b69642006-01-31 02:55:28 +0000370===-------------------------------------------------------------------------===
371
372Compile this:
373
374double %test(double %X) {
375 %Y = cast double %X to long
376 %Z = cast long %Y to double
377 ret double %Z
378}
379
380to this:
381
382_test:
383 fctidz f0, f1
384 stfd f0, -8(r1)
385 lwz r2, -4(r1)
386 lwz r3, -8(r1)
387 stw r2, -12(r1)
388 stw r3, -16(r1)
389 lfd f0, -16(r1)
390 fcfid f1, f0
391 blr
392
393without the lwz/stw's.
394
Chris Lattner83e64ba2006-01-31 07:16:34 +0000395===-------------------------------------------------------------------------===
396
397Compile this:
398
399int foo(int a) {
400 int b = (a < 8);
401 if (b) {
402 return b * 3; // ignore the fact that this is always 3.
403 } else {
404 return 2;
405 }
406}
407
408into something not this:
409
410_foo:
4111) cmpwi cr7, r3, 8
412 mfcr r2, 1
413 rlwinm r2, r2, 29, 31, 31
4141) cmpwi cr0, r3, 7
415 bgt cr0, LBB1_2 ; UnifiedReturnBlock
416LBB1_1: ; then
417 rlwinm r2, r2, 0, 31, 31
418 mulli r3, r2, 3
419 blr
420LBB1_2: ; UnifiedReturnBlock
421 li r3, 2
422 blr
423
424In particular, the two compares (marked 1) could be shared by reversing one.
425This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
426same operands (but backwards) exists. In this case, this wouldn't save us
427anything though, because the compares still wouldn't be shared.
Chris Lattner0ddc1802006-02-01 00:28:12 +0000428
Chris Lattner5a7efc92006-02-01 17:54:23 +0000429===-------------------------------------------------------------------------===
430
431The legalizer should lower this:
432
433bool %test(ulong %x) {
434 %tmp = setlt ulong %x, 4294967296
435 ret bool %tmp
436}
437
438into "if x.high == 0", not:
439
440_test:
441 addi r2, r3, -1
442 cntlzw r2, r2
443 cntlzw r3, r3
444 srwi r2, r2, 5
Nate Begeman93c740b2006-02-02 07:27:56 +0000445 srwi r4, r3, 5
446 li r3, 0
Chris Lattner5a7efc92006-02-01 17:54:23 +0000447 cmpwi cr0, r2, 0
448 bne cr0, LBB1_2 ;
449LBB1_1:
Nate Begeman93c740b2006-02-02 07:27:56 +0000450 or r3, r4, r4
Chris Lattner5a7efc92006-02-01 17:54:23 +0000451LBB1_2:
Chris Lattner5a7efc92006-02-01 17:54:23 +0000452 blr
453
454noticed in 2005-05-11-Popcount-ffs-fls.c.
Chris Lattner275b8842006-02-02 07:37:11 +0000455
456
457===-------------------------------------------------------------------------===
458
459We should custom expand setcc instead of pretending that we have it. That
460would allow us to expose the access of the crbit after the mfcr, allowing
461that access to be trivially folded into other ops. A simple example:
462
463int foo(int a, int b) { return (a < b) << 4; }
464
465compiles into:
466
467_foo:
468 cmpw cr7, r3, r4
469 mfcr r2, 1
470 rlwinm r2, r2, 29, 31, 31
471 slwi r3, r2, 4
472 blr
473
Chris Lattnerd463f7f2006-02-03 01:49:49 +0000474===-------------------------------------------------------------------------===
475
476Get the C front-end to expand hypot(x,y) -> llvm.sqrt(x*x+y*y) when errno and
477precision don't matter (ffastmath). Misc/mandel will like this. :)
478
Nate Begemana63fee82006-02-03 05:17:06 +0000479===-------------------------------------------------------------------------===
480
481Fold add and sub with constant into non-extern, non-weak addresses so this:
482
483static int a;
484void bar(int b) { a = b; }
485void foo(unsigned char *c) {
486 *c = a;
487}
488
489So that
490
491_foo:
492 lis r2, ha16(_a)
493 la r2, lo16(_a)(r2)
494 lbz r2, 3(r2)
495 stb r2, 0(r3)
496 blr
497
498Becomes
499
500_foo:
501 lis r2, ha16(_a+3)
502 lbz r2, lo16(_a+3)(r2)
503 stb r2, 0(r3)
504 blr