blob: b23b9a4cc4be3cc6b2f2ad00759fda4955ae71d4 [file] [log] [blame]
Nate Begemanb64af912004-08-10 20:42:36 +00001TODO:
Nate Begemanef9531e2005-04-11 20:48:57 +00002* gpr0 allocation
Nate Begeman4a0de072004-10-26 04:10:53 +00003* implement do-loop -> bdnz transform
Nate Begemanca068e82004-08-14 22:16:36 +00004* implement powerpc-64 for darwin
Nate Begemand332fd52004-08-29 22:02:43 +00005* use stfiwx in float->int
Nate Begeman50fb3c42005-12-24 01:00:15 +00006
7* Fold add and sub with constant into non-extern, non-weak addresses so this:
Nate Begeman4ad870d2005-07-26 18:59:06 +00008 lis r2, ha16(l2__ZTV4Cell)
9 la r2, lo16(l2__ZTV4Cell)(r2)
10 addi r2, r2, 8
Nate Begeman50fb3c42005-12-24 01:00:15 +000011becomes:
12 lis r2, ha16(l2__ZTV4Cell+8)
13 la r2, lo16(l2__ZTV4Cell+8)(r2)
14
Chris Lattnerb65975a2005-07-26 19:07:51 +000015
Nate Begeman5a014812005-08-14 01:17:16 +000016* Teach LLVM how to codegen this:
17unsigned short foo(float a) { return a; }
18as:
19_foo:
20 fctiwz f0,f1
21 stfd f0,-8(r1)
22 lhz r3,-2(r1)
23 blr
24not:
25_foo:
26 fctiwz f0, f1
27 stfd f0, -8(r1)
28 lwz r2, -4(r1)
29 rlwinm r3, r2, 0, 16, 31
30 blr
31
Chris Lattner6281ae42005-08-05 19:18:32 +000032* Support 'update' load/store instructions. These are cracked on the G5, but
33 are still a codesize win.
34
Misha Brukman4ce5ce22004-07-27 18:43:04 +000035* should hint to the branch select pass that it doesn't need to print the
36 second unconditional branch, so we don't end up with things like:
Misha Brukman4ce5ce22004-07-27 18:43:04 +000037 b .LBBl42__2E_expand_function_8_674 ; loopentry.24
38 b .LBBl42__2E_expand_function_8_42 ; NewDefault
39 b .LBBl42__2E_expand_function_8_42 ; NewDefault
Chris Lattner424dcbd2005-08-23 06:27:59 +000040
Chris Lattnera3c44542005-08-24 18:15:24 +000041===-------------------------------------------------------------------------===
42
Chris Lattner424dcbd2005-08-23 06:27:59 +000043* Codegen this:
44
45 void test2(int X) {
46 if (X == 0x12345678) bar();
47 }
48
49 as:
50
51 xoris r0,r3,0x1234
52 cmpwi cr0,r0,0x5678
53 beq cr0,L6
54
55 not:
56
57 lis r2, 4660
58 ori r2, r2, 22136
59 cmpw cr0, r3, r2
60 bne .LBB_test2_2
61
Chris Lattnera3c44542005-08-24 18:15:24 +000062===-------------------------------------------------------------------------===
63
64Lump the constant pool for each function into ONE pic object, and reference
65pieces of it as offsets from the start. For functions like this (contrived
66to have lots of constants obviously):
67
68double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
69
70We generate:
71
72_X:
73 lis r2, ha16(.CPI_X_0)
74 lfd f0, lo16(.CPI_X_0)(r2)
75 lis r2, ha16(.CPI_X_1)
76 lfd f2, lo16(.CPI_X_1)(r2)
77 fmadd f0, f1, f0, f2
78 lis r2, ha16(.CPI_X_2)
79 lfd f1, lo16(.CPI_X_2)(r2)
80 lis r2, ha16(.CPI_X_3)
81 lfd f2, lo16(.CPI_X_3)(r2)
82 fmadd f1, f0, f1, f2
83 blr
84
85It would be better to materialize .CPI_X into a register, then use immediates
86off of the register to avoid the lis's. This is even more important in PIC
87mode.
88
89===-------------------------------------------------------------------------===
Nate Begeman92cce902005-09-06 15:30:48 +000090
91Implement Newton-Rhapson method for improving estimate instructions to the
92correct accuracy, and implementing divide as multiply by reciprocal when it has
93more than one use. Itanium will want this too.
Nate Begeman21e463b2005-10-16 05:39:50 +000094
95===-------------------------------------------------------------------------===
96
97int foo(int a, int b) { return a == b ? 16 : 0; }
98_foo:
99 cmpw cr7, r3, r4
100 mfcr r2
101 rlwinm r2, r2, 31, 31, 31
102 slwi r3, r2, 4
103 blr
104
105If we exposed the srl & mask ops after the MFCR that we are doing to select
106the correct CR bit, then we could fold the slwi into the rlwinm before it.
Nate Begeman5cd61ce2005-10-25 23:50:02 +0000107
108===-------------------------------------------------------------------------===
109
110#define ARRAY_LENGTH 16
111
112union bitfield {
113 struct {
114#ifndef __ppc__
115 unsigned int field0 : 6;
116 unsigned int field1 : 6;
117 unsigned int field2 : 6;
118 unsigned int field3 : 6;
119 unsigned int field4 : 3;
120 unsigned int field5 : 4;
121 unsigned int field6 : 1;
122#else
123 unsigned int field6 : 1;
124 unsigned int field5 : 4;
125 unsigned int field4 : 3;
126 unsigned int field3 : 6;
127 unsigned int field2 : 6;
128 unsigned int field1 : 6;
129 unsigned int field0 : 6;
130#endif
131 } bitfields, bits;
132 unsigned int u32All;
133 signed int i32All;
134 float f32All;
135};
136
137
138typedef struct program_t {
139 union bitfield array[ARRAY_LENGTH];
140 int size;
141 int loaded;
142} program;
143
144
145void AdjustBitfields(program* prog, unsigned int fmt1)
146{
147 unsigned int shift = 0;
148 unsigned int texCount = 0;
149 unsigned int i;
150
151 for (i = 0; i < 8; i++)
152 {
153 prog->array[i].bitfields.field0 = texCount;
154 prog->array[i].bitfields.field1 = texCount + 1;
155 prog->array[i].bitfields.field2 = texCount + 2;
156 prog->array[i].bitfields.field3 = texCount + 3;
157
158 texCount += (fmt1 >> shift) & 0x7;
159 shift += 3;
160 }
161}
162
163In the loop above, the bitfield adds get generated as
164(add (shl bitfield, C1), (shl C2, C1)) where C2 is 1, 2 or 3.
165
166Since the input to the (or and, and) is an (add) rather than a (shl), the shift
167doesn't get folded into the rlwimi instruction. We should ideally see through
168things like this, rather than forcing llvm to generate the equivalent
169
170(shl (add bitfield, C2), C1) with some kind of mask.
Chris Lattner01959102005-10-28 00:20:45 +0000171
172===-------------------------------------------------------------------------===
173
Chris Lattnerae4664a2005-11-05 08:57:56 +0000174Compile this:
175
176int %f1(int %a, int %b) {
177 %tmp.1 = and int %a, 15 ; <int> [#uses=1]
178 %tmp.3 = and int %b, 240 ; <int> [#uses=1]
179 %tmp.4 = or int %tmp.3, %tmp.1 ; <int> [#uses=1]
180 ret int %tmp.4
181}
182
183without a copy. We make this currently:
184
185_f1:
186 rlwinm r2, r4, 0, 24, 27
187 rlwimi r2, r3, 0, 28, 31
188 or r3, r2, r2
189 blr
190
191The two-addr pass or RA needs to learn when it is profitable to commute an
192instruction to avoid a copy AFTER the 2-addr instruction. The 2-addr pass
193currently only commutes to avoid inserting a copy BEFORE the two addr instr.
194
Chris Lattner62c08dd2005-12-08 07:13:28 +0000195===-------------------------------------------------------------------------===
196
Nate Begemaneb20ed62006-01-28 01:22:10 +0000197176.gcc contains a bunch of code like this (this occurs dozens of times):
198
199int %test(uint %mode.0.i.0) {
200 %tmp.79 = cast uint %mode.0.i.0 to sbyte ; <sbyte> [#uses=1]
201 %tmp.80 = cast sbyte %tmp.79 to int ; <int> [#uses=1]
202 %tmp.81 = shl int %tmp.80, ubyte 16 ; <int> [#uses=1]
203 %tmp.82 = and int %tmp.81, 16711680
204 ret int %tmp.82
205}
206
207which we compile to:
208
209_test:
210 extsb r2, r3
211 rlwinm r3, r2, 16, 8, 15
212 blr
213
214The extsb is obviously dead. This can be handled by a future thing like
215MaskedValueIsZero that checks to see if bits are ever demanded (in this case,
216the sign bits are never used, so we can fold the sext_inreg to nothing).
217
218I'm seeing code like this:
219
220 srwi r3, r3, 16
221 extsb r3, r3
222 rlwimi r4, r3, 16, 8, 15
223
224in which the extsb is preventing the srwi from being nuked.
225
226===-------------------------------------------------------------------------===
227
228Another example that occurs is:
229
230uint %test(int %specbits.6.1) {
231 %tmp.2540 = shr int %specbits.6.1, ubyte 11 ; <int> [#uses=1]
232 %tmp.2541 = cast int %tmp.2540 to uint ; <uint> [#uses=1]
233 %tmp.2542 = shl uint %tmp.2541, ubyte 13 ; <uint> [#uses=1]
234 %tmp.2543 = and uint %tmp.2542, 8192 ; <uint> [#uses=1]
235 ret uint %tmp.2543
236}
237
238which we codegen as:
239
240l1_test:
241 srawi r2, r3, 11
242 rlwinm r3, r2, 13, 18, 18
243 blr
244
245the srawi can be nuked by turning the SAR into a logical SHR (the sext bits are
246dead), which I think can then be folded into the rlwinm.
247
248===-------------------------------------------------------------------------===
249
Chris Lattner62c08dd2005-12-08 07:13:28 +0000250Compile offsets from allocas:
251
252int *%test() {
253 %X = alloca { int, int }
254 %Y = getelementptr {int,int}* %X, int 0, uint 1
255 ret int* %Y
256}
257
258into a single add, not two:
259
260_test:
261 addi r2, r1, -8
262 addi r3, r2, 4
263 blr
264
265--> important for C++.
266
Chris Lattner39706e62005-12-22 17:19:28 +0000267===-------------------------------------------------------------------------===
268
269int test3(int a, int b) { return (a < 0) ? a : 0; }
270
271should be branch free code. LLVM is turning it into < 1 because of the RHS.
272
273===-------------------------------------------------------------------------===
274
Chris Lattner39706e62005-12-22 17:19:28 +0000275No loads or stores of the constants should be needed:
276
277struct foo { double X, Y; };
278void xxx(struct foo F);
279void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
280
Chris Lattner1db4b4f2006-01-16 17:53:00 +0000281===-------------------------------------------------------------------------===
282
283For this:
284
285int h(int i, int j, int k) {
286 return (i==0||j==0||k == 0);
287}
288
289We currently emit this:
290
291_h:
292 cntlzw r2, r3
293 cntlzw r3, r4
294 cntlzw r4, r5
295 srwi r2, r2, 5
296 srwi r3, r3, 5
297 srwi r4, r4, 5
298 or r2, r3, r2
299 or r3, r2, r4
300 blr
301
302The ctlz/shift instructions are created by the isel, so the dag combiner doesn't
303have a chance to pull the shifts through the or's (eliminating two
304instructions). SETCC nodes should be custom lowered in this case, not expanded
305by the isel.
306
Chris Lattner98fbc2f2006-01-16 17:58:54 +0000307===-------------------------------------------------------------------------===
308
309Darwin Stub LICM optimization:
310
311Loops like this:
312
313 for (...) bar();
314
315Have to go through an indirect stub if bar is external or linkonce. It would
316be better to compile it as:
317
318 fp = &bar;
319 for (...) fp();
320
321which only computes the address of bar once (instead of each time through the
322stub). This is Darwin specific and would have to be done in the code generator.
323Probably not a win on x86.
324
325===-------------------------------------------------------------------------===
326
327PowerPC i1/setcc stuff (depends on subreg stuff):
328
329Check out the PPC code we get for 'compare' in this testcase:
330http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
331
332oof. on top of not doing the logical crnand instead of (mfcr, mfcr,
333invert, invert, or), we then have to compare it against zero instead of
334using the value already in a CR!
335
336that should be something like
337 cmpw cr7, r8, r5
338 cmpw cr0, r7, r3
339 crnand cr0, cr0, cr7
340 bne cr0, LBB_compare_4
341
342instead of
343 cmpw cr7, r8, r5
344 cmpw cr0, r7, r3
345 mfcr r7, 1
346 mcrf cr7, cr0
347 mfcr r8, 1
348 rlwinm r7, r7, 30, 31, 31
349 rlwinm r8, r8, 30, 31, 31
350 xori r7, r7, 1
351 xori r8, r8, 1
352 addi r2, r2, 1
353 or r7, r8, r7
354 cmpwi cr0, r7, 0
355 bne cr0, LBB_compare_4 ; loopexit
356
357===-------------------------------------------------------------------------===
358
359Simple IPO for argument passing, change:
360 void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
361
362the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
363of arguments get assigned to r3 through r10. That is, if you have a function
364foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
365argument bytes for r4 and r5. The trick then would be to shuffle the argument
366order for functions we can internalize so that the maximum number of
367integers/pointers get passed in regs before you see any of the fp arguments.
368
369Instead of implementing this, it would actually probably be easier to just
370implement a PPC fastcc, where we could do whatever we wanted to the CC,
371including having this work sanely.
372
373===-------------------------------------------------------------------------===
374
375Fix Darwin FP-In-Integer Registers ABI
376
377Darwin passes doubles in structures in integer registers, which is very very
378bad. Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
379that percolates these things out of functions.
380
381Check out how horrible this is:
382http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
383
384This is an extension of "interprocedural CC unmunging" that can't be done with
385just fastcc.
386
387===-------------------------------------------------------------------------===
388
389Code Gen IPO optimization:
390
391Squish small scalar globals together into a single global struct, allowing the
392address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
393of the GOT on targets with one).
394
Chris Lattner3cda14f2006-01-19 02:09:38 +0000395===-------------------------------------------------------------------------===
396
397Generate lwbrx and other byteswapping load/store instructions when reasonable.
398
Chris Lattner96909792006-01-28 05:40:47 +0000399===-------------------------------------------------------------------------===
400
401Implement TargetConstantVec, and set up PPC to custom lower ConstantVec into
402TargetConstantVec's if it's one of the many forms that are algorithmically
403computable using the spiffy altivec instructions.
404
Chris Lattner56b69642006-01-31 02:55:28 +0000405===-------------------------------------------------------------------------===
406
407Compile this:
408
409double %test(double %X) {
410 %Y = cast double %X to long
411 %Z = cast long %Y to double
412 ret double %Z
413}
414
415to this:
416
417_test:
418 fctidz f0, f1
419 stfd f0, -8(r1)
420 lwz r2, -4(r1)
421 lwz r3, -8(r1)
422 stw r2, -12(r1)
423 stw r3, -16(r1)
424 lfd f0, -16(r1)
425 fcfid f1, f0
426 blr
427
428without the lwz/stw's.
429