blob: 58805562a87e94776eb3802de9c4953d8cbd094c [file] [log] [blame]
Nate Begemanb64af912004-08-10 20:42:36 +00001TODO:
Nate Begemanef9531e2005-04-11 20:48:57 +00002* gpr0 allocation
Nate Begeman4a0de072004-10-26 04:10:53 +00003* implement do-loop -> bdnz transform
Nate Begemanca068e82004-08-14 22:16:36 +00004* implement powerpc-64 for darwin
Nate Begemand332fd52004-08-29 22:02:43 +00005* use stfiwx in float->int
Nate Begeman4ad870d2005-07-26 18:59:06 +00006* be able to combine sequences like the following into 2 instructions:
7 lis r2, ha16(l2__ZTV4Cell)
8 la r2, lo16(l2__ZTV4Cell)(r2)
9 addi r2, r2, 8
Chris Lattnerb65975a2005-07-26 19:07:51 +000010
Nate Begeman5a014812005-08-14 01:17:16 +000011* Teach LLVM how to codegen this:
12unsigned short foo(float a) { return a; }
13as:
14_foo:
15 fctiwz f0,f1
16 stfd f0,-8(r1)
17 lhz r3,-2(r1)
18 blr
19not:
20_foo:
21 fctiwz f0, f1
22 stfd f0, -8(r1)
23 lwz r2, -4(r1)
24 rlwinm r3, r2, 0, 16, 31
25 blr
26
Chris Lattner3d8df552005-10-18 06:30:51 +000027and:
28 extern int X, Y; int* test(int C) { return C? &X : &Y; }
29as one load when using --enable-pic.
Nate Begeman5a014812005-08-14 01:17:16 +000030
Chris Lattner6281ae42005-08-05 19:18:32 +000031* Support 'update' load/store instructions. These are cracked on the G5, but
32 are still a codesize win.
33
Misha Brukman4ce5ce22004-07-27 18:43:04 +000034* should hint to the branch select pass that it doesn't need to print the
35 second unconditional branch, so we don't end up with things like:
Misha Brukman4ce5ce22004-07-27 18:43:04 +000036 b .LBBl42__2E_expand_function_8_674 ; loopentry.24
37 b .LBBl42__2E_expand_function_8_42 ; NewDefault
38 b .LBBl42__2E_expand_function_8_42 ; NewDefault
Chris Lattner424dcbd2005-08-23 06:27:59 +000039
Chris Lattnera3c44542005-08-24 18:15:24 +000040===-------------------------------------------------------------------------===
41
Chris Lattner424dcbd2005-08-23 06:27:59 +000042* Codegen this:
43
44 void test2(int X) {
45 if (X == 0x12345678) bar();
46 }
47
48 as:
49
50 xoris r0,r3,0x1234
51 cmpwi cr0,r0,0x5678
52 beq cr0,L6
53
54 not:
55
56 lis r2, 4660
57 ori r2, r2, 22136
58 cmpw cr0, r3, r2
59 bne .LBB_test2_2
60
Chris Lattnera3c44542005-08-24 18:15:24 +000061===-------------------------------------------------------------------------===
62
63Lump the constant pool for each function into ONE pic object, and reference
64pieces of it as offsets from the start. For functions like this (contrived
65to have lots of constants obviously):
66
67double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
68
69We generate:
70
71_X:
72 lis r2, ha16(.CPI_X_0)
73 lfd f0, lo16(.CPI_X_0)(r2)
74 lis r2, ha16(.CPI_X_1)
75 lfd f2, lo16(.CPI_X_1)(r2)
76 fmadd f0, f1, f0, f2
77 lis r2, ha16(.CPI_X_2)
78 lfd f1, lo16(.CPI_X_2)(r2)
79 lis r2, ha16(.CPI_X_3)
80 lfd f2, lo16(.CPI_X_3)(r2)
81 fmadd f1, f0, f1, f2
82 blr
83
84It would be better to materialize .CPI_X into a register, then use immediates
85off of the register to avoid the lis's. This is even more important in PIC
86mode.
87
88===-------------------------------------------------------------------------===
Nate Begeman92cce902005-09-06 15:30:48 +000089
90Implement Newton-Rhapson method for improving estimate instructions to the
91correct accuracy, and implementing divide as multiply by reciprocal when it has
92more than one use. Itanium will want this too.
Nate Begeman21e463b2005-10-16 05:39:50 +000093
94===-------------------------------------------------------------------------===
95
96int foo(int a, int b) { return a == b ? 16 : 0; }
97_foo:
98 cmpw cr7, r3, r4
99 mfcr r2
100 rlwinm r2, r2, 31, 31, 31
101 slwi r3, r2, 4
102 blr
103
104If we exposed the srl & mask ops after the MFCR that we are doing to select
105the correct CR bit, then we could fold the slwi into the rlwinm before it.
Nate Begeman5cd61ce2005-10-25 23:50:02 +0000106
107===-------------------------------------------------------------------------===
108
109#define ARRAY_LENGTH 16
110
111union bitfield {
112 struct {
113#ifndef __ppc__
114 unsigned int field0 : 6;
115 unsigned int field1 : 6;
116 unsigned int field2 : 6;
117 unsigned int field3 : 6;
118 unsigned int field4 : 3;
119 unsigned int field5 : 4;
120 unsigned int field6 : 1;
121#else
122 unsigned int field6 : 1;
123 unsigned int field5 : 4;
124 unsigned int field4 : 3;
125 unsigned int field3 : 6;
126 unsigned int field2 : 6;
127 unsigned int field1 : 6;
128 unsigned int field0 : 6;
129#endif
130 } bitfields, bits;
131 unsigned int u32All;
132 signed int i32All;
133 float f32All;
134};
135
136
137typedef struct program_t {
138 union bitfield array[ARRAY_LENGTH];
139 int size;
140 int loaded;
141} program;
142
143
144void AdjustBitfields(program* prog, unsigned int fmt1)
145{
146 unsigned int shift = 0;
147 unsigned int texCount = 0;
148 unsigned int i;
149
150 for (i = 0; i < 8; i++)
151 {
152 prog->array[i].bitfields.field0 = texCount;
153 prog->array[i].bitfields.field1 = texCount + 1;
154 prog->array[i].bitfields.field2 = texCount + 2;
155 prog->array[i].bitfields.field3 = texCount + 3;
156
157 texCount += (fmt1 >> shift) & 0x7;
158 shift += 3;
159 }
160}
161
162In the loop above, the bitfield adds get generated as
163(add (shl bitfield, C1), (shl C2, C1)) where C2 is 1, 2 or 3.
164
165Since the input to the (or and, and) is an (add) rather than a (shl), the shift
166doesn't get folded into the rlwimi instruction. We should ideally see through
167things like this, rather than forcing llvm to generate the equivalent
168
169(shl (add bitfield, C2), C1) with some kind of mask.
Chris Lattner01959102005-10-28 00:20:45 +0000170
171===-------------------------------------------------------------------------===
172
173Compile this (standard bitfield insert of a constant):
174void %test(uint* %tmp1) {
175 %tmp2 = load uint* %tmp1 ; <uint> [#uses=1]
176 %tmp5 = or uint %tmp2, 257949696 ; <uint> [#uses=1]
177 %tmp6 = and uint %tmp5, 4018143231 ; <uint> [#uses=1]
178 store uint %tmp6, uint* %tmp1
179 ret void
180}
181
182to:
183
184_test:
185 lwz r0,0(r3)
186 li r2,123
187 rlwimi r0,r2,21,3,10
188 stw r0,0(r3)
189 blr
190
191instead of:
192
193_test:
194 lis r2, -4225
195 lwz r4, 0(r3)
196 ori r2, r2, 65535
197 oris r4, r4, 3936
198 and r2, r4, r2
199 stw r2, 0(r3)
200 blr
201
Chris Lattnerae4664a2005-11-05 08:57:56 +0000202===-------------------------------------------------------------------------===
203
204Compile this:
205
206int %f1(int %a, int %b) {
207 %tmp.1 = and int %a, 15 ; <int> [#uses=1]
208 %tmp.3 = and int %b, 240 ; <int> [#uses=1]
209 %tmp.4 = or int %tmp.3, %tmp.1 ; <int> [#uses=1]
210 ret int %tmp.4
211}
212
213without a copy. We make this currently:
214
215_f1:
216 rlwinm r2, r4, 0, 24, 27
217 rlwimi r2, r3, 0, 28, 31
218 or r3, r2, r2
219 blr
220
221The two-addr pass or RA needs to learn when it is profitable to commute an
222instruction to avoid a copy AFTER the 2-addr instruction. The 2-addr pass
223currently only commutes to avoid inserting a copy BEFORE the two addr instr.
224
Chris Lattner62c08dd2005-12-08 07:13:28 +0000225===-------------------------------------------------------------------------===
226
227Compile offsets from allocas:
228
229int *%test() {
230 %X = alloca { int, int }
231 %Y = getelementptr {int,int}* %X, int 0, uint 1
232 ret int* %Y
233}
234
235into a single add, not two:
236
237_test:
238 addi r2, r1, -8
239 addi r3, r2, 4
240 blr
241
242--> important for C++.
243
Chris Lattner39706e62005-12-22 17:19:28 +0000244===-------------------------------------------------------------------------===
245
246int test3(int a, int b) { return (a < 0) ? a : 0; }
247
248should be branch free code. LLVM is turning it into < 1 because of the RHS.
249
250===-------------------------------------------------------------------------===
251
252For this testcase:
253int f1(int a, int b) { return (a&0xF)|(b&0xF0); }
254
255We currently emit:
256_f1:
257 rlwinm r2, r4, 0, 24, 27
258 rlwimi r2, r3, 0, 28, 31
259 or r3, r2, r2
260 blr
261
262We could emit:
263_f1:
264 rlwinm r4, r4, 0, 24, 27
265 rlwimi r3, r4, 0, 0, 27
266 blr
267
268===-------------------------------------------------------------------------===
269
270No loads or stores of the constants should be needed:
271
272struct foo { double X, Y; };
273void xxx(struct foo F);
274void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
275