blob: 88582966994acd5d2e540503f7f08141321de8f5 [file] [log] [blame]
Nate Begemanb64af912004-08-10 20:42:36 +00001TODO:
Nate Begemanef9531e2005-04-11 20:48:57 +00002* gpr0 allocation
Nate Begeman4a0de072004-10-26 04:10:53 +00003* implement do-loop -> bdnz transform
Nate Begemanca068e82004-08-14 22:16:36 +00004* implement powerpc-64 for darwin
Nate Begemand332fd52004-08-29 22:02:43 +00005* use stfiwx in float->int
Nate Begeman4ad870d2005-07-26 18:59:06 +00006* be able to combine sequences like the following into 2 instructions:
7 lis r2, ha16(l2__ZTV4Cell)
8 la r2, lo16(l2__ZTV4Cell)(r2)
9 addi r2, r2, 8
Chris Lattnerb65975a2005-07-26 19:07:51 +000010
Nate Begeman5a014812005-08-14 01:17:16 +000011* Teach LLVM how to codegen this:
12unsigned short foo(float a) { return a; }
13as:
14_foo:
15 fctiwz f0,f1
16 stfd f0,-8(r1)
17 lhz r3,-2(r1)
18 blr
19not:
20_foo:
21 fctiwz f0, f1
22 stfd f0, -8(r1)
23 lwz r2, -4(r1)
24 rlwinm r3, r2, 0, 16, 31
25 blr
26
Chris Lattner3d8df552005-10-18 06:30:51 +000027and:
28 extern int X, Y; int* test(int C) { return C? &X : &Y; }
29as one load when using --enable-pic.
Nate Begeman5a014812005-08-14 01:17:16 +000030
Chris Lattner6281ae42005-08-05 19:18:32 +000031* Support 'update' load/store instructions. These are cracked on the G5, but
32 are still a codesize win.
33
Chris Lattnerc7e18a12005-08-09 22:30:57 +000034* Add a custom legalizer for the GlobalAddress node, to move the funky darwin
35 stub stuff from the instruction selector to the legalizer (exposing low-level
36 operations to the dag for optzn. For example, we want to codegen this:
37
38 int A = 0;
39 void B() { A++; }
40 as:
41 lis r9,ha16(_A)
42 lwz r2,lo16(_A)(r9)
43 addi r2,r2,1
44 stw r2,lo16(_A)(r9)
45 not:
46 lis r2, ha16(_A)
47 lwz r2, lo16(_A)(r2)
48 addi r2, r2, 1
49 lis r3, ha16(_A)
50 stw r2, lo16(_A)(r3)
51
Misha Brukman4ce5ce22004-07-27 18:43:04 +000052* should hint to the branch select pass that it doesn't need to print the
53 second unconditional branch, so we don't end up with things like:
Misha Brukman4ce5ce22004-07-27 18:43:04 +000054 b .LBBl42__2E_expand_function_8_674 ; loopentry.24
55 b .LBBl42__2E_expand_function_8_42 ; NewDefault
56 b .LBBl42__2E_expand_function_8_42 ; NewDefault
Chris Lattner424dcbd2005-08-23 06:27:59 +000057
Chris Lattnera3c44542005-08-24 18:15:24 +000058===-------------------------------------------------------------------------===
59
Chris Lattner424dcbd2005-08-23 06:27:59 +000060* Codegen this:
61
62 void test2(int X) {
63 if (X == 0x12345678) bar();
64 }
65
66 as:
67
68 xoris r0,r3,0x1234
69 cmpwi cr0,r0,0x5678
70 beq cr0,L6
71
72 not:
73
74 lis r2, 4660
75 ori r2, r2, 22136
76 cmpw cr0, r3, r2
77 bne .LBB_test2_2
78
Chris Lattnera3c44542005-08-24 18:15:24 +000079===-------------------------------------------------------------------------===
80
81Lump the constant pool for each function into ONE pic object, and reference
82pieces of it as offsets from the start. For functions like this (contrived
83to have lots of constants obviously):
84
85double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
86
87We generate:
88
89_X:
90 lis r2, ha16(.CPI_X_0)
91 lfd f0, lo16(.CPI_X_0)(r2)
92 lis r2, ha16(.CPI_X_1)
93 lfd f2, lo16(.CPI_X_1)(r2)
94 fmadd f0, f1, f0, f2
95 lis r2, ha16(.CPI_X_2)
96 lfd f1, lo16(.CPI_X_2)(r2)
97 lis r2, ha16(.CPI_X_3)
98 lfd f2, lo16(.CPI_X_3)(r2)
99 fmadd f1, f0, f1, f2
100 blr
101
102It would be better to materialize .CPI_X into a register, then use immediates
103off of the register to avoid the lis's. This is even more important in PIC
104mode.
105
106===-------------------------------------------------------------------------===
Nate Begeman92cce902005-09-06 15:30:48 +0000107
108Implement Newton-Rhapson method for improving estimate instructions to the
109correct accuracy, and implementing divide as multiply by reciprocal when it has
110more than one use. Itanium will want this too.
Nate Begeman21e463b2005-10-16 05:39:50 +0000111
112===-------------------------------------------------------------------------===
113
114int foo(int a, int b) { return a == b ? 16 : 0; }
115_foo:
116 cmpw cr7, r3, r4
117 mfcr r2
118 rlwinm r2, r2, 31, 31, 31
119 slwi r3, r2, 4
120 blr
121
122If we exposed the srl & mask ops after the MFCR that we are doing to select
123the correct CR bit, then we could fold the slwi into the rlwinm before it.
Nate Begeman5cd61ce2005-10-25 23:50:02 +0000124
125===-------------------------------------------------------------------------===
126
127#define ARRAY_LENGTH 16
128
129union bitfield {
130 struct {
131#ifndef __ppc__
132 unsigned int field0 : 6;
133 unsigned int field1 : 6;
134 unsigned int field2 : 6;
135 unsigned int field3 : 6;
136 unsigned int field4 : 3;
137 unsigned int field5 : 4;
138 unsigned int field6 : 1;
139#else
140 unsigned int field6 : 1;
141 unsigned int field5 : 4;
142 unsigned int field4 : 3;
143 unsigned int field3 : 6;
144 unsigned int field2 : 6;
145 unsigned int field1 : 6;
146 unsigned int field0 : 6;
147#endif
148 } bitfields, bits;
149 unsigned int u32All;
150 signed int i32All;
151 float f32All;
152};
153
154
155typedef struct program_t {
156 union bitfield array[ARRAY_LENGTH];
157 int size;
158 int loaded;
159} program;
160
161
162void AdjustBitfields(program* prog, unsigned int fmt1)
163{
164 unsigned int shift = 0;
165 unsigned int texCount = 0;
166 unsigned int i;
167
168 for (i = 0; i < 8; i++)
169 {
170 prog->array[i].bitfields.field0 = texCount;
171 prog->array[i].bitfields.field1 = texCount + 1;
172 prog->array[i].bitfields.field2 = texCount + 2;
173 prog->array[i].bitfields.field3 = texCount + 3;
174
175 texCount += (fmt1 >> shift) & 0x7;
176 shift += 3;
177 }
178}
179
180In the loop above, the bitfield adds get generated as
181(add (shl bitfield, C1), (shl C2, C1)) where C2 is 1, 2 or 3.
182
183Since the input to the (or and, and) is an (add) rather than a (shl), the shift
184doesn't get folded into the rlwimi instruction. We should ideally see through
185things like this, rather than forcing llvm to generate the equivalent
186
187(shl (add bitfield, C2), C1) with some kind of mask.
Chris Lattner01959102005-10-28 00:20:45 +0000188
189===-------------------------------------------------------------------------===
190
191Compile this (standard bitfield insert of a constant):
192void %test(uint* %tmp1) {
193 %tmp2 = load uint* %tmp1 ; <uint> [#uses=1]
194 %tmp5 = or uint %tmp2, 257949696 ; <uint> [#uses=1]
195 %tmp6 = and uint %tmp5, 4018143231 ; <uint> [#uses=1]
196 store uint %tmp6, uint* %tmp1
197 ret void
198}
199
200to:
201
202_test:
203 lwz r0,0(r3)
204 li r2,123
205 rlwimi r0,r2,21,3,10
206 stw r0,0(r3)
207 blr
208
209instead of:
210
211_test:
212 lis r2, -4225
213 lwz r4, 0(r3)
214 ori r2, r2, 65535
215 oris r4, r4, 3936
216 and r2, r4, r2
217 stw r2, 0(r3)
218 blr
219
Chris Lattnerae4664a2005-11-05 08:57:56 +0000220===-------------------------------------------------------------------------===
221
222Compile this:
223
224int %f1(int %a, int %b) {
225 %tmp.1 = and int %a, 15 ; <int> [#uses=1]
226 %tmp.3 = and int %b, 240 ; <int> [#uses=1]
227 %tmp.4 = or int %tmp.3, %tmp.1 ; <int> [#uses=1]
228 ret int %tmp.4
229}
230
231without a copy. We make this currently:
232
233_f1:
234 rlwinm r2, r4, 0, 24, 27
235 rlwimi r2, r3, 0, 28, 31
236 or r3, r2, r2
237 blr
238
239The two-addr pass or RA needs to learn when it is profitable to commute an
240instruction to avoid a copy AFTER the 2-addr instruction. The 2-addr pass
241currently only commutes to avoid inserting a copy BEFORE the two addr instr.
242