diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 071b399..5cf2ffd 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -49,6 +49,7 @@
 
 STATISTIC(NumTwoAddressInstrs, "Number of two-address instructions");
 STATISTIC(NumCommuted        , "Number of instructions commuted to coalesce");
+STATISTIC(NumAggrCommuted    , "Number of instructions aggressively commuted");
 STATISTIC(NumConvertedTo3Addr, "Number of instructions promoted to 3-address");
 STATISTIC(Num3AddrSunk,        "Number of 3-address instructions sunk");
 STATISTIC(NumReMats,           "Number of instructions re-materialized");
@@ -70,6 +71,15 @@
                              MachineBasicBlock *MBB, unsigned Loc,
                              DenseMap<MachineInstr*, unsigned> &DistanceMap);
 
+    bool NoUseAfterLastDef(unsigned Reg, MachineBasicBlock *MBB, unsigned Dist,
+                           DenseMap<MachineInstr*, unsigned> &DistanceMap,
+                           unsigned &LastDef);
+
+    bool isProfitableToCommute(unsigned regB, unsigned regC,
+                               MachineInstr *MI, MachineBasicBlock *MBB,
+                               unsigned Dist,
+                               DenseMap<MachineInstr*, unsigned> &DistanceMap);
+
     bool CommuteInstruction(MachineBasicBlock::iterator &mi,
                             MachineFunction::iterator &mbbi,
                             unsigned RegC, unsigned Dist,
@@ -230,8 +240,6 @@
   for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(Reg),
          UE = MRI->use_end(); UI != UE; ++UI) {
     MachineOperand &UseMO = UI.getOperand();
-    if (!UseMO.isUse())
-      continue;
     MachineInstr *UseMI = UseMO.getParent();
     MachineBasicBlock *UseMBB = UseMI->getParent();
     if (UseMBB == MBB) {
@@ -255,6 +263,82 @@
   return MBB == DefMI->getParent();
 }
 
+/// NoUseAfterLastDef - Return true if there are no intervening uses between the
+/// last instruction in the MBB that defines the specified register and the
+/// two-address instruction which is being processed. It also returns the last
+/// def location by reference
+bool TwoAddressInstructionPass::NoUseAfterLastDef(unsigned Reg,
+                                 MachineBasicBlock *MBB, unsigned Dist,
+                                 DenseMap<MachineInstr*, unsigned> &DistanceMap,
+                                 unsigned &LastDef) {
+  LastDef = 0;
+  unsigned LastUse = Dist;
+  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Reg),
+         E = MRI->reg_end(); I != E; ++I) {
+    MachineOperand &MO = I.getOperand();
+    MachineInstr *MI = MO.getParent();
+    if (MI->getParent() != MBB)
+      continue;
+    DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI);
+    if (DI == DistanceMap.end())
+      continue;
+    if (MO.isUse() && DI->second < LastUse)
+      LastUse = DI->second;
+    if (MO.isDef() && DI->second > LastDef)
+      LastDef = DI->second;
+  }
+
+  return !(LastUse > LastDef && LastUse < Dist);
+}
+
+/// isProfitableToReMat - Return true if it's potentially profitable to commute
+/// the two-address instruction that's being processed.
+bool
+TwoAddressInstructionPass::isProfitableToCommute(unsigned regB, unsigned regC,
+                MachineInstr *MI, MachineBasicBlock *MBB,
+                unsigned Dist, DenseMap<MachineInstr*, unsigned> &DistanceMap) {
+  // Determine if it's profitable to commute this two address instruction. In
+  // general, we want no uses between this instruction and the definition of
+  // the two-address register.
+  // e.g.
+  // %reg1028<def> = EXTRACT_SUBREG %reg1027<kill>, 1
+  // %reg1029<def> = MOV8rr %reg1028
+  // %reg1029<def> = SHR8ri %reg1029, 7, %EFLAGS<imp-def,dead>
+  // insert => %reg1030<def> = MOV8rr %reg1028
+  // %reg1030<def> = ADD8rr %reg1028<kill>, %reg1029<kill>, %EFLAGS<imp-def,dead>
+  // In this case, it might not be possible to coalesce the second MOV8rr
+  // instruction if the first one is coalesced. So it would be profitable to
+  // commute it:
+  // %reg1028<def> = EXTRACT_SUBREG %reg1027<kill>, 1
+  // %reg1029<def> = MOV8rr %reg1028
+  // %reg1029<def> = SHR8ri %reg1029, 7, %EFLAGS<imp-def,dead>
+  // insert => %reg1030<def> = MOV8rr %reg1029
+  // %reg1030<def> = ADD8rr %reg1029<kill>, %reg1028<kill>, %EFLAGS<imp-def,dead>  
+
+  if (!MI->killsRegister(regC))
+    return false;
+
+  // Ok, we have something like:
+  // %reg1030<def> = ADD8rr %reg1028<kill>, %reg1029<kill>, %EFLAGS<imp-def,dead>
+  // let's see if it's worth commuting it.
+
+  // If there is a use of regC between its last def (could be livein) and this
+  // instruction, then bail.
+  unsigned LastDefC = 0;
+  if (!NoUseAfterLastDef(regC, MBB, Dist, DistanceMap, LastDefC))
+    return false;
+
+  // If there is a use of regB between its last def (could be livein) and this
+  // instruction, then go ahead and make this transformation.
+  unsigned LastDefB = 0;
+  if (!NoUseAfterLastDef(regB, MBB, Dist, DistanceMap, LastDefB))
+    return true;
+
+  // Since there are no intervening uses for both registers, then commute
+  // if the def of regC is closer. Its live interval is shorter.
+  return LastDefB && LastDefC && LastDefC > LastDefB;
+}
+
 /// CommuteInstruction - Commute a two-address instruction and update the basic
 /// block, distance map, and live variables if needed. Return true if it is
 /// successful.
@@ -419,6 +503,17 @@
             }
           }
 
+          // If it's profitable to commute the instruction, do so.
+          if (TID.isCommutable() && mi->getNumOperands() >= 3) {
+            unsigned regC = mi->getOperand(3-si).getReg();
+            if (isProfitableToCommute(regB, regC, mi, mbbi, Dist, DistanceMap))
+              if (CommuteInstruction(mi, mbbi, regC, Dist, DistanceMap)) {
+                ++NumAggrCommuted;
+                ++NumCommuted;
+                regB = regC;
+              }
+          }
+
         InstructionRearranged:
           const TargetRegisterClass* rc = MRI->getRegClass(regA);
           MachineInstr *DefMI = MRI->getVRegDef(regB);
@@ -436,7 +531,10 @@
             TII->copyRegToReg(*mbbi, mi, regA, regB, rc, rc);
           }
 
-          MachineBasicBlock::iterator prevMi = prior(mi);
+          MachineBasicBlock::iterator prevMI = prior(mi);
+          // Update DistanceMap.
+          DistanceMap.insert(std::make_pair(prevMI, Dist));
+          DistanceMap[mi] = ++Dist;
 
           // Update live variables for regB.
           if (LV) {
@@ -446,13 +544,13 @@
             varInfoB.UsedBlocks[mbbi->getNumber()] = true;
 
             if (LV->removeVirtualRegisterKilled(regB,  mi))
-              LV->addVirtualRegisterKilled(regB, prevMi);
+              LV->addVirtualRegisterKilled(regB, prevMI);
 
             if (LV->removeVirtualRegisterDead(regB, mi))
-              LV->addVirtualRegisterDead(regB, prevMi);
+              LV->addVirtualRegisterDead(regB, prevMI);
           }
 
-          DOUT << "\t\tprepend:\t"; DEBUG(prevMi->print(*cerr.stream(), &TM));
+          DOUT << "\t\tprepend:\t"; DEBUG(prevMI->print(*cerr.stream(), &TM));
           
           // Replace all occurences of regB with regA.
           for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) {
diff --git a/test/CodeGen/X86/2008-02-22-ReMatBug.ll b/test/CodeGen/X86/2008-02-22-ReMatBug.ll
index f78d526..539fc15 100644
--- a/test/CodeGen/X86/2008-02-22-ReMatBug.ll
+++ b/test/CodeGen/X86/2008-02-22-ReMatBug.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=x86 -stats |& grep {Number of re-materialization} | grep 3
+; RUN: llvm-as < %s | llc -march=x86 -stats |& grep {Number of re-materialization} | grep 4
 ; RUN: llvm-as < %s | llc -march=x86 -stats |& grep {Number of dead spill slots removed}
 ; rdar://5761454
 
diff --git a/test/CodeGen/X86/2008-07-19-movups-spills.ll b/test/CodeGen/X86/2008-07-19-movups-spills.ll
index ef5c7c5..8800357 100644
--- a/test/CodeGen/X86/2008-07-19-movups-spills.ll
+++ b/test/CodeGen/X86/2008-07-19-movups-spills.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=i686-pc-linux -realign-stack=1 -mattr=sse2 | grep movaps | count 76
+; RUN: llvm-as < %s | llc -mtriple=i686-pc-linux -realign-stack=1 -mattr=sse2 | grep movaps | count 75
 ; RUN: llvm-as < %s | llc -mtriple=i686-pc-linux -realign-stack=0 -mattr=sse2 | grep movaps | count 1
 ; PR2539
 
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index 5ee0932..b619411 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -1,6 +1,6 @@
 ; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 -stack-alignment=16 > %t
 ; RUN: grep pmul %t | count 12
-; RUN: grep mov %t | count 19
+; RUN: grep mov %t | count 15
 
 define <4 x i32> @a(<4 x i32> %i) nounwind  {
         %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
diff --git a/test/CodeGen/X86/twoaddr-coalesce.ll b/test/CodeGen/X86/twoaddr-coalesce.ll
new file mode 100644
index 0000000..c369d91
--- /dev/null
+++ b/test/CodeGen/X86/twoaddr-coalesce.ll
@@ -0,0 +1,25 @@
+; RUN: llvm-as < %s | llc -march=x86 -join-cross-class-copies -stats |& \
+; RUN:   grep {twoaddrinstr} | grep {Number of instructions aggressively commuted}
+; rdar://6523745
+
+@"\01LC" = internal constant [4 x i8] c"%d\0A\00"		; <[4 x i8]*> [#uses=1]
+
+define i32 @main() nounwind {
+bb1.thread:
+	br label %bb1
+
+bb1:		; preds = %bb1, %bb1.thread
+	%i.0.reg2mem.0 = phi i32 [ 0, %bb1.thread ], [ %indvar.next, %bb1 ]		; <i32> [#uses=2]
+	%0 = trunc i32 %i.0.reg2mem.0 to i8		; <i8> [#uses=1]
+	%1 = sdiv i8 %0, 2		; <i8> [#uses=1]
+	%2 = sext i8 %1 to i32		; <i32> [#uses=1]
+	%3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([4 x i8]* @"\01LC", i32 0, i32 0), i32 %2) nounwind		; <i32> [#uses=0]
+	%indvar.next = add i32 %i.0.reg2mem.0, 1		; <i32> [#uses=2]
+	%exitcond = icmp eq i32 %indvar.next, 258		; <i1> [#uses=1]
+	br i1 %exitcond, label %bb2, label %bb1
+
+bb2:		; preds = %bb1
+	ret i32 0
+}
+
+declare i32 @printf(i8*, ...) nounwind
