front end:
 - implemented insns: mfvscr, mtvscr, vand, vor, vxor
 - fixed default vscr: enable non-java mode

back end:
 - implemented enough to satisfy the front end: V128to32, 32UtoV128, not, and, or, xor
 - fixed conversions to/from v128 to use quad-word-aligned stack addressing for their vector load/stores




git-svn-id: svn://svn.valgrind.org/vex/trunk@1386 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest-ppc32/gdefs.h b/priv/guest-ppc32/gdefs.h
index 22725e3..cf06b5a 100644
--- a/priv/guest-ppc32/gdefs.h
+++ b/priv/guest-ppc32/gdefs.h
@@ -122,7 +122,7 @@
 
 
 /*---------------------------------------------------------*/
-/*--- ppc32 guest helpers                                 ---*/
+/*--- ppc32 guest helpers                               ---*/
 /*---------------------------------------------------------*/
 
 /* --- CLEAN HELPERS --- */
diff --git a/priv/guest-ppc32/ghelpers.c b/priv/guest-ppc32/ghelpers.c
index cd29e35..7703bf4 100644
--- a/priv/guest-ppc32/ghelpers.c
+++ b/priv/guest-ppc32/ghelpers.c
@@ -318,7 +318,7 @@
 
    vex_state->guest_VRSAVE = 0;
 
-   vex_state->guest_VSCR = 0;
+   vex_state->guest_VSCR = 0x00010000;  // Non-Java mode
 
    vex_state->guest_EMWARN = EmWarn_NONE;
 
diff --git a/priv/guest-ppc32/toIR.c b/priv/guest-ppc32/toIR.c
index 8cc01e5..372df22 100644
--- a/priv/guest-ppc32/toIR.c
+++ b/priv/guest-ppc32/toIR.c
@@ -235,7 +235,7 @@
 //zz     PPC32_SPR_CR,     // Condition Register
     PPC32_SPR_FPSCR,  // Floating Point Status/Control Register
     PPC32_SPR_VRSAVE, // Vector Save/Restore Register
-//zz     PPC32_SPR_VSCR,   // Vector Status and Control Register
+    PPC32_SPR_VSCR,   // Vector Status and Control Register
     PPC32_SPR_MAX
 } PPC32SPR;
 
@@ -1188,12 +1188,12 @@
 //zz    case PPC32_SPR_VRSAVE:
 //zz       assign( val, IRExpr_Get(OFFB_VRSAVE, Ity_I32) );
 //zz       break;
-//zz 
-//zz    case PPC32_SPR_VSCR:
-//zz       // All other bits are 'Reserved'. Returning zero for these bits.
-//zz       mask = mask & 0x00010001;
-//zz       assign( val, IRExpr_Get(OFFB_VSCR, Ity_I32) );
-//zz       break;
+
+   case PPC32_SPR_VSCR:
+      // All other bits are zero.
+      mask = mask & 0x00010001;
+      assign( val, IRExpr_Get(OFFB_VSCR, Ity_I32) );
+      break;
 
    default:
       vpanic("getReg(ppc32)");
@@ -1300,17 +1300,17 @@
 //zz       vassert(mask == 0xFFFFFFFF);    // Only ever need whole reg
 //zz       stmt( IRStmt_Put( OFFB_VRSAVE, src ) );
 //zz       break;
-//zz 
-//zz    case PPC32_SPR_VSCR:
-//zz //CAB: There are only 2 valid bits in VSCR - maybe split into two vars...
-//zz 
-//zz       // All other bits are 'Reserved'. Ignoring writes to these bits.
-//zz       stmt( IRStmt_Put( OFFB_VSCR,
-//zz                binop(Iop_Or32,
-//zz                      binop(Iop_And32, src, mkU32(mask & 0x00010001)),
-//zz                      getReg_masked( PPC32_SPR_VSCR, (~mask & 0x00010001) ))));
-//zz       break;
-//zz    }
+
+   case PPC32_SPR_VSCR:
+      // CAB: There are only 2 valid bits in VSCR - maybe split into two vars...
+      // ... or perhaps only 1 bit... is non-java mode bit ever set to zero?
+
+      // All other bits are 'Reserved'. Ignoring writes to these bits.
+      stmt( IRStmt_Put( OFFB_VSCR,
+         binop(Iop_Or32,
+               binop(Iop_And32, src, mkU32(mask & 0x00010001)),
+               getReg_masked( PPC32_SPR_VSCR, (~mask & 0x00010001) ))));
+      break;
 
    default:
       vpanic("putReg(ppc32)");
@@ -1341,6 +1341,9 @@
       case PPC32_SPR_VRSAVE: 
          stmt( IRStmt_Put( OFFB_VRSAVE, src ) );
          break;
+      case PPC32_SPR_VSCR:
+         putReg_masked( reg, src, 0xFFFFFFFF );
+         break;
       default:
          vpanic("putSPR(ppc32)");
    }
@@ -1355,6 +1358,8 @@
          return IRExpr_Get( OFFB_CTR, Ity_I32 );
       case PPC32_SPR_VRSAVE: 
          return IRExpr_Get( OFFB_VRSAVE, Ity_I32 );
+      case PPC32_SPR_VSCR: 
+         return getReg_masked( reg, 0xFFFFFFFF );
       default:
          vpanic("getSPR(ppc32)");
    }
@@ -2894,7 +2899,7 @@
       case 0x010: // bclr (Branch Cond. to Link Register, PPC32 p365) 
 
          if ((BO & 0x14 /* 1z1zz */) == 0x14 && flag_LK == 0) {
-            DIP("blr");
+            DIP("blr\n");
          } else {
             DIP("bclr%s 0x%x, 0x%x\n", flag_LK ? "l" : "", BO, BI);
          }
@@ -3427,7 +3432,7 @@
       DIP("mcrxr crf%d\n", crfD);
 
       /* Compute XER[0-3] (the top 4 bits of XER) into the bottom
-	 4 bits of xer_0to3. */
+         4 bits of xer_0to3. */
       assign( 
          xer_0to3,
          unop(Iop_32to8,
@@ -4815,18 +4820,20 @@
          return False;
       }
       DIP("mfvscr v%d\n", vD_addr);
-      DIP(" => not implemented\n");
-      return False;
+      putVReg( vD_addr, unop(Iop_32UtoV128, getSPR( PPC32_SPR_VSCR )) ); 
+      break;
 
-   case 0x644: // mtvscr (Move to VSCR, AV p130)
+   case 0x644: { // mtvscr (Move to VSCR, AV p130)
       if (vD_addr != 0 || vA_addr != 0) {
          vex_printf("dis_av_procctl(PPC32)(opc2,dst)\n");
          return False;
       }
       DIP("mtvscr v%d\n", vB_addr);
-      DIP(" => not implemented\n");
-      return False;
-
+      IRTemp vB = newTemp(Ity_V128);
+      assign( vB, getVReg(vB_addr));
+      putSPR( PPC32_SPR_VSCR, unop(Iop_V128to32, mkexpr(vB)) ); 
+      break;
+   }
    default:
       vex_printf("dis_av_procctl(PPC32)(opc2)\n");
       return False;
@@ -4914,14 +4921,14 @@
    UInt  opc2     =         (theInstr >>  1) & 0x3FF; /* theInstr[1:10]  */
    UChar b0       = toUChar((theInstr >>  0) & 1);    /* theInstr[0]     */
 
-   IRTemp rA = newTemp(Ity_I32);
-   IRTemp rB = newTemp(Ity_I32);
+   //   IRTemp rA = newTemp(Ity_I32);
+   //   IRTemp rB = newTemp(Ity_I32);
    IRTemp vS = newTemp(Ity_V128);
    IRTemp EA = newTemp(Ity_I32);
    IRTemp EA_aligned = newTemp(Ity_I32);
 
-   assign( rA, getIReg(rA_addr));
-   assign( rB, getIReg(rB_addr));
+   //   assign( rA, getIReg(rA_addr));
+   //   assign( rB, getIReg(rB_addr));
    assign( vS, getVReg(vS_addr));
 
    assign( EA, ea_standard(rA_addr, rB_addr) );
@@ -5283,6 +5290,11 @@
    UChar vB_addr = toUChar((theInstr >> 11) & 0x1F);  /* theInstr[11:15] */
    UInt  opc2    =         (theInstr >>  0) & 0x7FF;  /* theInstr[0:10]  */
 
+   IRTemp vA = newTemp(Ity_V128);
+   IRTemp vB = newTemp(Ity_V128);
+   assign( vA, getVReg(vA_addr));
+   assign( vB, getVReg(vB_addr));
+
    if (opc1 != 0x4) {
       vex_printf("dis_av_logic(PPC32)(opc1 != 0x4)\n");
       return False;
@@ -5291,8 +5303,8 @@
    switch (opc2) {
    case 0x404: // vand (And, AV p147)
       DIP("vand v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
-      DIP(" => not implemented\n");
-      return False;
+      putVReg( vD_addr, binop(Iop_AndV128, mkexpr(vA), mkexpr(vB)) );
+      break;
 
    case 0x444: // vandc (And, AV p148)
       DIP("vandc v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
@@ -5301,13 +5313,13 @@
 
    case 0x484: // vor (Or, AV p217)
       DIP("vor v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
-      DIP(" => not implemented\n");
-      return False;
+      putVReg( vD_addr, binop(Iop_OrV128, mkexpr(vA), mkexpr(vB)) );
+      break;
 
    case 0x4C4: // vxor (Xor, AV p282)
       DIP("vxor v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
-      DIP(" => not implemented\n");
-      return False;
+      putVReg( vD_addr, binop(Iop_XorV128, mkexpr(vA), mkexpr(vB)) );
+      break;
 
    case 0x504: // vnor (Nor, AV p216)
       DIP("vnor v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
@@ -6095,7 +6107,7 @@
    opc1 = toUChar(ifieldOPC(theInstr));
    opc2 = ifieldOPClo10(theInstr);
 
-#if PPC32_TOIR_DEBUG
+#if 0 //PPC32_TOIR_DEBUG
    vex_printf("\ndisInstr(ppc32): instr:   0x%x\n", theInstr);
    vex_printf("disInstr(ppc32): instr:   ");
    vex_printf_binary( theInstr, 32, True );
@@ -6565,6 +6577,7 @@
    default:
    decode_failure:
    /* All decode failures end up here. */
+   opc2 = (theInstr) & 0x7FF;
    vex_printf("disInstr(ppc32): unhandled instruction: "
               "0x%x\n", theInstr);
    vex_printf("                 primary %d(0x%x), secondary %u(0x%x)\n", 
diff --git a/priv/host-ppc32/hdefs.c b/priv/host-ppc32/hdefs.c
index c1259ee..0cdcb89 100644
--- a/priv/host-ppc32/hdefs.c
+++ b/priv/host-ppc32/hdefs.c
@@ -1261,10 +1261,11 @@
          ppLoadImm(hregPPC32_GPR30(), i->Pin.AvLdSt.addr->Pam.RR.index);
          vex_printf(" ; ");
       }
+      char* str_size = sz==1 ? "eb" : sz==2 ? "eh" : sz==4 ? "ew" : "";
       if (i->Pin.AvLdSt.isLoad)
-         vex_printf("lv%sx ", sz==8 ? "eb" : sz==16 ? "eh" : sz==32 ? "ew" : "");
+         vex_printf("lv%sx ", str_size);
       else
-         vex_printf("stv%sx ", sz==8 ? "eb" : sz==16 ? "eh" : sz==32 ? "ew" : "");
+         vex_printf("stv%sx ", str_size);
       ppHRegPPC32(i->Pin.AvLdSt.reg);
       vex_printf(",");
       if (i->Pin.AvLdSt.addr->tag == Pam_IR)
@@ -2755,7 +2756,7 @@
       UInt opc2, v_reg, r_idx, r_base;
       UChar sz   = i->Pin.AvLdSt.sz;
       Bool  idxd = toBool(i->Pin.AvLdSt.addr->tag == Pam_RR);
-      vassert(sz == 8 || sz == 16 || sz == 32 || sz == 128);
+      vassert(sz == 1 || sz == 2 || sz == 4 || sz == 16);
 
       v_reg  = vregNo(i->Pin.AvLdSt.reg);
       r_base = iregNo(i->Pin.AvLdSt.addr->Pam.RR.base);
@@ -2768,11 +2769,11 @@
          r_idx  = iregNo(i->Pin.AvLdSt.addr->Pam.RR.index);
       }
 
-      if (i->Pin.FpLdSt.isLoad) {  // Load from memory (8,16,32,128)
-         opc2 = (sz == 8) ? 7 : (sz == 16) ? 39 : (sz == 32) ? 71 : 103;
+      if (i->Pin.FpLdSt.isLoad) {  // Load from memory (1,2,4,16)
+         opc2 = (sz == 1) ? 7 : (sz == 2) ? 39 : (sz == 4) ? 71 : 103;
          p = mkFormX(p, 31, v_reg, r_idx, r_base, opc2, 0);
-      } else {                      // Store to memory (8,16,32,128)
-         opc2 = (sz == 8) ? 135 : (sz == 16) ? 167 : (sz == 32) ? 199 : 231;
+      } else {                      // Store to memory (1,2,4,16)
+         opc2 = (sz == 1) ? 135 : (sz == 2) ? 167 : (sz == 4) ? 199 : 231;
          p = mkFormX(p, 31, v_reg, r_idx, r_base, opc2, 0);
       }
       goto done;
@@ -2813,9 +2814,9 @@
       UInt opc2;
       switch (i->Pin.AvBinary.op) {
       /* Bitwise */
-      case Pav_AND:       opc2 = 1026; break; // vand
+      case Pav_AND:       opc2 = 1028; break; // vand
       case Pav_OR:        opc2 = 1156; break; // vor
-      case Pav_XOR:       opc2 = 1120; break; // vxor
+      case Pav_XOR:       opc2 = 1220; break; // vxor
 
       /* Shift */
       case Pav_SHL:       opc2 =  452; break; // vsl
diff --git a/priv/host-ppc32/hdefs.h b/priv/host-ppc32/hdefs.h
index d8f1148..e82b045 100644
--- a/priv/host-ppc32/hdefs.h
+++ b/priv/host-ppc32/hdefs.h
@@ -54,7 +54,7 @@
 /* --------- Registers. --------- */
 
 /* The usual HReg abstraction.  There are 32 real int regs,
-   32 real float regs, and 0 real vector regs. 
+   32 real float regs, and 32 real vector regs. 
 */
 
 extern void ppHRegPPC32 ( HReg );
diff --git a/priv/host-ppc32/isel.c b/priv/host-ppc32/isel.c
index 42330b9..7968fbb 100644
--- a/priv/host-ppc32/isel.c
+++ b/priv/host-ppc32/isel.c
@@ -344,6 +344,27 @@
                     Palu_SUB, sp, sp, PPC32RH_Imm(True,toUShort(n))));
 }
 
+/*
+  returns a quadword aligned address on the stack
+   - copies SP, adds 16bytes, aligns to quadword.
+  use sub_from_sp(32) before calling this,
+  as expects to have 32 bytes to play with.
+*/
+static HReg get_sp_aligned16 ( ISelEnv* env )
+{
+   HReg       r = newVRegI(env);
+   HReg align16 = newVRegI(env);
+   addInstr(env, mk_iMOVds_RR(r, StackFramePtr));
+   // add 16
+   addInstr(env, PPC32Instr_Alu32(
+                    Palu_ADD, r, r, PPC32RH_Imm(True,toUShort(16))));
+   // mask to quadword
+   addInstr(env, PPC32Instr_LI32(align16, (UInt)0xFFFFFFF0));
+   addInstr(env, PPC32Instr_Alu32(Palu_AND, r,r, PPC32RH_Reg(align16)));
+   return r;
+}
+
+
 
 /* Load 2*I32 regs to fp reg */
 static HReg mk_LoadRRtoFPR ( ISelEnv* env, HReg r_srcHi, HReg r_srcLo )
@@ -1188,16 +1209,24 @@
          return r_dst;
       }
 
-//..          case Iop_V128to32: {
-//..             HReg      dst  = newVRegI(env);
-//..             HReg      vec  = iselVecExpr(env, e->Iex.Unop.arg);
-//..             X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
-//..             sub_from_esp(env, 16);
-//..             addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, esp0));
-//..             addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(esp0), dst ));
-//..             add_to_esp(env, 16);
-//..             return dst;
-//..          }
+      case Iop_V128to32: {
+         HReg        dst  = newVRegI(env);
+         HReg        vec  = iselVecExpr(env, e->Iex.Unop.arg);
+         PPC32AMode *am_off0, *am_off12;
+         sub_from_sp( env, 32 );     // Move SP down 32 bytes
+
+         // get a quadword aligned address within our stack space
+         HReg r_aligned16 = get_sp_aligned16( env );
+         am_off0  = PPC32AMode_IR( 0, r_aligned16 );
+         am_off12 = PPC32AMode_IR( 12,r_aligned16 );
+
+         // store vec, load low word to dst
+         addInstr(env, PPC32Instr_AvLdSt( False/*store*/, 16, vec, am_off0 ));
+         addInstr(env, PPC32Instr_Load( 4, False, dst, am_off12 ));
+
+         add_to_sp( env, 32 );       // Reset SP
+         return dst;
+      }
 
       case Iop_16to8:
       case Iop_32to8:
@@ -2316,19 +2345,21 @@
             HReg tLo = newVRegI(env);
             HReg tHi = newVRegI(env);
             HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
-            PPC32AMode *sp0, *spLO, *spHI;
-
+            PPC32AMode *am_off0, *am_offLO, *am_offHI;
             sub_from_sp( env, 32 );     // Move SP down 32 bytes
-            sp0  = PPC32AMode_IR(0,     StackFramePtr);
-            spHI = PPC32AMode_IR(off,   StackFramePtr);
-            spLO = PPC32AMode_IR(off+4, StackFramePtr);
+
+            // get a quadword aligned address within our stack space
+            HReg r_aligned16 = get_sp_aligned16( env );
+            am_off0  = PPC32AMode_IR( 0,     r_aligned16 );
+            am_offHI = PPC32AMode_IR( off,   r_aligned16 );
+            am_offLO = PPC32AMode_IR( off+4, r_aligned16 );
 
             // store as Vec128
-            addInstr(env, PPC32Instr_AvLdSt( False/*store*/, 16, vec, sp0 ));
+            addInstr(env, PPC32Instr_AvLdSt( False/*store*/, 16, vec, am_off0 ));
 
             // load hi,lo words (of hi/lo half of vec) as Ity_I32's
-            addInstr(env, PPC32Instr_Load( 4, False, tHi, spHI ));
-            addInstr(env, PPC32Instr_Load( 4, False, tLo, spLO ));
+            addInstr(env, PPC32Instr_Load( 4, False, tHi, am_offHI ));
+            addInstr(env, PPC32Instr_Load( 4, False, tLo, am_offLO ));
 
             add_to_sp( env, 32 );       // Reset SP
             *rHi = tHi;
@@ -2781,7 +2812,7 @@
 static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
 {
 //..    Bool     arg1isEReg = False;
-  // unused:   PPC32AvOp op = Pav_INVALID;
+   PPC32AvOp op = Pav_INVALID;
    IRType   ty = typeOfIRExpr(env->type_env,e);
    vassert(e);
    vassert(ty == Ity_V128);
@@ -2817,11 +2848,13 @@
    if (e->tag == Iex_Unop) {
       switch (e->Iex.Unop.op) {
 
-//..       case Iop_Not128: {
-//..          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
-//..          return do_sse_Not128(env, arg);
-//..       }
-//.. 
+      case Iop_NotV128: {
+         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
+         HReg dst = newVRegV(env);
+         addInstr(env, PPC32Instr_AvUnary(Pav_NOT, dst, arg));
+         return dst;
+      }
+
 //..       case Iop_CmpNEZ64x2: {
 //..          /* We can use SSE2 instructions for this. */
 //..          /* Ideally, we want to do a 64Ix2 comparison against zero of
@@ -2963,17 +2996,37 @@
 //..          addInstr(env, X86Instr_Sse64FLo(op, arg, dst));
 //..          return dst;
 //..       }
-//.. 
-//..       case Iop_32UtoV128: {
-//..          HReg      dst  = newVRegV(env);
-//..          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
-//..          X86RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Unop.arg);
-//..          addInstr(env, X86Instr_Push(rmi));
-//..          addInstr(env, X86Instr_SseLdzLO(4, dst, esp0));
-//..          add_to_esp(env, 4);
-//..          return dst;
-//..       }
-//.. 
+
+      case Iop_32UtoV128: {
+         HReg r_src = iselIntExpr_R(env, e->Iex.Unop.arg);
+         HReg   dst = newVRegV(env);
+         PPC32AMode *am_off0, *am_off4, *am_off8, *am_off12;
+         sub_from_sp( env, 32 );     // Move SP down
+
+         /* Get a quadword aligned address within our stack space */
+         HReg r_aligned16 = get_sp_aligned16( env );
+         am_off0  = PPC32AMode_IR( 0,  r_aligned16);
+         am_off4  = PPC32AMode_IR( 4,  r_aligned16);
+         am_off8  = PPC32AMode_IR( 8,  r_aligned16);
+         am_off12 = PPC32AMode_IR( 12, r_aligned16);
+
+         /* Store zero's */
+         HReg r_zeros = newVRegI(env);
+         addInstr(env, PPC32Instr_LI32(r_zeros, 0x0));
+         addInstr(env, PPC32Instr_Store( 4, am_off0, r_zeros ));
+         addInstr(env, PPC32Instr_Store( 4, am_off4, r_zeros ));
+         addInstr(env, PPC32Instr_Store( 4, am_off8, r_zeros ));
+
+         /* Store r_src in low word of quadword-aligned mem */
+         addInstr(env, PPC32Instr_Store( 4, am_off12, r_src ));
+
+         /* Load word into low word of quadword vector reg */
+         addInstr(env, PPC32Instr_AvLdSt( True/*load*/, 4, dst, am_off12 ));
+
+         add_to_sp( env, 32 );       // Reset SP
+         return dst;
+      }
+
 //..       case Iop_64UtoV128: {
 //..          HReg      rHi, rLo;
 //..          HReg      dst  = newVRegV(env);
@@ -3025,24 +3078,31 @@
 //.. 
       case Iop_64HLtoV128: {
          HReg r3, r2, r1, r0;
-         PPC32AMode *sp0  = PPC32AMode_IR(0,  StackFramePtr);
-         PPC32AMode *sp4  = PPC32AMode_IR(4,  StackFramePtr);
-         PPC32AMode *sp8  = PPC32AMode_IR(8,  StackFramePtr);
-         PPC32AMode *sp12 = PPC32AMode_IR(12, StackFramePtr);
+         PPC32AMode *am_off0, *am_off4, *am_off8, *am_off12;
          HReg        dst = newVRegV(env);
          /* do this via the stack (easy, convenient, etc) */
-         sub_from_sp( env, 16 );        // Move SP down 16 bytes
+         sub_from_sp( env, 32 );        // Move SP down
+
+         // get a quadword aligned address within our stack space
+         HReg r_aligned16 = get_sp_aligned16( env );
+         am_off0  = PPC32AMode_IR( 0,  r_aligned16);
+         am_off4  = PPC32AMode_IR( 4,  r_aligned16);
+         am_off8  = PPC32AMode_IR( 8,  r_aligned16);
+         am_off12 = PPC32AMode_IR( 12, r_aligned16);
+
          /* Do the less significant 64 bits */
          iselInt64Expr(&r1, &r0, env, e->Iex.Binop.arg2);
-         addInstr(env, PPC32Instr_Store( 4, sp12, r0 ));
-         addInstr(env, PPC32Instr_Store( 4, sp8,  r1 ));
+         addInstr(env, PPC32Instr_Store( 4, am_off12, r0 ));
+         addInstr(env, PPC32Instr_Store( 4, am_off8,  r1 ));
          /* Do the more significant 64 bits */
          iselInt64Expr(&r3, &r2, env, e->Iex.Binop.arg1);
-         addInstr(env, PPC32Instr_Store( 4, sp4, r2 ));
-         addInstr(env, PPC32Instr_Store( 4, sp0, r3 ));
+         addInstr(env, PPC32Instr_Store( 4, am_off4, r2 ));
+         addInstr(env, PPC32Instr_Store( 4, am_off0, r3 ));
+
          /* Fetch result back from stack. */
-         addInstr(env, PPC32Instr_AvLdSt(True/*load*/, 16, dst, sp0));
-         add_to_sp( env, 16 );          // Reset SP
+         addInstr(env, PPC32Instr_AvLdSt(True/*load*/, 16, dst, am_off0));
+
+         add_to_sp( env, 32 );          // Reset SP
          return dst;
       }
 
@@ -3146,10 +3206,10 @@
 //..          op = Xsse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
 //..       case Iop_InterleaveLO64x2: 
 //..          op = Xsse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
-//.. 
-//..       case Iop_AndV128:    op = Xsse_AND;      goto do_SseReRg;
-//..       case Iop_OrV128:     op = Xsse_OR;       goto do_SseReRg;
-//..       case Iop_XorV128:    op = Xsse_XOR;      goto do_SseReRg;
+
+      case Iop_AndV128:    op = Pav_AND;      goto do_AvBin;
+      case Iop_OrV128:     op = Pav_OR;       goto do_AvBin;
+      case Iop_XorV128:    op = Pav_XOR;      goto do_AvBin;
 //..       case Iop_Add8x16:    op = Xsse_ADD8;     goto do_SseReRg;
 //..       case Iop_Add16x8:    op = Xsse_ADD16;    goto do_SseReRg;
 //..       case Iop_Add32x4:    op = Xsse_ADD32;    goto do_SseReRg;
@@ -3181,6 +3241,13 @@
 //..       case Iop_QSub16Sx8:  op = Xsse_QSUB16S;  goto do_SseReRg;
 //..       case Iop_QSub8Ux16:  op = Xsse_QSUB8U;   goto do_SseReRg;
 //..       case Iop_QSub16Ux8:  op = Xsse_QSUB16U;  goto do_SseReRg;
+      do_AvBin: {
+         HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
+         HReg dst = newVRegV(env);
+         addInstr(env, PPC32Instr_AvBinary(op, dst, arg1, arg2));
+         return dst;
+      }
 //..       do_SseReRg: {
 //..          HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
 //..          HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
diff --git a/pub/libvex_guest_ppc32.h b/pub/libvex_guest_ppc32.h
index a8200c6..1793774 100644
--- a/pub/libvex_guest_ppc32.h
+++ b/pub/libvex_guest_ppc32.h
@@ -208,7 +208,7 @@
       /* 952 */ UInt guest_RESVN;
 
       /* Padding to make it have an 8-aligned size */
-      /* UInt  padding; */
+      UInt  padding;
    }
    VexGuestPPC32State;