MIPS32: improvements in code generation (mostly 64-bit ALU ops)

Specifically:
- Use the delay slot in InvokeRuntime() for direct entry points
- Use kNoOutputOverlap wherever possible
- Improve and/or/xor/add/sub with 64-bit integer constants
- Improve 64-bit shifts by a constant amount on R2+
- More efficient load/store of 64-bit constants (especially, 0 & +0.0)

Change-Id: I86d2217c8b5b8e2a9371effc2ce38b9eec62782b
diff --git a/disassembler/disassembler_mips.cc b/disassembler/disassembler_mips.cc
index cd64a4f..ee7b21c 100644
--- a/disassembler/disassembler_mips.cc
+++ b/disassembler/disassembler_mips.cc
@@ -150,7 +150,9 @@
   { kSpecial2Mask | 0x3f, (28 << kOpcodeShift) | 0x3f, "sdbbp", "" },  // TODO: code
 
   // SPECIAL3
+  { kSpecial3Mask | 0x3f, (31 << kOpcodeShift), "ext", "TSAZ", },
   { kSpecial3Mask | 0x3f, (31 << kOpcodeShift) | 3, "dext", "TSAZ", },
+  { kSpecial3Mask | 0x3f, (31 << kOpcodeShift) | 4, "ins", "TSAz", },
   { kSpecial3Mask | (0x1f << 21) | (0x1f << 6) | 0x3f,
     (31 << kOpcodeShift) | (16 << 6) | 32,
     "seb",
@@ -421,7 +423,7 @@
       opcode = gMipsInstructions[i].name;
       for (const char* args_fmt = gMipsInstructions[i].args_fmt; *args_fmt; ++args_fmt) {
         switch (*args_fmt) {
-          case 'A':  // sa (shift amount or [d]ext position).
+          case 'A':  // sa (shift amount or [d]ins/[d]ext position).
             args << sa;
             break;
           case 'B':  // Branch offset.
@@ -519,7 +521,8 @@
           case 's': args << 'f' << rs; break;
           case 'T': args << 'r' << rt; break;
           case 't': args << 'f' << rt; break;
-          case 'Z': args << rd; break;   // sz ([d]ext size).
+          case 'Z': args << (rd + 1); break;  // sz ([d]ext size).
+          case 'z': args << (rd - sa + 1); break;  // sz ([d]ins size).
         }
         if (*(args_fmt + 1)) {
           args << ", ";