Add support for not-long on ARM64 in the optimizing compiler.

Change-Id: I3e98ff411ba358d92774def18a12daccdc4f558f
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index c26b0ab..31051aa 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -1067,11 +1067,8 @@
       break;
 
     case Primitive::kPrimInt:
-      __ Mvn(OutputRegister(instruction), InputOperandAt(instruction, 0));
-      break;
-
     case Primitive::kPrimLong:
-      LOG(FATAL) << "Not yet implemented type for not operation " << instruction->GetResultType();
+      __ Mvn(OutputRegister(instruction), InputOperandAt(instruction, 0));
       break;
 
     default:
diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index 47e9fa4..a7b64c9 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc
@@ -31,6 +31,7 @@
 #include "prepare_for_register_allocation.h"
 #include "register_allocator.h"
 #include "ssa_liveness_analysis.h"
+#include "utils.h"
 
 #include "gtest/gtest.h"
 
@@ -56,24 +57,26 @@
   DISALLOW_COPY_AND_ASSIGN(InternalCodeAllocator);
 };
 
+template <typename Expected>
 static void Run(const InternalCodeAllocator& allocator,
                 const CodeGenerator& codegen,
                 bool has_result,
-                int32_t expected) {
-  typedef int32_t (*fptr)();
+                Expected expected) {
+  typedef Expected (*fptr)();
   CommonCompilerTest::MakeExecutable(allocator.GetMemory(), allocator.GetSize());
   fptr f = reinterpret_cast<fptr>(allocator.GetMemory());
   if (codegen.GetInstructionSet() == kThumb2) {
     // For thumb we need the bottom bit set.
     f = reinterpret_cast<fptr>(reinterpret_cast<uintptr_t>(f) + 1);
   }
-  int32_t result = f();
+  Expected result = f();
   if (has_result) {
     ASSERT_EQ(result, expected);
   }
 }
 
-static void RunCodeBaseline(HGraph* graph, bool has_result, int32_t expected) {
+template <typename Expected>
+static void RunCodeBaseline(HGraph* graph, bool has_result, Expected expected) {
   InternalCodeAllocator allocator;
 
   x86::CodeGeneratorX86 codegenX86(graph);
@@ -103,11 +106,12 @@
   }
 }
 
+template <typename Expected>
 static void RunCodeOptimized(CodeGenerator* codegen,
                              HGraph* graph,
                              std::function<void(HGraph*)> hook_before_codegen,
                              bool has_result,
-                             int32_t expected) {
+                             Expected expected) {
   SsaLivenessAnalysis liveness(*graph, codegen);
   liveness.Analyze();
 
@@ -120,10 +124,11 @@
   Run(allocator, *codegen, has_result, expected);
 }
 
+template <typename Expected>
 static void RunCodeOptimized(HGraph* graph,
                              std::function<void(HGraph*)> hook_before_codegen,
                              bool has_result,
-                             int32_t expected) {
+                             Expected expected) {
   if (kRuntimeISA == kX86) {
     x86::CodeGeneratorX86 codegenX86(graph);
     RunCodeOptimized(&codegenX86, graph, hook_before_codegen, has_result, expected);
@@ -148,6 +153,18 @@
   RunCodeBaseline(graph, has_result, expected);
 }
 
+static void TestCodeLong(const uint16_t* data, bool has_result, int64_t expected) {
+  ArenaPool pool;
+  ArenaAllocator arena(&pool);
+  HGraphBuilder builder(&arena, Primitive::kPrimLong);
+  const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
+  HGraph* graph = builder.BuildGraph(*item);
+  ASSERT_NE(graph, nullptr);
+  // Remove suspend checks, they cannot be executed in this context.
+  RemoveSuspendChecks(graph);
+  RunCodeBaseline(graph, has_result, expected);
+}
+
 TEST(CodegenTest, ReturnVoid) {
   const uint16_t data[] = ZERO_REGISTER_CODE_ITEM(Instruction::RETURN_VOID);
   TestCode(data);
@@ -272,8 +289,8 @@
 #define NOT_INT_TEST(TEST_NAME, INPUT, EXPECTED_OUTPUT) \
 TEST(CodegenTest, TEST_NAME) {                          \
   const int32_t input = INPUT;                          \
-  const uint16_t input_lo = input & 0x0000FFFF;         \
-  const uint16_t input_hi = input >> 16;                \
+  const uint16_t input_lo = Low16Bits(input);           \
+  const uint16_t input_hi = High16Bits(input);          \
   const uint16_t data[] = TWO_REGISTERS_CODE_ITEM(      \
       Instruction::CONST | 0 << 8, input_lo, input_hi,  \
       Instruction::NOT_INT | 1 << 8 | 0 << 12 ,         \
@@ -286,13 +303,65 @@
 NOT_INT_TEST(ReturnNotIntMinus1, -1, 0)
 NOT_INT_TEST(ReturnNotInt0, 0, -1)
 NOT_INT_TEST(ReturnNotInt1, 1, -2)
-NOT_INT_TEST(ReturnNotIntINT_MIN, -2147483648, 2147483647)  // (2^31) - 1
-NOT_INT_TEST(ReturnNotIntINT_MINPlus1, -2147483647, 2147483646)  // (2^31) - 2
-NOT_INT_TEST(ReturnNotIntINT_MAXMinus1, 2147483646, -2147483647)  // -(2^31) - 1
-NOT_INT_TEST(ReturnNotIntINT_MAX, 2147483647, -2147483648)  // -(2^31)
+NOT_INT_TEST(ReturnNotIntINT32_MIN, -2147483648, 2147483647)  // (2^31) - 1
+NOT_INT_TEST(ReturnNotIntINT32_MINPlus1, -2147483647, 2147483646)  // (2^31) - 2
+NOT_INT_TEST(ReturnNotIntINT32_MAXMinus1, 2147483646, -2147483647)  // -(2^31) - 1
+NOT_INT_TEST(ReturnNotIntINT32_MAX, 2147483647, -2147483648)  // -(2^31)
 
 #undef NOT_INT_TEST
 
+// Exercise bit-wise (one's complement) not-long instruction.
+#define NOT_LONG_TEST(TEST_NAME, INPUT, EXPECTED_OUTPUT)                 \
+TEST(CodegenTest, TEST_NAME) {                                           \
+  const int64_t input = INPUT;                                           \
+  const uint16_t word0 = Low16Bits(Low32Bits(input));   /* LSW. */       \
+  const uint16_t word1 = High16Bits(Low32Bits(input));                   \
+  const uint16_t word2 = Low16Bits(High32Bits(input));                   \
+  const uint16_t word3 = High16Bits(High32Bits(input)); /* MSW. */       \
+  const uint16_t data[] = FOUR_REGISTERS_CODE_ITEM(                      \
+      Instruction::CONST_WIDE | 0 << 8, word0, word1, word2, word3,      \
+      Instruction::NOT_LONG | 2 << 8 | 0 << 12,                          \
+      Instruction::RETURN_WIDE | 2 << 8);                                \
+                                                                         \
+  TestCodeLong(data, true, EXPECTED_OUTPUT);                             \
+}
+
+NOT_LONG_TEST(ReturnNotLongMinus2, INT64_C(-2), INT64_C(1))
+NOT_LONG_TEST(ReturnNotLongMinus1, INT64_C(-1), INT64_C(0))
+NOT_LONG_TEST(ReturnNotLong0, INT64_C(0), INT64_C(-1))
+NOT_LONG_TEST(ReturnNotLong1, INT64_C(1), INT64_C(-2))
+
+NOT_LONG_TEST(ReturnNotLongINT32_MIN,
+              INT64_C(-2147483648),
+              INT64_C(2147483647))  // (2^31) - 1
+NOT_LONG_TEST(ReturnNotLongINT32_MINPlus1,
+              INT64_C(-2147483647),
+              INT64_C(2147483646))  // (2^31) - 2
+NOT_LONG_TEST(ReturnNotLongINT32_MAXMinus1,
+              INT64_C(2147483646),
+              INT64_C(-2147483647))  // -(2^31) - 1
+NOT_LONG_TEST(ReturnNotLongINT32_MAX,
+              INT64_C(2147483647),
+              INT64_C(-2147483648))  // -(2^31)
+
+// Note that the C++ compiler won't accept
+// INT64_C(-9223372036854775808) (that is, INT64_MIN) as a valid
+// int64_t literal, so we use INT64_C(-9223372036854775807)-1 instead.
+NOT_LONG_TEST(ReturnNotINT64_MIN,
+              INT64_C(-9223372036854775807)-1,
+              INT64_C(9223372036854775807));  // (2^63) - 1
+NOT_LONG_TEST(ReturnNotINT64_MINPlus1,
+              INT64_C(-9223372036854775807),
+              INT64_C(9223372036854775806));  // (2^63) - 2
+NOT_LONG_TEST(ReturnNotLongINT64_MAXMinus1,
+              INT64_C(9223372036854775806),
+              INT64_C(-9223372036854775807));  // -(2^63) - 1
+NOT_LONG_TEST(ReturnNotLongINT64_MAX,
+              INT64_C(9223372036854775807),
+              INT64_C(-9223372036854775807)-1);  // -(2^63)
+
+#undef NOT_LONG_TEST
+
 TEST(CodegenTest, ReturnAdd1) {
   const uint16_t data[] = TWO_REGISTERS_CODE_ITEM(
     Instruction::CONST_4 | 3 << 12 | 0,
diff --git a/compiler/optimizing/constant_folding_test.cc b/compiler/optimizing/constant_folding_test.cc
index 09bf2c8..856c516 100644
--- a/compiler/optimizing/constant_folding_test.cc
+++ b/compiler/optimizing/constant_folding_test.cc
@@ -332,9 +332,6 @@
            check_after_cf);
 }
 
-#define SIX_REGISTERS_CODE_ITEM(...)                                     \
-    { 6, 0, 0, 0, 0, 0, NUM_INSTRUCTIONS(__VA_ARGS__), 0, __VA_ARGS__ }
-
 /**
  * Tiny three-register-pair program exercising long constant folding
  * on addition.
diff --git a/compiler/optimizing/optimizing_unit_test.h b/compiler/optimizing/optimizing_unit_test.h
index aae7f9b..c4106b7 100644
--- a/compiler/optimizing/optimizing_unit_test.h
+++ b/compiler/optimizing/optimizing_unit_test.h
@@ -30,17 +30,17 @@
 #define NUM_INSTRUCTIONS(...)  \
   (sizeof((uint16_t[]) {__VA_ARGS__}) /sizeof(uint16_t))
 
-#define ZERO_REGISTER_CODE_ITEM(...)                                       \
-    { 0, 0, 0, 0, 0, 0, NUM_INSTRUCTIONS(__VA_ARGS__), 0, __VA_ARGS__ }
+#define N_REGISTERS_CODE_ITEM(NUM_REGS, ...)                            \
+    { NUM_REGS, 0, 0, 0, 0, 0, NUM_INSTRUCTIONS(__VA_ARGS__), 0, __VA_ARGS__ }
 
-#define ONE_REGISTER_CODE_ITEM(...)                                        \
-    { 1, 0, 0, 0, 0, 0, NUM_INSTRUCTIONS(__VA_ARGS__), 0, __VA_ARGS__ }
+#define ZERO_REGISTER_CODE_ITEM(...)   N_REGISTERS_CODE_ITEM(0, __VA_ARGS__)
+#define ONE_REGISTER_CODE_ITEM(...)    N_REGISTERS_CODE_ITEM(1, __VA_ARGS__)
+#define TWO_REGISTERS_CODE_ITEM(...)   N_REGISTERS_CODE_ITEM(2, __VA_ARGS__)
+#define THREE_REGISTERS_CODE_ITEM(...) N_REGISTERS_CODE_ITEM(3, __VA_ARGS__)
+#define FOUR_REGISTERS_CODE_ITEM(...)  N_REGISTERS_CODE_ITEM(4, __VA_ARGS__)
+#define FIVE_REGISTERS_CODE_ITEM(...)  N_REGISTERS_CODE_ITEM(5, __VA_ARGS__)
+#define SIX_REGISTERS_CODE_ITEM(...)   N_REGISTERS_CODE_ITEM(6, __VA_ARGS__)
 
-#define TWO_REGISTERS_CODE_ITEM(...)                                       \
-    { 2, 0, 0, 0, 0, 0, NUM_INSTRUCTIONS(__VA_ARGS__), 0, __VA_ARGS__ }
-
-#define THREE_REGISTERS_CODE_ITEM(...)                                     \
-    { 3, 0, 0, 0, 0, 0, NUM_INSTRUCTIONS(__VA_ARGS__), 0, __VA_ARGS__ }
 
 LiveInterval* BuildInterval(const size_t ranges[][2],
                             size_t number_of_ranges,
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index ed80e44..42d774a 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -687,7 +687,11 @@
 }
 
 inline void X86_64Assembler::EmitInt64(int64_t value) {
-  buffer_.Emit<int64_t>(value);
+  // Write this 64-bit value as two 32-bit words for alignment reasons
+  // (this is essentially when running on ARM, which does not allow
+  // 64-bit unaligned accesses).  We assume little-endianness here.
+  EmitInt32(Low32Bits(value));
+  EmitInt32(High32Bits(value));
 }
 
 inline void X86_64Assembler::EmitRegisterOperand(uint8_t rm, uint8_t reg) {