Remove forced alignment code.

The move from to gcc 4.8 to gcc 4.9 for arm32 introduced a bug in this
code. The original code is trying to out smart the compiler by arch, but
we got caught. Running benchmarks, the amount of time we save by doing this
is in the nanosecond range, so just let the compiler figure things out on
it's own.

It turns out for aarch64, x86, x86_64, two of the functions produce exactly
the same code. For swapLongs, x86/x86_64 produces slightly different code
but is about the same performance.

For arm32, letting the compiler optimize also leads to about the same
performance.

Adding unit tests and benchmark code for these.

Bug: 19692084

Change-Id: I858eb3147ef1e9e2c1894ddb226cdddcc0baf933
diff --git a/NativeCode.mk b/NativeCode.mk
index 1449e30..910527c 100644
--- a/NativeCode.mk
+++ b/NativeCode.mk
@@ -106,6 +106,37 @@
 
 endif # LIBCORE_SKIP_TESTS
 
+# Set of gtest unit tests.
+include $(CLEAR_VARS)
+LOCAL_CFLAGS += $(core_cflags)
+LOCAL_CPPFLAGS += $(core_cppflags)
+LOCAL_SRC_FILES += \
+  luni/src/test/native/libcore_io_Memory_test.cpp \
+
+LOCAL_C_INCLUDES += libcore/include
+LOCAL_MODULE_TAGS := debug
+LOCAL_MODULE := libjavacore-unit-tests
+LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/NativeCode.mk
+LOCAL_CXX_STL := libc++
+include $(BUILD_NATIVE_TEST)
+
+# Set of benchmarks for libjavacore functions.
+include $(CLEAR_VARS)
+LOCAL_CFLAGS += $(core_cflags)
+LOCAL_CPPFLAGS += $(core_cppflags)
+LOCAL_SRC_FILES += \
+  luni/src/benchmark/native/libcore_io_Memory_bench.cpp \
+
+LOCAL_C_INCLUDES += libcore/include bionic/benchmarks
+LOCAL_MODULE_TAGS := debug
+LOCAL_MODULE := libjavacore-benchmarks
+LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/NativeCode.mk
+LOCAL_CXX_STL := libc++
+LOCAL_MULTILIB := both
+LOCAL_MODULE_STEM_32 := $(LOCAL_MODULE)32
+LOCAL_MODULE_STEM_64 := $(LOCAL_MODULE)64
+include $(BUILD_NATIVE_BENCHMARK)
+
 
 #
 # Build for the host.
diff --git a/luni/src/benchmark/native/libcore_io_Memory_bench.cpp b/luni/src/benchmark/native/libcore_io_Memory_bench.cpp
new file mode 100644
index 0000000..0819c27
--- /dev/null
+++ b/luni/src/benchmark/native/libcore_io_Memory_bench.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// The functions we want to benchmark are static, so include the source code.
+#include "luni/src/main/native/libcore_io_Memory.cpp"
+
+#include <benchmark/Benchmark.h>
+
+template<typename T, size_t ALIGN>
+void swap_bench(testing::Benchmark* bench, void (*swap_func)(T*, const T*, size_t),
+                int iters, size_t num_elements) {
+  T* src;
+  T* dst;
+  T* src_elems;
+  T* dst_elems;
+
+  if (ALIGN) {
+    src_elems = new T[num_elements + 1];
+    dst_elems = new T[num_elements + 1];
+
+    src = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(src_elems) + ALIGN);
+    dst = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(dst_elems) + ALIGN);
+  } else {
+    src_elems = new T[num_elements];
+    dst_elems = new T[num_elements];
+
+    src = src_elems;
+    dst = dst_elems;
+  }
+
+  memset(dst, 0, sizeof(T) * num_elements);
+  memset(src, 0x12, sizeof(T) * num_elements);
+
+  bench->StartBenchmarkTiming();
+
+  for (int i = 0; i < iters; i++) {
+    swap_func(src, dst, num_elements);
+  }
+
+  bench->StopBenchmarkTiming();
+
+  delete[] src_elems;
+  delete[] dst_elems;
+}
+
+#define AT_COMMON_VALUES \
+    Arg(10)->Arg(100)->Arg(1000)->Arg(1024*10)->Arg(1024*100)
+
+BENCHMARK_WITH_ARG(BM_libcore_swapShorts_aligned, int)->AT_COMMON_VALUES;
+void BM_libcore_swapShorts_aligned::Run(int iters, int num_shorts) {
+  swap_bench<jshort, 0>(this, swapShorts, iters, num_shorts);
+}
+
+BENCHMARK_WITH_ARG(BM_libcore_swapInts_aligned, int)->AT_COMMON_VALUES;
+void BM_libcore_swapInts_aligned::Run(int iters, int num_ints) {
+  swap_bench<jint, 0>(this, swapInts, iters, num_ints);
+}
+
+BENCHMARK_WITH_ARG(BM_libcore_swapLongs_aligned, int)->AT_COMMON_VALUES;
+void BM_libcore_swapLongs_aligned::Run(int iters, int num_longs) {
+  swap_bench<jlong, 0>(this, swapLongs, iters, num_longs);
+}
+
+BENCHMARK_WITH_ARG(BM_libcore_swapShorts_unaligned1, int)->AT_COMMON_VALUES;
+void BM_libcore_swapShorts_unaligned1::Run(int iters, int num_shorts) {
+  swap_bench<jshort, 1>(this, swapShorts, iters, num_shorts);
+}
+
+BENCHMARK_WITH_ARG(BM_libcore_swapInts_unaligned1, int)->AT_COMMON_VALUES;
+void BM_libcore_swapInts_unaligned1::Run(int iters, int num_ints) {
+  swap_bench<jint, 1>(this, swapInts, iters, num_ints);
+}
+
+BENCHMARK_WITH_ARG(BM_libcore_swapLongs_unaligned1, int)->AT_COMMON_VALUES;
+void BM_libcore_swapLongs_unaligned1::Run(int iters, int num_longs) {
+  swap_bench<jlong, 1>(this, swapLongs, iters, num_longs);
+}
+
+BENCHMARK_WITH_ARG(BM_libcore_swapShorts_unaligned2, int)->AT_COMMON_VALUES;
+void BM_libcore_swapShorts_unaligned2::Run(int iters, int num_shorts) {
+  swap_bench<jshort, 2>(this, swapShorts, iters, num_shorts);
+}
+
+BENCHMARK_WITH_ARG(BM_libcore_swapInts_unaligned2, int)->AT_COMMON_VALUES;
+void BM_libcore_swapInts_unaligned2::Run(int iters, int num_ints) {
+  swap_bench<jint, 2>(this, swapInts, iters, num_ints);
+}
+
+BENCHMARK_WITH_ARG(BM_libcore_swapLongs_unaligned2, int)->AT_COMMON_VALUES;
+void BM_libcore_swapLongs_unaligned2::Run(int iters, int num_longs) {
+  swap_bench<jlong, 2>(this, swapLongs, iters, num_longs);
+}
diff --git a/luni/src/main/native/libcore_io_Memory.cpp b/luni/src/main/native/libcore_io_Memory.cpp
index 70bd9e4..5122a6c 100644
--- a/luni/src/main/native/libcore_io_Memory.cpp
+++ b/luni/src/main/native/libcore_io_Memory.cpp
@@ -27,25 +27,6 @@
 #include <string.h>
 #include <sys/mman.h>
 
-#if defined(__arm__)
-// 32-bit ARM has load/store alignment restrictions for longs.
-#define LONG_ALIGNMENT_MASK 0x3
-#define INT_ALIGNMENT_MASK 0x0
-#define SHORT_ALIGNMENT_MASK 0x0
-#elif defined(__mips__)
-// MIPS has load/store alignment restrictions for longs, ints and shorts.
-#define LONG_ALIGNMENT_MASK 0x7
-#define INT_ALIGNMENT_MASK 0x3
-#define SHORT_ALIGNMENT_MASK 0x1
-#elif defined(__aarch64__) || defined(__i386__) || defined(__x86_64__)
-// These architectures can load anything at any alignment.
-#define LONG_ALIGNMENT_MASK 0x0
-#define INT_ALIGNMENT_MASK 0x0
-#define SHORT_ALIGNMENT_MASK 0x0
-#else
-#error unknown load/store alignment restrictions for this architecture
-#endif
-
 // Use packed structures for access to unaligned data on targets with alignment restrictions.
 // The compiler will generate appropriate code to access these structures without
 // generating alignment exceptions.
@@ -81,63 +62,31 @@
     // Do 32-bit swaps as long as possible...
     jint* dst = reinterpret_cast<jint*>(dstShorts);
     const jint* src = reinterpret_cast<const jint*>(srcShorts);
-
-    if ((reinterpret_cast<uintptr_t>(dst) & INT_ALIGNMENT_MASK) == 0 &&
-        (reinterpret_cast<uintptr_t>(src) & INT_ALIGNMENT_MASK) == 0) {
-        for (size_t i = 0; i < count / 2; ++i) {
-            jint v = *src++;
-            *dst++ = bswap_2x16(v);
-        }
-        // ...with one last 16-bit swap if necessary.
-        if ((count % 2) != 0) {
-            jshort v = *reinterpret_cast<const jshort*>(src);
-            *reinterpret_cast<jshort*>(dst) = bswap_16(v);
-        }
-    } else {
-        for (size_t i = 0; i < count / 2; ++i) {
-            jint v = get_unaligned<jint>(src++);
-            put_unaligned<jint>(dst++, bswap_2x16(v));
-        }
-        if ((count % 2) != 0) {
-          jshort v = get_unaligned<jshort>(reinterpret_cast<const jshort*>(src));
-          put_unaligned<jshort>(reinterpret_cast<jshort*>(dst), bswap_16(v));
-        }
+    for (size_t i = 0; i < count / 2; ++i) {
+        jint v = get_unaligned<jint>(src++);
+        put_unaligned<jint>(dst++, bswap_2x16(v));
+    }
+    if ((count % 2) != 0) {
+      jshort v = get_unaligned<jshort>(reinterpret_cast<const jshort*>(src));
+      put_unaligned<jshort>(reinterpret_cast<jshort*>(dst), bswap_16(v));
     }
 }
 
 static inline void swapInts(jint* dstInts, const jint* srcInts, size_t count) {
-    if ((reinterpret_cast<uintptr_t>(dstInts) & INT_ALIGNMENT_MASK) == 0 &&
-        (reinterpret_cast<uintptr_t>(srcInts) & INT_ALIGNMENT_MASK) == 0) {
-        for (size_t i = 0; i < count; ++i) {
-            jint v = *srcInts++;
-            *dstInts++ = bswap_32(v);
-        }
-    } else {
-        for (size_t i = 0; i < count; ++i) {
-            jint v = get_unaligned<int>(srcInts++);
-            put_unaligned<jint>(dstInts++, bswap_32(v));
-        }
+    for (size_t i = 0; i < count; ++i) {
+        jint v = get_unaligned<int>(srcInts++);
+        put_unaligned<jint>(dstInts++, bswap_32(v));
     }
 }
 
 static inline void swapLongs(jlong* dstLongs, const jlong* srcLongs, size_t count) {
     jint* dst = reinterpret_cast<jint*>(dstLongs);
     const jint* src = reinterpret_cast<const jint*>(srcLongs);
-    if ((reinterpret_cast<uintptr_t>(dstLongs) & INT_ALIGNMENT_MASK) == 0 &&
-        (reinterpret_cast<uintptr_t>(srcLongs) & INT_ALIGNMENT_MASK) == 0) {
-        for (size_t i = 0; i < count; ++i) {
-          jint v1 = *src++;
-          jint v2 = *src++;
-          *dst++ = bswap_32(v2);
-          *dst++ = bswap_32(v1);
-        }
-    } else {
-        for (size_t i = 0; i < count; ++i) {
-            jint v1 = get_unaligned<jint>(src++);
-            jint v2 = get_unaligned<jint>(src++);
-            put_unaligned<jint>(dst++, bswap_32(v2));
-            put_unaligned<jint>(dst++, bswap_32(v1));
-        }
+    for (size_t i = 0; i < count; ++i) {
+        jint v1 = get_unaligned<jint>(src++);
+        jint v2 = get_unaligned<jint>(src++);
+        put_unaligned<jint>(dst++, bswap_32(v2));
+        put_unaligned<jint>(dst++, bswap_32(v1));
     }
 }
 
@@ -259,39 +208,27 @@
 }
 
 static jshort Memory_peekShortNative(JNIEnv*, jclass, jlong srcAddress) {
-    return *cast<const jshort*>(srcAddress);
+    return get_unaligned<jshort>(cast<const jshort*>(srcAddress));
 }
 
 static void Memory_pokeShortNative(JNIEnv*, jclass, jlong dstAddress, jshort value) {
-    *cast<jshort*>(dstAddress) = value;
+    put_unaligned<jshort>(cast<jshort*>(dstAddress), value);
 }
 
 static jint Memory_peekIntNative(JNIEnv*, jclass, jlong srcAddress) {
-    return *cast<const jint*>(srcAddress);
+    return get_unaligned<jint>(cast<const jint*>(srcAddress));
 }
 
 static void Memory_pokeIntNative(JNIEnv*, jclass, jlong dstAddress, jint value) {
-    *cast<jint*>(dstAddress) = value;
+    put_unaligned<jint>(cast<jint*>(dstAddress), value);
 }
 
 static jlong Memory_peekLongNative(JNIEnv*, jclass, jlong srcAddress) {
-    jlong result;
-    const jlong* src = cast<const jlong*>(srcAddress);
-    if ((srcAddress & LONG_ALIGNMENT_MASK) == 0) {
-        result = *src;
-    } else {
-        result = get_unaligned<jlong>(src);
-    }
-    return result;
+    return get_unaligned<jlong>(cast<const jlong*>(srcAddress));
 }
 
 static void Memory_pokeLongNative(JNIEnv*, jclass, jlong dstAddress, jlong value) {
-    jlong* dst = cast<jlong*>(dstAddress);
-    if ((dstAddress & LONG_ALIGNMENT_MASK) == 0) {
-        *dst = value;
-    } else {
-        put_unaligned<jlong>(dst, value);
-    }
+    put_unaligned<jlong>(cast<jlong*>(dstAddress), value);
 }
 
 static void unsafeBulkCopy(jbyte* dst, const jbyte* src, jint byteCount,
diff --git a/luni/src/test/native/libcore_io_Memory_test.cpp b/luni/src/test/native/libcore_io_Memory_test.cpp
new file mode 100644
index 0000000..2d95155
--- /dev/null
+++ b/luni/src/test/native/libcore_io_Memory_test.cpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luni/src/main/native/libcore_io_Memory.cpp"
+
+#include <stdlib.h>
+
+#include <functional>
+
+#include <gtest/gtest.h>
+
+#define ALIGNMENT 8
+
+template<typename T, size_t NUM_ELEMENTS>
+void swap_align_test(void (*swap_func)(T*, const T*, size_t),
+                     std::function<void (T*, T*, uint64_t)> init_func) {
+  uint8_t* dst = nullptr;
+  uint8_t* src = nullptr;
+  ASSERT_EQ(0, posix_memalign(reinterpret_cast<void**>(&dst), ALIGNMENT,
+                              sizeof(T) * NUM_ELEMENTS + ALIGNMENT));
+  ASSERT_EQ(0, posix_memalign(reinterpret_cast<void**>(&src), ALIGNMENT,
+                              sizeof(T) * NUM_ELEMENTS + ALIGNMENT));
+
+  T src_buf[NUM_ELEMENTS];
+  T dst_buf[NUM_ELEMENTS];
+  for (uint64_t i = 0; i < NUM_ELEMENTS; i++) {
+    init_func(&src_buf[i], &dst_buf[i], i);
+  }
+
+  // Vary a few alignments.
+  for (size_t dst_align = 0; dst_align < ALIGNMENT; dst_align++) {
+    T* dst_aligned = reinterpret_cast<T*>(&dst[dst_align]);
+    for (size_t src_align = 0; src_align < ALIGNMENT; src_align++) {
+      T* src_aligned = reinterpret_cast<T*>(&src[src_align]);
+      memset(dst_aligned, 0, sizeof(T) * NUM_ELEMENTS);
+      memcpy(src_aligned, src_buf, sizeof(T) * NUM_ELEMENTS);
+      swap_func(dst_aligned, src_aligned, NUM_ELEMENTS);
+      ASSERT_EQ(0, memcmp(dst_buf, dst_aligned, sizeof(T) * NUM_ELEMENTS))
+          << "Failed at dst align " << dst_align << " src align " << src_align;
+    }
+  }
+  free(dst);
+  free(src);
+}
+
+TEST(libcore, swapShorts_align_test) {
+  // Use an odd number to guarantee that the last 16-bit swap code
+  // is executed.
+  swap_align_test<jshort, 9> (swapShorts, [] (jshort* src, jshort* dst, uint64_t i) {
+    *src = ((2*i) << 8) | (2*(i+1));
+    *dst = (2*i) | ((2*(i+1)) << 8);
+  });
+}
+
+TEST(libcore, swapInts_align_test) {
+  swap_align_test<jint, 10> (swapInts, [] (jint* src, jint* dst, uint64_t i) {
+    *src = ((4*i) << 24) | ((4*(i+1)) << 16) | ((4*(i+2)) << 8) | (4*(i+3));
+    *dst = (4*i) | ((4*(i+1)) << 8) | ((4*(i+2)) << 16) | ((4*(i+3)) << 24);
+  });
+}
+
+TEST(libcore, swapLongs_align_test) {
+  swap_align_test<jlong, 10> (swapLongs, [] (jlong* src, jlong* dst, uint64_t i) {
+    *src = ((8*i) << 56) | ((8*(i+1)) << 48) | ((8*(i+2)) << 40) | ((8*(i+3)) << 32) |
+        ((8*(i+4)) << 24) | ((8*(i+5)) << 16) | ((8*(i+6)) << 8) | (8*(i+7));
+    *dst = (8*i) | ((8*(i+1)) << 8) | ((8*(i+2)) << 16) | ((8*(i+3)) << 24) |
+        ((8*(i+4)) << 32) | ((8*(i+5)) << 40) | ((8*(i+6)) << 48) | ((8*(i+7)) << 56);
+  });
+}
+
+template<typename T>
+void memory_peek_test(T (*peek_func)(JNIEnv*, jclass, jlong), T value) {
+  T* src = nullptr;
+  ASSERT_EQ(0, posix_memalign(reinterpret_cast<void**>(&src), ALIGNMENT,
+                              sizeof(T) + ALIGNMENT));
+  for (size_t i = 0; i < ALIGNMENT; i++) {
+    jlong src_aligned = reinterpret_cast<jlong>(src) + i;
+    memcpy(reinterpret_cast<void*>(src_aligned), &value, sizeof(T));
+    T result = peek_func(nullptr, nullptr, src_aligned);
+    ASSERT_EQ(value, result);
+  }
+  free(src);
+}
+
+TEST(libcore, Memory_peekShortNative_align_check) {
+  memory_peek_test<jshort>(Memory_peekShortNative, 0x0102);
+}
+
+TEST(libcore, Memory_peekIntNative_align_check) {
+  memory_peek_test<jint>(Memory_peekIntNative, 0x01020304);
+}
+
+TEST(libcore, Memory_peekLongNative_align_check) {
+  memory_peek_test<jlong>(Memory_peekLongNative, 0x01020405060708ULL);
+}
+
+template<typename T>
+void memory_poke_test(void (*poke_func)(JNIEnv*, jclass, jlong, T), T value) {
+  T* dst = nullptr;
+  ASSERT_EQ(0, posix_memalign(reinterpret_cast<void**>(&dst), ALIGNMENT,
+                              sizeof(T) + ALIGNMENT));
+  for(size_t i = 0; i < ALIGNMENT; i++) {
+    memset(dst, 0, sizeof(T) + ALIGNMENT);
+    jlong dst_aligned = reinterpret_cast<jlong>(dst) + i;
+    poke_func(nullptr, nullptr, dst_aligned, value);
+    ASSERT_EQ(0, memcmp(reinterpret_cast<void*>(dst_aligned), &value, sizeof(T)));
+  }
+  free(dst);
+}
+
+TEST(libcore, Memory_pokeShortNative_align_check) {
+  memory_poke_test<jshort>(Memory_pokeShortNative, 0x0102);
+}
+
+TEST(libcore, Memory_pokeIntNative_align_check) {
+  memory_poke_test<jint>(Memory_pokeIntNative, 0x01020304);
+}
+
+TEST(libcore, Memory_pokeLongNative_align_check) {
+  memory_poke_test<jlong>(Memory_pokeLongNative, 0x0102030405060708ULL);
+}