Remove forced alignment code.

The move from to gcc 4.8 to gcc 4.9 for arm32 introduced a bug in this
code. The original code is trying to out smart the compiler by arch, but
we got caught. Running benchmarks, the amount of time we save by doing this
is in the nanosecond range, so just let the compiler figure things out on
it's own.

It turns out for aarch64, x86, x86_64, two of the functions produce exactly
the same code. For swapLongs, x86/x86_64 produces slightly different code
but is about the same performance.

For arm32, letting the compiler optimize also leads to about the same
performance.

Adding unit tests and benchmark code for these.

Bug: 19692084

Change-Id: I858eb3147ef1e9e2c1894ddb226cdddcc0baf933
diff --git a/NativeCode.mk b/NativeCode.mk
index 1449e30..910527c 100644
--- a/NativeCode.mk
+++ b/NativeCode.mk
@@ -106,6 +106,37 @@
 
 endif # LIBCORE_SKIP_TESTS
 
+# Set of gtest unit tests.
+include $(CLEAR_VARS)
+LOCAL_CFLAGS += $(core_cflags)
+LOCAL_CPPFLAGS += $(core_cppflags)
+LOCAL_SRC_FILES += \
+  luni/src/test/native/libcore_io_Memory_test.cpp \
+
+LOCAL_C_INCLUDES += libcore/include
+LOCAL_MODULE_TAGS := debug
+LOCAL_MODULE := libjavacore-unit-tests
+LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/NativeCode.mk
+LOCAL_CXX_STL := libc++
+include $(BUILD_NATIVE_TEST)
+
+# Set of benchmarks for libjavacore functions.
+include $(CLEAR_VARS)
+LOCAL_CFLAGS += $(core_cflags)
+LOCAL_CPPFLAGS += $(core_cppflags)
+LOCAL_SRC_FILES += \
+  luni/src/benchmark/native/libcore_io_Memory_bench.cpp \
+
+LOCAL_C_INCLUDES += libcore/include bionic/benchmarks
+LOCAL_MODULE_TAGS := debug
+LOCAL_MODULE := libjavacore-benchmarks
+LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/NativeCode.mk
+LOCAL_CXX_STL := libc++
+LOCAL_MULTILIB := both
+LOCAL_MODULE_STEM_32 := $(LOCAL_MODULE)32
+LOCAL_MODULE_STEM_64 := $(LOCAL_MODULE)64
+include $(BUILD_NATIVE_BENCHMARK)
+
 
 #
 # Build for the host.