Cleaned up debug variants of libclcore.bc

Bug: 38418449
Bug: 62028512
Bug: 37955136

Two major changes:
(More motivation/explanation/discussion are in the tagged bugs.)

1) Added the fourth variant of libclcore.bc, libclcore_debug_g.bc,
which both gets compiled with -g -O0 and provides runtime range
checking. This is needed for debug context related CTS tests to pass
when they are compiled with flags -g -O0.

2) Removed Element setters and getters defined under RS_G_RUNTIME.
The removed code is neither necessary or correct. After the
RS_G_RUNTIME specific setter/getter implementation was removed,
a bug in the default setter/getter implementation showed up with -O0
compilation as described in Bug 38418449, which caused segfaults
due to mismatch argument type between callers and callees of
rsSetElementAtImpl_<T>() and __rsAllocationVStoreImpl_<T>().
To fix it, I adjusted argument type in rsSetElementAtImpl_<T>() and
__rsAllocationVStoreXImpl_<T> defined in
frameworks/rs/driver/runtime/ll64/allocation.ll to match clang-
generated code from frameworks/rs/driver/runtime/rs_allocation.c.
E.g., char4 was <4 x i8> in ll64/allocation.ll, but was i32 in code
generated by clang. This caused segfaults on calls to the affected
functions in the final arm64-v8a code. short2 and half2 also have the
issue and fix.

Test: CTS on Angler and X86_64:
Test: With tests compiled using -g -O0 and system property
      debug.rs.debug set to 1;
Test: With tests compiled using -g -O0;
Test: With tests compiled using the default flags;
Test: With tests compiled using the default flags and system property
      debug.rs.debug set to 1.

Test: LLDB tests on X86_64 with no additional failures than those are
      known to fail.

Change-Id: I23bd9ab6c7648d2762a77977f08ad3f20e31941c
diff --git a/Android.bp b/Android.bp
index 5835246..78992d8 100644
--- a/Android.bp
+++ b/Android.bp
@@ -183,6 +183,7 @@
     required: [
         "libclcore.bc",
         "libclcore_debug.bc",
+        "libclcore_debug_g.bc",
         "libclcore_g.bc",
         "libcompiler_rt",
     ],
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index 60d08be..cf1b869 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -447,6 +447,9 @@
 
     // If we're debugging, use the debug library.
     if (mCtx->getContext()->getContextType() == RS_CONTEXT_TYPE_DEBUG) {
+        if (ME.hasDebugInfo()) {
+            return SYSLIBPATH_BC"/libclcore_debug_g.bc";
+        }
         return SYSLIBPATH_BC"/libclcore_debug.bc";
     }
 
diff --git a/driver/runtime/Android.mk b/driver/runtime/Android.mk
index 8d7e5bf..71b5ab2 100755
--- a/driver/runtime/Android.mk
+++ b/driver/runtime/Android.mk
@@ -152,14 +152,38 @@
 LOCAL_CFLAGS += $(clcore_cflags)
 LOCAL_CFLAGS += -g -O0
 LOCAL_SRC_FILES := $(clcore_base_files) $(clcore_g_files)
+LOCAL_SRC_FILES_32 := $(clcore_base_files_32)
+LOCAL_SRC_FILES_64 := $(clcore_base_files_64)
 
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),arm64))
 LOCAL_CFLAGS_64 += -DARCH_ARM64_HAVE_NEON
 endif
 
 include $(LOCAL_PATH)/build_bc_lib.mk
-rs_g_runtime :=
 
+# Build a debug version of the library with debug info
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libclcore_debug_g.bc
+rs_debug_runtime := 1
+rs_g_runtime := 1
+LOCAL_CFLAGS += $(clcore_cflags)
+LOCAL_CFLAGS += -g -O0
+LOCAL_SRC_FILES := $(clcore_base_files)
+LOCAL_SRC_FILES += rs_abi_debuginfo.c
+LOCAL_SRC_FILES_32 := $(clcore_base_files_32)
+LOCAL_SRC_FILES_64 := $(clcore_base_files_64)
+
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),arm64))
+LOCAL_SRC_FILES_64 += arch/asimd.ll arch/clamp.c
+LOCAL_CFLAGS_64 += -DARCH_ARM64_HAVE_NEON
+else
+LOCAL_SRC_FILES_64 += arch/generic.c
+endif
+
+include $(LOCAL_PATH)/build_bc_lib.mk
+rs_debug_runtime :=
+rs_g_runtime :=
 
 ### Build new versions (librsrt_<ARCH>.bc) as host shared libraries.
 ### These will be used with bcc_compat and the support library.
diff --git a/driver/runtime/ll64/allocation.ll b/driver/runtime/ll64/allocation.ll
index 64b4a8f..94fa11b 100644
--- a/driver/runtime/ll64/allocation.ll
+++ b/driver/runtime/ll64/allocation.ll
@@ -82,10 +82,11 @@
 }
 
 !24 = !{!"char4", !15}
-define void @rsSetElementAtImpl_char4(%struct.rs_allocation* nocapture readonly %a, <4 x i8> %val, i32 %x, i32 %y, i32 %z) #1 {
+define void @rsSetElementAtImpl_char4(%struct.rs_allocation* nocapture readonly %a, i32 %val, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 4, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i8>*
-  store <4 x i8> %val, <4 x i8>* %2, align 4, !tbaa !24
+  %3 = bitcast i32 %val to <4 x i8>
+  store <4 x i8> %3, <4 x i8>* %2, align 4, !tbaa !24
   ret void
 }
 
@@ -144,10 +145,11 @@
 }
 
 !28 = !{!"uchar4", !15}
-define void @rsSetElementAtImpl_uchar4(%struct.rs_allocation* nocapture readonly %a, <4 x i8> %val, i32 %x, i32 %y, i32 %z) #1 {
+define void @rsSetElementAtImpl_uchar4(%struct.rs_allocation* nocapture readonly %a, i32 %val, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 4, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i8>*
-  store <4 x i8> %val, <4 x i8>* %2, align 4, !tbaa !28
+  %3 = bitcast i32 %val to <4 x i8>
+  store <4 x i8> %3, <4 x i8>* %2, align 4, !tbaa !28
   ret void
 }
 
@@ -174,10 +176,11 @@
 }
 
 !30 = !{!"short2", !15}
-define void @rsSetElementAtImpl_short2(%struct.rs_allocation* nocapture readonly %a, <2 x i16> %val, i32 %x, i32 %y, i32 %z) #1 {
+define void @rsSetElementAtImpl_short2(%struct.rs_allocation* nocapture readonly %a, i32 %val, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 4, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x i16>*
-  store <2 x i16> %val, <2 x i16>* %2, align 4, !tbaa !30
+  %3 = bitcast i32 %val to <2 x i16>
+  store <2 x i16> %3, <2 x i16>* %2, align 4, !tbaa !30
   ret void
 }
 
@@ -237,10 +240,11 @@
 }
 
 !34 = !{!"ushort2", !15}
-define void @rsSetElementAtImpl_ushort2(%struct.rs_allocation* nocapture readonly %a, <2 x i16> %val, i32 %x, i32 %y, i32 %z) #1 {
+define void @rsSetElementAtImpl_ushort2(%struct.rs_allocation* nocapture readonly %a, i32 %val, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 4, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x i16>*
-  store <2 x i16> %val, <2 x i16>* %2, align 4, !tbaa !34
+  %3 = bitcast i32 %val to <2 x i16>
+  store <2 x i16> %3, <2 x i16>* %2, align 4, !tbaa !34
   ret void
 }
 
@@ -685,10 +689,11 @@
 }
 
 !62 = !{!"half2", !15}
-define void @rsSetElementAtImpl_half2(%struct.rs_allocation* nocapture readonly %a.coerce, <2 x half> %val, i32 %x, i32 %y, i32 %z) #1 {
+define void @rsSetElementAtImpl_half2(%struct.rs_allocation* nocapture readonly %a.coerce, i32 %val, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a.coerce, i32 4, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x half>*
-  store <2 x half> %val, <2 x half>* %2, align 4, !tbaa !62
+  %3 = bitcast i32 %val to <2 x half>
+  store <2 x half> %3, <2 x half>* %2, align 4, !tbaa !62
   ret void
 }
 
@@ -1026,10 +1031,11 @@
   store <3 x i16> %4, <3 x i16>* %2, align 2
   ret void
 }
-define void @__rsAllocationVStoreXImpl_short2(%struct.rs_allocation* nocapture readonly %a, <2 x i16> %val, i32 %x, i32 %y, i32 %z) #1 {
+define void @__rsAllocationVStoreXImpl_short2(%struct.rs_allocation* nocapture readonly %a, i32 %val, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x i16>*
-  store <2 x i16> %val, <2 x i16>* %2, align 2
+  %3 = bitcast i32 %val to <2 x i16>
+  store <2 x i16> %3, <2 x i16>* %2, align 2
   ret void
 }
 
@@ -1047,17 +1053,19 @@
   store <3 x i16> %4, <3 x i16>* %2, align 2
   ret void
 }
-define void @__rsAllocationVStoreXImpl_ushort2(%struct.rs_allocation* nocapture readonly %a, <2 x i16> %val, i32 %x, i32 %y, i32 %z) #1 {
+define void @__rsAllocationVStoreXImpl_ushort2(%struct.rs_allocation* nocapture readonly %a, i32 %val, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x i16>*
-  store <2 x i16> %val, <2 x i16>* %2, align 2
+  %3 = bitcast i32 %val to <2 x i16>
+  store <2 x i16> %3, <2 x i16>* %2, align 2
   ret void
 }
 
-define void @__rsAllocationVStoreXImpl_char4(%struct.rs_allocation* nocapture readonly %a, <4 x i8> %val, i32 %x, i32 %y, i32 %z) #1 {
+define void @__rsAllocationVStoreXImpl_char4(%struct.rs_allocation* nocapture readonly %a, i32 %val, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i8>*
-  store <4 x i8> %val, <4 x i8>* %2, align 1
+  %3 = bitcast i32 %val to <4 x i8>
+  store <4 x i8> %3, <4 x i8>* %2, align 1
   ret void
 }
 define void @__rsAllocationVStoreXImpl_char3(%struct.rs_allocation* nocapture readonly %a, i32 %val, i32 %x, i32 %y, i32 %z) #1 {
@@ -1076,10 +1084,11 @@
   ret void
 }
 
-define void @__rsAllocationVStoreXImpl_uchar4(%struct.rs_allocation* nocapture readonly %a, <4 x i8> %val, i32 %x, i32 %y, i32 %z) #1 {
+define void @__rsAllocationVStoreXImpl_uchar4(%struct.rs_allocation* nocapture readonly %a, i32 %val, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i8>*
-  store <4 x i8> %val, <4 x i8>* %2, align 1
+  %3 = bitcast i32 %val to <4 x i8>
+  store <4 x i8> %3, <4 x i8>* %2, align 1
   ret void
 }
 define void @__rsAllocationVStoreXImpl_uchar3(%struct.rs_allocation* nocapture readonly %a, i32 %val, i32 %x, i32 %y, i32 %z) #1 {
diff --git a/driver/runtime/rs_allocation.c b/driver/runtime/rs_allocation.c
index 2163e77..075b368 100644
--- a/driver/runtime/rs_allocation.c
+++ b/driver/runtime/rs_allocation.c
@@ -61,7 +61,6 @@
     }
 }
 
-#ifndef RS_DEBUG_RUNTIME
 uint8_t*
 rsOffset(rs_allocation a, uint32_t sizeOf, uint32_t x, uint32_t y,
          uint32_t z) {
@@ -73,7 +72,6 @@
                      (z * stride * dimY)];
     return dp;
 }
-#endif
 
 uint8_t*
 rsOffsetNs(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
@@ -134,64 +132,6 @@
     }
 #else  // NOT RS_DEBUG_RUNTIME
 
-// The functions rsSetElementAtImpl_T and rsGetElementAtImpl_T are implemented in bitcode
-// in ll32/allocation.ll and ll64/allocation.ll. To be able to provide debug info for
-// these functions define them here instead, if we are linking with the debug library.
-#ifdef RS_G_RUNTIME
-
-#define SET_ELEMENT_AT_IMPL_TYPE_SIZE(typename, size)                               \
-     void rsSetElementAtImpl_##typename                                             \
-            (rs_allocation a, typename val, uint32_t x, uint32_t y, uint32_t z) {   \
-        typename* val_ptr = (typename*)rsOffset(a, size, x, y, z);                  \
-        *val_ptr = val;                                                             \
-    }
-
-#define GET_ELEMENT_AT_IMPL_TYPE_SIZE(typename, size)                               \
-     typename rsGetElementAtImpl_##typename                                         \
-            (rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {                 \
-        typename *val_ptr = (typename*)rsOffset(a, size, x, y, z);                  \
-        return *val_ptr;                                                            \
-    }
-
-#define SET_ELEMENT_AT_IMPL_TYPE(typename)          \
-    SET_ELEMENT_AT_IMPL_TYPE_SIZE(typename, sizeof(typename))        \
-    SET_ELEMENT_AT_IMPL_TYPE_SIZE(typename##2, sizeof(typename)*2)   \
-    SET_ELEMENT_AT_IMPL_TYPE_SIZE(typename##3, sizeof(typename)*4)   \
-    SET_ELEMENT_AT_IMPL_TYPE_SIZE(typename##4, sizeof(typename)*4)
-
-#define GET_ELEMENT_AT_IMPL_TYPE(typename)          \
-    GET_ELEMENT_AT_IMPL_TYPE_SIZE(typename, sizeof(typename))        \
-    GET_ELEMENT_AT_IMPL_TYPE_SIZE(typename##2, sizeof(typename)*2)   \
-    GET_ELEMENT_AT_IMPL_TYPE_SIZE(typename##3, sizeof(typename)*4)   \
-    GET_ELEMENT_AT_IMPL_TYPE_SIZE(typename##4, sizeof(typename)*4)
-
-#define ELEMENT_AT_IMPL_TYPE(typename)  \
-    SET_ELEMENT_AT_IMPL_TYPE(typename)  \
-    GET_ELEMENT_AT_IMPL_TYPE(typename)
-
-ELEMENT_AT_IMPL_TYPE(char)
-ELEMENT_AT_IMPL_TYPE(uchar)
-ELEMENT_AT_IMPL_TYPE(short)
-ELEMENT_AT_IMPL_TYPE(ushort)
-ELEMENT_AT_IMPL_TYPE(int)
-ELEMENT_AT_IMPL_TYPE(uint)
-ELEMENT_AT_IMPL_TYPE(long)
-ELEMENT_AT_IMPL_TYPE(ulong)
-ELEMENT_AT_IMPL_TYPE(half)
-ELEMENT_AT_IMPL_TYPE(float)
-ELEMENT_AT_IMPL_TYPE(double)
-
-#undef ELEMENT_AT_IMPL_TYPE
-#undef GET_ELEMENT_AT_IMPL_TYPE
-#undef SET_ELEMENT_AT_IMPL_TYPE
-#undef GET_ELEMENT_AT_IMPL_TYPE_SIZE
-#undef SET_ELEMENT_AT_IMPL_TYPE_SIZE
-
-#define SET_ELEMENT_AT_TYPE_IMPL(T, typename) /* nothing */
-#define GET_ELEMENT_AT_TYPE_IMPL(T, typename) /* nothing */
-
-#else  //NOT RS_G_RUNTIME
-
 #define SET_ELEMENT_AT_TYPE_IMPL(T, typename)                                    \
     void                                                                \
     rsSetElementAtImpl_##typename(rs_allocation a, typename val, uint32_t x,   \
@@ -202,8 +142,6 @@
     rsGetElementAtImpl_##typename(rs_allocation a, uint32_t x, uint32_t y, \
                                   uint32_t z);
 
-#endif //RS_G_RUNTIME
-
 #define SET_ELEMENT_AT_TYPE_DEF(T, typename)                                    \
     extern void __attribute__((overloadable))                           \
     rsSetElementAt_##typename(rs_allocation a, T val, uint32_t x) {     \
@@ -402,33 +340,10 @@
     return pin[((x >> shift) * cstep) + ((y >> shift) * stride)];
 }
 
-// The functions rsAllocationVLoadXImpl_T and rsAllocationVStoreXImpl_T are implemented in
-// bitcode in ll32/allocation.ll and ll64/allocation.ll. To be able to provide debug info
-// for these functions define them here instead, if we are linking with the debug library.
-#ifdef RS_G_RUNTIME
-
-#define VOP_IMPL(T)                                                             \
-    void __rsAllocationVStoreXImpl_##T                                          \
-            (rs_allocation a, const T val, uint32_t x, uint32_t y, uint32_t z) {\
-        T *val_ptr = (T*)rsOffsetNs(a, x, y, z);                                \
-        local_memcpy(val_ptr, &val, sizeof(T));                                       \
-    }                                                                           \
-    T __rsAllocationVLoadXImpl_##T                                              \
-            (rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {             \
-        T result = {};                                                          \
-        T* val_ptr = (T*)rsOffsetNs(a, x, y, z);                                \
-        local_memcpy(&result, val_ptr, sizeof(T));                                    \
-        return result;                                                          \
-    }
-
-#else
-
 #define VOP_IMPL(T)                                                          \
     extern void __rsAllocationVStoreXImpl_##T(rs_allocation a, const T val, uint32_t x, uint32_t y, uint32_t z); \
     extern T __rsAllocationVLoadXImpl_##T(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
 
-#endif // RS_G_RUNTIME
-
 #define VOP_DEF(T)                                                      \
     extern void __attribute__((overloadable))                           \
     rsAllocationVStoreX_##T(rs_allocation a, T val, uint32_t x) {       \