Implement vector load/store.

Support loading vec(2,3,4) vectors from scaler
buffers of the same component type.

Change-Id: Ice9f96d595c62ffe5e58e3d28b278417cea08fee
diff --git a/driver/runtime/allocation.ll b/driver/runtime/allocation.ll
index e1d6c7e..2b04aef 100644
--- a/driver/runtime/allocation.ll
+++ b/driver/runtime/allocation.ll
@@ -2,6 +2,7 @@
 target triple = "armv7-none-linux-gnueabi"
 
 declare i8* @rsOffset([1 x i32] %a.coerce, i32 %sizeOf, i32 %x, i32 %y, i32 %z)
+declare i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z)
 
 ; The loads and stores in this file are annotated with RenderScript-specific
 ; information for the type based alias analysis, such that the TBAA analysis
@@ -648,6 +649,389 @@
   ret void
 }
 
+
+define <4 x i64> @__rsAllocationVLoadXImpl_long4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <4 x i64>*
+  %3 = load <4 x i64>* %2, align 8
+  ret <4 x i64> %3
+}
+define <3 x i64> @__rsAllocationVLoadXImpl_long3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <3 x i64>*
+  %3 = load <3 x i64>* %2, align 8
+  ret <3 x i64> %3
+}
+define <2 x i64> @__rsAllocationVLoadXImpl_long2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <2 x i64>*
+  %3 = load <2 x i64>* %2, align 8
+  ret <2 x i64> %3
+}
+
+define <4 x i64> @__rsAllocationVLoadXImpl_ulong4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <4 x i64>*
+  %3 = load <4 x i64>* %2, align 8
+  ret <4 x i64> %3
+}
+define <3 x i64> @__rsAllocationVLoadXImpl_ulong3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <3 x i64>*
+  %3 = load <3 x i64>* %2, align 8
+  ret <3 x i64> %3
+}
+define <2 x i64> @__rsAllocationVLoadXImpl_ulong2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <2 x i64>*
+  %3 = load <2 x i64>* %2, align 8
+  ret <2 x i64> %3
+}
+
+define <4 x i32> @__rsAllocationVLoadXImpl_int4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <4 x i32>*
+  %3 = load <4 x i32>* %2, align 4
+  ret <4 x i32> %3
+}
+define <3 x i32> @__rsAllocationVLoadXImpl_int3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <3 x i32>*
+  %3 = load <3 x i32>* %2, align 4
+  ret <3 x i32> %3
+}
+define <2 x i32> @__rsAllocationVLoadXImpl_int2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <2 x i32>*
+  %3 = load <2 x i32>* %2, align 4
+  ret <2 x i32> %3
+}
+
+define <4 x i32> @__rsAllocationVLoadXImpl_uint4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <4 x i32>*
+  %3 = load <4 x i32>* %2, align 4
+  ret <4 x i32> %3
+}
+define <3 x i32> @__rsAllocationVLoadXImpl_uint3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <3 x i32>*
+  %3 = load <3 x i32>* %2, align 4
+  ret <3 x i32> %3
+}
+define <2 x i32> @__rsAllocationVLoadXImpl_uint2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <2 x i32>*
+  %3 = load <2 x i32>* %2, align 4
+  ret <2 x i32> %3
+}
+
+define <4 x i16> @__rsAllocationVLoadXImpl_short4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <4 x i16>*
+  %3 = load <4 x i16>* %2, align 2
+  ret <4 x i16> %3
+}
+define <3 x i16> @__rsAllocationVLoadXImpl_short3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <3 x i16>*
+  %3 = load <3 x i16>* %2, align 2
+  ret <3 x i16> %3
+}
+define <2 x i16> @__rsAllocationVLoadXImpl_short2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <2 x i16>*
+  %3 = load <2 x i16>* %2, align 2
+  ret <2 x i16> %3
+}
+
+define <4 x i16> @__rsAllocationVLoadXImpl_ushort4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <4 x i16>*
+  %3 = load <4 x i16>* %2, align 2
+  ret <4 x i16> %3
+}
+define <3 x i16> @__rsAllocationVLoadXImpl_ushort3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <3 x i16>*
+  %3 = load <3 x i16>* %2, align 2
+  ret <3 x i16> %3
+}
+define <2 x i16> @__rsAllocationVLoadXImpl_ushort2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <2 x i16>*
+  %3 = load <2 x i16>* %2, align 2
+  ret <2 x i16> %3
+}
+
+define <4 x i8> @__rsAllocationVLoadXImpl_char4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <4 x i8>*
+  %3 = load <4 x i8>* %2, align 1
+  ret <4 x i8> %3
+}
+define <3 x i8> @__rsAllocationVLoadXImpl_char3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <3 x i8>*
+  %3 = load <3 x i8>* %2, align 1
+  ret <3 x i8> %3
+}
+define <2 x i8> @__rsAllocationVLoadXImpl_char2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <2 x i8>*
+  %3 = load <2 x i8>* %2, align 1
+  ret <2 x i8> %3
+}
+
+define <4 x i8> @__rsAllocationVLoadXImpl_uchar4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <4 x i8>*
+  %3 = load <4 x i8>* %2, align 1
+  ret <4 x i8> %3
+}
+define <3 x i8> @__rsAllocationVLoadXImpl_uchar3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <3 x i8>*
+  %3 = load <3 x i8>* %2, align 1
+  ret <3 x i8> %3
+}
+define <2 x i8> @__rsAllocationVLoadXImpl_uchar2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <2 x i8>*
+  %3 = load <2 x i8>* %2, align 1
+  ret <2 x i8> %3
+}
+
+define <4 x float> @__rsAllocationVLoadXImpl_float4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <4 x float>*
+  %3 = load <4 x float>* %2, align 4
+  ret <4 x float> %3
+}
+define <3 x float> @__rsAllocationVLoadXImpl_float3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <3 x float>*
+  %3 = load <3 x float>* %2, align 4
+  ret <3 x float> %3
+}
+define <2 x float> @__rsAllocationVLoadXImpl_float2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <2 x float>*
+  %3 = load <2 x float>* %2, align 4
+  ret <2 x float> %3
+}
+
+define <4 x double> @__rsAllocationVLoadXImpl_double4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <4 x double>*
+  %3 = load <4 x double>* %2, align 8
+  ret <4 x double> %3
+}
+define <3 x double> @__rsAllocationVLoadXImpl_double3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <3 x double>*
+  %3 = load <3 x double>* %2, align 8
+  ret <3 x double> %3
+}
+define <2 x double> @__rsAllocationVLoadXImpl_double2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <2 x double>*
+  %3 = load <2 x double>* %2, align 8
+  ret <2 x double> %3
+}
+
+
+define void @__rsAllocationVStoreXImpl_long4([1 x i32] %a.coerce, <4 x i64> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <4 x i64>*
+  store <4 x i64> %val, <4 x i64>* %2, align 8
+  ret void
+}
+define void @__rsAllocationVStoreXImpl_long3([1 x i32] %a.coerce, <3 x i64> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <3 x i64>*
+  store <3 x i64> %val, <3 x i64>* %2, align 8
+  ret void
+}
+define void @__rsAllocationVStoreXImpl_long2([1 x i32] %a.coerce, <2 x i64> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <2 x i64>*
+  store <2 x i64> %val, <2 x i64>* %2, align 8
+  ret void
+}
+
+define void @__rsAllocationVStoreXImpl_ulong4([1 x i32] %a.coerce, <4 x i64> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <4 x i64>*
+  store <4 x i64> %val, <4 x i64>* %2, align 8
+  ret void
+}
+define void @__rsAllocationVStoreXImpl_ulong3([1 x i32] %a.coerce, <3 x i64> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <3 x i64>*
+  store <3 x i64> %val, <3 x i64>* %2, align 8
+  ret void
+}
+define void @__rsAllocationVStoreXImpl_ulong2([1 x i32] %a.coerce, <2 x i64> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <2 x i64>*
+  store <2 x i64> %val, <2 x i64>* %2, align 8
+  ret void
+}
+
+define void @__rsAllocationVStoreXImpl_int4([1 x i32] %a.coerce, <4 x i32> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <4 x i32>*
+  store <4 x i32> %val, <4 x i32>* %2, align 8
+  ret void
+}
+define void @__rsAllocationVStoreXImpl_int3([1 x i32] %a.coerce, <3 x i32> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <3 x i32>*
+  store <3 x i32> %val, <3 x i32>* %2, align 8
+  ret void
+}
+define void @__rsAllocationVStoreXImpl_int2([1 x i32] %a.coerce, <2 x i32> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <2 x i32>*
+  store <2 x i32> %val, <2 x i32>* %2, align 8
+  ret void
+}
+
+define void @__rsAllocationVStoreXImpl_uint4([1 x i32] %a.coerce, <4 x i32> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <4 x i32>*
+  store <4 x i32> %val, <4 x i32>* %2, align 8
+  ret void
+}
+define void @__rsAllocationVStoreXImpl_uint3([1 x i32] %a.coerce, <3 x i32> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <3 x i32>*
+  store <3 x i32> %val, <3 x i32>* %2, align 8
+  ret void
+}
+define void @__rsAllocationVStoreXImpl_uint2([1 x i32] %a.coerce, <2 x i32> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <2 x i32>*
+  store <2 x i32> %val, <2 x i32>* %2, align 8
+  ret void
+}
+
+define void @__rsAllocationVStoreXImpl_short4([1 x i32] %a.coerce, <4 x i16> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <4 x i16>*
+  store <4 x i16> %val, <4 x i16>* %2, align 8
+  ret void
+}
+define void @__rsAllocationVStoreXImpl_short3([1 x i32] %a.coerce, <3 x i16> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <3 x i16>*
+  store <3 x i16> %val, <3 x i16>* %2, align 8
+  ret void
+}
+define void @__rsAllocationVStoreXImpl_short2([1 x i32] %a.coerce, <2 x i16> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <2 x i16>*
+  store <2 x i16> %val, <2 x i16>* %2, align 8
+  ret void
+}
+
+define void @__rsAllocationVStoreXImpl_ushort4([1 x i32] %a.coerce, <4 x i16> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <4 x i16>*
+  store <4 x i16> %val, <4 x i16>* %2, align 8
+  ret void
+}
+define void @__rsAllocationVStoreXImpl_ushort3([1 x i32] %a.coerce, <3 x i16> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <3 x i16>*
+  store <3 x i16> %val, <3 x i16>* %2, align 8
+  ret void
+}
+define void @__rsAllocationVStoreXImpl_ushort2([1 x i32] %a.coerce, <2 x i16> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <2 x i16>*
+  store <2 x i16> %val, <2 x i16>* %2, align 8
+  ret void
+}
+
+define void @__rsAllocationVStoreXImpl_char4([1 x i32] %a.coerce, <4 x i8> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <4 x i8>*
+  store <4 x i8> %val, <4 x i8>* %2, align 8
+  ret void
+}
+define void @__rsAllocationVStoreXImpl_char3([1 x i32] %a.coerce, <3 x i8> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <3 x i8>*
+  store <3 x i8> %val, <3 x i8>* %2, align 8
+  ret void
+}
+define void @__rsAllocationVStoreXImpl_char2([1 x i32] %a.coerce, <2 x i8> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <2 x i8>*
+  store <2 x i8> %val, <2 x i8>* %2, align 8
+  ret void
+}
+
+define void @__rsAllocationVStoreXImpl_uchar4([1 x i32] %a.coerce, <4 x i8> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <4 x i8>*
+  store <4 x i8> %val, <4 x i8>* %2, align 8
+  ret void
+}
+define void @__rsAllocationVStoreXImpl_uchar3([1 x i32] %a.coerce, <3 x i8> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <3 x i8>*
+  store <3 x i8> %val, <3 x i8>* %2, align 8
+  ret void
+}
+define void @__rsAllocationVStoreXImpl_uchar2([1 x i32] %a.coerce, <2 x i8> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <2 x i8>*
+  store <2 x i8> %val, <2 x i8>* %2, align 8
+  ret void
+}
+
+define void @__rsAllocationVStoreXImpl_float4([1 x i32] %a.coerce, <4 x float> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <4 x float>*
+  store <4 x float> %val, <4 x float>* %2, align 8
+  ret void
+}
+define void @__rsAllocationVStoreXImpl_float3([1 x i32] %a.coerce, <3 x float> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <3 x float>*
+  store <3 x float> %val, <3 x float>* %2, align 8
+  ret void
+}
+define void @__rsAllocationVStoreXImpl_float2([1 x i32] %a.coerce, <2 x float> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <2 x float>*
+  store <2 x float> %val, <2 x float>* %2, align 8
+  ret void
+}
+
+define void @__rsAllocationVStoreXImpl_double4([1 x i32] %a.coerce, <4 x double> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <4 x double>*
+  store <4 x double> %val, <4 x double>* %2, align 8
+  ret void
+}
+define void @__rsAllocationVStoreXImpl_double3([1 x i32] %a.coerce, <3 x double> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <3 x double>*
+  store <3 x double> %val, <3 x double>* %2, align 8
+  ret void
+}
+define void @__rsAllocationVStoreXImpl_double2([1 x i32] %a.coerce, <2 x double> %val, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+  %2 = bitcast i8* %1 to <2 x double>*
+  store <2 x double> %val, <2 x double>* %2, align 8
+  ret void
+}
+
+
 attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/driver/runtime/rs_allocation.c b/driver/runtime/rs_allocation.c
index a307776..0722680 100644
--- a/driver/runtime/rs_allocation.c
+++ b/driver/runtime/rs_allocation.c
@@ -99,7 +99,6 @@
         rsGetElementAt_##T(a, &tmp, x, y, z);                           \
         return tmp;                                                     \
     }
-
 #else
 
 uint8_t*
@@ -114,6 +113,18 @@
     return dp;
 }
 
+uint8_t*
+rsOffsetNs(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    Allocation_t *alloc = (Allocation_t *)a.p;
+    uint8_t *p = (uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr;
+    const uint32_t stride = alloc->mHal.drvState.lod[0].stride;
+    const uint32_t dimY = alloc->mHal.drvState.lod[0].dimY;
+    const uint32_t sizeOf = alloc->mHal.state.elementSizeBytes;;
+    uint8_t *dp = &p[(sizeOf * x) + (y * stride) +
+                     (z * stride * dimY)];
+    return dp;
+}
+
 #define ELEMENT_AT(T)                                                   \
                                                                         \
     void                                                                \
@@ -290,3 +301,66 @@
     return pin[((x >> shift) * cstep) + ((y >> shift) * stride)];
 }
 
+
+#define VOP(T)                                                          \
+    extern void __rsAllocationVStoreXImpl_##T(rs_allocation a, const T val, uint32_t x, uint32_t y, uint32_t z); \
+    extern T __rsAllocationVLoadXImpl_##T(rs_allocation a, uint32_t x, uint32_t y, uint32_t z); \
+                                                                        \
+    extern void __attribute__((overloadable))                           \
+    rsAllocationVStoreX_##T(rs_allocation a, T val, uint32_t x) {       \
+        __rsAllocationVStoreXImpl_##T(a, val, x, 0, 0);                 \
+    }                                                                   \
+    extern void __attribute__((overloadable))                           \
+    rsAllocationVStoreX_##T(rs_allocation a, T val, uint32_t x, uint32_t y) { \
+        __rsAllocationVStoreXImpl_##T(a, val, x, y, 0);                 \
+    }                                                                   \
+    extern void __attribute__((overloadable))                           \
+    rsAllocationVStoreX_##T(rs_allocation a, T val, uint32_t x, uint32_t y, uint32_t z) { \
+        __rsAllocationVStoreXImpl_##T(a, val, x, y, z);                 \
+    }                                                                   \
+    extern T __attribute__((overloadable))                              \
+    rsAllocationVLoadX_##T(rs_allocation a, uint32_t x) {               \
+        return __rsAllocationVLoadXImpl_##T(a, x, 0, 0);                \
+    }                                                                   \
+    extern T __attribute__((overloadable))                              \
+    rsAllocationVLoadX_##T(rs_allocation a, uint32_t x, uint32_t y) {   \
+        return __rsAllocationVLoadXImpl_##T(a, x, y, 0);                \
+    }                                                                   \
+    extern T __attribute__((overloadable))                              \
+    rsAllocationVLoadX_##T(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) { \
+        return __rsAllocationVLoadXImpl_##T(a, x, y, z);                \
+    }
+
+VOP(char2)
+VOP(char3)
+VOP(char4)
+VOP(uchar2)
+VOP(uchar3)
+VOP(uchar4)
+VOP(short2)
+VOP(short3)
+VOP(short4)
+VOP(ushort2)
+VOP(ushort3)
+VOP(ushort4)
+VOP(int2)
+VOP(int3)
+VOP(int4)
+VOP(uint2)
+VOP(uint3)
+VOP(uint4)
+VOP(long2)
+VOP(long3)
+VOP(long4)
+VOP(ulong2)
+VOP(ulong3)
+VOP(ulong4)
+VOP(float2)
+VOP(float3)
+VOP(float4)
+VOP(double2)
+VOP(double3)
+VOP(double4)
+
+#undef VOP
+