Merge "llvm matrix ops"
diff --git a/lib/ScriptCRT/Android.mk b/lib/ScriptCRT/Android.mk
index 892f7b4..88d6ce1 100644
--- a/lib/ScriptCRT/Android.mk
+++ b/lib/ScriptCRT/Android.mk
@@ -28,7 +28,9 @@
 
 # Hand-written bitcode for the library
 clcore_ll_files := \
-    clamp.ll
+    clamp.ll \
+    convert.ll \
+    matrix.ll
 
 include $(BUILD_SYSTEM)/base_rules.mk
 
diff --git a/lib/ScriptCRT/convert.ll b/lib/ScriptCRT/convert.ll
new file mode 100644
index 0000000..e590ad1
--- /dev/null
+++ b/lib/ScriptCRT/convert.ll
@@ -0,0 +1,256 @@
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
+target triple = "armv7-none-linux-gnueabi"
+
+define <2 x float> @_Z14convert_float2Dv2_h(<2 x i8> %in) nounwind readnone alwaysinline {
+  %1 = uitofp <2 x i8> %in to <2 x float>
+  ret <2 x float> %1
+}
+
+define <3 x float> @_Z14convert_float3Dv3_h(<3 x i8> %in) nounwind readnone alwaysinline {
+  %1 = uitofp <3 x i8> %in to <3 x float>
+  ret <3 x float> %1
+}
+
+define <4 x float> @_Z14convert_float4Dv4_h(<4 x i8> %in) nounwind readnone alwaysinline {
+  %1 = uitofp <4 x i8> %in to <4 x float>
+  ret <4 x float> %1
+}
+
+define <2 x float> @_Z14convert_float2Dv2_c(<2 x i8> %in) nounwind readnone alwaysinline {
+  %1 = sitofp <2 x i8> %in to <2 x float>
+  ret <2 x float> %1
+}
+
+define <3 x float> @_Z14convert_float3Dv3_c(<3 x i8> %in) nounwind readnone alwaysinline {
+  %1 = sitofp <3 x i8> %in to <3 x float>
+  ret <3 x float> %1
+}
+
+define <4 x float> @_Z14convert_float4Dv4_c(<4 x i8> %in) nounwind readnone alwaysinline {
+  %1 = sitofp <4 x i8> %in to <4 x float>
+  ret <4 x float> %1
+}
+
+define <2 x float> @_Z14convert_float2Dv2_t(<2 x i16> %in) nounwind readnone alwaysinline {
+  %1 = uitofp <2 x i16> %in to <2 x float>
+  ret <2 x float> %1
+}
+
+define <3 x float> @_Z14convert_float3Dv3_t(<3 x i16> %in) nounwind readnone alwaysinline {
+  %1 = uitofp <3 x i16> %in to <3 x float>
+  ret <3 x float> %1
+}
+
+define <4 x float> @_Z14convert_float4Dv4_t(<4 x i16> %in) nounwind readnone alwaysinline {
+  %1 = uitofp <4 x i16> %in to <4 x float>
+  ret <4 x float> %1
+}
+
+define <2 x float> @_Z14convert_float2Dv2_s(<2 x i16> %in) nounwind readnone alwaysinline {
+  %1 = sitofp <2 x i16> %in to <2 x float>
+  ret <2 x float> %1
+}
+
+define <3 x float> @_Z14convert_float3Dv3_s(<3 x i16> %in) nounwind readnone alwaysinline {
+  %1 = sitofp <3 x i16> %in to <3 x float>
+  ret <3 x float> %1
+}
+
+define <4 x float> @_Z14convert_float4Dv4_s(<4 x i16> %in) nounwind readnone alwaysinline {
+  %1 = sitofp <4 x i16> %in to <4 x float>
+  ret <4 x float> %1
+}
+
+define <2 x float> @_Z14convert_float2Dv2_j(<2 x i32> %in) nounwind readnone alwaysinline {
+  %1 = uitofp <2 x i32> %in to <2 x float>
+  ret <2 x float> %1
+}
+
+define <3 x float> @_Z14convert_float3Dv3_j(<3 x i32> %in) nounwind readnone alwaysinline {
+  %1 = uitofp <3 x i32> %in to <3 x float>
+  ret <3 x float> %1
+}
+
+define <4 x float> @_Z14convert_float4Dv4_j(<4 x i32> %in) nounwind readnone alwaysinline {
+  %1 = uitofp <4 x i32> %in to <4 x float>
+  ret <4 x float> %1
+}
+
+define <2 x float> @_Z14convert_float2Dv2_i(<2 x i32> %in) nounwind readnone alwaysinline {
+  %1 = sitofp <2 x i32> %in to <2 x float>
+  ret <2 x float> %1
+}
+
+define <3 x float> @_Z14convert_float3Dv3_i(<3 x i32> %in) nounwind readnone alwaysinline {
+  %1 = sitofp <3 x i32> %in to <3 x float>
+  ret <3 x float> %1
+}
+
+define <4 x float> @_Z14convert_float4Dv4_i(<4 x i32> %in) nounwind readnone alwaysinline {
+  %1 = sitofp <4 x i32> %in to <4 x float>
+  ret <4 x float> %1
+}
+
+define <2 x float> @_Z14convert_float2Dv2_f(<2 x float> %in) nounwind readnone alwaysinline {
+  ret <2 x float> %in
+}
+
+define <3 x float> @_Z14convert_float3Dv3_f(<3 x float> %in) nounwind readnone alwaysinline {
+  ret <3 x float> %in
+}
+
+define <4 x float> @_Z14convert_float4Dv4_f(<4 x float> %in) nounwind readnone alwaysinline {
+  ret <4 x float> %in
+}
+
+;---
+
+define <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %in) nounwind readnone alwaysinline {
+  %1 = fptoui <4 x float> %in to <4 x i32>
+  %2 = trunc <4 x i32> %1 to <4 x i16>
+  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %4 = trunc <8 x i16> %3 to <8 x i8>
+  %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i8> %5
+}
+
+define <3 x i8> @_Z14convert_uchar3Dv3_f(<3 x float> %in) nounwind readnone alwaysinline {
+  %in2 = shufflevector <3 x float> %in, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %1 = fptoui <4 x float> %in2 to <4 x i32>
+  %2 = trunc <4 x i32> %1 to <4 x i16>
+  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %4 = trunc <8 x i16> %3 to <8 x i8>
+  %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x i8> %5
+}
+
+define <2 x i8> @_Z14convert_uchar2Dv2_f(<2 x float> %in) nounwind readnone alwaysinline {
+  %in2 = shufflevector <2 x float> %in, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %1 = fptoui <4 x float> %in2 to <4 x i32>
+  %2 = trunc <4 x i32> %1 to <4 x i16>
+  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %4 = trunc <8 x i16> %3 to <8 x i8>
+  %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x i8> %5
+}
+
+define <4 x i8> @_Z14convert_uchar4Dv4_h(<4 x i8> %in) nounwind readnone alwaysinline {
+  ret <4 x i8> %in
+}
+
+define <3 x i8> @_Z14convert_uchar3Dv3_h(<3 x i8> %in) nounwind readnone alwaysinline {
+  ret <3 x i8> %in
+}
+
+define <2 x i8> @_Z14convert_uchar2Dv2_h(<2 x i8> %in) nounwind readnone alwaysinline {
+  ret <2 x i8> %in
+}
+
+define <4 x i8> @_Z14convert_uchar4Dv4_c(<4 x i8> %in) nounwind readnone alwaysinline {
+  ret <4 x i8> %in
+}
+
+define <3 x i8> @_Z14convert_uchar3Dv3_c(<3 x i8> %in) nounwind readnone alwaysinline {
+  ret <3 x i8> %in
+}
+
+define <2 x i8> @_Z14convert_uchar2Dv2_c(<2 x i8> %in) nounwind readnone alwaysinline {
+  ret <2 x i8> %in
+}
+
+
+define <4 x i8> @_Z14convert_uchar4Dv4_t(<4 x i16> %in) nounwind readnone alwaysinline {
+  %1 = shufflevector <4 x i16> %in, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = trunc <8 x i16> %1 to <8 x i8>
+  %3 = shufflevector <8 x i8> %2, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i8> %3
+}
+
+define <3 x i8> @_Z14convert_uchar3Dv3_t(<3 x i16> %in) nounwind readnone alwaysinline {
+  %1 = shufflevector <3 x i16> %in, <3 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 5, i32 5>
+  %2 = trunc <8 x i16> %1 to <8 x i8>
+  %3 = shufflevector <8 x i8> %2, <8 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x i8> %3
+}
+
+define <2 x i8> @_Z14convert_uchar2Dv2_t(<2 x i16> %in) nounwind readnone alwaysinline {
+  %1 = shufflevector <2 x i16> %in, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %2 = trunc <8 x i16> %1 to <8 x i8>
+  %3 = shufflevector <8 x i8> %2, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x i8> %3
+}
+
+define <4 x i8> @_Z14convert_uchar4Dv4_s(<4 x i16> %in) nounwind readnone alwaysinline {
+  %1 = shufflevector <4 x i16> %in, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = trunc <8 x i16> %1 to <8 x i8>
+  %3 = shufflevector <8 x i8> %2, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i8> %3
+}
+
+define <3 x i8> @_Z14convert_uchar3Dv3_s(<3 x i16> %in) nounwind readnone alwaysinline {
+  %1 = shufflevector <3 x i16> %in, <3 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 5, i32 5>
+  %2 = trunc <8 x i16> %1 to <8 x i8>
+  %3 = shufflevector <8 x i8> %2, <8 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x i8> %3
+}
+
+define <2 x i8> @_Z14convert_uchar2Dv2_s(<2 x i16> %in) nounwind readnone alwaysinline {
+  %1 = shufflevector <2 x i16> %in, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %2 = trunc <8 x i16> %1 to <8 x i8>
+  %3 = shufflevector <8 x i8> %2, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x i8> %3
+}
+
+
+define <4 x i8> @_Z14convert_uchar4Dv4_j(<4 x i32> %in) nounwind readnone alwaysinline {
+  %1 = trunc <4 x i32> %in to <4 x i16>
+  %2 = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %3 = trunc <8 x i16> %2 to <8 x i8>
+  %4 = shufflevector <8 x i8> %3, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i8> %4
+}
+
+define <3 x i8> @_Z14convert_uchar3Dv3_j(<3 x i32> %in) nounwind readnone alwaysinline {
+  %1 = shufflevector <3 x i32> %in, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = trunc <4 x i32> %1 to <4 x i16>
+  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %4 = trunc <8 x i16> %3 to <8 x i8>
+  %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x i8> %5
+}
+
+define <2 x i8> @_Z14convert_uchar2Dv2_j(<2 x i32> %in) nounwind readnone alwaysinline {
+  %1 = shufflevector <2 x i32> %in, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = trunc <4 x i32> %1 to <4 x i16>
+  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %4 = trunc <8 x i16> %3 to <8 x i8>
+  %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x i8> %5
+}
+
+define <4 x i8> @_Z14convert_uchar4Dv4_i(<4 x i32> %in) nounwind readnone alwaysinline {
+  %1 = trunc <4 x i32> %in to <4 x i16>
+  %2 = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %3 = trunc <8 x i16> %2 to <8 x i8>
+  %4 = shufflevector <8 x i8> %3, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i8> %4
+}
+
+define <3 x i8> @_Z14convert_uchar3Dv3_i(<3 x i32> %in) nounwind readnone alwaysinline {
+  %1 = shufflevector <3 x i32> %in, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = trunc <4 x i32> %1 to <4 x i16>
+  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %4 = trunc <8 x i16> %3 to <8 x i8>
+  %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x i8> %5
+}
+
+define <2 x i8> @_Z14convert_uchar2Dv2_i(<2 x i32> %in) nounwind readnone alwaysinline {
+  %1 = shufflevector <2 x i32> %in, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = trunc <4 x i32> %1 to <4 x i16>
+  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %4 = trunc <8 x i16> %3 to <8 x i8>
+  %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x i8> %5
+}
+
diff --git a/lib/ScriptCRT/matrix.ll b/lib/ScriptCRT/matrix.ll
new file mode 100644
index 0000000..1b60566
--- /dev/null
+++ b/lib/ScriptCRT/matrix.ll
@@ -0,0 +1,176 @@
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
+target triple = "armv7-none-linux-gnueabi"
+
+
+%struct.rs_matrix4x4 = type { [16 x float] }
+%struct.rs_matrix3x3 = type { [9 x float] }
+%struct.rs_matrix2x2 = type { [4 x float] }
+
+define internal <4 x float> @smear_f(float %in) nounwind readonly {
+  %1 = insertelement <4 x float> undef, float %in, i32 0
+  %2 = insertelement <4 x float> %1, float %in, i32 1
+  %3 = insertelement <4 x float> %2, float %in, i32 2
+  %4 = insertelement <4 x float> %3, float %in, i32 3
+  ret <4 x float> %4
+}
+
+
+define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv3_f(%struct.rs_matrix3x3* nocapture %m, <3 x float> %in) nounwind readonly {
+  %x0 = extractelement <3 x float> %in, i32 0
+  %x = call <4 x float> @smear_f(float %x0) nounwind
+  %y0 = extractelement <3 x float> %in, i32 1
+  %y = call <4 x float> @smear_f(float %y0) nounwind
+  %z0 = extractelement <3 x float> %in, i32 2
+  %z = call <4 x float> @smear_f(float %z0) nounwind
+
+  %px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0
+  %px2 = bitcast float* %px to <4 x float>*
+  %xm = load <4 x float>* %px2
+  %py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3
+  %py2 = bitcast float* %py to <4 x float>*
+  %ym = load <4 x float>* %py2
+  %pz = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 6
+  %pz2 = bitcast float* %pz to <3 x float>*
+  %zm2 = load <3 x float>* %pz2
+  %zm = shufflevector <3 x float> %zm2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+
+  %a1 = fmul <4 x float> %x, %xm
+  %a2 = fmul <4 x float> %y, %ym
+  %a3 = fadd <4 x float> %a1, %a2
+  %a4 = fmul <4 x float> %z, %zm
+  %a5 = fadd <4 x float> %a4, %a3
+  %a6 = shufflevector <4 x float> %a5, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %a6
+}
+
+define <3 x float> @_Z16rsMatrixMultiplyP12rs_matrix3x3Dv3_f(%struct.rs_matrix3x3* nocapture %m, <3 x float> %in) nounwind readonly {
+  %r = call <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv3_f(%struct.rs_matrix3x3* nocapture %m, <3 x float> %in) nounwind
+  ret <3 x float> %r
+}
+
+define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv2_f(%struct.rs_matrix3x3* nocapture %m, <2 x float> %in) nounwind readonly {
+  %x0 = extractelement <2 x float> %in, i32 0
+  %x = call <4 x float> @smear_f(float %x0) nounwind
+  %y0 = extractelement <2 x float> %in, i32 1
+  %y = call <4 x float> @smear_f(float %y0) nounwind
+
+  %px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0
+  %px2 = bitcast float* %px to <4 x float>*
+  %xm = load <4 x float>* %px2
+  %py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3
+  %py2 = bitcast float* %py to <4 x float>*
+  %ym = load <4 x float>* %py2
+
+  %a1 = fmul <4 x float> %x, %xm
+  %a2 = fmul <4 x float> %y, %ym
+  %a3 = fadd <4 x float> %a1, %a2
+  %a4 = shufflevector <4 x float> %a3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %a4
+}
+
+define <3 x float> @_Z16rsMatrixMultiplyP12rs_matrix3x3Dv2_f(%struct.rs_matrix3x3* nocapture %m, <2 x float> %in) nounwind readonly {
+  %r = call <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv2_f(%struct.rs_matrix3x3* nocapture %m, <2 x float> %in) nounwind
+  ret <3 x float> %r
+}
+
+define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv4_f(%struct.rs_matrix4x4* nocapture %m, <4 x float> %in) nounwind readonly {
+  %x0 = extractelement <4 x float> %in, i32 0
+  %x = call <4 x float> @smear_f(float %x0) nounwind
+  %y0 = extractelement <4 x float> %in, i32 1
+  %y = call <4 x float> @smear_f(float %y0) nounwind
+  %z0 = extractelement <4 x float> %in, i32 2
+  %z = call <4 x float> @smear_f(float %z0) nounwind
+  %w0 = extractelement <4 x float> %in, i32 3
+  %w = call <4 x float> @smear_f(float %w0) nounwind
+
+  %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
+  %px2 = bitcast float* %px to <4 x float>*
+  %xm = load <4 x float>* %px2
+  %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
+  %py2 = bitcast float* %py to <4 x float>*
+  %ym = load <4 x float>* %py2
+  %pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8
+  %pz2 = bitcast float* %pz to <4 x float>*
+  %zm = load <4 x float>* %pz2
+  %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
+  %pw2 = bitcast float* %pw to <4 x float>*
+  %wm = load <4 x float>* %pw2
+
+  %a1 = fmul <4 x float> %x, %xm
+  %a2 = fmul <4 x float> %y, %ym
+  %a3 = fadd <4 x float> %a1, %a2
+  %a4 = fmul <4 x float> %z, %zm
+  %a5 = fadd <4 x float> %a3, %a4
+  %a6 = fmul <4 x float> %w, %wm
+  %a7 = fadd <4 x float> %a5, %a6
+  ret <4 x float> %a7
+}
+
+define <4 x float> @_Z16rsMatrixMultiplyP12rs_matrix4x4Dv4_f(%struct.rs_matrix4x4* nocapture %m, <4 x float> %in) nounwind readonly {
+  %r = call <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv4_f(%struct.rs_matrix4x4* nocapture %m, <4 x float> %in) nounwind
+  ret <4 x float> %r
+}
+
+define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv3_f(%struct.rs_matrix4x4* nocapture %m, <3 x float> %in) nounwind readonly {
+  %x0 = extractelement <3 x float> %in, i32 0
+  %x = call <4 x float> @smear_f(float %x0) nounwind
+  %y0 = extractelement <3 x float> %in, i32 1
+  %y = call <4 x float> @smear_f(float %y0) nounwind
+  %z0 = extractelement <3 x float> %in, i32 2
+  %z = call <4 x float> @smear_f(float %z0) nounwind
+
+  %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
+  %px2 = bitcast float* %px to <4 x float>*
+  %xm = load <4 x float>* %px2
+  %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
+  %py2 = bitcast float* %py to <4 x float>*
+  %ym = load <4 x float>* %py2
+  %pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8
+  %pz2 = bitcast float* %pz to <4 x float>*
+  %zm = load <4 x float>* %pz2
+  %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
+  %pw2 = bitcast float* %pw to <4 x float>*
+  %wm = load <4 x float>* %pw2
+
+  %a1 = fmul <4 x float> %x, %xm
+  %a2 = fadd <4 x float> %wm, %a1
+  %a3 = fmul <4 x float> %y, %ym
+  %a4 = fadd <4 x float> %a2, %a3
+  %a5 = fmul <4 x float> %z, %zm
+  %a6 = fadd <4 x float> %a4, %a5
+  ret <4 x float> %a6
+}
+
+define <4 x float> @_Z16rsMatrixMultiplyP12rs_matrix4x4Dv3_f(%struct.rs_matrix4x4* nocapture %m, <3 x float> %in) nounwind readonly {
+  %r = call <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv3_f(%struct.rs_matrix4x4* nocapture %m, <3 x float> %in) nounwind
+  ret <4 x float> %r
+}
+
+define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv2_f(%struct.rs_matrix4x4* nocapture %m, <2 x float> %in) nounwind readonly {
+  %x0 = extractelement <2 x float> %in, i32 0
+  %x = call <4 x float> @smear_f(float %x0) nounwind
+  %y0 = extractelement <2 x float> %in, i32 1
+  %y = call <4 x float> @smear_f(float %y0) nounwind
+
+  %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
+  %px2 = bitcast float* %px to <4 x float>*
+  %xm = load <4 x float>* %px2
+  %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
+  %py2 = bitcast float* %py to <4 x float>*
+  %ym = load <4 x float>* %py2
+  %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
+  %pw2 = bitcast float* %pw to <4 x float>*
+  %wm = load <4 x float>* %pw2
+
+  %a1 = fmul <4 x float> %x, %xm
+  %a2 = fadd <4 x float> %wm, %a1
+  %a3 = fmul <4 x float> %y, %ym
+  %a4 = fadd <4 x float> %a2, %a3
+  ret <4 x float> %a4
+}
+
+define <4 x float> @_Z16rsMatrixMultiplyP12rs_matrix4x4Dv2_f(%struct.rs_matrix4x4* nocapture %m, <2 x float> %in) nounwind readonly {
+  %r = call <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv2_f(%struct.rs_matrix4x4* nocapture %m, <2 x float> %in) nounwind
+  ret <4 x float> %r
+}
+
diff --git a/lib/ScriptCRT/rs_cl.c b/lib/ScriptCRT/rs_cl.c
index f0390ac..9d1f402 100644
--- a/lib/ScriptCRT/rs_cl.c
+++ b/lib/ScriptCRT/rs_cl.c
@@ -28,12 +28,12 @@
                         CVT_FUNC_2(type, float)
 
 CVT_FUNC(char)
-CVT_FUNC(uchar)
+//CVT_FUNC(uchar)
 CVT_FUNC(short)
 CVT_FUNC(ushort)
 CVT_FUNC(int)
 CVT_FUNC(uint)
-CVT_FUNC(float)
+//CVT_FUNC(float)
 
 // Float ops, 6.11.2
 
diff --git a/lib/ScriptCRT/rs_core.c b/lib/ScriptCRT/rs_core.c
index f5dd1b0..fbd2d24 100644
--- a/lib/ScriptCRT/rs_core.c
+++ b/lib/ScriptCRT/rs_core.c
@@ -346,7 +346,7 @@
     return m->m[row * 2 + col];
 }
 
-
+/*
 extern float4 __attribute__((overloadable))
 rsMatrixMultiply(const rs_matrix4x4 *m, float4 in) {
     float4 ret;
@@ -402,7 +402,6 @@
     return rsMatrixMultiply((const rs_matrix3x3 *)m, in);
 }
 
-
 extern float3 __attribute__((overloadable))
 rsMatrixMultiply(const rs_matrix3x3 *m, float2 in) {
     float3 ret;
@@ -415,6 +414,7 @@
 rsMatrixMultiply(rs_matrix3x3 *m, float2 in) {
     return rsMatrixMultiply((const rs_matrix3x3 *)m, in);
 }
+*/
 
 extern float2 __attribute__((overloadable))
 rsMatrixMultiply(const rs_matrix2x2 *m, float2 in) {