rs_matrix types are not 16-byte aligned, so we have to load them as align 4.

Change-Id: I30742a23fe87db7cb68d2c97bc022f7ee418ef82
diff --git a/lib/Renderscript/runtime/matrix.ll b/lib/Renderscript/runtime/matrix.ll
index e559d99..c56405d 100644
--- a/lib/Renderscript/runtime/matrix.ll
+++ b/lib/Renderscript/runtime/matrix.ll
@@ -25,13 +25,13 @@
 
   %px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0
   %px2 = bitcast float* %px to <4 x float>*
-  %xm = load <4 x float>* %px2
+  %xm = load <4 x float>* %px2, align 4
   %py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3
   %py2 = bitcast float* %py to <4 x float>*
-  %ym = load <4 x float>* %py2
+  %ym = load <4 x float>* %py2, align 4
   %pz = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 6
   %pz2 = bitcast float* %pz to <3 x float>*
-  %zm2 = load <3 x float>* %pz2
+  %zm2 = load <3 x float>* %pz2, align 4
   %zm = shufflevector <3 x float> %zm2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 
   %a1 = fmul <4 x float> %x, %xm
@@ -56,10 +56,10 @@
 
   %px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0
   %px2 = bitcast float* %px to <4 x float>*
-  %xm = load <4 x float>* %px2
+  %xm = load <4 x float>* %px2, align 4
   %py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3
   %py2 = bitcast float* %py to <4 x float>*
-  %ym = load <4 x float>* %py2
+  %ym = load <4 x float>* %py2, align 4
 
   %a1 = fmul <4 x float> %x, %xm
   %a2 = fmul <4 x float> %y, %ym
@@ -85,16 +85,16 @@
 
   %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
   %px2 = bitcast float* %px to <4 x float>*
-  %xm = load <4 x float>* %px2
+  %xm = load <4 x float>* %px2, align 4
   %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
   %py2 = bitcast float* %py to <4 x float>*
-  %ym = load <4 x float>* %py2
+  %ym = load <4 x float>* %py2, align 4
   %pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8
   %pz2 = bitcast float* %pz to <4 x float>*
-  %zm = load <4 x float>* %pz2
+  %zm = load <4 x float>* %pz2, align 4
   %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
   %pw2 = bitcast float* %pw to <4 x float>*
-  %wm = load <4 x float>* %pw2
+  %wm = load <4 x float>* %pw2, align 4
 
   %a1 = fmul <4 x float> %x, %xm
   %a2 = fmul <4 x float> %y, %ym
@@ -121,16 +121,16 @@
 
   %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
   %px2 = bitcast float* %px to <4 x float>*
-  %xm = load <4 x float>* %px2
+  %xm = load <4 x float>* %px2, align 4
   %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
   %py2 = bitcast float* %py to <4 x float>*
-  %ym = load <4 x float>* %py2
+  %ym = load <4 x float>* %py2, align 4
   %pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8
   %pz2 = bitcast float* %pz to <4 x float>*
-  %zm = load <4 x float>* %pz2
+  %zm = load <4 x float>* %pz2, align 4
   %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
   %pw2 = bitcast float* %pw to <4 x float>*
-  %wm = load <4 x float>* %pw2
+  %wm = load <4 x float>* %pw2, align 4
 
   %a1 = fmul <4 x float> %x, %xm
   %a2 = fadd <4 x float> %wm, %a1
@@ -154,13 +154,13 @@
 
   %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
   %px2 = bitcast float* %px to <4 x float>*
-  %xm = load <4 x float>* %px2
+  %xm = load <4 x float>* %px2, align 4
   %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
   %py2 = bitcast float* %py to <4 x float>*
-  %ym = load <4 x float>* %py2
+  %ym = load <4 x float>* %py2, align 4
   %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
   %pw2 = bitcast float* %pw to <4 x float>*
-  %wm = load <4 x float>* %pw2
+  %wm = load <4 x float>* %pw2, align 4
 
   %a1 = fmul <4 x float> %x, %xm
   %a2 = fadd <4 x float> %wm, %a1