__set_neon_cumulative_sat() modifies the contents on the QC flag, and
some intrinsics do so too: this patch adds the explicit dependency on
the asm statement, to avoid code reordering or removal.

When writing QC, the asm statement now has a fake input dependency,
which is the output of the intrinsic being tested. Modifying the
__set_neon_cumulative_sat macro is necessary, to be able to accept all
the possible input types.

Update the generic code in ref_v_binary_sat_op.c and ref_v_unary_sat_op.c
accordingly, as well as all the tests involving QC.
diff --git a/ref_vqshlu_n.c b/ref_vqshlu_n.c
index b72261c..27d53de 100644
--- a/ref_vqshlu_n.c
+++ b/ref_vqshlu_n.c
@@ -40,15 +40,15 @@
 FNNAME (INSN)
 {
   /* Basic test: v2=vqshlu_n(v1,v), then store the result.  */
-#define TEST_VQSHLU_N2(INSN, Q, T1, T2, T3, T4, W, N, V)	\
-  Set_Neon_Cumulative_Sat(0);					\
-  VECT_VAR(vector_res, T3, W, N) =				\
-    INSN##Q##_n_##T2##W(VECT_VAR(vector, T1, W, N),		\
-			V);					\
-  vst1##Q##_##T4##W(VECT_VAR(result, T3, W, N),			\
-		    VECT_VAR(vector_res, T3, W, N));		\
-  dump_neon_cumulative_sat(TEST_MSG, xSTR(INSN##Q##_n_##T2##W),	\
-			   xSTR(T1), W, N)
+#define TEST_VQSHLU_N2(INSN, Q, T1, T2, T3, T4, W, N, V)		\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T3, W, N));		\
+  VECT_VAR(vector_res, T3, W, N) =					\
+    INSN##Q##_n_##T2##W(VECT_VAR(vector, T1, W, N),			\
+			V);						\
+    vst1##Q##_##T4##W(VECT_VAR(result, T3, W, N),			\
+		      VECT_VAR(vector_res, T3, W, N));			\
+    dump_neon_cumulative_sat(TEST_MSG, xSTR(INSN##Q##_n_##T2##W),	\
+			       xSTR(T1), W, N)
 
   /* Two auxliary macros are necessary to expand INSN */
 #define TEST_VQSHLU_N1(INSN, Q, T1, T2, T3, T4, W, N, V)	\