selftests/powerpc: Test preservation of FPU and VMX regs across preemption

Loop in assembly checking the registers with many threads.

Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
diff --git a/tools/testing/selftests/powerpc/math/vmx_asm.S b/tools/testing/selftests/powerpc/math/vmx_asm.S
index 4ce6421..1b8c248 100644
--- a/tools/testing/selftests/powerpc/math/vmx_asm.S
+++ b/tools/testing/selftests/powerpc/math/vmx_asm.S
@@ -9,6 +9,7 @@
 
 #include "../basic_asm.h"
 
+# POS MUST BE 16 ALIGNED!
 #define PUSH_VMX(pos,reg) \
 	li	reg,pos; \
 	stvx	v20,reg,sp; \
@@ -35,6 +36,7 @@
 	addi	reg,reg,16; \
 	stvx	v31,reg,sp;
 
+# POS MUST BE 16 ALIGNED!
 #define POP_VMX(pos,reg) \
 	li	reg,pos; \
 	lvx	v20,reg,sp; \
@@ -93,7 +95,7 @@
 
 # Should be safe from C, only touches r4, r5 and v0,v1,v2
 FUNC_START(check_vmx)
-	PUSH_BASIC_STACK(16)
+	PUSH_BASIC_STACK(32)
 	mr r4,r3
 	li	r3,1 # assume a bad result
 	li	r5,0
@@ -162,7 +164,7 @@
 	cmpdi	r0,0xffffffffffffffff
 	bne	1f
 	li	r3,0
-1:	POP_BASIC_STACK(16)
+1:	POP_BASIC_STACK(32)
 	blr
 FUNC_END(check_vmx)
 
@@ -193,3 +195,41 @@
 	POP_BASIC_STACK(512)
 	blr
 FUNC_END(test_vmx)
+
+# int preempt_vmx(vector int *varray, int *threads_starting, int *running)
+# On starting will (atomically) decrement threads_starting as a signal that
+# the VMX have been loaded with varray. Will proceed to check the validity of
+# the VMX registers while running is not zero.
+FUNC_START(preempt_vmx)
+	PUSH_BASIC_STACK(512)
+	std r3,STACK_FRAME_PARAM(0)(sp) # vector int *varray
+	std r4,STACK_FRAME_PARAM(1)(sp) # int *threads_starting
+	std r5,STACK_FRAME_PARAM(2)(sp) # int *running
+	# VMX need to write to 16 byte aligned addresses, skip STACK_FRAME_LOCAL(3,0)
+	PUSH_VMX(STACK_FRAME_LOCAL(4,0),r4)
+
+	bl load_vmx
+	nop
+
+	sync
+	# Atomic DEC
+	ld r3,STACK_FRAME_PARAM(1)(sp)
+1:	lwarx r4,0,r3
+	addi r4,r4,-1
+	stwcx. r4,0,r3
+	bne- 1b
+
+2:	ld r3,STACK_FRAME_PARAM(0)(sp)
+	bl check_vmx
+	nop
+	cmpdi r3,0
+	bne 3f
+	ld r4,STACK_FRAME_PARAM(2)(sp)
+	ld r5,0(r4)
+	cmpwi r5,0
+	bne 2b
+
+3:	POP_VMX(STACK_FRAME_LOCAL(4,0),r4)
+	POP_BASIC_STACK(512)
+	blr
+FUNC_END(preempt_vmx)