fix out-of-bounds array access in pthread barriers on 64-bit

it's ok to overlap with integer slot 3 on 32-bit because only slots
0-2 are used on process-local barriers.
diff --git a/src/internal/pthread_impl.h b/src/internal/pthread_impl.h
index 5c1b206..2348b5d 100644
--- a/src/internal/pthread_impl.h
+++ b/src/internal/pthread_impl.h
@@ -83,7 +83,7 @@
 #define _b_limit __u.__i[2]
 #define _b_count __u.__i[3]
 #define _b_waiters2 __u.__i[4]
-#define _b_inst __u.__p[4]
+#define _b_inst __u.__p[3]
 
 #include "pthread_arch.h"