Fix select decoding on e.g. 32-bit ppc process by 64-bit strace.

Added next_set_bit() function which finds the next set bit,
properly taking into account word size of the traced process.
Use it in decode_select() instead of fd_isset().
Also, properly round fdsize up to word size of traced process,
not to strace's word size.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
diff --git a/defs.h b/defs.h
index f5dd6d5..865c83d 100644
--- a/defs.h
+++ b/defs.h
@@ -648,6 +648,7 @@
 
 extern int string_to_uint(const char *str);
 extern int string_quote(const char *, char *, long, int);
+extern int next_set_bit(const void *bit_array, unsigned cur_bit, unsigned size_bits);
 
 /* a refers to the lower numbered u_arg,
  * b refers to the higher numbered u_arg
diff --git a/desc.c b/desc.c
index 3957226..04f1164 100644
--- a/desc.c
+++ b/desc.c
@@ -477,22 +477,6 @@
 }
 #endif
 
-/* FD_ISSET from libc would abort for large fd if built with
- * debug flags/library hacks which enforce array bound checks
- * (fd_set contains a fixed-size array of longs).
- * We need to use a homegrown replacement.
- */
-static inline int
-fd_isset(unsigned fd, fd_set *fds)
-{
-	/* Using unsigned types to avoid signed divisions and shifts,
-	 * which are slow(er) on many CPUs.
-	 */
-	const unsigned bpl = 8 * sizeof(long);
-	unsigned long *s = (unsigned long *) fds;
-	return s[fd / bpl] & (1UL << (fd % bpl));
-}
-
 static int
 decode_select(struct tcb *tcp, long *args, enum bitness_t bitness)
 {
@@ -518,7 +502,7 @@
 	 * We had bugs a-la "while (j < args[0])" and "umoven(args[0])" below.
 	 * Instead of args[0], use nfds for fd count, fdsize for array lengths.
 	 */
-	fdsize = (((nfds + 7) / 8) + sizeof(long)-1) & -sizeof(long);
+	fdsize = (((nfds + 7) / 8) + current_wordsize-1) & -current_wordsize;
 
 	if (entering(tcp)) {
 		tprintf("%d", (int) args[0]);
@@ -543,12 +527,13 @@
 				continue;
 			}
 			tprints(", [");
-			for (j = 0, sep = ""; j < nfds; j++) {
-				if (fd_isset(j, fds)) {
-					tprints(sep);
-					printfd(tcp, j);
-					sep = " ";
-				}
+			for (j = 0, sep = "";; j++) {
+				j = next_set_bit(fds, j, nfds);
+				if (j < 0)
+					break;
+				tprints(sep);
+				printfd(tcp, j);
+				sep = " ";
 			}
 			tprints("]");
 		}
@@ -583,26 +568,27 @@
 			arg = args[i+1];
 			if (!arg || umoven(tcp, arg, fdsize, (char *) fds) < 0)
 				continue;
-			for (j = 0; j < nfds; j++) {
-				if (fd_isset(j, fds)) {
-					/* +2 chars needed at the end: ']',NUL */
-					if (outptr < end_outstr - (sizeof(", except [") + sizeof(int)*3 + 2)) {
-						if (first) {
-							outptr += sprintf(outptr, "%s%s [%u",
-								sep,
-								i == 0 ? "in" : i == 1 ? "out" : "except",
-								j
-							);
-							first = 0;
-							sep = ", ";
-						}
-						else {
-							outptr += sprintf(outptr, " %u", j);
-						}
+			for (j = 0;; j++) {
+				j = next_set_bit(fds, j, nfds);
+				if (j < 0)
+					break;
+				/* +2 chars needed at the end: ']',NUL */
+				if (outptr < end_outstr - (sizeof(", except [") + sizeof(int)*3 + 2)) {
+					if (first) {
+						outptr += sprintf(outptr, "%s%s [%u",
+							sep,
+							i == 0 ? "in" : i == 1 ? "out" : "except",
+							j
+						);
+						first = 0;
+						sep = ", ";
 					}
-					if (--ready_fds == 0)
-						break;
+					else {
+						outptr += sprintf(outptr, " %u", j);
+					}
 				}
+				if (--ready_fds == 0)
+					break;
 			}
 			if (outptr != outstr)
 				*outptr++ = ']';
diff --git a/util.c b/util.c
index 30a7f19..47c8734 100644
--- a/util.c
+++ b/util.c
@@ -160,6 +160,53 @@
 }
 #endif
 
+/* Find a next bit which is set.
+ * Starts testing at cur_bit.
+ * Returns -1 if no more bits are set.
+ *
+ * We never touch bytes we don't need to.
+ * On big-endian, array is assumed to consist of
+ * current_wordsize wide words: for example, is current_wordsize is 4,
+ * the bytes are walked in 3,2,1,0, 7,6,5,4, 11,10,9,8 ... sequence.
+ * On little-endian machines, word size is immaterial.
+ */
+int
+next_set_bit(const void *bit_array, unsigned cur_bit, unsigned size_bits)
+{
+	const unsigned endian = 1;
+	int little_endian = *(char*)&endian;
+
+	const uint8_t *array = bit_array;
+	unsigned pos = cur_bit / 8;
+	unsigned pos_xor_mask = little_endian ? 0 : current_wordsize-1;
+
+	for (;;) {
+		uint8_t bitmask;
+		uint8_t cur_byte;
+
+		if (cur_bit >= size_bits)
+			return -1;
+		cur_byte = array[pos ^ pos_xor_mask];
+		if (cur_byte == 0) {
+			cur_bit = (cur_bit + 8) & (-8);
+			pos++;
+			continue;
+		}
+		bitmask = 1 << (cur_bit & 7);
+		for (;;) {
+			if (cur_byte & bitmask)
+				return cur_bit;
+			cur_bit++;
+			if (cur_bit >= size_bits)
+				return -1;
+			bitmask <<= 1;
+			/* This check *can't be* optimized out: */
+			if (bitmask == 0)
+				break;
+		}
+		pos++;
+	}
+}
 /*
  * Print entry in struct xlat table, if there.
  */