Improve ARM disassembler to cope with JNI stubs.

Also decode r9-relative loads/stores, and add the Thread::state_ offset.

Also lose the implicit 's' on the test instructions, and don't show the
destination register for those instructions, since they don't use it.

Examples:

            0x60cdd718: e599c060    ldr     r12, [r9, #96]  ; top_sirt_

            0x60cdd754: e589c06c    str     r12, [r9, #108]  ; state_

            0x60cdd760: e35c0000    cmp     r12, #0

            0x60cdd7c4: 1a00000b    bne     44 (0x60cdd7f8)

            0x60cdd814: e1200070    bkpt    #0

Change-Id: I4afa9f47267daefded46211d62718fd7fb87cf97
diff --git a/src/thread.cc b/src/thread.cc
index 0a44b53..a4194b1 100644
--- a/src/thread.cc
+++ b/src/thread.cc
@@ -1461,13 +1461,14 @@
 
 void Thread::DumpThreadOffset(std::ostream& os, uint32_t offset, size_t size_of_pointers) {
   CHECK_EQ(size_of_pointers, 4U); // TODO: support 64-bit targets.
-#define DO_THREAD_OFFSET(x) if (offset == static_cast<uint32_t>(OFFSETOF_MEMBER(Thread, x))) { os << # x; } else
+#define DO_THREAD_OFFSET(x) if (offset == static_cast<uint32_t>(OFFSETOF_VOLATILE_MEMBER(Thread, x))) { os << # x; } else
 #define DO_THREAD_ENTRY_POINT_OFFSET(x) if (offset == ENTRYPOINT_OFFSET(x)) { os << # x; } else
   DO_THREAD_OFFSET(card_table_)
   DO_THREAD_OFFSET(exception_)
   DO_THREAD_OFFSET(jni_env_)
   DO_THREAD_OFFSET(self_)
   DO_THREAD_OFFSET(stack_end_)
+  DO_THREAD_OFFSET(state_)
   DO_THREAD_OFFSET(suspend_count_)
   DO_THREAD_OFFSET(thin_lock_id_)
   DO_THREAD_OFFSET(top_of_managed_stack_)