Update <qemu/bswap.h> and <exec/cpu-all.h>

This is a small update to merge upstream changes to cpu load/store
macro operations.

Change-Id: I0deda5923ed5efc1f0b412701a59a40a86015d55
diff --git a/exec.c b/exec.c
index ba53ad0..8eb030e 100644
--- a/exec.c
+++ b/exec.c
@@ -165,10 +165,10 @@
 #define L1_SIZE (1 << L1_BITS)
 #define L2_SIZE (1 << L2_BITS)
 
-unsigned long qemu_real_host_page_size;
-unsigned long qemu_host_page_bits;
-unsigned long qemu_host_page_size;
-unsigned long qemu_host_page_mask;
+uintptr_t qemu_real_host_page_size;
+uintptr_t qemu_host_page_bits;
+uintptr_t qemu_host_page_size;
+uintptr_t qemu_host_page_mask;
 
 /* XXX: for system emulation, it could just be an array */
 static PageDesc *l1_map[L1_SIZE];
diff --git a/hw/block/cdrom.c b/hw/block/cdrom.c
index c457354..b938234 100644
--- a/hw/block/cdrom.c
+++ b/hw/block/cdrom.c
@@ -59,7 +59,7 @@
             q += 3;
         } else {
             /* sector 0 */
-            cpu_to_be32wu((uint32_t *)q, 0);
+            stl_be_p(q, 0);
             q += 4;
         }
     }
@@ -73,11 +73,11 @@
         lba_to_msf(q, nb_sectors);
         q += 3;
     } else {
-        cpu_to_be32wu((uint32_t *)q, nb_sectors);
+        stl_be_p(q, nb_sectors);
         q += 4;
     }
     len = q - buf;
-    cpu_to_be16wu((uint16_t *)buf, len - 2);
+    stw_be_p(buf, len - 2);
     return len;
 }
 
@@ -127,7 +127,7 @@
         lba_to_msf(q, nb_sectors);
         q += 3;
     } else {
-        cpu_to_be32wu((uint32_t *)q, nb_sectors);
+        stl_be_p(q, nb_sectors);
         q += 4;
     }
 
@@ -150,6 +150,6 @@
     }
 
     len = q - buf;
-    cpu_to_be16wu((uint16_t *)buf, len - 2);
+    stw_be_p(buf, len - 2);
     return len;
 }
diff --git a/hw/net/ne2000.c b/hw/net/ne2000.c
index 69df0bf..98b15c1 100644
--- a/hw/net/ne2000.c
+++ b/hw/net/ne2000.c
@@ -514,7 +514,7 @@
     addr &= ~1; /* XXX: check exact behaviour if not even */
     if (addr < 32 ||
         (addr >= NE2000_PMEM_START && addr < NE2000_MEM_SIZE)) {
-        cpu_to_le32wu((uint32_t *)(s->mem + addr), val);
+        stl_le_p(s->mem + addr, val);
     }
 }
 
@@ -544,7 +544,7 @@
     addr &= ~1; /* XXX: check exact behaviour if not even */
     if (addr < 32 ||
         (addr >= NE2000_PMEM_START && addr < NE2000_MEM_SIZE)) {
-        return le32_to_cpupu((uint32_t *)(s->mem + addr));
+        return ldl_le_p(s->mem + addr);
     } else {
         return 0xffffffff;
     }
diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index a922ebb..047046e 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -35,8 +35,6 @@
  * TARGET_WORDS_BIGENDIAN : same for target cpu
  */
 
-#include "fpu/softfloat.h"
-
 #if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN)
 #define BSWAP_NEEDED
 #endif
@@ -114,60 +112,6 @@
 #define bswaptls(s) bswap64s(s)
 #endif
 
-typedef union {
-    float32 f;
-    uint32_t l;
-} CPU_FloatU;
-
-/* NOTE: arm FPA is horrible as double 32 bit words are stored in big
-   endian ! */
-typedef union {
-    float64 d;
-#if defined(HOST_WORDS_BIGENDIAN) \
-    || (defined(__arm__) && !defined(__VFP_FP__) && !defined(CONFIG_SOFTFLOAT))
-    struct {
-        uint32_t upper;
-        uint32_t lower;
-    } l;
-#else
-    struct {
-        uint32_t lower;
-        uint32_t upper;
-    } l;
-#endif
-    uint64_t ll;
-} CPU_DoubleU;
-
-#ifdef TARGET_SPARC
-typedef union {
-    float128 q;
-#if defined(HOST_WORDS_BIGENDIAN) \
-    || (defined(__arm__) && !defined(__VFP_FP__) && !defined(CONFIG_SOFTFLOAT))
-    struct {
-        uint32_t upmost;
-        uint32_t upper;
-        uint32_t lower;
-        uint32_t lowest;
-    } l;
-    struct {
-        uint64_t upper;
-        uint64_t lower;
-    } ll;
-#else
-    struct {
-        uint32_t lowest;
-        uint32_t lower;
-        uint32_t upper;
-        uint32_t upmost;
-    } l;
-    struct {
-        uint64_t lower;
-        uint64_t upper;
-    } ll;
-#endif
-} CPU_QuadU;
-#endif
-
 /* CPU memory access without any memory or io remapping */
 
 /*
@@ -203,392 +147,8 @@
  *   user   : user mode access using soft MMU
  *   kernel : kernel mode access using soft MMU
  */
-static inline int ldub_p(const void *ptr)
-{
-    return *(uint8_t *)ptr;
-}
 
-static inline int ldsb_p(const void *ptr)
-{
-    return *(int8_t *)ptr;
-}
-
-static inline void stb_p(void *ptr, int v)
-{
-    *(uint8_t *)ptr = v;
-}
-
-/* NOTE: on arm, putting 2 in /proc/sys/debug/alignment so that the
-   kernel handles unaligned load/stores may give better results, but
-   it is a system wide setting : bad */
-#if defined(HOST_WORDS_BIGENDIAN) || defined(WORDS_ALIGNED)
-
-/* conservative code for little endian unaligned accesses */
-static inline int lduw_le_p(const void *ptr)
-{
-#ifdef _ARCH_PPC
-    int val;
-    __asm__ __volatile__ ("lhbrx %0,0,%1" : "=r" (val) : "r" (ptr));
-    return val;
-#else
-    const uint8_t *p = ptr;
-    return p[0] | (p[1] << 8);
-#endif
-}
-
-static inline int ldsw_le_p(const void *ptr)
-{
-#ifdef _ARCH_PPC
-    int val;
-    __asm__ __volatile__ ("lhbrx %0,0,%1" : "=r" (val) : "r" (ptr));
-    return (int16_t)val;
-#else
-    const uint8_t *p = ptr;
-    return (int16_t)(p[0] | (p[1] << 8));
-#endif
-}
-
-static inline int ldl_le_p(const void *ptr)
-{
-#ifdef _ARCH_PPC
-    int val;
-    __asm__ __volatile__ ("lwbrx %0,0,%1" : "=r" (val) : "r" (ptr));
-    return val;
-#else
-    const uint8_t *p = ptr;
-    return p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24);
-#endif
-}
-
-static inline uint64_t ldq_le_p(const void *ptr)
-{
-    const uint8_t *p = ptr;
-    uint32_t v1, v2;
-    v1 = ldl_le_p(p);
-    v2 = ldl_le_p(p + 4);
-    return v1 | ((uint64_t)v2 << 32);
-}
-
-static inline void stw_le_p(void *ptr, int v)
-{
-#ifdef _ARCH_PPC
-    __asm__ __volatile__ ("sthbrx %1,0,%2" : "=m" (*(uint16_t *)ptr) : "r" (v), "r" (ptr));
-#else
-    uint8_t *p = ptr;
-    p[0] = v;
-    p[1] = v >> 8;
-#endif
-}
-
-static inline void stl_le_p(void *ptr, int v)
-{
-#ifdef _ARCH_PPC
-    __asm__ __volatile__ ("stwbrx %1,0,%2" : "=m" (*(uint32_t *)ptr) : "r" (v), "r" (ptr));
-#else
-    uint8_t *p = ptr;
-    p[0] = v;
-    p[1] = v >> 8;
-    p[2] = v >> 16;
-    p[3] = v >> 24;
-#endif
-}
-
-static inline void stq_le_p(void *ptr, uint64_t v)
-{
-    uint8_t *p = ptr;
-    stl_le_p(p, (uint32_t)v);
-    stl_le_p(p + 4, v >> 32);
-}
-
-/* float access */
-
-static inline float32 ldfl_le_p(const void *ptr)
-{
-    union {
-        float32 f;
-        uint32_t i;
-    } u;
-    u.i = ldl_le_p(ptr);
-    return u.f;
-}
-
-static inline void stfl_le_p(void *ptr, float32 v)
-{
-    union {
-        float32 f;
-        uint32_t i;
-    } u;
-    u.f = v;
-    stl_le_p(ptr, u.i);
-}
-
-static inline float64 ldfq_le_p(const void *ptr)
-{
-    CPU_DoubleU u;
-    u.l.lower = ldl_le_p(ptr);
-    u.l.upper = ldl_le_p(ptr + 4);
-    return u.d;
-}
-
-static inline void stfq_le_p(void *ptr, float64 v)
-{
-    CPU_DoubleU u;
-    u.d = v;
-    stl_le_p(ptr, u.l.lower);
-    stl_le_p(ptr + 4, u.l.upper);
-}
-
-#else
-
-static inline int lduw_le_p(const void *ptr)
-{
-    return *(uint16_t *)ptr;
-}
-
-static inline int ldsw_le_p(const void *ptr)
-{
-    return *(int16_t *)ptr;
-}
-
-static inline int ldl_le_p(const void *ptr)
-{
-    return *(uint32_t *)ptr;
-}
-
-static inline uint64_t ldq_le_p(const void *ptr)
-{
-    return *(uint64_t *)ptr;
-}
-
-static inline void stw_le_p(void *ptr, int v)
-{
-    *(uint16_t *)ptr = v;
-}
-
-static inline void stl_le_p(void *ptr, int v)
-{
-    *(uint32_t *)ptr = v;
-}
-
-static inline void stq_le_p(void *ptr, uint64_t v)
-{
-    *(uint64_t *)ptr = v;
-}
-
-/* float access */
-
-static inline float32 ldfl_le_p(const void *ptr)
-{
-    return *(float32 *)ptr;
-}
-
-static inline float64 ldfq_le_p(const void *ptr)
-{
-    return *(float64 *)ptr;
-}
-
-static inline void stfl_le_p(void *ptr, float32 v)
-{
-    *(float32 *)ptr = v;
-}
-
-static inline void stfq_le_p(void *ptr, float64 v)
-{
-    *(float64 *)ptr = v;
-}
-#endif
-
-#if !defined(HOST_WORDS_BIGENDIAN) || defined(WORDS_ALIGNED)
-
-static inline int lduw_be_p(const void *ptr)
-{
-#if defined(__i386__)
-    int val;
-    asm volatile ("movzwl %1, %0\n"
-                  "xchgb %b0, %h0\n"
-                  : "=q" (val)
-                  : "m" (*(uint16_t *)ptr));
-    return val;
-#else
-    const uint8_t *b = ptr;
-    return ((b[0] << 8) | b[1]);
-#endif
-}
-
-static inline int ldsw_be_p(const void *ptr)
-{
-#if defined(__i386__)
-    int val;
-    asm volatile ("movzwl %1, %0\n"
-                  "xchgb %b0, %h0\n"
-                  : "=q" (val)
-                  : "m" (*(uint16_t *)ptr));
-    return (int16_t)val;
-#else
-    const uint8_t *b = ptr;
-    return (int16_t)((b[0] << 8) | b[1]);
-#endif
-}
-
-static inline int ldl_be_p(const void *ptr)
-{
-#if defined(__i386__) || defined(__x86_64__)
-    int val;
-    asm volatile ("movl %1, %0\n"
-                  "bswap %0\n"
-                  : "=r" (val)
-                  : "m" (*(uint32_t *)ptr));
-    return val;
-#else
-    const uint8_t *b = ptr;
-    return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3];
-#endif
-}
-
-static inline uint64_t ldq_be_p(const void *ptr)
-{
-    uint32_t a,b;
-    a = ldl_be_p(ptr);
-    b = ldl_be_p((uint8_t *)ptr + 4);
-    return (((uint64_t)a<<32)|b);
-}
-
-static inline void stw_be_p(void *ptr, int v)
-{
-#if defined(__i386__)
-    asm volatile ("xchgb %b0, %h0\n"
-                  "movw %w0, %1\n"
-                  : "=q" (v)
-                  : "m" (*(uint16_t *)ptr), "0" (v));
-#else
-    uint8_t *d = (uint8_t *) ptr;
-    d[0] = v >> 8;
-    d[1] = v;
-#endif
-}
-
-static inline void stl_be_p(void *ptr, int v)
-{
-#if defined(__i386__) || defined(__x86_64__)
-    asm volatile ("bswap %0\n"
-                  "movl %0, %1\n"
-                  : "=r" (v)
-                  : "m" (*(uint32_t *)ptr), "0" (v));
-#else
-    uint8_t *d = (uint8_t *) ptr;
-    d[0] = v >> 24;
-    d[1] = v >> 16;
-    d[2] = v >> 8;
-    d[3] = v;
-#endif
-}
-
-static inline void stq_be_p(void *ptr, uint64_t v)
-{
-    stl_be_p(ptr, v >> 32);
-    stl_be_p((uint8_t *)ptr + 4, v);
-}
-
-/* float access */
-
-static inline float32 ldfl_be_p(const void *ptr)
-{
-    union {
-        float32 f;
-        uint32_t i;
-    } u;
-    u.i = ldl_be_p(ptr);
-    return u.f;
-}
-
-static inline void stfl_be_p(void *ptr, float32 v)
-{
-    union {
-        float32 f;
-        uint32_t i;
-    } u;
-    u.f = v;
-    stl_be_p(ptr, u.i);
-}
-
-static inline float64 ldfq_be_p(const void *ptr)
-{
-    CPU_DoubleU u;
-    u.l.upper = ldl_be_p(ptr);
-    u.l.lower = ldl_be_p((uint8_t *)ptr + 4);
-    return u.d;
-}
-
-static inline void stfq_be_p(void *ptr, float64 v)
-{
-    CPU_DoubleU u;
-    u.d = v;
-    stl_be_p(ptr, u.l.upper);
-    stl_be_p((uint8_t *)ptr + 4, u.l.lower);
-}
-
-#else
-
-static inline int lduw_be_p(const void *ptr)
-{
-    return *(uint16_t *)ptr;
-}
-
-static inline int ldsw_be_p(const void *ptr)
-{
-    return *(int16_t *)ptr;
-}
-
-static inline int ldl_be_p(const void *ptr)
-{
-    return *(uint32_t *)ptr;
-}
-
-static inline uint64_t ldq_be_p(const void *ptr)
-{
-    return *(uint64_t *)ptr;
-}
-
-static inline void stw_be_p(void *ptr, int v)
-{
-    *(uint16_t *)ptr = v;
-}
-
-static inline void stl_be_p(void *ptr, int v)
-{
-    *(uint32_t *)ptr = v;
-}
-
-static inline void stq_be_p(void *ptr, uint64_t v)
-{
-    *(uint64_t *)ptr = v;
-}
-
-/* float access */
-
-static inline float32 ldfl_be_p(const void *ptr)
-{
-    return *(float32 *)ptr;
-}
-
-static inline float64 ldfq_be_p(const void *ptr)
-{
-    return *(float64 *)ptr;
-}
-
-static inline void stfl_be_p(void *ptr, float32 v)
-{
-    *(float32 *)ptr = v;
-}
-
-static inline void stfq_be_p(void *ptr, float64 v)
-{
-    *(float64 *)ptr = v;
-}
-
-#endif
-
-/* target CPU memory access functions */
+/* target-endianness CPU memory access functions */
 #if defined(TARGET_WORDS_BIGENDIAN)
 #define lduw_p(p) lduw_be_p(p)
 #define ldsw_p(p) ldsw_be_p(p)
@@ -636,22 +196,27 @@
 #endif
 
 /* All direct uses of g2h and h2g need to go away for usermode softmmu.  */
-#define g2h(x) ((void *)((unsigned long)(x) + GUEST_BASE))
+#define g2h(x) ((void *)((unsigned long)(target_ulong)(x) + GUEST_BASE))
 
 #if HOST_LONG_BITS <= TARGET_VIRT_ADDR_SPACE_BITS
 #define h2g_valid(x) 1
 #else
 #define h2g_valid(x) ({ \
     unsigned long __guest = (unsigned long)(x) - GUEST_BASE; \
-    __guest < (1ul << TARGET_VIRT_ADDR_SPACE_BITS); \
+    (__guest < (1ul << TARGET_VIRT_ADDR_SPACE_BITS)) && \
+    (!RESERVED_VA || (__guest < RESERVED_VA)); \
 })
 #endif
 
-#define h2g(x) ({ \
+#define h2g_nocheck(x) ({ \
     unsigned long __ret = (unsigned long)(x) - GUEST_BASE; \
+    (abi_ulong)__ret; \
+})
+
+#define h2g(x) ({ \
     /* Check if given address fits target address space */ \
     assert(h2g_valid(x)); \
-    (abi_ulong)__ret; \
+    h2g_nocheck(x); \
 })
 
 #define saddr(x) g2h(x)
@@ -660,8 +225,8 @@
 #else /* !CONFIG_USER_ONLY */
 /* NOTE: we use double casts if pointers and target_ulong have
    different sizes */
-#define saddr(x) (uint8_t *)(long)(x)
-#define laddr(x) (uint8_t *)(long)(x)
+#define saddr(x) (uint8_t *)(intptr_t)(x)
+#define laddr(x) (uint8_t *)(intptr_t)(x)
 #endif
 
 #define ldub_raw(p) ldub_p(laddr((p)))
@@ -698,12 +263,34 @@
 #define stfl(p, v) stfl_raw(p, v)
 #define stfq(p, v) stfq_raw(p, v)
 
-#define ldub_code(p) ldub_raw(p)
-#define ldsb_code(p) ldsb_raw(p)
-#define lduw_code(p) lduw_raw(p)
-#define ldsw_code(p) ldsw_raw(p)
-#define ldl_code(p) ldl_raw(p)
-#define ldq_code(p) ldq_raw(p)
+#define cpu_ldub_code(env1, p) ldub_raw(p)
+#define cpu_ldsb_code(env1, p) ldsb_raw(p)
+#define cpu_lduw_code(env1, p) lduw_raw(p)
+#define cpu_ldsw_code(env1, p) ldsw_raw(p)
+#define cpu_ldl_code(env1, p) ldl_raw(p)
+#define cpu_ldq_code(env1, p) ldq_raw(p)
+
+#define cpu_ldub_data(env, addr) ldub_raw(addr)
+#define cpu_lduw_data(env, addr) lduw_raw(addr)
+#define cpu_ldsw_data(env, addr) ldsw_raw(addr)
+#define cpu_ldl_data(env, addr) ldl_raw(addr)
+#define cpu_ldq_data(env, addr) ldq_raw(addr)
+
+#define cpu_stb_data(env, addr, data) stb_raw(addr, data)
+#define cpu_stw_data(env, addr, data) stw_raw(addr, data)
+#define cpu_stl_data(env, addr, data) stl_raw(addr, data)
+#define cpu_stq_data(env, addr, data) stq_raw(addr, data)
+
+#define cpu_ldub_kernel(env, addr) ldub_raw(addr)
+#define cpu_lduw_kernel(env, addr) lduw_raw(addr)
+#define cpu_ldsw_kernel(env, addr) ldsw_raw(addr)
+#define cpu_ldl_kernel(env, addr) ldl_raw(addr)
+#define cpu_ldq_kernel(env, addr) ldq_raw(addr)
+
+#define cpu_stb_kernel(env, addr, data) stb_raw(addr, data)
+#define cpu_stw_kernel(env, addr, data) stw_raw(addr, data)
+#define cpu_stl_kernel(env, addr, data) stl_raw(addr, data)
+#define cpu_stq_kernel(env, addr, data) stq_raw(addr, data)
 
 #define ldub_kernel(p) ldub_raw(p)
 #define ldsb_kernel(p) ldsb_raw(p)
@@ -720,6 +307,13 @@
 #define stfl_kernel(p, v) stfl_raw(p, v)
 #define stfq_kernel(p, vt) stfq_raw(p, v)
 
+#define cpu_ldub_data(env, addr) ldub_raw(addr)
+#define cpu_lduw_data(env, addr) lduw_raw(addr)
+#define cpu_ldl_data(env, addr) ldl_raw(addr)
+
+#define cpu_stb_data(env, addr, data) stb_raw(addr, data)
+#define cpu_stw_data(env, addr, data) stw_raw(addr, data)
+#define cpu_stl_data(env, addr, data) stl_raw(addr, data)
 #endif /* defined(CONFIG_USER_ONLY) */
 
 /* page related stuff */
@@ -728,11 +322,11 @@
 #define TARGET_PAGE_MASK ~(TARGET_PAGE_SIZE - 1)
 #define TARGET_PAGE_ALIGN(addr) (((addr) + TARGET_PAGE_SIZE - 1) & TARGET_PAGE_MASK)
 
-/* ??? These should be the larger of unsigned long and target_ulong.  */
-extern unsigned long qemu_real_host_page_size;
-extern unsigned long qemu_host_page_bits;
-extern unsigned long qemu_host_page_size;
-extern unsigned long qemu_host_page_mask;
+/* ??? These should be the larger of uintptr_t and target_ulong.  */
+extern uintptr_t qemu_real_host_page_size;
+extern uintptr_t qemu_host_page_bits;
+extern uintptr_t qemu_host_page_size;
+extern uintptr_t qemu_host_page_mask;
 
 #define HOST_PAGE_ALIGN(addr) (((addr) + qemu_host_page_size - 1) & qemu_host_page_mask)
 
@@ -815,15 +409,16 @@
 #define CPU_INTERRUPT_TGT_EXT_4   0x1000
 
 /* Several target-specific internal interrupts.  These differ from the
-   preceeding target-specific interrupts in that they are intended to
+   preceding target-specific interrupts in that they are intended to
    originate from within the cpu itself, typically in response to some
    instruction being executed.  These, therefore, are not masked while
    single-stepping within the debugger.  */
 #define CPU_INTERRUPT_TGT_INT_0   0x0100
 #define CPU_INTERRUPT_TGT_INT_1   0x0400
 #define CPU_INTERRUPT_TGT_INT_2   0x0800
+#define CPU_INTERRUPT_TGT_INT_3   0x2000
 
-/* First unused bit: 0x2000.  */
+/* First unused bit: 0x4000.  */
 
 /* The set of all bits that should be masked when single-stepping.  */
 #define CPU_INTERRUPT_SSTEP_MASK \
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 1aef3e3..67b3435 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -200,21 +200,69 @@
                         pci_map_irq_fn map_irq, const char *name);
 
 static inline void
+pci_set_byte(uint8_t *config, uint8_t val)
+{
+    *config = val;
+}
+
+static inline uint8_t
+pci_get_byte(const uint8_t *config)
+{
+    return *config;
+}
+
+static inline void
+pci_set_word(uint8_t *config, uint16_t val)
+{
+    stw_le_p(config, val);
+}
+
+static inline uint16_t
+pci_get_word(const uint8_t *config)
+{
+    return lduw_le_p(config);
+}
+
+static inline void
+pci_set_long(uint8_t *config, uint32_t val)
+{
+    stl_le_p(config, val);
+}
+
+static inline uint32_t
+pci_get_long(const uint8_t *config)
+{
+    return ldl_le_p(config);
+}
+
+static inline void
+pci_set_quad(uint8_t *config, uint64_t val)
+{
+    cpu_to_le64w((uint64_t *)config, val);
+}
+
+static inline uint64_t
+pci_get_quad(const uint8_t *config)
+{
+    return le64_to_cpup((const uint64_t *)config);
+}
+
+static inline void
 pci_config_set_vendor_id(uint8_t *pci_config, uint16_t val)
 {
-    cpu_to_le16wu((uint16_t *)&pci_config[PCI_VENDOR_ID], val);
+    pci_set_word(&pci_config[PCI_VENDOR_ID], val);
 }
 
 static inline void
 pci_config_set_device_id(uint8_t *pci_config, uint16_t val)
 {
-    cpu_to_le16wu((uint16_t *)&pci_config[PCI_DEVICE_ID], val);
+    pci_set_word(&pci_config[PCI_DEVICE_ID], val);
 }
 
 static inline void
 pci_config_set_class(uint8_t *pci_config, uint16_t val)
 {
-    cpu_to_le16wu((uint16_t *)&pci_config[PCI_CLASS_DEVICE], val);
+    pci_set_word(&pci_config[PCI_CLASS_DEVICE], val);
 }
 
 typedef void (*pci_qdev_initfn)(PCIDevice *dev);
diff --git a/include/qemu/bswap.h b/include/qemu/bswap.h
index 82a7951..437b8e0 100644
--- a/include/qemu/bswap.h
+++ b/include/qemu/bswap.h
@@ -2,52 +2,17 @@
 #define BSWAP_H
 
 #include "config-host.h"
-
 #include <inttypes.h>
+#include <limits.h>
+#include <string.h>
+#include "fpu/softfloat.h"
 
 #ifdef CONFIG_MACHINE_BSWAP_H
-#include <sys/endian.h>
-#include <sys/types.h>
-#include <machine/bswap.h>
-#else
-
-#ifdef CONFIG_BYTESWAP_H
-#include <byteswap.h>
-#else
-
-#define bswap_16(x) \
-({ \
-	uint16_t __x = (x); \
-	((uint16_t)( \
-		(((uint16_t)(__x) & (uint16_t)0x00ffU) << 8) | \
-		(((uint16_t)(__x) & (uint16_t)0xff00U) >> 8) )); \
-})
-
-#define bswap_32(x) \
-({ \
-	uint32_t __x = (x); \
-	((uint32_t)( \
-		(((uint32_t)(__x) & (uint32_t)0x000000ffUL) << 24) | \
-		(((uint32_t)(__x) & (uint32_t)0x0000ff00UL) <<  8) | \
-		(((uint32_t)(__x) & (uint32_t)0x00ff0000UL) >>  8) | \
-		(((uint32_t)(__x) & (uint32_t)0xff000000UL) >> 24) )); \
-})
-
-#define bswap_64(x) \
-({ \
-	uint64_t __x = (x); \
-	((uint64_t)( \
-		(uint64_t)(((uint64_t)(__x) & (uint64_t)0x00000000000000ffULL) << 56) | \
-		(uint64_t)(((uint64_t)(__x) & (uint64_t)0x000000000000ff00ULL) << 40) | \
-		(uint64_t)(((uint64_t)(__x) & (uint64_t)0x0000000000ff0000ULL) << 24) | \
-		(uint64_t)(((uint64_t)(__x) & (uint64_t)0x00000000ff000000ULL) <<  8) | \
-	        (uint64_t)(((uint64_t)(__x) & (uint64_t)0x000000ff00000000ULL) >>  8) | \
-		(uint64_t)(((uint64_t)(__x) & (uint64_t)0x0000ff0000000000ULL) >> 24) | \
-		(uint64_t)(((uint64_t)(__x) & (uint64_t)0x00ff000000000000ULL) >> 40) | \
-		(uint64_t)(((uint64_t)(__x) & (uint64_t)0xff00000000000000ULL) >> 56) )); \
-})
-
-#endif /* !CONFIG_BYTESWAP_H */
+# include <sys/endian.h>
+# include <sys/types.h>
+# include <machine/bswap.h>
+#elif defined(CONFIG_BYTESWAP_H)
+# include <byteswap.h>
 
 static inline uint16_t bswap16(uint16_t x)
 {
@@ -63,7 +28,32 @@
 {
     return bswap_64(x);
 }
+# else
+static inline uint16_t bswap16(uint16_t x)
+{
+    return (((x & 0x00ff) << 8) |
+            ((x & 0xff00) >> 8));
+}
 
+static inline uint32_t bswap32(uint32_t x)
+{
+    return (((x & 0x000000ffU) << 24) |
+            ((x & 0x0000ff00U) <<  8) |
+            ((x & 0x00ff0000U) >>  8) |
+            ((x & 0xff000000U) >> 24));
+}
+
+static inline uint64_t bswap64(uint64_t x)
+{
+    return (((x & 0x00000000000000ffULL) << 56) |
+            ((x & 0x000000000000ff00ULL) << 40) |
+            ((x & 0x0000000000ff0000ULL) << 24) |
+            ((x & 0x00000000ff000000ULL) <<  8) |
+            ((x & 0x000000ff00000000ULL) >>  8) |
+            ((x & 0x0000ff0000000000ULL) >> 24) |
+            ((x & 0x00ff000000000000ULL) >> 40) |
+            ((x & 0xff00000000000000ULL) >> 56));
+}
 #endif /* ! CONFIG_MACHINE_BSWAP_H */
 
 static inline void bswap16s(uint16_t *s)
@@ -83,45 +73,45 @@
 
 #if defined(HOST_WORDS_BIGENDIAN)
 #define be_bswap(v, size) (v)
-#define le_bswap(v, size) bswap ## size(v)
+#define le_bswap(v, size) glue(bswap, size)(v)
 #define be_bswaps(v, size)
-#define le_bswaps(p, size) *p = bswap ## size(*p);
+#define le_bswaps(p, size) do { *p = glue(bswap, size)(*p); } while(0)
 #else
 #define le_bswap(v, size) (v)
-#define be_bswap(v, size) bswap ## size(v)
+#define be_bswap(v, size) glue(bswap, size)(v)
 #define le_bswaps(v, size)
-#define be_bswaps(p, size) *p = bswap ## size(*p);
+#define be_bswaps(p, size) do { *p = glue(bswap, size)(*p); } while(0)
 #endif
 
 #define CPU_CONVERT(endian, size, type)\
 static inline type endian ## size ## _to_cpu(type v)\
 {\
-    return endian ## _bswap(v, size);\
+    return glue(endian, _bswap)(v, size);\
 }\
 \
 static inline type cpu_to_ ## endian ## size(type v)\
 {\
-    return endian ## _bswap(v, size);\
+    return glue(endian, _bswap)(v, size);\
 }\
 \
 static inline void endian ## size ## _to_cpus(type *p)\
 {\
-    endian ## _bswaps(p, size)\
+    glue(endian, _bswaps)(p, size);\
 }\
 \
 static inline void cpu_to_ ## endian ## size ## s(type *p)\
 {\
-    endian ## _bswaps(p, size)\
+    glue(endian, _bswaps)(p, size);\
 }\
 \
 static inline type endian ## size ## _to_cpup(const type *p)\
 {\
-    return endian ## size ## _to_cpu(*p);\
+    return glue(glue(endian, size), _to_cpu)(*p);\
 }\
 \
 static inline void cpu_to_ ## endian ## size ## w(type *p, type v)\
 {\
-     *p = cpu_to_ ## endian ## size(v);\
+    *p = glue(glue(cpu_to_, endian), size)(v);\
 }
 
 CPU_CONVERT(be, 16, uint16_t)
@@ -132,109 +122,310 @@
 CPU_CONVERT(le, 32, uint32_t)
 CPU_CONVERT(le, 64, uint64_t)
 
-/* unaligned versions (optimized for frequent unaligned accesses)*/
-
-#if defined(__i386__) || defined(_ARCH_PPC)
-
-#define cpu_to_le16wu(p, v) cpu_to_le16w(p, v)
-#define cpu_to_le32wu(p, v) cpu_to_le32w(p, v)
-#define le16_to_cpupu(p) le16_to_cpup(p)
-#define le32_to_cpupu(p) le32_to_cpup(p)
-#define be32_to_cpupu(p) be32_to_cpup(p)
-
-#define cpu_to_be16wu(p, v) cpu_to_be16w(p, v)
-#define cpu_to_be32wu(p, v) cpu_to_be32w(p, v)
-#define cpu_to_be64wu(p, v) cpu_to_be64w(p, v)
-
-#else
-
-static inline void cpu_to_le16wu(uint16_t *p, uint16_t v)
-{
-    uint8_t *p1 = (uint8_t *)p;
-
-    p1[0] = v & 0xff;
-    p1[1] = v >> 8;
-}
-
-static inline void cpu_to_le32wu(uint32_t *p, uint32_t v)
-{
-    uint8_t *p1 = (uint8_t *)p;
-
-    p1[0] = v & 0xff;
-    p1[1] = v >> 8;
-    p1[2] = v >> 16;
-    p1[3] = v >> 24;
-}
-
-static inline uint16_t le16_to_cpupu(const uint16_t *p)
-{
-    const uint8_t *p1 = (const uint8_t *)p;
-    return p1[0] | (p1[1] << 8);
-}
-
-static inline uint32_t le32_to_cpupu(const uint32_t *p)
-{
-    const uint8_t *p1 = (const uint8_t *)p;
-    return p1[0] | (p1[1] << 8) | (p1[2] << 16) | (p1[3] << 24);
-}
-
-static inline uint32_t be32_to_cpupu(const uint32_t *p)
-{
-    const uint8_t *p1 = (const uint8_t *)p;
-    return p1[3] | (p1[2] << 8) | (p1[1] << 16) | (p1[0] << 24);
-}
-
-static inline void cpu_to_be16wu(uint16_t *p, uint16_t v)
-{
-    uint8_t *p1 = (uint8_t *)p;
-
-    p1[0] = v >> 8;
-    p1[1] = v & 0xff;
-}
-
-static inline void cpu_to_be32wu(uint32_t *p, uint32_t v)
-{
-    uint8_t *p1 = (uint8_t *)p;
-
-    p1[0] = v >> 24;
-    p1[1] = v >> 16;
-    p1[2] = v >> 8;
-    p1[3] = v & 0xff;
-}
-
-static inline void cpu_to_be64wu(uint64_t *p, uint64_t v)
-{
-    uint8_t *p1 = (uint8_t *)p;
-
-    p1[0] = v >> 56;
-    p1[1] = v >> 48;
-    p1[2] = v >> 40;
-    p1[3] = v >> 32;
-    p1[4] = v >> 24;
-    p1[5] = v >> 16;
-    p1[6] = v >> 8;
-    p1[7] = v & 0xff;
-}
-
-#endif
-
-#ifdef HOST_WORDS_BIGENDIAN
-#define cpu_to_32wu cpu_to_be32wu
-#define leul_to_cpu(v) glue(glue(le,HOST_LONG_BITS),_to_cpu)(v)
-#else
-#define cpu_to_32wu cpu_to_le32wu
-#define leul_to_cpu(v) (v)
-#endif
-
-#undef le_bswap
-#undef be_bswap
-#undef le_bswaps
-#undef be_bswaps
-
 /* len must be one of 1, 2, 4 */
 static inline uint32_t qemu_bswap_len(uint32_t value, int len)
 {
     return bswap32(value) >> (32 - 8 * len);
 }
 
+/* Unions for reinterpreting between floats and integers.  */
+
+typedef union {
+    float32 f;
+    uint32_t l;
+} CPU_FloatU;
+
+typedef union {
+    float64 d;
+#if defined(HOST_WORDS_BIGENDIAN)
+    struct {
+        uint32_t upper;
+        uint32_t lower;
+    } l;
+#else
+    struct {
+        uint32_t lower;
+        uint32_t upper;
+    } l;
+#endif
+    uint64_t ll;
+} CPU_DoubleU;
+
+typedef union {
+     floatx80 d;
+     struct {
+         uint64_t lower;
+         uint16_t upper;
+     } l;
+} CPU_LDoubleU;
+
+typedef union {
+    float128 q;
+#if defined(HOST_WORDS_BIGENDIAN)
+    struct {
+        uint32_t upmost;
+        uint32_t upper;
+        uint32_t lower;
+        uint32_t lowest;
+    } l;
+    struct {
+        uint64_t upper;
+        uint64_t lower;
+    } ll;
+#else
+    struct {
+        uint32_t lowest;
+        uint32_t lower;
+        uint32_t upper;
+        uint32_t upmost;
+    } l;
+    struct {
+        uint64_t lower;
+        uint64_t upper;
+    } ll;
+#endif
+} CPU_QuadU;
+
+/* unaligned/endian-independent pointer access */
+
+/*
+ * the generic syntax is:
+ *
+ * load: ld{type}{sign}{size}{endian}_p(ptr)
+ *
+ * store: st{type}{size}{endian}_p(ptr, val)
+ *
+ * Note there are small differences with the softmmu access API!
+ *
+ * type is:
+ * (empty): integer access
+ *   f    : float access
+ *
+ * sign is:
+ * (empty): for floats or 32 bit size
+ *   u    : unsigned
+ *   s    : signed
+ *
+ * size is:
+ *   b: 8 bits
+ *   w: 16 bits
+ *   l: 32 bits
+ *   q: 64 bits
+ *
+ * endian is:
+ * (empty): host endian
+ *   be   : big endian
+ *   le   : little endian
+ */
+
+static inline int ldub_p(const void *ptr)
+{
+    return *(uint8_t *)ptr;
+}
+
+static inline int ldsb_p(const void *ptr)
+{
+    return *(int8_t *)ptr;
+}
+
+static inline void stb_p(void *ptr, int v)
+{
+    *(uint8_t *)ptr = v;
+}
+
+/* Any compiler worth its salt will turn these memcpy into native unaligned
+   operations.  Thus we don't need to play games with packed attributes, or
+   inline byte-by-byte stores.  */
+
+static inline int lduw_p(const void *ptr)
+{
+    uint16_t r;
+    memcpy(&r, ptr, sizeof(r));
+    return r;
+}
+
+static inline int ldsw_p(const void *ptr)
+{
+    int16_t r;
+    memcpy(&r, ptr, sizeof(r));
+    return r;
+}
+
+static inline void stw_p(void *ptr, uint16_t v)
+{
+    memcpy(ptr, &v, sizeof(v));
+}
+
+static inline int ldl_p(const void *ptr)
+{
+    int32_t r;
+    memcpy(&r, ptr, sizeof(r));
+    return r;
+}
+
+static inline void stl_p(void *ptr, uint32_t v)
+{
+    memcpy(ptr, &v, sizeof(v));
+}
+
+static inline uint64_t ldq_p(const void *ptr)
+{
+    uint64_t r;
+    memcpy(&r, ptr, sizeof(r));
+    return r;
+}
+
+static inline void stq_p(void *ptr, uint64_t v)
+{
+    memcpy(ptr, &v, sizeof(v));
+}
+
+static inline int lduw_le_p(const void *ptr)
+{
+    return (uint16_t)le_bswap(lduw_p(ptr), 16);
+}
+
+static inline int ldsw_le_p(const void *ptr)
+{
+    return (int16_t)le_bswap(lduw_p(ptr), 16);
+}
+
+static inline int ldl_le_p(const void *ptr)
+{
+    return le_bswap(ldl_p(ptr), 32);
+}
+
+static inline uint64_t ldq_le_p(const void *ptr)
+{
+    return le_bswap(ldq_p(ptr), 64);
+}
+
+static inline void stw_le_p(void *ptr, int v)
+{
+    stw_p(ptr, le_bswap(v, 16));
+}
+
+static inline void stl_le_p(void *ptr, int v)
+{
+    stl_p(ptr, le_bswap(v, 32));
+}
+
+static inline void stq_le_p(void *ptr, uint64_t v)
+{
+    stq_p(ptr, le_bswap(v, 64));
+}
+
+/* float access */
+
+static inline float32 ldfl_le_p(const void *ptr)
+{
+    CPU_FloatU u;
+    u.l = ldl_le_p(ptr);
+    return u.f;
+}
+
+static inline void stfl_le_p(void *ptr, float32 v)
+{
+    CPU_FloatU u;
+    u.f = v;
+    stl_le_p(ptr, u.l);
+}
+
+static inline float64 ldfq_le_p(const void *ptr)
+{
+    CPU_DoubleU u;
+    u.ll = ldq_le_p(ptr);
+    return u.d;
+}
+
+static inline void stfq_le_p(void *ptr, float64 v)
+{
+    CPU_DoubleU u;
+    u.d = v;
+    stq_le_p(ptr, u.ll);
+}
+
+static inline int lduw_be_p(const void *ptr)
+{
+    return (uint16_t)be_bswap(lduw_p(ptr), 16);
+}
+
+static inline int ldsw_be_p(const void *ptr)
+{
+    return (int16_t)be_bswap(lduw_p(ptr), 16);
+}
+
+static inline int ldl_be_p(const void *ptr)
+{
+    return be_bswap(ldl_p(ptr), 32);
+}
+
+static inline uint64_t ldq_be_p(const void *ptr)
+{
+    return be_bswap(ldq_p(ptr), 64);
+}
+
+static inline void stw_be_p(void *ptr, int v)
+{
+    stw_p(ptr, be_bswap(v, 16));
+}
+
+static inline void stl_be_p(void *ptr, int v)
+{
+    stl_p(ptr, be_bswap(v, 32));
+}
+
+static inline void stq_be_p(void *ptr, uint64_t v)
+{
+    stq_p(ptr, be_bswap(v, 64));
+}
+
+/* float access */
+
+static inline float32 ldfl_be_p(const void *ptr)
+{
+    CPU_FloatU u;
+    u.l = ldl_be_p(ptr);
+    return u.f;
+}
+
+static inline void stfl_be_p(void *ptr, float32 v)
+{
+    CPU_FloatU u;
+    u.f = v;
+    stl_be_p(ptr, u.l);
+}
+
+static inline float64 ldfq_be_p(const void *ptr)
+{
+    CPU_DoubleU u;
+    u.ll = ldq_be_p(ptr);
+    return u.d;
+}
+
+static inline void stfq_be_p(void *ptr, float64 v)
+{
+    CPU_DoubleU u;
+    u.d = v;
+    stq_be_p(ptr, u.ll);
+}
+
+static inline unsigned long leul_to_cpu(unsigned long v)
+{
+    /* In order to break an include loop between here and
+       qemu-common.h, don't rely on HOST_LONG_BITS.  */
+#if ULONG_MAX == UINT32_MAX
+    return le_bswap(v, 32);
+#elif ULONG_MAX == UINT64_MAX
+    return le_bswap(v, 64);
+#else
+# error Unknown sizeof long
+#endif
+}
+
+#undef le_bswap
+#undef be_bswap
+#undef le_bswaps
+#undef be_bswaps
+
 #endif /* BSWAP_H */