[AVR32] faster avr32 unaligned access

Use a more conventional implementation for unaligned access, and include
an AT32AP-specific optimization:  the CPU will handle unaligned words.

The result is always faster and smaller for 8, 16, and 32 bit values.
For 64 bit quantities, it's presumably larger.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Haavard Skinnemoen <hskinnemoen@atmel.com>
diff --git a/include/asm-avr32/unaligned.h b/include/asm-avr32/unaligned.h
index 3042723..7913617 100644
--- a/include/asm-avr32/unaligned.h
+++ b/include/asm-avr32/unaligned.h
@@ -6,20 +6,31 @@
  * implementation.  The AVR32 AP implementation can handle unaligned
  * words, but halfwords must be halfword-aligned, and doublewords must
  * be word-aligned.
- *
- * TODO: Make all this CPU-specific and optimize.
  */
 
-#include <linux/string.h>
+#include <asm-generic/unaligned.h>
 
-/* Use memmove here, so gcc does not insert a __builtin_memcpy. */
+#ifdef CONFIG_CPU_AT32AP7000
 
+/* REVISIT calling memmove() may be smaller for 64-bit values ... */
+
+#undef get_unaligned
 #define get_unaligned(ptr) \
-  ({ __typeof__(*(ptr)) __tmp; memmove(&__tmp, (ptr), sizeof(*(ptr))); __tmp; })
+	___get_unaligned(ptr, sizeof((*ptr)))
+#define ___get_unaligned(ptr, size) \
+	((size == 4) ? *(ptr) : __get_unaligned(ptr, size))
 
-#define put_unaligned(val, ptr)				\
-  ({ __typeof__(*(ptr)) __tmp = (val);			\
-     memmove((ptr), &__tmp, sizeof(*(ptr)));		\
-     (void)0; })
+#undef put_unaligned
+#define put_unaligned(val, ptr) \
+	___put_unaligned((__u64)(val), ptr, sizeof((*ptr)))
+#define ___put_unaligned(val, ptr, size)		\
+do {							\
+	if (size == 4)					\
+		*(ptr) = (val);				\
+	else						\
+		__put_unaligned(val, ptr, size);	\
+} while (0)
+
+#endif
 
 #endif /* __ASM_AVR32_UNALIGNED_H */