Sync-patch with libwebp (ChangeId: Ia5475247)

Added 16bit swapping of RGB565 / RGB4444 colorspace.
Added ARM/NEON code for decoder/encoder modules.
Speedup in WebP compression (method 3 and above).

Change-Id: I95a697338bef7c3ea08054eb5f850a97d1889eb9
diff --git a/src/dsp/dec.c b/src/dsp/dec.c
index 9ae7b6f..758c6a5 100644
--- a/src/dsp/dec.c
+++ b/src/dsp/dec.c
@@ -426,11 +426,16 @@
 }
 
 // helper for chroma-DC predictions
-static WEBP_INLINE void Put8x8uv(uint64_t v, uint8_t* dst) {
+static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
   int j;
+#ifndef WEBP_REFERENCE_IMPLEMENTATION
+  const uint64_t v = (uint64_t)value * 0x0101010101010101ULL;
   for (j = 0; j < 8; ++j) {
     *(uint64_t*)(dst + j * BPS) = v;
   }
+#else
+  for (j = 0; j < 8; ++j) memset(dst + j * BPS, value, 8);
+#endif
 }
 
 static void DC8uv(uint8_t *dst) {     // DC
@@ -439,7 +444,7 @@
   for (i = 0; i < 8; ++i) {
     dc0 += dst[i - BPS] + dst[-1 + i * BPS];
   }
-  Put8x8uv((uint64_t)((dc0 >> 4) * 0x0101010101010101ULL), dst);
+  Put8x8uv(dc0 >> 4, dst);
 }
 
 static void DC8uvNoLeft(uint8_t *dst) {   // DC with no left samples
@@ -448,7 +453,7 @@
   for (i = 0; i < 8; ++i) {
     dc0 += dst[i - BPS];
   }
-  Put8x8uv((uint64_t)((dc0 >> 3) * 0x0101010101010101ULL), dst);
+  Put8x8uv(dc0 >> 3, dst);
 }
 
 static void DC8uvNoTop(uint8_t *dst) {  // DC with no top samples
@@ -457,11 +462,11 @@
   for (i = 0; i < 8; ++i) {
     dc0 += dst[-1 + i * BPS];
   }
-  Put8x8uv((uint64_t)((dc0 >> 3) * 0x0101010101010101ULL), dst);
+  Put8x8uv(dc0 >> 3, dst);
 }
 
 static void DC8uvNoTopLeft(uint8_t *dst) {    // DC with nothing
-  Put8x8uv(0x8080808080808080ULL, dst);
+  Put8x8uv(0x80, dst);
 }
 
 //------------------------------------------------------------------------------