Merge tag android-5.1.0_r1 into AOSP_5.1_MERGE

Change-Id: Ic794e4e60c6823b6679d6b77bd9f96ef22738699
diff --git a/BUILD.gn b/BUILD.gn
index b9a3f3a..e41b993 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -77,6 +77,12 @@
     defines += [ "HAVE_JPEG" ]
   }
 
+  if (is_msan) {
+    # MemorySanitizer does not support assembly code yet.
+    # http://crbug.com/344505
+    defines += [ "LIBYUV_DISABLE_X86" ]
+  }
+
   deps = [
     "//third_party:jpeg",
   ]
diff --git a/DEPS b/DEPS
index e4da873..2fae824 100644
--- a/DEPS
+++ b/DEPS
@@ -14,7 +14,7 @@
   "chromium_trunk" : "http://src.chromium.org/svn/trunk",
   # chrome://version/ for revision of canary Chrome.
   # http://chromium-status.appspot.com/lkgr is a last known good revision.
-  "chromium_revision": "262938",
+  "chromium_revision": "280149",
 }
 
 # NOTE: Prefer revision numbers to tags for svn deps. Use http rather than
@@ -29,6 +29,9 @@
   "build":
     Var("chromium_trunk") + "/src/build@" + Var("chromium_revision"),
 
+  "buildtools":
+    From("chromium_deps", "src/buildtools"),
+
   # Needed by common.gypi.
   "google_apis/build":
     Var("chromium_trunk") + "/src/google_apis/build@" + Var("chromium_revision"),
@@ -48,9 +51,18 @@
   "tools/gyp":
     From("chromium_deps", "src/tools/gyp"),
 
+  "tools/memory":
+    Var("chromium_trunk") + "/src/tools/memory@" + Var("chromium_revision"),
+
   "tools/python":
     Var("chromium_trunk") + "/src/tools/python@" + Var("chromium_revision"),
 
+  "tools/sanitizer_options":
+    File(Var("chromium_trunk") + "/src/base/debug/sanitizer_options.cc@" + Var("chromium_revision")),
+
+  "tools/tsan_suppressions":
+    File(Var("chromium_trunk") + "/src/base/debug/tsan_suppressions.cc@" + Var("chromium_revision")),
+
   "tools/valgrind":
     Var("chromium_trunk") + "/src/tools/valgrind@" + Var("chromium_revision"),
 
@@ -61,6 +73,18 @@
   "third_party/binutils":
     Var("chromium_trunk") + "/src/third_party/binutils@" + Var("chromium_revision"),
 
+  "third_party/libc++":
+    Var("chromium_trunk") + "/src/third_party/libc++@" + Var("chromium_revision"),
+
+  "third_party/libc++/trunk":
+    From("chromium_deps", "src/third_party/libc++/trunk"),
+
+  "third_party/libc++abi":
+    Var("chromium_trunk") + "/src/third_party/libc++abi@" + Var("chromium_revision"),
+
+  "third_party/libc++abi/trunk":
+    From("chromium_deps", "src/third_party/libc++abi/trunk"),
+
   "third_party/libjpeg_turbo":
     From("chromium_deps", "src/third_party/libjpeg_turbo"),
 
@@ -130,7 +154,7 @@
                 "--platform=win32",
                 "--no_auth",
                 "--bucket", "chromium-gn",
-                "-s", Var("root_dir") + "/tools/gn/bin/win/gn.exe.sha1",
+                "-s", Var("root_dir") + "/buildtools/win/gn.exe.sha1",
     ],
   },
   {
@@ -141,7 +165,7 @@
                 "--platform=darwin",
                 "--no_auth",
                 "--bucket", "chromium-gn",
-                "-s", Var("root_dir") + "/tools/gn/bin/mac/gn.sha1",
+                "-s", Var("root_dir") + "/buildtools/mac/gn.sha1",
     ],
   },
   {
@@ -152,7 +176,7 @@
                 "--platform=linux*",
                 "--no_auth",
                 "--bucket", "chromium-gn",
-                "-s", Var("root_dir") + "/tools/gn/bin/linux/gn.sha1",
+                "-s", Var("root_dir") + "/buildtools/linux64/gn.sha1",
     ],
   },
   {
@@ -163,10 +187,17 @@
                 "--platform=linux*",
                 "--no_auth",
                 "--bucket", "chromium-gn",
-                "-s", Var("root_dir") + "/tools/gn/bin/linux/gn32.sha1",
+                "-s", Var("root_dir") + "/buildtools/linux32/gn.sha1",
     ],
   },
   {
+    # Remove GN binaries from tools/gn/bin that aren't used anymore.
+    # TODO(kjellander) remove after the end of July, 2014.
+    "name": "remove_old_gn_binaries",
+    "pattern": ".",
+    "action": ["python", Var("root_dir") + "/tools/gn/bin/rm_binaries.py"],
+  },
+  {
     # Pull clang on mac. If nothing changed, or on non-mac platforms, this takes
     # zero seconds to run. If something changed, it downloads a prebuilt clang.
     "pattern": ".",
diff --git a/README.chromium b/README.chromium
index 78d39a6..2f9a041 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,9 +1,12 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1007
+Version: 1038
 License: BSD
 License File: LICENSE
 
 Description:
 libyuv is an open source project that includes
 YUV conversion and scaling functionality.
+
+Changes:
+Patch r1067 into r1038 to disable mips dsp2 code on iso r6.
diff --git a/include/libyuv/convert_from_argb.h b/include/libyuv/convert_from_argb.h
index f0343a7..90f43af 100644
--- a/include/libyuv/convert_from_argb.h
+++ b/include/libyuv/convert_from_argb.h
@@ -25,24 +25,22 @@
              uint8* dst_argb, int dst_stride_argb,
              int width, int height);
 
-// Convert ARGB To BGRA. (alias)
-#define ARGBToBGRA BGRAToARGB
+// Convert ARGB To BGRA.
 LIBYUV_API
-int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_argb, int dst_stride_argb,
+int ARGBToBGRA(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_bgra, int dst_stride_bgra,
                int width, int height);
 
-// Convert ARGB To ABGR. (alias)
-#define ARGBToABGR ABGRToARGB
+// Convert ARGB To ABGR.
 LIBYUV_API
-int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_argb, int dst_stride_argb,
+int ARGBToABGR(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_abgr, int dst_stride_abgr,
                int width, int height);
 
 // Convert ARGB To RGBA.
 LIBYUV_API
-int ARGBToRGBA(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_argb, int dst_stride_argb,
+int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_rgba, int dst_stride_rgba,
                int width, int height);
 
 // Convert ARGB To RGB24.
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 7a54e4a..99728cf 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -15,6 +15,10 @@
 
 #include "libyuv/basic_types.h"
 
+#if defined(__native_client__)
+#include "ppapi/c/pp_macros.h"  // For PPAPI_RELEASE
+#endif
+
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
@@ -38,7 +42,8 @@
   var = 0
 
 #if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
-    defined(TARGET_IPHONE_SIMULATOR)
+    defined(TARGET_IPHONE_SIMULATOR) || \
+    (defined(_MSC_VER) && defined(__clang__))
 #define LIBYUV_DISABLE_X86
 #endif
 // True if compiling for SSSE3 as a requirement.
@@ -47,7 +52,12 @@
 #endif
 
 // Enable for NaCL pepper 33 for bundle and AVX2 support.
-//  #define NEW_BINUTILS
+#if defined(__native_client__) && PPAPI_RELEASE >= 33
+#define NEW_BINUTILS
+#endif
+#if defined(__native_client__) && defined(__arm__) && PPAPI_RELEASE < 37
+#define LIBYUV_DISABLE_NEON
+#endif
 
 // The following are available on all x86 platforms:
 #if !defined(LIBYUV_DISABLE_X86) && \
@@ -152,6 +162,11 @@
 #define HAS_YUY2TOYROW_SSE2
 #endif
 
+// The following are available on x64 Visual C:
+#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64)
+#define HAS_I422TOARGBROW_SSSE3
+#endif
+
 // GCC >= 4.7.0 required for AVX2.
 #if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
 #if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
@@ -237,8 +252,7 @@
 
 // The following are available on Neon platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) && \
-    !defined(__native_client__)
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_ABGRTOUVROW_NEON
 #define HAS_ABGRTOYROW_NEON
 #define HAS_ARGB1555TOARGBROW_NEON
@@ -331,7 +345,8 @@
 #endif
 
 // The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
 #define HAS_COPYROW_MIPS
 #if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
 #define HAS_I422TOABGRROW_MIPS_DSPR2
@@ -427,7 +442,7 @@
     "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
     #opcode " (%%r15,%%r14),%" #arg "\n" \
     BUNDLEUNLOCK
-#else
+#else  // defined(__native_client__) && defined(__x86_64__)
 #define BUNDLEALIGN "\n"
 #define MEMACCESS(base) "(%" #base ")"
 #define MEMACCESS2(offset, base) #offset "(%" #base ")"
@@ -444,6 +459,15 @@
     #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
 #define MEMOPARG(opcode, offset, base, index, scale, arg) \
     #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
+#endif  // defined(__native_client__) && defined(__x86_64__)
+
+#if defined(__arm__)
+#undef MEMACCESS
+#if defined(__native_client__)
+#define MEMACCESS(base) ".p2align   3\nbic %" #base ", #0xc0000000\n"
+#else
+#define MEMACCESS(base) "\n"
+#endif
 #endif
 
 void I444ToARGBRow_NEON(const uint8* src_y,
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 16faa9b..1ab8c5c 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1007
+#define LIBYUV_VERSION 1038
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/include/libyuv/video_common.h b/include/libyuv/video_common.h
index 039efb9..91acc2f 100644
--- a/include/libyuv/video_common.h
+++ b/include/libyuv/video_common.h
@@ -115,7 +115,7 @@
   FOURCC_H264 = FOURCC('H', '2', '6', '4'),
 
   // Match any fourcc.
-  FOURCC_ANY  = 0xFFFFFFFF,
+  FOURCC_ANY = -1,
 };
 
 enum FourCCBpp {
diff --git a/libyuv.gyp b/libyuv.gyp
index d067a05..246d662 100644
--- a/libyuv.gyp
+++ b/libyuv.gyp
@@ -12,6 +12,7 @@
   ],
   'variables': {
     'use_system_libjpeg%': 0,
+    'libyuv_disable_jpeg%': 0,
     'build_neon': 0,
     'conditions': [
        [ '(target_arch == "armv7" or target_arch == "armv7s" or (target_arch == "arm" and arm_version >= 7)) and target_subarch != 64 and (arm_neon == 1 or arm_neon_optional == 1)', {
@@ -71,8 +72,7 @@
             'LIBYUV_DISABLE_NEON'
           ],
         }],
-        # TODO(fbarchard): Use gyp define to enable jpeg.
-        [ 'OS != "ios"', {
+        [ 'OS != "ios" and libyuv_disable_jpeg != 1', {
           'defines': [
             'HAVE_JPEG'
           ],
@@ -104,6 +104,13 @@
             'LIBYUV_NEON',
           ]
         }],
+        # MemorySanitizer does not support assembly code yet.
+        # http://crbug.com/344505
+        [ 'msan == 1', {
+          'defines': [
+            'LIBYUV_DISABLE_X86',
+          ],
+        }],
       ],
       'defines': [
         # Enable the following 3 macros to turn off assembly for specified CPU.
diff --git a/libyuv.target.darwin-arm.mk b/libyuv.target.darwin-arm.mk
index e162db2..c04afd0 100644
--- a/libyuv.target.darwin-arm.mk
+++ b/libyuv.target.darwin-arm.mk
@@ -5,14 +5,12 @@
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 LOCAL_MODULE := third_party_libyuv_libyuv_gyp
 LOCAL_MODULE_SUFFIX := .a
-LOCAL_MODULE_TAGS := optional
 LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH)
 gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX))
 gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX))
 
 # Make sure our deps are built first.
-GYP_TARGET_DEPENDENCIES := \
-	$(call intermediates-dir-for,GYP,third_party_libjpeg_libjpeg_gyp,,,$(GYP_VAR_PREFIX))/libjpeg.stamp
+GYP_TARGET_DEPENDENCIES :=
 
 GYP_GENERATED_OUTPUTS :=
 
@@ -62,7 +60,6 @@
 MY_CFLAGS_Debug := \
 	-fstack-protector \
 	--param=ssp-buffer-size=4 \
-	-fno-exceptions \
 	-fno-strict-aliasing \
 	-Wno-unused-parameter \
 	-Wno-missing-field-initializers \
@@ -72,13 +69,13 @@
 	-Wno-unused-local-typedefs \
 	-Wno-format \
 	-fno-tree-sra \
+	-fno-caller-saves \
+	-Wno-psabi \
 	-fno-partial-inlining \
 	-fno-early-inlining \
 	-fno-tree-copy-prop \
 	-fno-tree-loop-optimize \
 	-fno-move-loop-invariants \
-	-fno-caller-saves \
-	-Wno-psabi \
 	-ffunction-sections \
 	-funwind-tables \
 	-g \
@@ -97,6 +94,7 @@
 	-Wno-sequence-point \
 	-Os \
 	-g \
+	-gdwarf-4 \
 	-fdata-sections \
 	-ffunction-sections \
 	-fomit-frame-pointer \
@@ -104,7 +102,6 @@
 
 MY_DEFS_Debug := \
 	'-DV8_DEPRECATION_WARNINGS' \
-	'-DBLINK_SCALE_FILTERS_AT_RECORD_TIME' \
 	'-D_FILE_OFFSET_BITS=64' \
 	'-DNO_TCMALLOC' \
 	'-DDISABLE_NACL' \
@@ -121,14 +118,15 @@
 	'-DENABLE_PRINTING=1' \
 	'-DENABLE_MANAGED_USERS=1' \
 	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
-	'-DDATA_REDUCTION_DEV_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
 	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
 	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
 	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
 	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
 	'-DHAVE_JPEG' \
 	'-DLIBYUV_NEON' \
-	'-DUSE_SYSTEM_LIBJPEG' \
 	'-DUSE_OPENSSL=1' \
 	'-DUSE_OPENSSL_CERTS=1' \
 	'-DANDROID' \
@@ -146,7 +144,7 @@
 	$(gyp_shared_intermediate_dir) \
 	$(LOCAL_PATH)/third_party/libyuv/include \
 	$(LOCAL_PATH)/third_party/libyuv \
-	$(PWD)/external/jpeg \
+	$(LOCAL_PATH)/third_party/libjpeg_turbo \
 	$(PWD)/frameworks/wilhelm/include \
 	$(PWD)/bionic \
 	$(PWD)/external/stlport/stlport
@@ -154,23 +152,24 @@
 
 # Flags passed to only C++ (and not C) files.
 LOCAL_CPPFLAGS_Debug := \
+	-fno-exceptions \
 	-fno-rtti \
 	-fno-threadsafe-statics \
 	-fvisibility-inlines-hidden \
 	-Wno-deprecated \
 	-Wno-abi \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
 	-Wno-non-virtual-dtor \
 	-Wno-sign-promo \
 	-Wno-non-virtual-dtor
 
 
-LOCAL_FDO_SUPPORT_Debug := false
-
 # Flags passed to both C and C++ files.
 MY_CFLAGS_Release := \
 	-fstack-protector \
 	--param=ssp-buffer-size=4 \
-	-fno-exceptions \
 	-fno-strict-aliasing \
 	-Wno-unused-parameter \
 	-Wno-missing-field-initializers \
@@ -180,13 +179,13 @@
 	-Wno-unused-local-typedefs \
 	-Wno-format \
 	-fno-tree-sra \
+	-fno-caller-saves \
+	-Wno-psabi \
 	-fno-partial-inlining \
 	-fno-early-inlining \
 	-fno-tree-copy-prop \
 	-fno-tree-loop-optimize \
 	-fno-move-loop-invariants \
-	-fno-caller-saves \
-	-Wno-psabi \
 	-ffunction-sections \
 	-funwind-tables \
 	-g \
@@ -212,7 +211,6 @@
 
 MY_DEFS_Release := \
 	'-DV8_DEPRECATION_WARNINGS' \
-	'-DBLINK_SCALE_FILTERS_AT_RECORD_TIME' \
 	'-D_FILE_OFFSET_BITS=64' \
 	'-DNO_TCMALLOC' \
 	'-DDISABLE_NACL' \
@@ -229,14 +227,15 @@
 	'-DENABLE_PRINTING=1' \
 	'-DENABLE_MANAGED_USERS=1' \
 	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
-	'-DDATA_REDUCTION_DEV_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
 	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
 	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
 	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
 	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
 	'-DHAVE_JPEG' \
 	'-DLIBYUV_NEON' \
-	'-DUSE_SYSTEM_LIBJPEG' \
 	'-DUSE_OPENSSL=1' \
 	'-DUSE_OPENSSL_CERTS=1' \
 	'-DANDROID' \
@@ -254,7 +253,7 @@
 	$(gyp_shared_intermediate_dir) \
 	$(LOCAL_PATH)/third_party/libyuv/include \
 	$(LOCAL_PATH)/third_party/libyuv \
-	$(PWD)/external/jpeg \
+	$(LOCAL_PATH)/third_party/libjpeg_turbo \
 	$(PWD)/frameworks/wilhelm/include \
 	$(PWD)/bionic \
 	$(PWD)/external/stlport/stlport
@@ -262,69 +261,26 @@
 
 # Flags passed to only C++ (and not C) files.
 LOCAL_CPPFLAGS_Release := \
+	-fno-exceptions \
 	-fno-rtti \
 	-fno-threadsafe-statics \
 	-fvisibility-inlines-hidden \
 	-Wno-deprecated \
 	-Wno-abi \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
 	-Wno-non-virtual-dtor \
 	-Wno-sign-promo \
 	-Wno-non-virtual-dtor
 
 
-LOCAL_FDO_SUPPORT_Release := false
-
 LOCAL_CFLAGS := $(MY_CFLAGS_$(GYP_CONFIGURATION)) $(MY_DEFS_$(GYP_CONFIGURATION))
-LOCAL_FDO_SUPPORT := $(LOCAL_FDO_SUPPORT_$(GYP_CONFIGURATION))
 LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CONFIGURATION))
 LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION))
 LOCAL_ASFLAGS := $(LOCAL_CFLAGS)
 ### Rules for final target.
 
-LOCAL_LDFLAGS_Debug := \
-	-Wl,-z,now \
-	-Wl,-z,relro \
-	-Wl,--fatal-warnings \
-	-Wl,-z,noexecstack \
-	-fPIC \
-	-Wl,-z,relro \
-	-Wl,-z,now \
-	-fuse-ld=gold \
-	-nostdlib \
-	-Wl,--no-undefined \
-	-Wl,--exclude-libs=ALL \
-	-Wl,--icf=safe \
-	-Wl,--warn-shared-textrel \
-	-Wl,-O1 \
-	-Wl,--as-needed
-
-
-LOCAL_LDFLAGS_Release := \
-	-Wl,-z,now \
-	-Wl,-z,relro \
-	-Wl,--fatal-warnings \
-	-Wl,-z,noexecstack \
-	-fPIC \
-	-Wl,-z,relro \
-	-Wl,-z,now \
-	-fuse-ld=gold \
-	-nostdlib \
-	-Wl,--no-undefined \
-	-Wl,--exclude-libs=ALL \
-	-Wl,--icf=safe \
-	-Wl,-O1 \
-	-Wl,--as-needed \
-	-Wl,--gc-sections \
-	-Wl,--warn-shared-textrel
-
-
-LOCAL_LDFLAGS := $(LOCAL_LDFLAGS_$(GYP_CONFIGURATION))
-
-LOCAL_STATIC_LIBRARIES :=
-
-# Enable grouping to fix circular references
-LOCAL_GROUP_STATIC_LIBRARIES := true
-
 LOCAL_SHARED_LIBRARIES := \
 	libstlport \
 	libdl
diff --git a/libyuv.target.darwin-arm64.mk b/libyuv.target.darwin-arm64.mk
index c056da0..8e7a656 100644
--- a/libyuv.target.darwin-arm64.mk
+++ b/libyuv.target.darwin-arm64.mk
@@ -5,14 +5,12 @@
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 LOCAL_MODULE := third_party_libyuv_libyuv_gyp
 LOCAL_MODULE_SUFFIX := .a
-LOCAL_MODULE_TAGS := optional
 LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH)
 gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX))
 gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX))
 
 # Make sure our deps are built first.
-GYP_TARGET_DEPENDENCIES := \
-	$(call intermediates-dir-for,GYP,third_party_libjpeg_libjpeg_gyp,,,$(GYP_VAR_PREFIX))/libjpeg.stamp
+GYP_TARGET_DEPENDENCIES :=
 
 GYP_GENERATED_OUTPUTS :=
 
@@ -61,7 +59,6 @@
 # Flags passed to both C and C++ files.
 MY_CFLAGS_Debug := \
 	--param=ssp-buffer-size=4 \
-	-fno-exceptions \
 	-fno-strict-aliasing \
 	-Wno-unused-parameter \
 	-Wno-missing-field-initializers \
@@ -87,13 +84,13 @@
 	-Wno-sequence-point \
 	-Os \
 	-g \
+	-gdwarf-4 \
 	-fdata-sections \
 	-ffunction-sections \
 	-funwind-tables
 
 MY_DEFS_Debug := \
 	'-DV8_DEPRECATION_WARNINGS' \
-	'-DBLINK_SCALE_FILTERS_AT_RECORD_TIME' \
 	'-D_FILE_OFFSET_BITS=64' \
 	'-DNO_TCMALLOC' \
 	'-DDISABLE_NACL' \
@@ -110,13 +107,14 @@
 	'-DENABLE_PRINTING=1' \
 	'-DENABLE_MANAGED_USERS=1' \
 	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
-	'-DDATA_REDUCTION_DEV_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
 	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
 	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
 	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
 	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
 	'-DHAVE_JPEG' \
-	'-DUSE_SYSTEM_LIBJPEG' \
 	'-DUSE_OPENSSL=1' \
 	'-DUSE_OPENSSL_CERTS=1' \
 	'-DANDROID' \
@@ -134,7 +132,7 @@
 	$(gyp_shared_intermediate_dir) \
 	$(LOCAL_PATH)/third_party/libyuv/include \
 	$(LOCAL_PATH)/third_party/libyuv \
-	$(PWD)/external/jpeg \
+	$(LOCAL_PATH)/third_party/libjpeg_turbo \
 	$(PWD)/frameworks/wilhelm/include \
 	$(PWD)/bionic \
 	$(PWD)/external/stlport/stlport
@@ -142,21 +140,22 @@
 
 # Flags passed to only C++ (and not C) files.
 LOCAL_CPPFLAGS_Debug := \
+	-fno-exceptions \
 	-fno-rtti \
 	-fno-threadsafe-statics \
 	-fvisibility-inlines-hidden \
 	-Wno-deprecated \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
 	-Wno-non-virtual-dtor \
 	-Wno-sign-promo \
 	-Wno-non-virtual-dtor
 
 
-LOCAL_FDO_SUPPORT_Debug := false
-
 # Flags passed to both C and C++ files.
 MY_CFLAGS_Release := \
 	--param=ssp-buffer-size=4 \
-	-fno-exceptions \
 	-fno-strict-aliasing \
 	-Wno-unused-parameter \
 	-Wno-missing-field-initializers \
@@ -188,7 +187,6 @@
 
 MY_DEFS_Release := \
 	'-DV8_DEPRECATION_WARNINGS' \
-	'-DBLINK_SCALE_FILTERS_AT_RECORD_TIME' \
 	'-D_FILE_OFFSET_BITS=64' \
 	'-DNO_TCMALLOC' \
 	'-DDISABLE_NACL' \
@@ -205,13 +203,14 @@
 	'-DENABLE_PRINTING=1' \
 	'-DENABLE_MANAGED_USERS=1' \
 	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
-	'-DDATA_REDUCTION_DEV_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
 	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
 	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
 	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
 	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
 	'-DHAVE_JPEG' \
-	'-DUSE_SYSTEM_LIBJPEG' \
 	'-DUSE_OPENSSL=1' \
 	'-DUSE_OPENSSL_CERTS=1' \
 	'-DANDROID' \
@@ -229,7 +228,7 @@
 	$(gyp_shared_intermediate_dir) \
 	$(LOCAL_PATH)/third_party/libyuv/include \
 	$(LOCAL_PATH)/third_party/libyuv \
-	$(PWD)/external/jpeg \
+	$(LOCAL_PATH)/third_party/libjpeg_turbo \
 	$(PWD)/frameworks/wilhelm/include \
 	$(PWD)/bionic \
 	$(PWD)/external/stlport/stlport
@@ -237,60 +236,25 @@
 
 # Flags passed to only C++ (and not C) files.
 LOCAL_CPPFLAGS_Release := \
+	-fno-exceptions \
 	-fno-rtti \
 	-fno-threadsafe-statics \
 	-fvisibility-inlines-hidden \
 	-Wno-deprecated \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
 	-Wno-non-virtual-dtor \
 	-Wno-sign-promo \
 	-Wno-non-virtual-dtor
 
 
-LOCAL_FDO_SUPPORT_Release := false
-
 LOCAL_CFLAGS := $(MY_CFLAGS_$(GYP_CONFIGURATION)) $(MY_DEFS_$(GYP_CONFIGURATION))
-LOCAL_FDO_SUPPORT := $(LOCAL_FDO_SUPPORT_$(GYP_CONFIGURATION))
 LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CONFIGURATION))
 LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION))
 LOCAL_ASFLAGS := $(LOCAL_CFLAGS)
 ### Rules for final target.
 
-LOCAL_LDFLAGS_Debug := \
-	-Wl,-z,now \
-	-Wl,-z,relro \
-	-Wl,--fatal-warnings \
-	-Wl,-z,noexecstack \
-	-fPIC \
-	-nostdlib \
-	-Wl,--no-undefined \
-	-Wl,--exclude-libs=ALL \
-	-Wl,--warn-shared-textrel \
-	-Wl,-O1 \
-	-Wl,--as-needed
-
-
-LOCAL_LDFLAGS_Release := \
-	-Wl,-z,now \
-	-Wl,-z,relro \
-	-Wl,--fatal-warnings \
-	-Wl,-z,noexecstack \
-	-fPIC \
-	-nostdlib \
-	-Wl,--no-undefined \
-	-Wl,--exclude-libs=ALL \
-	-Wl,-O1 \
-	-Wl,--as-needed \
-	-Wl,--gc-sections \
-	-Wl,--warn-shared-textrel
-
-
-LOCAL_LDFLAGS := $(LOCAL_LDFLAGS_$(GYP_CONFIGURATION))
-
-LOCAL_STATIC_LIBRARIES :=
-
-# Enable grouping to fix circular references
-LOCAL_GROUP_STATIC_LIBRARIES := true
-
 LOCAL_SHARED_LIBRARIES := \
 	libstlport \
 	libdl
diff --git a/libyuv.target.darwin-mips.mk b/libyuv.target.darwin-mips.mk
index 341f905..e446e24 100644
--- a/libyuv.target.darwin-mips.mk
+++ b/libyuv.target.darwin-mips.mk
@@ -5,14 +5,12 @@
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 LOCAL_MODULE := third_party_libyuv_libyuv_gyp
 LOCAL_MODULE_SUFFIX := .a
-LOCAL_MODULE_TAGS := optional
 LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH)
 gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX))
 gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX))
 
 # Make sure our deps are built first.
-GYP_TARGET_DEPENDENCIES := \
-	$(call intermediates-dir-for,GYP,third_party_libjpeg_libjpeg_gyp,,,$(GYP_VAR_PREFIX))/libjpeg.stamp
+GYP_TARGET_DEPENDENCIES :=
 
 GYP_GENERATED_OUTPUTS :=
 
@@ -63,7 +61,6 @@
 	-fstack-protector \
 	--param=ssp-buffer-size=4 \
 	 \
-	-fno-exceptions \
 	-fno-strict-aliasing \
 	-Wno-unused-parameter \
 	-Wno-missing-field-initializers \
@@ -72,8 +69,6 @@
 	-fPIC \
 	-Wno-unused-local-typedefs \
 	-Wno-format \
-	-EL \
-	-mhard-float \
 	-ffunction-sections \
 	-funwind-tables \
 	-g \
@@ -92,6 +87,7 @@
 	-Wno-sequence-point \
 	-Os \
 	-g \
+	-gdwarf-4 \
 	-fdata-sections \
 	-ffunction-sections \
 	-fomit-frame-pointer \
@@ -99,7 +95,6 @@
 
 MY_DEFS_Debug := \
 	'-DV8_DEPRECATION_WARNINGS' \
-	'-DBLINK_SCALE_FILTERS_AT_RECORD_TIME' \
 	'-D_FILE_OFFSET_BITS=64' \
 	'-DNO_TCMALLOC' \
 	'-DDISABLE_NACL' \
@@ -116,13 +111,14 @@
 	'-DENABLE_PRINTING=1' \
 	'-DENABLE_MANAGED_USERS=1' \
 	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
-	'-DDATA_REDUCTION_DEV_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
 	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
 	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
 	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
 	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
 	'-DHAVE_JPEG' \
-	'-DUSE_SYSTEM_LIBJPEG' \
 	'-DUSE_OPENSSL=1' \
 	'-DUSE_OPENSSL_CERTS=1' \
 	'-DANDROID' \
@@ -140,7 +136,7 @@
 	$(gyp_shared_intermediate_dir) \
 	$(LOCAL_PATH)/third_party/libyuv/include \
 	$(LOCAL_PATH)/third_party/libyuv \
-	$(PWD)/external/jpeg \
+	$(LOCAL_PATH)/third_party/libjpeg_turbo \
 	$(PWD)/frameworks/wilhelm/include \
 	$(PWD)/bionic \
 	$(PWD)/external/stlport/stlport
@@ -148,24 +144,25 @@
 
 # Flags passed to only C++ (and not C) files.
 LOCAL_CPPFLAGS_Debug := \
+	-fno-exceptions \
 	-fno-rtti \
 	-fno-threadsafe-statics \
 	-fvisibility-inlines-hidden \
 	-Wno-deprecated \
 	-Wno-uninitialized \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
 	-Wno-non-virtual-dtor \
 	-Wno-sign-promo \
 	-Wno-non-virtual-dtor
 
 
-LOCAL_FDO_SUPPORT_Debug := false
-
 # Flags passed to both C and C++ files.
 MY_CFLAGS_Release := \
 	-fstack-protector \
 	--param=ssp-buffer-size=4 \
 	 \
-	-fno-exceptions \
 	-fno-strict-aliasing \
 	-Wno-unused-parameter \
 	-Wno-missing-field-initializers \
@@ -174,8 +171,6 @@
 	-fPIC \
 	-Wno-unused-local-typedefs \
 	-Wno-format \
-	-EL \
-	-mhard-float \
 	-ffunction-sections \
 	-funwind-tables \
 	-g \
@@ -201,7 +196,6 @@
 
 MY_DEFS_Release := \
 	'-DV8_DEPRECATION_WARNINGS' \
-	'-DBLINK_SCALE_FILTERS_AT_RECORD_TIME' \
 	'-D_FILE_OFFSET_BITS=64' \
 	'-DNO_TCMALLOC' \
 	'-DDISABLE_NACL' \
@@ -218,13 +212,14 @@
 	'-DENABLE_PRINTING=1' \
 	'-DENABLE_MANAGED_USERS=1' \
 	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
-	'-DDATA_REDUCTION_DEV_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
 	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
 	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
 	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
 	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
 	'-DHAVE_JPEG' \
-	'-DUSE_SYSTEM_LIBJPEG' \
 	'-DUSE_OPENSSL=1' \
 	'-DUSE_OPENSSL_CERTS=1' \
 	'-DANDROID' \
@@ -242,7 +237,7 @@
 	$(gyp_shared_intermediate_dir) \
 	$(LOCAL_PATH)/third_party/libyuv/include \
 	$(LOCAL_PATH)/third_party/libyuv \
-	$(PWD)/external/jpeg \
+	$(LOCAL_PATH)/third_party/libjpeg_turbo \
 	$(PWD)/frameworks/wilhelm/include \
 	$(PWD)/bionic \
 	$(PWD)/external/stlport/stlport
@@ -250,65 +245,26 @@
 
 # Flags passed to only C++ (and not C) files.
 LOCAL_CPPFLAGS_Release := \
+	-fno-exceptions \
 	-fno-rtti \
 	-fno-threadsafe-statics \
 	-fvisibility-inlines-hidden \
 	-Wno-deprecated \
 	-Wno-uninitialized \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
 	-Wno-non-virtual-dtor \
 	-Wno-sign-promo \
 	-Wno-non-virtual-dtor
 
 
-LOCAL_FDO_SUPPORT_Release := false
-
 LOCAL_CFLAGS := $(MY_CFLAGS_$(GYP_CONFIGURATION)) $(MY_DEFS_$(GYP_CONFIGURATION))
-LOCAL_FDO_SUPPORT := $(LOCAL_FDO_SUPPORT_$(GYP_CONFIGURATION))
 LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CONFIGURATION))
 LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION))
 LOCAL_ASFLAGS := $(LOCAL_CFLAGS)
 ### Rules for final target.
 
-LOCAL_LDFLAGS_Debug := \
-	-Wl,-z,now \
-	-Wl,-z,relro \
-	-Wl,--fatal-warnings \
-	-Wl,-z,noexecstack \
-	-fPIC \
-	-EL \
-	-Wl,--no-keep-memory \
-	-nostdlib \
-	-Wl,--no-undefined \
-	-Wl,--exclude-libs=ALL \
-	-Wl,--warn-shared-textrel \
-	-Wl,-O1 \
-	-Wl,--as-needed
-
-
-LOCAL_LDFLAGS_Release := \
-	-Wl,-z,now \
-	-Wl,-z,relro \
-	-Wl,--fatal-warnings \
-	-Wl,-z,noexecstack \
-	-fPIC \
-	-EL \
-	-Wl,--no-keep-memory \
-	-nostdlib \
-	-Wl,--no-undefined \
-	-Wl,--exclude-libs=ALL \
-	-Wl,-O1 \
-	-Wl,--as-needed \
-	-Wl,--gc-sections \
-	-Wl,--warn-shared-textrel
-
-
-LOCAL_LDFLAGS := $(LOCAL_LDFLAGS_$(GYP_CONFIGURATION))
-
-LOCAL_STATIC_LIBRARIES :=
-
-# Enable grouping to fix circular references
-LOCAL_GROUP_STATIC_LIBRARIES := true
-
 LOCAL_SHARED_LIBRARIES := \
 	libstlport \
 	libdl
diff --git a/libyuv.target.darwin-mips64.mk b/libyuv.target.darwin-mips64.mk
new file mode 100644
index 0000000..e446e24
--- /dev/null
+++ b/libyuv.target.darwin-mips64.mk
@@ -0,0 +1,280 @@
+# This file is generated by gyp; do not edit.
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_CLASS := STATIC_LIBRARIES
+LOCAL_MODULE := third_party_libyuv_libyuv_gyp
+LOCAL_MODULE_SUFFIX := .a
+LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH)
+gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX))
+gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX))
+
+# Make sure our deps are built first.
+GYP_TARGET_DEPENDENCIES :=
+
+GYP_GENERATED_OUTPUTS :=
+
+# Make sure our deps and generated files are built first.
+LOCAL_ADDITIONAL_DEPENDENCIES := $(GYP_TARGET_DEPENDENCIES) $(GYP_GENERATED_OUTPUTS)
+
+LOCAL_CPP_EXTENSION := .cc
+LOCAL_GENERATED_SOURCES :=
+
+GYP_COPIED_SOURCE_ORIGIN_DIRS :=
+
+LOCAL_SRC_FILES := \
+	third_party/libyuv/source/compare.cc \
+	third_party/libyuv/source/compare_common.cc \
+	third_party/libyuv/source/compare_posix.cc \
+	third_party/libyuv/source/compare_win.cc \
+	third_party/libyuv/source/convert.cc \
+	third_party/libyuv/source/convert_argb.cc \
+	third_party/libyuv/source/convert_from.cc \
+	third_party/libyuv/source/convert_from_argb.cc \
+	third_party/libyuv/source/convert_jpeg.cc \
+	third_party/libyuv/source/convert_to_argb.cc \
+	third_party/libyuv/source/convert_to_i420.cc \
+	third_party/libyuv/source/cpu_id.cc \
+	third_party/libyuv/source/format_conversion.cc \
+	third_party/libyuv/source/mjpeg_decoder.cc \
+	third_party/libyuv/source/mjpeg_validate.cc \
+	third_party/libyuv/source/planar_functions.cc \
+	third_party/libyuv/source/rotate.cc \
+	third_party/libyuv/source/rotate_argb.cc \
+	third_party/libyuv/source/rotate_mips.cc \
+	third_party/libyuv/source/row_any.cc \
+	third_party/libyuv/source/row_common.cc \
+	third_party/libyuv/source/row_mips.cc \
+	third_party/libyuv/source/row_posix.cc \
+	third_party/libyuv/source/row_win.cc \
+	third_party/libyuv/source/scale.cc \
+	third_party/libyuv/source/scale_argb.cc \
+	third_party/libyuv/source/scale_common.cc \
+	third_party/libyuv/source/scale_mips.cc \
+	third_party/libyuv/source/scale_posix.cc \
+	third_party/libyuv/source/scale_win.cc \
+	third_party/libyuv/source/video_common.cc
+
+
+# Flags passed to both C and C++ files.
+MY_CFLAGS_Debug := \
+	-fstack-protector \
+	--param=ssp-buffer-size=4 \
+	 \
+	-fno-strict-aliasing \
+	-Wno-unused-parameter \
+	-Wno-missing-field-initializers \
+	-fvisibility=hidden \
+	-pipe \
+	-fPIC \
+	-Wno-unused-local-typedefs \
+	-Wno-format \
+	-ffunction-sections \
+	-funwind-tables \
+	-g \
+	-fstack-protector \
+	-fno-short-enums \
+	-finline-limit=64 \
+	-Wa,--noexecstack \
+	-U_FORTIFY_SOURCE \
+	-Wno-extra \
+	-Wno-ignored-qualifiers \
+	-Wno-type-limits \
+	-Wno-unused-but-set-variable \
+	-Wno-address \
+	-Wno-format-security \
+	-Wno-return-type \
+	-Wno-sequence-point \
+	-Os \
+	-g \
+	-gdwarf-4 \
+	-fdata-sections \
+	-ffunction-sections \
+	-fomit-frame-pointer \
+	-funwind-tables
+
+MY_DEFS_Debug := \
+	'-DV8_DEPRECATION_WARNINGS' \
+	'-D_FILE_OFFSET_BITS=64' \
+	'-DNO_TCMALLOC' \
+	'-DDISABLE_NACL' \
+	'-DCHROMIUM_BUILD' \
+	'-DUSE_LIBJPEG_TURBO=1' \
+	'-DENABLE_WEBRTC=1' \
+	'-DUSE_PROPRIETARY_CODECS' \
+	'-DENABLE_BROWSER_CDMS' \
+	'-DENABLE_CONFIGURATION_POLICY' \
+	'-DDISCARDABLE_MEMORY_ALWAYS_SUPPORTED_NATIVELY' \
+	'-DSYSTEM_NATIVELY_SIGNALS_MEMORY_PRESSURE' \
+	'-DENABLE_EGLIMAGE=1' \
+	'-DCLD_VERSION=1' \
+	'-DENABLE_PRINTING=1' \
+	'-DENABLE_MANAGED_USERS=1' \
+	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
+	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
+	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
+	'-DHAVE_JPEG' \
+	'-DUSE_OPENSSL=1' \
+	'-DUSE_OPENSSL_CERTS=1' \
+	'-DANDROID' \
+	'-D__GNU_SOURCE=1' \
+	'-DUSE_STLPORT=1' \
+	'-D_STLP_USE_PTR_SPECIALIZATIONS=1' \
+	'-DCHROME_BUILD_ID=""' \
+	'-DDYNAMIC_ANNOTATIONS_ENABLED=1' \
+	'-DWTF_USE_DYNAMIC_ANNOTATIONS=1' \
+	'-D_DEBUG'
+
+
+# Include paths placed before CFLAGS/CPPFLAGS
+LOCAL_C_INCLUDES_Debug := \
+	$(gyp_shared_intermediate_dir) \
+	$(LOCAL_PATH)/third_party/libyuv/include \
+	$(LOCAL_PATH)/third_party/libyuv \
+	$(LOCAL_PATH)/third_party/libjpeg_turbo \
+	$(PWD)/frameworks/wilhelm/include \
+	$(PWD)/bionic \
+	$(PWD)/external/stlport/stlport
+
+
+# Flags passed to only C++ (and not C) files.
+LOCAL_CPPFLAGS_Debug := \
+	-fno-exceptions \
+	-fno-rtti \
+	-fno-threadsafe-statics \
+	-fvisibility-inlines-hidden \
+	-Wno-deprecated \
+	-Wno-uninitialized \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
+	-Wno-non-virtual-dtor \
+	-Wno-sign-promo \
+	-Wno-non-virtual-dtor
+
+
+# Flags passed to both C and C++ files.
+MY_CFLAGS_Release := \
+	-fstack-protector \
+	--param=ssp-buffer-size=4 \
+	 \
+	-fno-strict-aliasing \
+	-Wno-unused-parameter \
+	-Wno-missing-field-initializers \
+	-fvisibility=hidden \
+	-pipe \
+	-fPIC \
+	-Wno-unused-local-typedefs \
+	-Wno-format \
+	-ffunction-sections \
+	-funwind-tables \
+	-g \
+	-fstack-protector \
+	-fno-short-enums \
+	-finline-limit=64 \
+	-Wa,--noexecstack \
+	-U_FORTIFY_SOURCE \
+	-Wno-extra \
+	-Wno-ignored-qualifiers \
+	-Wno-type-limits \
+	-Wno-unused-but-set-variable \
+	-Wno-address \
+	-Wno-format-security \
+	-Wno-return-type \
+	-Wno-sequence-point \
+	-Os \
+	-fno-ident \
+	-fdata-sections \
+	-ffunction-sections \
+	-fomit-frame-pointer \
+	-funwind-tables
+
+MY_DEFS_Release := \
+	'-DV8_DEPRECATION_WARNINGS' \
+	'-D_FILE_OFFSET_BITS=64' \
+	'-DNO_TCMALLOC' \
+	'-DDISABLE_NACL' \
+	'-DCHROMIUM_BUILD' \
+	'-DUSE_LIBJPEG_TURBO=1' \
+	'-DENABLE_WEBRTC=1' \
+	'-DUSE_PROPRIETARY_CODECS' \
+	'-DENABLE_BROWSER_CDMS' \
+	'-DENABLE_CONFIGURATION_POLICY' \
+	'-DDISCARDABLE_MEMORY_ALWAYS_SUPPORTED_NATIVELY' \
+	'-DSYSTEM_NATIVELY_SIGNALS_MEMORY_PRESSURE' \
+	'-DENABLE_EGLIMAGE=1' \
+	'-DCLD_VERSION=1' \
+	'-DENABLE_PRINTING=1' \
+	'-DENABLE_MANAGED_USERS=1' \
+	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
+	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
+	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
+	'-DHAVE_JPEG' \
+	'-DUSE_OPENSSL=1' \
+	'-DUSE_OPENSSL_CERTS=1' \
+	'-DANDROID' \
+	'-D__GNU_SOURCE=1' \
+	'-DUSE_STLPORT=1' \
+	'-D_STLP_USE_PTR_SPECIALIZATIONS=1' \
+	'-DCHROME_BUILD_ID=""' \
+	'-DNDEBUG' \
+	'-DNVALGRIND' \
+	'-DDYNAMIC_ANNOTATIONS_ENABLED=0'
+
+
+# Include paths placed before CFLAGS/CPPFLAGS
+LOCAL_C_INCLUDES_Release := \
+	$(gyp_shared_intermediate_dir) \
+	$(LOCAL_PATH)/third_party/libyuv/include \
+	$(LOCAL_PATH)/third_party/libyuv \
+	$(LOCAL_PATH)/third_party/libjpeg_turbo \
+	$(PWD)/frameworks/wilhelm/include \
+	$(PWD)/bionic \
+	$(PWD)/external/stlport/stlport
+
+
+# Flags passed to only C++ (and not C) files.
+LOCAL_CPPFLAGS_Release := \
+	-fno-exceptions \
+	-fno-rtti \
+	-fno-threadsafe-statics \
+	-fvisibility-inlines-hidden \
+	-Wno-deprecated \
+	-Wno-uninitialized \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
+	-Wno-non-virtual-dtor \
+	-Wno-sign-promo \
+	-Wno-non-virtual-dtor
+
+
+LOCAL_CFLAGS := $(MY_CFLAGS_$(GYP_CONFIGURATION)) $(MY_DEFS_$(GYP_CONFIGURATION))
+LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CONFIGURATION))
+LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION))
+LOCAL_ASFLAGS := $(LOCAL_CFLAGS)
+### Rules for final target.
+
+LOCAL_SHARED_LIBRARIES := \
+	libstlport \
+	libdl
+
+# Add target alias to "gyp_all_modules" target.
+.PHONY: gyp_all_modules
+gyp_all_modules: third_party_libyuv_libyuv_gyp
+
+# Alias gyp target name.
+.PHONY: libyuv
+libyuv: third_party_libyuv_libyuv_gyp
+
+include $(BUILD_STATIC_LIBRARY)
diff --git a/libyuv.target.darwin-x86.mk b/libyuv.target.darwin-x86.mk
index 5fb11f4..cd82ea9 100644
--- a/libyuv.target.darwin-x86.mk
+++ b/libyuv.target.darwin-x86.mk
@@ -5,14 +5,12 @@
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 LOCAL_MODULE := third_party_libyuv_libyuv_gyp
 LOCAL_MODULE_SUFFIX := .a
-LOCAL_MODULE_TAGS := optional
 LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH)
 gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX))
 gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX))
 
 # Make sure our deps are built first.
-GYP_TARGET_DEPENDENCIES := \
-	$(call intermediates-dir-for,GYP,third_party_libjpeg_libjpeg_gyp,,,$(GYP_VAR_PREFIX))/libjpeg.stamp
+GYP_TARGET_DEPENDENCIES :=
 
 GYP_GENERATED_OUTPUTS :=
 
@@ -61,7 +59,6 @@
 # Flags passed to both C and C++ files.
 MY_CFLAGS_Debug := \
 	--param=ssp-buffer-size=4 \
-	-fno-exceptions \
 	-fno-strict-aliasing \
 	-Wno-unused-parameter \
 	-Wno-missing-field-initializers \
@@ -92,6 +89,7 @@
 	-Wno-sequence-point \
 	-Os \
 	-g \
+	-gdwarf-4 \
 	-fdata-sections \
 	-ffunction-sections \
 	-fomit-frame-pointer \
@@ -99,7 +97,6 @@
 
 MY_DEFS_Debug := \
 	'-DV8_DEPRECATION_WARNINGS' \
-	'-DBLINK_SCALE_FILTERS_AT_RECORD_TIME' \
 	'-D_FILE_OFFSET_BITS=64' \
 	'-DNO_TCMALLOC' \
 	'-DDISABLE_NACL' \
@@ -116,13 +113,14 @@
 	'-DENABLE_PRINTING=1' \
 	'-DENABLE_MANAGED_USERS=1' \
 	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
-	'-DDATA_REDUCTION_DEV_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
 	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
 	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
 	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
 	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
 	'-DHAVE_JPEG' \
-	'-DUSE_SYSTEM_LIBJPEG' \
 	'-DUSE_OPENSSL=1' \
 	'-DUSE_OPENSSL_CERTS=1' \
 	'-DANDROID' \
@@ -140,7 +138,7 @@
 	$(gyp_shared_intermediate_dir) \
 	$(LOCAL_PATH)/third_party/libyuv/include \
 	$(LOCAL_PATH)/third_party/libyuv \
-	$(PWD)/external/jpeg \
+	$(LOCAL_PATH)/third_party/libjpeg_turbo \
 	$(PWD)/frameworks/wilhelm/include \
 	$(PWD)/bionic \
 	$(PWD)/external/stlport/stlport
@@ -148,21 +146,22 @@
 
 # Flags passed to only C++ (and not C) files.
 LOCAL_CPPFLAGS_Debug := \
+	-fno-exceptions \
 	-fno-rtti \
 	-fno-threadsafe-statics \
 	-fvisibility-inlines-hidden \
 	-Wno-deprecated \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
 	-Wno-non-virtual-dtor \
 	-Wno-sign-promo \
 	-Wno-non-virtual-dtor
 
 
-LOCAL_FDO_SUPPORT_Debug := false
-
 # Flags passed to both C and C++ files.
 MY_CFLAGS_Release := \
 	--param=ssp-buffer-size=4 \
-	-fno-exceptions \
 	-fno-strict-aliasing \
 	-Wno-unused-parameter \
 	-Wno-missing-field-initializers \
@@ -200,7 +199,6 @@
 
 MY_DEFS_Release := \
 	'-DV8_DEPRECATION_WARNINGS' \
-	'-DBLINK_SCALE_FILTERS_AT_RECORD_TIME' \
 	'-D_FILE_OFFSET_BITS=64' \
 	'-DNO_TCMALLOC' \
 	'-DDISABLE_NACL' \
@@ -217,13 +215,14 @@
 	'-DENABLE_PRINTING=1' \
 	'-DENABLE_MANAGED_USERS=1' \
 	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
-	'-DDATA_REDUCTION_DEV_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
 	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
 	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
 	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
 	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
 	'-DHAVE_JPEG' \
-	'-DUSE_SYSTEM_LIBJPEG' \
 	'-DUSE_OPENSSL=1' \
 	'-DUSE_OPENSSL_CERTS=1' \
 	'-DANDROID' \
@@ -241,7 +240,7 @@
 	$(gyp_shared_intermediate_dir) \
 	$(LOCAL_PATH)/third_party/libyuv/include \
 	$(LOCAL_PATH)/third_party/libyuv \
-	$(PWD)/external/jpeg \
+	$(LOCAL_PATH)/third_party/libjpeg_turbo \
 	$(PWD)/frameworks/wilhelm/include \
 	$(PWD)/bionic \
 	$(PWD)/external/stlport/stlport
@@ -249,64 +248,25 @@
 
 # Flags passed to only C++ (and not C) files.
 LOCAL_CPPFLAGS_Release := \
+	-fno-exceptions \
 	-fno-rtti \
 	-fno-threadsafe-statics \
 	-fvisibility-inlines-hidden \
 	-Wno-deprecated \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
 	-Wno-non-virtual-dtor \
 	-Wno-sign-promo \
 	-Wno-non-virtual-dtor
 
 
-LOCAL_FDO_SUPPORT_Release := false
-
 LOCAL_CFLAGS := $(MY_CFLAGS_$(GYP_CONFIGURATION)) $(MY_DEFS_$(GYP_CONFIGURATION))
-LOCAL_FDO_SUPPORT := $(LOCAL_FDO_SUPPORT_$(GYP_CONFIGURATION))
 LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CONFIGURATION))
 LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION))
 LOCAL_ASFLAGS := $(LOCAL_CFLAGS)
 ### Rules for final target.
 
-LOCAL_LDFLAGS_Debug := \
-	-Wl,-z,now \
-	-Wl,-z,relro \
-	-Wl,--fatal-warnings \
-	-Wl,-z,noexecstack \
-	-fPIC \
-	-m32 \
-	-fuse-ld=gold \
-	-nostdlib \
-	-Wl,--no-undefined \
-	-Wl,--exclude-libs=ALL \
-	-Wl,--warn-shared-textrel \
-	-Wl,-O1 \
-	-Wl,--as-needed
-
-
-LOCAL_LDFLAGS_Release := \
-	-Wl,-z,now \
-	-Wl,-z,relro \
-	-Wl,--fatal-warnings \
-	-Wl,-z,noexecstack \
-	-fPIC \
-	-m32 \
-	-fuse-ld=gold \
-	-nostdlib \
-	-Wl,--no-undefined \
-	-Wl,--exclude-libs=ALL \
-	-Wl,-O1 \
-	-Wl,--as-needed \
-	-Wl,--gc-sections \
-	-Wl,--warn-shared-textrel
-
-
-LOCAL_LDFLAGS := $(LOCAL_LDFLAGS_$(GYP_CONFIGURATION))
-
-LOCAL_STATIC_LIBRARIES :=
-
-# Enable grouping to fix circular references
-LOCAL_GROUP_STATIC_LIBRARIES := true
-
 LOCAL_SHARED_LIBRARIES := \
 	libstlport \
 	libdl
diff --git a/libyuv.target.darwin-x86_64.mk b/libyuv.target.darwin-x86_64.mk
index f06f354..7beeb32 100644
--- a/libyuv.target.darwin-x86_64.mk
+++ b/libyuv.target.darwin-x86_64.mk
@@ -5,14 +5,12 @@
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 LOCAL_MODULE := third_party_libyuv_libyuv_gyp
 LOCAL_MODULE_SUFFIX := .a
-LOCAL_MODULE_TAGS := optional
 LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH)
 gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX))
 gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX))
 
 # Make sure our deps are built first.
-GYP_TARGET_DEPENDENCIES := \
-	$(call intermediates-dir-for,GYP,third_party_libjpeg_libjpeg_gyp,,,$(GYP_VAR_PREFIX))/libjpeg.stamp
+GYP_TARGET_DEPENDENCIES :=
 
 GYP_GENERATED_OUTPUTS :=
 
@@ -62,7 +60,6 @@
 MY_CFLAGS_Debug := \
 	-fstack-protector \
 	--param=ssp-buffer-size=4 \
-	-fno-exceptions \
 	-fno-strict-aliasing \
 	-Wno-unused-parameter \
 	-Wno-missing-field-initializers \
@@ -91,6 +88,7 @@
 	-Wno-sequence-point \
 	-Os \
 	-g \
+	-gdwarf-4 \
 	-fdata-sections \
 	-ffunction-sections \
 	-fomit-frame-pointer \
@@ -98,7 +96,6 @@
 
 MY_DEFS_Debug := \
 	'-DV8_DEPRECATION_WARNINGS' \
-	'-DBLINK_SCALE_FILTERS_AT_RECORD_TIME' \
 	'-D_FILE_OFFSET_BITS=64' \
 	'-DNO_TCMALLOC' \
 	'-DDISABLE_NACL' \
@@ -115,13 +112,14 @@
 	'-DENABLE_PRINTING=1' \
 	'-DENABLE_MANAGED_USERS=1' \
 	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
-	'-DDATA_REDUCTION_DEV_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
 	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
 	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
 	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
 	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
 	'-DHAVE_JPEG' \
-	'-DUSE_SYSTEM_LIBJPEG' \
 	'-DUSE_OPENSSL=1' \
 	'-DUSE_OPENSSL_CERTS=1' \
 	'-DANDROID' \
@@ -139,7 +137,7 @@
 	$(gyp_shared_intermediate_dir) \
 	$(LOCAL_PATH)/third_party/libyuv/include \
 	$(LOCAL_PATH)/third_party/libyuv \
-	$(PWD)/external/jpeg \
+	$(LOCAL_PATH)/third_party/libjpeg_turbo \
 	$(PWD)/frameworks/wilhelm/include \
 	$(PWD)/bionic \
 	$(PWD)/external/stlport/stlport
@@ -147,22 +145,23 @@
 
 # Flags passed to only C++ (and not C) files.
 LOCAL_CPPFLAGS_Debug := \
+	-fno-exceptions \
 	-fno-rtti \
 	-fno-threadsafe-statics \
 	-fvisibility-inlines-hidden \
 	-Wno-deprecated \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
 	-Wno-non-virtual-dtor \
 	-Wno-sign-promo \
 	-Wno-non-virtual-dtor
 
 
-LOCAL_FDO_SUPPORT_Debug := false
-
 # Flags passed to both C and C++ files.
 MY_CFLAGS_Release := \
 	-fstack-protector \
 	--param=ssp-buffer-size=4 \
-	-fno-exceptions \
 	-fno-strict-aliasing \
 	-Wno-unused-parameter \
 	-Wno-missing-field-initializers \
@@ -198,7 +197,6 @@
 
 MY_DEFS_Release := \
 	'-DV8_DEPRECATION_WARNINGS' \
-	'-DBLINK_SCALE_FILTERS_AT_RECORD_TIME' \
 	'-D_FILE_OFFSET_BITS=64' \
 	'-DNO_TCMALLOC' \
 	'-DDISABLE_NACL' \
@@ -215,13 +213,14 @@
 	'-DENABLE_PRINTING=1' \
 	'-DENABLE_MANAGED_USERS=1' \
 	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
-	'-DDATA_REDUCTION_DEV_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
 	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
 	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
 	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
 	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
 	'-DHAVE_JPEG' \
-	'-DUSE_SYSTEM_LIBJPEG' \
 	'-DUSE_OPENSSL=1' \
 	'-DUSE_OPENSSL_CERTS=1' \
 	'-DANDROID' \
@@ -239,7 +238,7 @@
 	$(gyp_shared_intermediate_dir) \
 	$(LOCAL_PATH)/third_party/libyuv/include \
 	$(LOCAL_PATH)/third_party/libyuv \
-	$(PWD)/external/jpeg \
+	$(LOCAL_PATH)/third_party/libjpeg_turbo \
 	$(PWD)/frameworks/wilhelm/include \
 	$(PWD)/bionic \
 	$(PWD)/external/stlport/stlport
@@ -247,64 +246,25 @@
 
 # Flags passed to only C++ (and not C) files.
 LOCAL_CPPFLAGS_Release := \
+	-fno-exceptions \
 	-fno-rtti \
 	-fno-threadsafe-statics \
 	-fvisibility-inlines-hidden \
 	-Wno-deprecated \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
 	-Wno-non-virtual-dtor \
 	-Wno-sign-promo \
 	-Wno-non-virtual-dtor
 
 
-LOCAL_FDO_SUPPORT_Release := false
-
 LOCAL_CFLAGS := $(MY_CFLAGS_$(GYP_CONFIGURATION)) $(MY_DEFS_$(GYP_CONFIGURATION))
-LOCAL_FDO_SUPPORT := $(LOCAL_FDO_SUPPORT_$(GYP_CONFIGURATION))
 LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CONFIGURATION))
 LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION))
 LOCAL_ASFLAGS := $(LOCAL_CFLAGS)
 ### Rules for final target.
 
-LOCAL_LDFLAGS_Debug := \
-	-Wl,-z,now \
-	-Wl,-z,relro \
-	-Wl,--fatal-warnings \
-	-Wl,-z,noexecstack \
-	-fPIC \
-	-m64 \
-	-fuse-ld=gold \
-	-nostdlib \
-	-Wl,--no-undefined \
-	-Wl,--exclude-libs=ALL \
-	-Wl,--warn-shared-textrel \
-	-Wl,-O1 \
-	-Wl,--as-needed
-
-
-LOCAL_LDFLAGS_Release := \
-	-Wl,-z,now \
-	-Wl,-z,relro \
-	-Wl,--fatal-warnings \
-	-Wl,-z,noexecstack \
-	-fPIC \
-	-m64 \
-	-fuse-ld=gold \
-	-nostdlib \
-	-Wl,--no-undefined \
-	-Wl,--exclude-libs=ALL \
-	-Wl,-O1 \
-	-Wl,--as-needed \
-	-Wl,--gc-sections \
-	-Wl,--warn-shared-textrel
-
-
-LOCAL_LDFLAGS := $(LOCAL_LDFLAGS_$(GYP_CONFIGURATION))
-
-LOCAL_STATIC_LIBRARIES :=
-
-# Enable grouping to fix circular references
-LOCAL_GROUP_STATIC_LIBRARIES := true
-
 LOCAL_SHARED_LIBRARIES := \
 	libstlport \
 	libdl
diff --git a/libyuv.target.linux-arm.mk b/libyuv.target.linux-arm.mk
index e162db2..c04afd0 100644
--- a/libyuv.target.linux-arm.mk
+++ b/libyuv.target.linux-arm.mk
@@ -5,14 +5,12 @@
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 LOCAL_MODULE := third_party_libyuv_libyuv_gyp
 LOCAL_MODULE_SUFFIX := .a
-LOCAL_MODULE_TAGS := optional
 LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH)
 gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX))
 gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX))
 
 # Make sure our deps are built first.
-GYP_TARGET_DEPENDENCIES := \
-	$(call intermediates-dir-for,GYP,third_party_libjpeg_libjpeg_gyp,,,$(GYP_VAR_PREFIX))/libjpeg.stamp
+GYP_TARGET_DEPENDENCIES :=
 
 GYP_GENERATED_OUTPUTS :=
 
@@ -62,7 +60,6 @@
 MY_CFLAGS_Debug := \
 	-fstack-protector \
 	--param=ssp-buffer-size=4 \
-	-fno-exceptions \
 	-fno-strict-aliasing \
 	-Wno-unused-parameter \
 	-Wno-missing-field-initializers \
@@ -72,13 +69,13 @@
 	-Wno-unused-local-typedefs \
 	-Wno-format \
 	-fno-tree-sra \
+	-fno-caller-saves \
+	-Wno-psabi \
 	-fno-partial-inlining \
 	-fno-early-inlining \
 	-fno-tree-copy-prop \
 	-fno-tree-loop-optimize \
 	-fno-move-loop-invariants \
-	-fno-caller-saves \
-	-Wno-psabi \
 	-ffunction-sections \
 	-funwind-tables \
 	-g \
@@ -97,6 +94,7 @@
 	-Wno-sequence-point \
 	-Os \
 	-g \
+	-gdwarf-4 \
 	-fdata-sections \
 	-ffunction-sections \
 	-fomit-frame-pointer \
@@ -104,7 +102,6 @@
 
 MY_DEFS_Debug := \
 	'-DV8_DEPRECATION_WARNINGS' \
-	'-DBLINK_SCALE_FILTERS_AT_RECORD_TIME' \
 	'-D_FILE_OFFSET_BITS=64' \
 	'-DNO_TCMALLOC' \
 	'-DDISABLE_NACL' \
@@ -121,14 +118,15 @@
 	'-DENABLE_PRINTING=1' \
 	'-DENABLE_MANAGED_USERS=1' \
 	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
-	'-DDATA_REDUCTION_DEV_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
 	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
 	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
 	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
 	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
 	'-DHAVE_JPEG' \
 	'-DLIBYUV_NEON' \
-	'-DUSE_SYSTEM_LIBJPEG' \
 	'-DUSE_OPENSSL=1' \
 	'-DUSE_OPENSSL_CERTS=1' \
 	'-DANDROID' \
@@ -146,7 +144,7 @@
 	$(gyp_shared_intermediate_dir) \
 	$(LOCAL_PATH)/third_party/libyuv/include \
 	$(LOCAL_PATH)/third_party/libyuv \
-	$(PWD)/external/jpeg \
+	$(LOCAL_PATH)/third_party/libjpeg_turbo \
 	$(PWD)/frameworks/wilhelm/include \
 	$(PWD)/bionic \
 	$(PWD)/external/stlport/stlport
@@ -154,23 +152,24 @@
 
 # Flags passed to only C++ (and not C) files.
 LOCAL_CPPFLAGS_Debug := \
+	-fno-exceptions \
 	-fno-rtti \
 	-fno-threadsafe-statics \
 	-fvisibility-inlines-hidden \
 	-Wno-deprecated \
 	-Wno-abi \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
 	-Wno-non-virtual-dtor \
 	-Wno-sign-promo \
 	-Wno-non-virtual-dtor
 
 
-LOCAL_FDO_SUPPORT_Debug := false
-
 # Flags passed to both C and C++ files.
 MY_CFLAGS_Release := \
 	-fstack-protector \
 	--param=ssp-buffer-size=4 \
-	-fno-exceptions \
 	-fno-strict-aliasing \
 	-Wno-unused-parameter \
 	-Wno-missing-field-initializers \
@@ -180,13 +179,13 @@
 	-Wno-unused-local-typedefs \
 	-Wno-format \
 	-fno-tree-sra \
+	-fno-caller-saves \
+	-Wno-psabi \
 	-fno-partial-inlining \
 	-fno-early-inlining \
 	-fno-tree-copy-prop \
 	-fno-tree-loop-optimize \
 	-fno-move-loop-invariants \
-	-fno-caller-saves \
-	-Wno-psabi \
 	-ffunction-sections \
 	-funwind-tables \
 	-g \
@@ -212,7 +211,6 @@
 
 MY_DEFS_Release := \
 	'-DV8_DEPRECATION_WARNINGS' \
-	'-DBLINK_SCALE_FILTERS_AT_RECORD_TIME' \
 	'-D_FILE_OFFSET_BITS=64' \
 	'-DNO_TCMALLOC' \
 	'-DDISABLE_NACL' \
@@ -229,14 +227,15 @@
 	'-DENABLE_PRINTING=1' \
 	'-DENABLE_MANAGED_USERS=1' \
 	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
-	'-DDATA_REDUCTION_DEV_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
 	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
 	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
 	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
 	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
 	'-DHAVE_JPEG' \
 	'-DLIBYUV_NEON' \
-	'-DUSE_SYSTEM_LIBJPEG' \
 	'-DUSE_OPENSSL=1' \
 	'-DUSE_OPENSSL_CERTS=1' \
 	'-DANDROID' \
@@ -254,7 +253,7 @@
 	$(gyp_shared_intermediate_dir) \
 	$(LOCAL_PATH)/third_party/libyuv/include \
 	$(LOCAL_PATH)/third_party/libyuv \
-	$(PWD)/external/jpeg \
+	$(LOCAL_PATH)/third_party/libjpeg_turbo \
 	$(PWD)/frameworks/wilhelm/include \
 	$(PWD)/bionic \
 	$(PWD)/external/stlport/stlport
@@ -262,69 +261,26 @@
 
 # Flags passed to only C++ (and not C) files.
 LOCAL_CPPFLAGS_Release := \
+	-fno-exceptions \
 	-fno-rtti \
 	-fno-threadsafe-statics \
 	-fvisibility-inlines-hidden \
 	-Wno-deprecated \
 	-Wno-abi \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
 	-Wno-non-virtual-dtor \
 	-Wno-sign-promo \
 	-Wno-non-virtual-dtor
 
 
-LOCAL_FDO_SUPPORT_Release := false
-
 LOCAL_CFLAGS := $(MY_CFLAGS_$(GYP_CONFIGURATION)) $(MY_DEFS_$(GYP_CONFIGURATION))
-LOCAL_FDO_SUPPORT := $(LOCAL_FDO_SUPPORT_$(GYP_CONFIGURATION))
 LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CONFIGURATION))
 LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION))
 LOCAL_ASFLAGS := $(LOCAL_CFLAGS)
 ### Rules for final target.
 
-LOCAL_LDFLAGS_Debug := \
-	-Wl,-z,now \
-	-Wl,-z,relro \
-	-Wl,--fatal-warnings \
-	-Wl,-z,noexecstack \
-	-fPIC \
-	-Wl,-z,relro \
-	-Wl,-z,now \
-	-fuse-ld=gold \
-	-nostdlib \
-	-Wl,--no-undefined \
-	-Wl,--exclude-libs=ALL \
-	-Wl,--icf=safe \
-	-Wl,--warn-shared-textrel \
-	-Wl,-O1 \
-	-Wl,--as-needed
-
-
-LOCAL_LDFLAGS_Release := \
-	-Wl,-z,now \
-	-Wl,-z,relro \
-	-Wl,--fatal-warnings \
-	-Wl,-z,noexecstack \
-	-fPIC \
-	-Wl,-z,relro \
-	-Wl,-z,now \
-	-fuse-ld=gold \
-	-nostdlib \
-	-Wl,--no-undefined \
-	-Wl,--exclude-libs=ALL \
-	-Wl,--icf=safe \
-	-Wl,-O1 \
-	-Wl,--as-needed \
-	-Wl,--gc-sections \
-	-Wl,--warn-shared-textrel
-
-
-LOCAL_LDFLAGS := $(LOCAL_LDFLAGS_$(GYP_CONFIGURATION))
-
-LOCAL_STATIC_LIBRARIES :=
-
-# Enable grouping to fix circular references
-LOCAL_GROUP_STATIC_LIBRARIES := true
-
 LOCAL_SHARED_LIBRARIES := \
 	libstlport \
 	libdl
diff --git a/libyuv.target.linux-arm64.mk b/libyuv.target.linux-arm64.mk
index c056da0..8e7a656 100644
--- a/libyuv.target.linux-arm64.mk
+++ b/libyuv.target.linux-arm64.mk
@@ -5,14 +5,12 @@
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 LOCAL_MODULE := third_party_libyuv_libyuv_gyp
 LOCAL_MODULE_SUFFIX := .a
-LOCAL_MODULE_TAGS := optional
 LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH)
 gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX))
 gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX))
 
 # Make sure our deps are built first.
-GYP_TARGET_DEPENDENCIES := \
-	$(call intermediates-dir-for,GYP,third_party_libjpeg_libjpeg_gyp,,,$(GYP_VAR_PREFIX))/libjpeg.stamp
+GYP_TARGET_DEPENDENCIES :=
 
 GYP_GENERATED_OUTPUTS :=
 
@@ -61,7 +59,6 @@
 # Flags passed to both C and C++ files.
 MY_CFLAGS_Debug := \
 	--param=ssp-buffer-size=4 \
-	-fno-exceptions \
 	-fno-strict-aliasing \
 	-Wno-unused-parameter \
 	-Wno-missing-field-initializers \
@@ -87,13 +84,13 @@
 	-Wno-sequence-point \
 	-Os \
 	-g \
+	-gdwarf-4 \
 	-fdata-sections \
 	-ffunction-sections \
 	-funwind-tables
 
 MY_DEFS_Debug := \
 	'-DV8_DEPRECATION_WARNINGS' \
-	'-DBLINK_SCALE_FILTERS_AT_RECORD_TIME' \
 	'-D_FILE_OFFSET_BITS=64' \
 	'-DNO_TCMALLOC' \
 	'-DDISABLE_NACL' \
@@ -110,13 +107,14 @@
 	'-DENABLE_PRINTING=1' \
 	'-DENABLE_MANAGED_USERS=1' \
 	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
-	'-DDATA_REDUCTION_DEV_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
 	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
 	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
 	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
 	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
 	'-DHAVE_JPEG' \
-	'-DUSE_SYSTEM_LIBJPEG' \
 	'-DUSE_OPENSSL=1' \
 	'-DUSE_OPENSSL_CERTS=1' \
 	'-DANDROID' \
@@ -134,7 +132,7 @@
 	$(gyp_shared_intermediate_dir) \
 	$(LOCAL_PATH)/third_party/libyuv/include \
 	$(LOCAL_PATH)/third_party/libyuv \
-	$(PWD)/external/jpeg \
+	$(LOCAL_PATH)/third_party/libjpeg_turbo \
 	$(PWD)/frameworks/wilhelm/include \
 	$(PWD)/bionic \
 	$(PWD)/external/stlport/stlport
@@ -142,21 +140,22 @@
 
 # Flags passed to only C++ (and not C) files.
 LOCAL_CPPFLAGS_Debug := \
+	-fno-exceptions \
 	-fno-rtti \
 	-fno-threadsafe-statics \
 	-fvisibility-inlines-hidden \
 	-Wno-deprecated \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
 	-Wno-non-virtual-dtor \
 	-Wno-sign-promo \
 	-Wno-non-virtual-dtor
 
 
-LOCAL_FDO_SUPPORT_Debug := false
-
 # Flags passed to both C and C++ files.
 MY_CFLAGS_Release := \
 	--param=ssp-buffer-size=4 \
-	-fno-exceptions \
 	-fno-strict-aliasing \
 	-Wno-unused-parameter \
 	-Wno-missing-field-initializers \
@@ -188,7 +187,6 @@
 
 MY_DEFS_Release := \
 	'-DV8_DEPRECATION_WARNINGS' \
-	'-DBLINK_SCALE_FILTERS_AT_RECORD_TIME' \
 	'-D_FILE_OFFSET_BITS=64' \
 	'-DNO_TCMALLOC' \
 	'-DDISABLE_NACL' \
@@ -205,13 +203,14 @@
 	'-DENABLE_PRINTING=1' \
 	'-DENABLE_MANAGED_USERS=1' \
 	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
-	'-DDATA_REDUCTION_DEV_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
 	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
 	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
 	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
 	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
 	'-DHAVE_JPEG' \
-	'-DUSE_SYSTEM_LIBJPEG' \
 	'-DUSE_OPENSSL=1' \
 	'-DUSE_OPENSSL_CERTS=1' \
 	'-DANDROID' \
@@ -229,7 +228,7 @@
 	$(gyp_shared_intermediate_dir) \
 	$(LOCAL_PATH)/third_party/libyuv/include \
 	$(LOCAL_PATH)/third_party/libyuv \
-	$(PWD)/external/jpeg \
+	$(LOCAL_PATH)/third_party/libjpeg_turbo \
 	$(PWD)/frameworks/wilhelm/include \
 	$(PWD)/bionic \
 	$(PWD)/external/stlport/stlport
@@ -237,60 +236,25 @@
 
 # Flags passed to only C++ (and not C) files.
 LOCAL_CPPFLAGS_Release := \
+	-fno-exceptions \
 	-fno-rtti \
 	-fno-threadsafe-statics \
 	-fvisibility-inlines-hidden \
 	-Wno-deprecated \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
 	-Wno-non-virtual-dtor \
 	-Wno-sign-promo \
 	-Wno-non-virtual-dtor
 
 
-LOCAL_FDO_SUPPORT_Release := false
-
 LOCAL_CFLAGS := $(MY_CFLAGS_$(GYP_CONFIGURATION)) $(MY_DEFS_$(GYP_CONFIGURATION))
-LOCAL_FDO_SUPPORT := $(LOCAL_FDO_SUPPORT_$(GYP_CONFIGURATION))
 LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CONFIGURATION))
 LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION))
 LOCAL_ASFLAGS := $(LOCAL_CFLAGS)
 ### Rules for final target.
 
-LOCAL_LDFLAGS_Debug := \
-	-Wl,-z,now \
-	-Wl,-z,relro \
-	-Wl,--fatal-warnings \
-	-Wl,-z,noexecstack \
-	-fPIC \
-	-nostdlib \
-	-Wl,--no-undefined \
-	-Wl,--exclude-libs=ALL \
-	-Wl,--warn-shared-textrel \
-	-Wl,-O1 \
-	-Wl,--as-needed
-
-
-LOCAL_LDFLAGS_Release := \
-	-Wl,-z,now \
-	-Wl,-z,relro \
-	-Wl,--fatal-warnings \
-	-Wl,-z,noexecstack \
-	-fPIC \
-	-nostdlib \
-	-Wl,--no-undefined \
-	-Wl,--exclude-libs=ALL \
-	-Wl,-O1 \
-	-Wl,--as-needed \
-	-Wl,--gc-sections \
-	-Wl,--warn-shared-textrel
-
-
-LOCAL_LDFLAGS := $(LOCAL_LDFLAGS_$(GYP_CONFIGURATION))
-
-LOCAL_STATIC_LIBRARIES :=
-
-# Enable grouping to fix circular references
-LOCAL_GROUP_STATIC_LIBRARIES := true
-
 LOCAL_SHARED_LIBRARIES := \
 	libstlport \
 	libdl
diff --git a/libyuv.target.linux-mips.mk b/libyuv.target.linux-mips.mk
index 341f905..e446e24 100644
--- a/libyuv.target.linux-mips.mk
+++ b/libyuv.target.linux-mips.mk
@@ -5,14 +5,12 @@
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 LOCAL_MODULE := third_party_libyuv_libyuv_gyp
 LOCAL_MODULE_SUFFIX := .a
-LOCAL_MODULE_TAGS := optional
 LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH)
 gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX))
 gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX))
 
 # Make sure our deps are built first.
-GYP_TARGET_DEPENDENCIES := \
-	$(call intermediates-dir-for,GYP,third_party_libjpeg_libjpeg_gyp,,,$(GYP_VAR_PREFIX))/libjpeg.stamp
+GYP_TARGET_DEPENDENCIES :=
 
 GYP_GENERATED_OUTPUTS :=
 
@@ -63,7 +61,6 @@
 	-fstack-protector \
 	--param=ssp-buffer-size=4 \
 	 \
-	-fno-exceptions \
 	-fno-strict-aliasing \
 	-Wno-unused-parameter \
 	-Wno-missing-field-initializers \
@@ -72,8 +69,6 @@
 	-fPIC \
 	-Wno-unused-local-typedefs \
 	-Wno-format \
-	-EL \
-	-mhard-float \
 	-ffunction-sections \
 	-funwind-tables \
 	-g \
@@ -92,6 +87,7 @@
 	-Wno-sequence-point \
 	-Os \
 	-g \
+	-gdwarf-4 \
 	-fdata-sections \
 	-ffunction-sections \
 	-fomit-frame-pointer \
@@ -99,7 +95,6 @@
 
 MY_DEFS_Debug := \
 	'-DV8_DEPRECATION_WARNINGS' \
-	'-DBLINK_SCALE_FILTERS_AT_RECORD_TIME' \
 	'-D_FILE_OFFSET_BITS=64' \
 	'-DNO_TCMALLOC' \
 	'-DDISABLE_NACL' \
@@ -116,13 +111,14 @@
 	'-DENABLE_PRINTING=1' \
 	'-DENABLE_MANAGED_USERS=1' \
 	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
-	'-DDATA_REDUCTION_DEV_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
 	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
 	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
 	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
 	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
 	'-DHAVE_JPEG' \
-	'-DUSE_SYSTEM_LIBJPEG' \
 	'-DUSE_OPENSSL=1' \
 	'-DUSE_OPENSSL_CERTS=1' \
 	'-DANDROID' \
@@ -140,7 +136,7 @@
 	$(gyp_shared_intermediate_dir) \
 	$(LOCAL_PATH)/third_party/libyuv/include \
 	$(LOCAL_PATH)/third_party/libyuv \
-	$(PWD)/external/jpeg \
+	$(LOCAL_PATH)/third_party/libjpeg_turbo \
 	$(PWD)/frameworks/wilhelm/include \
 	$(PWD)/bionic \
 	$(PWD)/external/stlport/stlport
@@ -148,24 +144,25 @@
 
 # Flags passed to only C++ (and not C) files.
 LOCAL_CPPFLAGS_Debug := \
+	-fno-exceptions \
 	-fno-rtti \
 	-fno-threadsafe-statics \
 	-fvisibility-inlines-hidden \
 	-Wno-deprecated \
 	-Wno-uninitialized \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
 	-Wno-non-virtual-dtor \
 	-Wno-sign-promo \
 	-Wno-non-virtual-dtor
 
 
-LOCAL_FDO_SUPPORT_Debug := false
-
 # Flags passed to both C and C++ files.
 MY_CFLAGS_Release := \
 	-fstack-protector \
 	--param=ssp-buffer-size=4 \
 	 \
-	-fno-exceptions \
 	-fno-strict-aliasing \
 	-Wno-unused-parameter \
 	-Wno-missing-field-initializers \
@@ -174,8 +171,6 @@
 	-fPIC \
 	-Wno-unused-local-typedefs \
 	-Wno-format \
-	-EL \
-	-mhard-float \
 	-ffunction-sections \
 	-funwind-tables \
 	-g \
@@ -201,7 +196,6 @@
 
 MY_DEFS_Release := \
 	'-DV8_DEPRECATION_WARNINGS' \
-	'-DBLINK_SCALE_FILTERS_AT_RECORD_TIME' \
 	'-D_FILE_OFFSET_BITS=64' \
 	'-DNO_TCMALLOC' \
 	'-DDISABLE_NACL' \
@@ -218,13 +212,14 @@
 	'-DENABLE_PRINTING=1' \
 	'-DENABLE_MANAGED_USERS=1' \
 	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
-	'-DDATA_REDUCTION_DEV_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
 	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
 	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
 	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
 	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
 	'-DHAVE_JPEG' \
-	'-DUSE_SYSTEM_LIBJPEG' \
 	'-DUSE_OPENSSL=1' \
 	'-DUSE_OPENSSL_CERTS=1' \
 	'-DANDROID' \
@@ -242,7 +237,7 @@
 	$(gyp_shared_intermediate_dir) \
 	$(LOCAL_PATH)/third_party/libyuv/include \
 	$(LOCAL_PATH)/third_party/libyuv \
-	$(PWD)/external/jpeg \
+	$(LOCAL_PATH)/third_party/libjpeg_turbo \
 	$(PWD)/frameworks/wilhelm/include \
 	$(PWD)/bionic \
 	$(PWD)/external/stlport/stlport
@@ -250,65 +245,26 @@
 
 # Flags passed to only C++ (and not C) files.
 LOCAL_CPPFLAGS_Release := \
+	-fno-exceptions \
 	-fno-rtti \
 	-fno-threadsafe-statics \
 	-fvisibility-inlines-hidden \
 	-Wno-deprecated \
 	-Wno-uninitialized \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
 	-Wno-non-virtual-dtor \
 	-Wno-sign-promo \
 	-Wno-non-virtual-dtor
 
 
-LOCAL_FDO_SUPPORT_Release := false
-
 LOCAL_CFLAGS := $(MY_CFLAGS_$(GYP_CONFIGURATION)) $(MY_DEFS_$(GYP_CONFIGURATION))
-LOCAL_FDO_SUPPORT := $(LOCAL_FDO_SUPPORT_$(GYP_CONFIGURATION))
 LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CONFIGURATION))
 LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION))
 LOCAL_ASFLAGS := $(LOCAL_CFLAGS)
 ### Rules for final target.
 
-LOCAL_LDFLAGS_Debug := \
-	-Wl,-z,now \
-	-Wl,-z,relro \
-	-Wl,--fatal-warnings \
-	-Wl,-z,noexecstack \
-	-fPIC \
-	-EL \
-	-Wl,--no-keep-memory \
-	-nostdlib \
-	-Wl,--no-undefined \
-	-Wl,--exclude-libs=ALL \
-	-Wl,--warn-shared-textrel \
-	-Wl,-O1 \
-	-Wl,--as-needed
-
-
-LOCAL_LDFLAGS_Release := \
-	-Wl,-z,now \
-	-Wl,-z,relro \
-	-Wl,--fatal-warnings \
-	-Wl,-z,noexecstack \
-	-fPIC \
-	-EL \
-	-Wl,--no-keep-memory \
-	-nostdlib \
-	-Wl,--no-undefined \
-	-Wl,--exclude-libs=ALL \
-	-Wl,-O1 \
-	-Wl,--as-needed \
-	-Wl,--gc-sections \
-	-Wl,--warn-shared-textrel
-
-
-LOCAL_LDFLAGS := $(LOCAL_LDFLAGS_$(GYP_CONFIGURATION))
-
-LOCAL_STATIC_LIBRARIES :=
-
-# Enable grouping to fix circular references
-LOCAL_GROUP_STATIC_LIBRARIES := true
-
 LOCAL_SHARED_LIBRARIES := \
 	libstlport \
 	libdl
diff --git a/libyuv.target.linux-mips64.mk b/libyuv.target.linux-mips64.mk
new file mode 100644
index 0000000..e446e24
--- /dev/null
+++ b/libyuv.target.linux-mips64.mk
@@ -0,0 +1,280 @@
+# This file is generated by gyp; do not edit.
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_CLASS := STATIC_LIBRARIES
+LOCAL_MODULE := third_party_libyuv_libyuv_gyp
+LOCAL_MODULE_SUFFIX := .a
+LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH)
+gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX))
+gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX))
+
+# Make sure our deps are built first.
+GYP_TARGET_DEPENDENCIES :=
+
+GYP_GENERATED_OUTPUTS :=
+
+# Make sure our deps and generated files are built first.
+LOCAL_ADDITIONAL_DEPENDENCIES := $(GYP_TARGET_DEPENDENCIES) $(GYP_GENERATED_OUTPUTS)
+
+LOCAL_CPP_EXTENSION := .cc
+LOCAL_GENERATED_SOURCES :=
+
+GYP_COPIED_SOURCE_ORIGIN_DIRS :=
+
+LOCAL_SRC_FILES := \
+	third_party/libyuv/source/compare.cc \
+	third_party/libyuv/source/compare_common.cc \
+	third_party/libyuv/source/compare_posix.cc \
+	third_party/libyuv/source/compare_win.cc \
+	third_party/libyuv/source/convert.cc \
+	third_party/libyuv/source/convert_argb.cc \
+	third_party/libyuv/source/convert_from.cc \
+	third_party/libyuv/source/convert_from_argb.cc \
+	third_party/libyuv/source/convert_jpeg.cc \
+	third_party/libyuv/source/convert_to_argb.cc \
+	third_party/libyuv/source/convert_to_i420.cc \
+	third_party/libyuv/source/cpu_id.cc \
+	third_party/libyuv/source/format_conversion.cc \
+	third_party/libyuv/source/mjpeg_decoder.cc \
+	third_party/libyuv/source/mjpeg_validate.cc \
+	third_party/libyuv/source/planar_functions.cc \
+	third_party/libyuv/source/rotate.cc \
+	third_party/libyuv/source/rotate_argb.cc \
+	third_party/libyuv/source/rotate_mips.cc \
+	third_party/libyuv/source/row_any.cc \
+	third_party/libyuv/source/row_common.cc \
+	third_party/libyuv/source/row_mips.cc \
+	third_party/libyuv/source/row_posix.cc \
+	third_party/libyuv/source/row_win.cc \
+	third_party/libyuv/source/scale.cc \
+	third_party/libyuv/source/scale_argb.cc \
+	third_party/libyuv/source/scale_common.cc \
+	third_party/libyuv/source/scale_mips.cc \
+	third_party/libyuv/source/scale_posix.cc \
+	third_party/libyuv/source/scale_win.cc \
+	third_party/libyuv/source/video_common.cc
+
+
+# Flags passed to both C and C++ files.
+MY_CFLAGS_Debug := \
+	-fstack-protector \
+	--param=ssp-buffer-size=4 \
+	 \
+	-fno-strict-aliasing \
+	-Wno-unused-parameter \
+	-Wno-missing-field-initializers \
+	-fvisibility=hidden \
+	-pipe \
+	-fPIC \
+	-Wno-unused-local-typedefs \
+	-Wno-format \
+	-ffunction-sections \
+	-funwind-tables \
+	-g \
+	-fstack-protector \
+	-fno-short-enums \
+	-finline-limit=64 \
+	-Wa,--noexecstack \
+	-U_FORTIFY_SOURCE \
+	-Wno-extra \
+	-Wno-ignored-qualifiers \
+	-Wno-type-limits \
+	-Wno-unused-but-set-variable \
+	-Wno-address \
+	-Wno-format-security \
+	-Wno-return-type \
+	-Wno-sequence-point \
+	-Os \
+	-g \
+	-gdwarf-4 \
+	-fdata-sections \
+	-ffunction-sections \
+	-fomit-frame-pointer \
+	-funwind-tables
+
+MY_DEFS_Debug := \
+	'-DV8_DEPRECATION_WARNINGS' \
+	'-D_FILE_OFFSET_BITS=64' \
+	'-DNO_TCMALLOC' \
+	'-DDISABLE_NACL' \
+	'-DCHROMIUM_BUILD' \
+	'-DUSE_LIBJPEG_TURBO=1' \
+	'-DENABLE_WEBRTC=1' \
+	'-DUSE_PROPRIETARY_CODECS' \
+	'-DENABLE_BROWSER_CDMS' \
+	'-DENABLE_CONFIGURATION_POLICY' \
+	'-DDISCARDABLE_MEMORY_ALWAYS_SUPPORTED_NATIVELY' \
+	'-DSYSTEM_NATIVELY_SIGNALS_MEMORY_PRESSURE' \
+	'-DENABLE_EGLIMAGE=1' \
+	'-DCLD_VERSION=1' \
+	'-DENABLE_PRINTING=1' \
+	'-DENABLE_MANAGED_USERS=1' \
+	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
+	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
+	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
+	'-DHAVE_JPEG' \
+	'-DUSE_OPENSSL=1' \
+	'-DUSE_OPENSSL_CERTS=1' \
+	'-DANDROID' \
+	'-D__GNU_SOURCE=1' \
+	'-DUSE_STLPORT=1' \
+	'-D_STLP_USE_PTR_SPECIALIZATIONS=1' \
+	'-DCHROME_BUILD_ID=""' \
+	'-DDYNAMIC_ANNOTATIONS_ENABLED=1' \
+	'-DWTF_USE_DYNAMIC_ANNOTATIONS=1' \
+	'-D_DEBUG'
+
+
+# Include paths placed before CFLAGS/CPPFLAGS
+LOCAL_C_INCLUDES_Debug := \
+	$(gyp_shared_intermediate_dir) \
+	$(LOCAL_PATH)/third_party/libyuv/include \
+	$(LOCAL_PATH)/third_party/libyuv \
+	$(LOCAL_PATH)/third_party/libjpeg_turbo \
+	$(PWD)/frameworks/wilhelm/include \
+	$(PWD)/bionic \
+	$(PWD)/external/stlport/stlport
+
+
+# Flags passed to only C++ (and not C) files.
+LOCAL_CPPFLAGS_Debug := \
+	-fno-exceptions \
+	-fno-rtti \
+	-fno-threadsafe-statics \
+	-fvisibility-inlines-hidden \
+	-Wno-deprecated \
+	-Wno-uninitialized \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
+	-Wno-non-virtual-dtor \
+	-Wno-sign-promo \
+	-Wno-non-virtual-dtor
+
+
+# Flags passed to both C and C++ files.
+MY_CFLAGS_Release := \
+	-fstack-protector \
+	--param=ssp-buffer-size=4 \
+	 \
+	-fno-strict-aliasing \
+	-Wno-unused-parameter \
+	-Wno-missing-field-initializers \
+	-fvisibility=hidden \
+	-pipe \
+	-fPIC \
+	-Wno-unused-local-typedefs \
+	-Wno-format \
+	-ffunction-sections \
+	-funwind-tables \
+	-g \
+	-fstack-protector \
+	-fno-short-enums \
+	-finline-limit=64 \
+	-Wa,--noexecstack \
+	-U_FORTIFY_SOURCE \
+	-Wno-extra \
+	-Wno-ignored-qualifiers \
+	-Wno-type-limits \
+	-Wno-unused-but-set-variable \
+	-Wno-address \
+	-Wno-format-security \
+	-Wno-return-type \
+	-Wno-sequence-point \
+	-Os \
+	-fno-ident \
+	-fdata-sections \
+	-ffunction-sections \
+	-fomit-frame-pointer \
+	-funwind-tables
+
+MY_DEFS_Release := \
+	'-DV8_DEPRECATION_WARNINGS' \
+	'-D_FILE_OFFSET_BITS=64' \
+	'-DNO_TCMALLOC' \
+	'-DDISABLE_NACL' \
+	'-DCHROMIUM_BUILD' \
+	'-DUSE_LIBJPEG_TURBO=1' \
+	'-DENABLE_WEBRTC=1' \
+	'-DUSE_PROPRIETARY_CODECS' \
+	'-DENABLE_BROWSER_CDMS' \
+	'-DENABLE_CONFIGURATION_POLICY' \
+	'-DDISCARDABLE_MEMORY_ALWAYS_SUPPORTED_NATIVELY' \
+	'-DSYSTEM_NATIVELY_SIGNALS_MEMORY_PRESSURE' \
+	'-DENABLE_EGLIMAGE=1' \
+	'-DCLD_VERSION=1' \
+	'-DENABLE_PRINTING=1' \
+	'-DENABLE_MANAGED_USERS=1' \
+	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
+	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
+	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
+	'-DHAVE_JPEG' \
+	'-DUSE_OPENSSL=1' \
+	'-DUSE_OPENSSL_CERTS=1' \
+	'-DANDROID' \
+	'-D__GNU_SOURCE=1' \
+	'-DUSE_STLPORT=1' \
+	'-D_STLP_USE_PTR_SPECIALIZATIONS=1' \
+	'-DCHROME_BUILD_ID=""' \
+	'-DNDEBUG' \
+	'-DNVALGRIND' \
+	'-DDYNAMIC_ANNOTATIONS_ENABLED=0'
+
+
+# Include paths placed before CFLAGS/CPPFLAGS
+LOCAL_C_INCLUDES_Release := \
+	$(gyp_shared_intermediate_dir) \
+	$(LOCAL_PATH)/third_party/libyuv/include \
+	$(LOCAL_PATH)/third_party/libyuv \
+	$(LOCAL_PATH)/third_party/libjpeg_turbo \
+	$(PWD)/frameworks/wilhelm/include \
+	$(PWD)/bionic \
+	$(PWD)/external/stlport/stlport
+
+
+# Flags passed to only C++ (and not C) files.
+LOCAL_CPPFLAGS_Release := \
+	-fno-exceptions \
+	-fno-rtti \
+	-fno-threadsafe-statics \
+	-fvisibility-inlines-hidden \
+	-Wno-deprecated \
+	-Wno-uninitialized \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
+	-Wno-non-virtual-dtor \
+	-Wno-sign-promo \
+	-Wno-non-virtual-dtor
+
+
+LOCAL_CFLAGS := $(MY_CFLAGS_$(GYP_CONFIGURATION)) $(MY_DEFS_$(GYP_CONFIGURATION))
+LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CONFIGURATION))
+LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION))
+LOCAL_ASFLAGS := $(LOCAL_CFLAGS)
+### Rules for final target.
+
+LOCAL_SHARED_LIBRARIES := \
+	libstlport \
+	libdl
+
+# Add target alias to "gyp_all_modules" target.
+.PHONY: gyp_all_modules
+gyp_all_modules: third_party_libyuv_libyuv_gyp
+
+# Alias gyp target name.
+.PHONY: libyuv
+libyuv: third_party_libyuv_libyuv_gyp
+
+include $(BUILD_STATIC_LIBRARY)
diff --git a/libyuv.target.linux-x86.mk b/libyuv.target.linux-x86.mk
index 5fb11f4..cd82ea9 100644
--- a/libyuv.target.linux-x86.mk
+++ b/libyuv.target.linux-x86.mk
@@ -5,14 +5,12 @@
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 LOCAL_MODULE := third_party_libyuv_libyuv_gyp
 LOCAL_MODULE_SUFFIX := .a
-LOCAL_MODULE_TAGS := optional
 LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH)
 gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX))
 gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX))
 
 # Make sure our deps are built first.
-GYP_TARGET_DEPENDENCIES := \
-	$(call intermediates-dir-for,GYP,third_party_libjpeg_libjpeg_gyp,,,$(GYP_VAR_PREFIX))/libjpeg.stamp
+GYP_TARGET_DEPENDENCIES :=
 
 GYP_GENERATED_OUTPUTS :=
 
@@ -61,7 +59,6 @@
 # Flags passed to both C and C++ files.
 MY_CFLAGS_Debug := \
 	--param=ssp-buffer-size=4 \
-	-fno-exceptions \
 	-fno-strict-aliasing \
 	-Wno-unused-parameter \
 	-Wno-missing-field-initializers \
@@ -92,6 +89,7 @@
 	-Wno-sequence-point \
 	-Os \
 	-g \
+	-gdwarf-4 \
 	-fdata-sections \
 	-ffunction-sections \
 	-fomit-frame-pointer \
@@ -99,7 +97,6 @@
 
 MY_DEFS_Debug := \
 	'-DV8_DEPRECATION_WARNINGS' \
-	'-DBLINK_SCALE_FILTERS_AT_RECORD_TIME' \
 	'-D_FILE_OFFSET_BITS=64' \
 	'-DNO_TCMALLOC' \
 	'-DDISABLE_NACL' \
@@ -116,13 +113,14 @@
 	'-DENABLE_PRINTING=1' \
 	'-DENABLE_MANAGED_USERS=1' \
 	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
-	'-DDATA_REDUCTION_DEV_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
 	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
 	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
 	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
 	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
 	'-DHAVE_JPEG' \
-	'-DUSE_SYSTEM_LIBJPEG' \
 	'-DUSE_OPENSSL=1' \
 	'-DUSE_OPENSSL_CERTS=1' \
 	'-DANDROID' \
@@ -140,7 +138,7 @@
 	$(gyp_shared_intermediate_dir) \
 	$(LOCAL_PATH)/third_party/libyuv/include \
 	$(LOCAL_PATH)/third_party/libyuv \
-	$(PWD)/external/jpeg \
+	$(LOCAL_PATH)/third_party/libjpeg_turbo \
 	$(PWD)/frameworks/wilhelm/include \
 	$(PWD)/bionic \
 	$(PWD)/external/stlport/stlport
@@ -148,21 +146,22 @@
 
 # Flags passed to only C++ (and not C) files.
 LOCAL_CPPFLAGS_Debug := \
+	-fno-exceptions \
 	-fno-rtti \
 	-fno-threadsafe-statics \
 	-fvisibility-inlines-hidden \
 	-Wno-deprecated \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
 	-Wno-non-virtual-dtor \
 	-Wno-sign-promo \
 	-Wno-non-virtual-dtor
 
 
-LOCAL_FDO_SUPPORT_Debug := false
-
 # Flags passed to both C and C++ files.
 MY_CFLAGS_Release := \
 	--param=ssp-buffer-size=4 \
-	-fno-exceptions \
 	-fno-strict-aliasing \
 	-Wno-unused-parameter \
 	-Wno-missing-field-initializers \
@@ -200,7 +199,6 @@
 
 MY_DEFS_Release := \
 	'-DV8_DEPRECATION_WARNINGS' \
-	'-DBLINK_SCALE_FILTERS_AT_RECORD_TIME' \
 	'-D_FILE_OFFSET_BITS=64' \
 	'-DNO_TCMALLOC' \
 	'-DDISABLE_NACL' \
@@ -217,13 +215,14 @@
 	'-DENABLE_PRINTING=1' \
 	'-DENABLE_MANAGED_USERS=1' \
 	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
-	'-DDATA_REDUCTION_DEV_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
 	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
 	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
 	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
 	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
 	'-DHAVE_JPEG' \
-	'-DUSE_SYSTEM_LIBJPEG' \
 	'-DUSE_OPENSSL=1' \
 	'-DUSE_OPENSSL_CERTS=1' \
 	'-DANDROID' \
@@ -241,7 +240,7 @@
 	$(gyp_shared_intermediate_dir) \
 	$(LOCAL_PATH)/third_party/libyuv/include \
 	$(LOCAL_PATH)/third_party/libyuv \
-	$(PWD)/external/jpeg \
+	$(LOCAL_PATH)/third_party/libjpeg_turbo \
 	$(PWD)/frameworks/wilhelm/include \
 	$(PWD)/bionic \
 	$(PWD)/external/stlport/stlport
@@ -249,64 +248,25 @@
 
 # Flags passed to only C++ (and not C) files.
 LOCAL_CPPFLAGS_Release := \
+	-fno-exceptions \
 	-fno-rtti \
 	-fno-threadsafe-statics \
 	-fvisibility-inlines-hidden \
 	-Wno-deprecated \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
 	-Wno-non-virtual-dtor \
 	-Wno-sign-promo \
 	-Wno-non-virtual-dtor
 
 
-LOCAL_FDO_SUPPORT_Release := false
-
 LOCAL_CFLAGS := $(MY_CFLAGS_$(GYP_CONFIGURATION)) $(MY_DEFS_$(GYP_CONFIGURATION))
-LOCAL_FDO_SUPPORT := $(LOCAL_FDO_SUPPORT_$(GYP_CONFIGURATION))
 LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CONFIGURATION))
 LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION))
 LOCAL_ASFLAGS := $(LOCAL_CFLAGS)
 ### Rules for final target.
 
-LOCAL_LDFLAGS_Debug := \
-	-Wl,-z,now \
-	-Wl,-z,relro \
-	-Wl,--fatal-warnings \
-	-Wl,-z,noexecstack \
-	-fPIC \
-	-m32 \
-	-fuse-ld=gold \
-	-nostdlib \
-	-Wl,--no-undefined \
-	-Wl,--exclude-libs=ALL \
-	-Wl,--warn-shared-textrel \
-	-Wl,-O1 \
-	-Wl,--as-needed
-
-
-LOCAL_LDFLAGS_Release := \
-	-Wl,-z,now \
-	-Wl,-z,relro \
-	-Wl,--fatal-warnings \
-	-Wl,-z,noexecstack \
-	-fPIC \
-	-m32 \
-	-fuse-ld=gold \
-	-nostdlib \
-	-Wl,--no-undefined \
-	-Wl,--exclude-libs=ALL \
-	-Wl,-O1 \
-	-Wl,--as-needed \
-	-Wl,--gc-sections \
-	-Wl,--warn-shared-textrel
-
-
-LOCAL_LDFLAGS := $(LOCAL_LDFLAGS_$(GYP_CONFIGURATION))
-
-LOCAL_STATIC_LIBRARIES :=
-
-# Enable grouping to fix circular references
-LOCAL_GROUP_STATIC_LIBRARIES := true
-
 LOCAL_SHARED_LIBRARIES := \
 	libstlport \
 	libdl
diff --git a/libyuv.target.linux-x86_64.mk b/libyuv.target.linux-x86_64.mk
index f06f354..7beeb32 100644
--- a/libyuv.target.linux-x86_64.mk
+++ b/libyuv.target.linux-x86_64.mk
@@ -5,14 +5,12 @@
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 LOCAL_MODULE := third_party_libyuv_libyuv_gyp
 LOCAL_MODULE_SUFFIX := .a
-LOCAL_MODULE_TAGS := optional
 LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH)
 gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX))
 gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX))
 
 # Make sure our deps are built first.
-GYP_TARGET_DEPENDENCIES := \
-	$(call intermediates-dir-for,GYP,third_party_libjpeg_libjpeg_gyp,,,$(GYP_VAR_PREFIX))/libjpeg.stamp
+GYP_TARGET_DEPENDENCIES :=
 
 GYP_GENERATED_OUTPUTS :=
 
@@ -62,7 +60,6 @@
 MY_CFLAGS_Debug := \
 	-fstack-protector \
 	--param=ssp-buffer-size=4 \
-	-fno-exceptions \
 	-fno-strict-aliasing \
 	-Wno-unused-parameter \
 	-Wno-missing-field-initializers \
@@ -91,6 +88,7 @@
 	-Wno-sequence-point \
 	-Os \
 	-g \
+	-gdwarf-4 \
 	-fdata-sections \
 	-ffunction-sections \
 	-fomit-frame-pointer \
@@ -98,7 +96,6 @@
 
 MY_DEFS_Debug := \
 	'-DV8_DEPRECATION_WARNINGS' \
-	'-DBLINK_SCALE_FILTERS_AT_RECORD_TIME' \
 	'-D_FILE_OFFSET_BITS=64' \
 	'-DNO_TCMALLOC' \
 	'-DDISABLE_NACL' \
@@ -115,13 +112,14 @@
 	'-DENABLE_PRINTING=1' \
 	'-DENABLE_MANAGED_USERS=1' \
 	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
-	'-DDATA_REDUCTION_DEV_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
 	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
 	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
 	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
 	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
 	'-DHAVE_JPEG' \
-	'-DUSE_SYSTEM_LIBJPEG' \
 	'-DUSE_OPENSSL=1' \
 	'-DUSE_OPENSSL_CERTS=1' \
 	'-DANDROID' \
@@ -139,7 +137,7 @@
 	$(gyp_shared_intermediate_dir) \
 	$(LOCAL_PATH)/third_party/libyuv/include \
 	$(LOCAL_PATH)/third_party/libyuv \
-	$(PWD)/external/jpeg \
+	$(LOCAL_PATH)/third_party/libjpeg_turbo \
 	$(PWD)/frameworks/wilhelm/include \
 	$(PWD)/bionic \
 	$(PWD)/external/stlport/stlport
@@ -147,22 +145,23 @@
 
 # Flags passed to only C++ (and not C) files.
 LOCAL_CPPFLAGS_Debug := \
+	-fno-exceptions \
 	-fno-rtti \
 	-fno-threadsafe-statics \
 	-fvisibility-inlines-hidden \
 	-Wno-deprecated \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
 	-Wno-non-virtual-dtor \
 	-Wno-sign-promo \
 	-Wno-non-virtual-dtor
 
 
-LOCAL_FDO_SUPPORT_Debug := false
-
 # Flags passed to both C and C++ files.
 MY_CFLAGS_Release := \
 	-fstack-protector \
 	--param=ssp-buffer-size=4 \
-	-fno-exceptions \
 	-fno-strict-aliasing \
 	-Wno-unused-parameter \
 	-Wno-missing-field-initializers \
@@ -198,7 +197,6 @@
 
 MY_DEFS_Release := \
 	'-DV8_DEPRECATION_WARNINGS' \
-	'-DBLINK_SCALE_FILTERS_AT_RECORD_TIME' \
 	'-D_FILE_OFFSET_BITS=64' \
 	'-DNO_TCMALLOC' \
 	'-DDISABLE_NACL' \
@@ -215,13 +213,14 @@
 	'-DENABLE_PRINTING=1' \
 	'-DENABLE_MANAGED_USERS=1' \
 	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
-	'-DDATA_REDUCTION_DEV_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
 	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
 	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
 	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
 	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
 	'-DHAVE_JPEG' \
-	'-DUSE_SYSTEM_LIBJPEG' \
 	'-DUSE_OPENSSL=1' \
 	'-DUSE_OPENSSL_CERTS=1' \
 	'-DANDROID' \
@@ -239,7 +238,7 @@
 	$(gyp_shared_intermediate_dir) \
 	$(LOCAL_PATH)/third_party/libyuv/include \
 	$(LOCAL_PATH)/third_party/libyuv \
-	$(PWD)/external/jpeg \
+	$(LOCAL_PATH)/third_party/libjpeg_turbo \
 	$(PWD)/frameworks/wilhelm/include \
 	$(PWD)/bionic \
 	$(PWD)/external/stlport/stlport
@@ -247,64 +246,25 @@
 
 # Flags passed to only C++ (and not C) files.
 LOCAL_CPPFLAGS_Release := \
+	-fno-exceptions \
 	-fno-rtti \
 	-fno-threadsafe-statics \
 	-fvisibility-inlines-hidden \
 	-Wno-deprecated \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
 	-Wno-non-virtual-dtor \
 	-Wno-sign-promo \
 	-Wno-non-virtual-dtor
 
 
-LOCAL_FDO_SUPPORT_Release := false
-
 LOCAL_CFLAGS := $(MY_CFLAGS_$(GYP_CONFIGURATION)) $(MY_DEFS_$(GYP_CONFIGURATION))
-LOCAL_FDO_SUPPORT := $(LOCAL_FDO_SUPPORT_$(GYP_CONFIGURATION))
 LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CONFIGURATION))
 LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION))
 LOCAL_ASFLAGS := $(LOCAL_CFLAGS)
 ### Rules for final target.
 
-LOCAL_LDFLAGS_Debug := \
-	-Wl,-z,now \
-	-Wl,-z,relro \
-	-Wl,--fatal-warnings \
-	-Wl,-z,noexecstack \
-	-fPIC \
-	-m64 \
-	-fuse-ld=gold \
-	-nostdlib \
-	-Wl,--no-undefined \
-	-Wl,--exclude-libs=ALL \
-	-Wl,--warn-shared-textrel \
-	-Wl,-O1 \
-	-Wl,--as-needed
-
-
-LOCAL_LDFLAGS_Release := \
-	-Wl,-z,now \
-	-Wl,-z,relro \
-	-Wl,--fatal-warnings \
-	-Wl,-z,noexecstack \
-	-fPIC \
-	-m64 \
-	-fuse-ld=gold \
-	-nostdlib \
-	-Wl,--no-undefined \
-	-Wl,--exclude-libs=ALL \
-	-Wl,-O1 \
-	-Wl,--as-needed \
-	-Wl,--gc-sections \
-	-Wl,--warn-shared-textrel
-
-
-LOCAL_LDFLAGS := $(LOCAL_LDFLAGS_$(GYP_CONFIGURATION))
-
-LOCAL_STATIC_LIBRARIES :=
-
-# Enable grouping to fix circular references
-LOCAL_GROUP_STATIC_LIBRARIES := true
-
 LOCAL_SHARED_LIBRARIES := \
 	libstlport \
 	libdl
diff --git a/libyuv_neon.target.darwin-arm.mk b/libyuv_neon.target.darwin-arm.mk
index b770056..f4f3e35 100644
--- a/libyuv_neon.target.darwin-arm.mk
+++ b/libyuv_neon.target.darwin-arm.mk
@@ -5,7 +5,6 @@
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 LOCAL_MODULE := third_party_libyuv_libyuv_neon_gyp
 LOCAL_MODULE_SUFFIX := .a
-LOCAL_MODULE_TAGS := optional
 LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH)
 gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX))
 gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX))
@@ -34,7 +33,6 @@
 MY_CFLAGS_Debug := \
 	-fstack-protector \
 	--param=ssp-buffer-size=4 \
-	-fno-exceptions \
 	-fno-strict-aliasing \
 	-Wno-unused-parameter \
 	-Wno-missing-field-initializers \
@@ -45,13 +43,13 @@
 	-mfpu=neon \
 	-Wno-format \
 	-fno-tree-sra \
+	-fno-caller-saves \
+	-Wno-psabi \
 	-fno-partial-inlining \
 	-fno-early-inlining \
 	-fno-tree-copy-prop \
 	-fno-tree-loop-optimize \
 	-fno-move-loop-invariants \
-	-fno-caller-saves \
-	-Wno-psabi \
 	-ffunction-sections \
 	-funwind-tables \
 	-g \
@@ -70,6 +68,7 @@
 	-Wno-sequence-point \
 	-Os \
 	-g \
+	-gdwarf-4 \
 	-fdata-sections \
 	-ffunction-sections \
 	-fomit-frame-pointer \
@@ -77,7 +76,6 @@
 
 MY_DEFS_Debug := \
 	'-DV8_DEPRECATION_WARNINGS' \
-	'-DBLINK_SCALE_FILTERS_AT_RECORD_TIME' \
 	'-D_FILE_OFFSET_BITS=64' \
 	'-DNO_TCMALLOC' \
 	'-DDISABLE_NACL' \
@@ -94,11 +92,13 @@
 	'-DENABLE_PRINTING=1' \
 	'-DENABLE_MANAGED_USERS=1' \
 	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
-	'-DDATA_REDUCTION_DEV_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
 	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
 	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
 	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
 	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
 	'-DUSE_OPENSSL=1' \
 	'-DUSE_OPENSSL_CERTS=1' \
 	'-DANDROID' \
@@ -123,23 +123,24 @@
 
 # Flags passed to only C++ (and not C) files.
 LOCAL_CPPFLAGS_Debug := \
+	-fno-exceptions \
 	-fno-rtti \
 	-fno-threadsafe-statics \
 	-fvisibility-inlines-hidden \
 	-Wno-deprecated \
 	-Wno-abi \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
 	-Wno-non-virtual-dtor \
 	-Wno-sign-promo \
 	-Wno-non-virtual-dtor
 
 
-LOCAL_FDO_SUPPORT_Debug := false
-
 # Flags passed to both C and C++ files.
 MY_CFLAGS_Release := \
 	-fstack-protector \
 	--param=ssp-buffer-size=4 \
-	-fno-exceptions \
 	-fno-strict-aliasing \
 	-Wno-unused-parameter \
 	-Wno-missing-field-initializers \
@@ -150,13 +151,13 @@
 	-mfpu=neon \
 	-Wno-format \
 	-fno-tree-sra \
+	-fno-caller-saves \
+	-Wno-psabi \
 	-fno-partial-inlining \
 	-fno-early-inlining \
 	-fno-tree-copy-prop \
 	-fno-tree-loop-optimize \
 	-fno-move-loop-invariants \
-	-fno-caller-saves \
-	-Wno-psabi \
 	-ffunction-sections \
 	-funwind-tables \
 	-g \
@@ -182,7 +183,6 @@
 
 MY_DEFS_Release := \
 	'-DV8_DEPRECATION_WARNINGS' \
-	'-DBLINK_SCALE_FILTERS_AT_RECORD_TIME' \
 	'-D_FILE_OFFSET_BITS=64' \
 	'-DNO_TCMALLOC' \
 	'-DDISABLE_NACL' \
@@ -199,11 +199,13 @@
 	'-DENABLE_PRINTING=1' \
 	'-DENABLE_MANAGED_USERS=1' \
 	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
-	'-DDATA_REDUCTION_DEV_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
 	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
 	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
 	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
 	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
 	'-DUSE_OPENSSL=1' \
 	'-DUSE_OPENSSL_CERTS=1' \
 	'-DANDROID' \
@@ -228,69 +230,26 @@
 
 # Flags passed to only C++ (and not C) files.
 LOCAL_CPPFLAGS_Release := \
+	-fno-exceptions \
 	-fno-rtti \
 	-fno-threadsafe-statics \
 	-fvisibility-inlines-hidden \
 	-Wno-deprecated \
 	-Wno-abi \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
 	-Wno-non-virtual-dtor \
 	-Wno-sign-promo \
 	-Wno-non-virtual-dtor
 
 
-LOCAL_FDO_SUPPORT_Release := false
-
 LOCAL_CFLAGS := $(MY_CFLAGS_$(GYP_CONFIGURATION)) $(MY_DEFS_$(GYP_CONFIGURATION))
-LOCAL_FDO_SUPPORT := $(LOCAL_FDO_SUPPORT_$(GYP_CONFIGURATION))
 LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CONFIGURATION))
 LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION))
 LOCAL_ASFLAGS := $(LOCAL_CFLAGS)
 ### Rules for final target.
 
-LOCAL_LDFLAGS_Debug := \
-	-Wl,-z,now \
-	-Wl,-z,relro \
-	-Wl,--fatal-warnings \
-	-Wl,-z,noexecstack \
-	-fPIC \
-	-Wl,-z,relro \
-	-Wl,-z,now \
-	-fuse-ld=gold \
-	-nostdlib \
-	-Wl,--no-undefined \
-	-Wl,--exclude-libs=ALL \
-	-Wl,--icf=safe \
-	-Wl,--warn-shared-textrel \
-	-Wl,-O1 \
-	-Wl,--as-needed
-
-
-LOCAL_LDFLAGS_Release := \
-	-Wl,-z,now \
-	-Wl,-z,relro \
-	-Wl,--fatal-warnings \
-	-Wl,-z,noexecstack \
-	-fPIC \
-	-Wl,-z,relro \
-	-Wl,-z,now \
-	-fuse-ld=gold \
-	-nostdlib \
-	-Wl,--no-undefined \
-	-Wl,--exclude-libs=ALL \
-	-Wl,--icf=safe \
-	-Wl,-O1 \
-	-Wl,--as-needed \
-	-Wl,--gc-sections \
-	-Wl,--warn-shared-textrel
-
-
-LOCAL_LDFLAGS := $(LOCAL_LDFLAGS_$(GYP_CONFIGURATION))
-
-LOCAL_STATIC_LIBRARIES :=
-
-# Enable grouping to fix circular references
-LOCAL_GROUP_STATIC_LIBRARIES := true
-
 LOCAL_SHARED_LIBRARIES := \
 	libstlport \
 	libdl
diff --git a/libyuv_neon.target.linux-arm.mk b/libyuv_neon.target.linux-arm.mk
index b770056..f4f3e35 100644
--- a/libyuv_neon.target.linux-arm.mk
+++ b/libyuv_neon.target.linux-arm.mk
@@ -5,7 +5,6 @@
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 LOCAL_MODULE := third_party_libyuv_libyuv_neon_gyp
 LOCAL_MODULE_SUFFIX := .a
-LOCAL_MODULE_TAGS := optional
 LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH)
 gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX))
 gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX))
@@ -34,7 +33,6 @@
 MY_CFLAGS_Debug := \
 	-fstack-protector \
 	--param=ssp-buffer-size=4 \
-	-fno-exceptions \
 	-fno-strict-aliasing \
 	-Wno-unused-parameter \
 	-Wno-missing-field-initializers \
@@ -45,13 +43,13 @@
 	-mfpu=neon \
 	-Wno-format \
 	-fno-tree-sra \
+	-fno-caller-saves \
+	-Wno-psabi \
 	-fno-partial-inlining \
 	-fno-early-inlining \
 	-fno-tree-copy-prop \
 	-fno-tree-loop-optimize \
 	-fno-move-loop-invariants \
-	-fno-caller-saves \
-	-Wno-psabi \
 	-ffunction-sections \
 	-funwind-tables \
 	-g \
@@ -70,6 +68,7 @@
 	-Wno-sequence-point \
 	-Os \
 	-g \
+	-gdwarf-4 \
 	-fdata-sections \
 	-ffunction-sections \
 	-fomit-frame-pointer \
@@ -77,7 +76,6 @@
 
 MY_DEFS_Debug := \
 	'-DV8_DEPRECATION_WARNINGS' \
-	'-DBLINK_SCALE_FILTERS_AT_RECORD_TIME' \
 	'-D_FILE_OFFSET_BITS=64' \
 	'-DNO_TCMALLOC' \
 	'-DDISABLE_NACL' \
@@ -94,11 +92,13 @@
 	'-DENABLE_PRINTING=1' \
 	'-DENABLE_MANAGED_USERS=1' \
 	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
-	'-DDATA_REDUCTION_DEV_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
 	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
 	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
 	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
 	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
 	'-DUSE_OPENSSL=1' \
 	'-DUSE_OPENSSL_CERTS=1' \
 	'-DANDROID' \
@@ -123,23 +123,24 @@
 
 # Flags passed to only C++ (and not C) files.
 LOCAL_CPPFLAGS_Debug := \
+	-fno-exceptions \
 	-fno-rtti \
 	-fno-threadsafe-statics \
 	-fvisibility-inlines-hidden \
 	-Wno-deprecated \
 	-Wno-abi \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
 	-Wno-non-virtual-dtor \
 	-Wno-sign-promo \
 	-Wno-non-virtual-dtor
 
 
-LOCAL_FDO_SUPPORT_Debug := false
-
 # Flags passed to both C and C++ files.
 MY_CFLAGS_Release := \
 	-fstack-protector \
 	--param=ssp-buffer-size=4 \
-	-fno-exceptions \
 	-fno-strict-aliasing \
 	-Wno-unused-parameter \
 	-Wno-missing-field-initializers \
@@ -150,13 +151,13 @@
 	-mfpu=neon \
 	-Wno-format \
 	-fno-tree-sra \
+	-fno-caller-saves \
+	-Wno-psabi \
 	-fno-partial-inlining \
 	-fno-early-inlining \
 	-fno-tree-copy-prop \
 	-fno-tree-loop-optimize \
 	-fno-move-loop-invariants \
-	-fno-caller-saves \
-	-Wno-psabi \
 	-ffunction-sections \
 	-funwind-tables \
 	-g \
@@ -182,7 +183,6 @@
 
 MY_DEFS_Release := \
 	'-DV8_DEPRECATION_WARNINGS' \
-	'-DBLINK_SCALE_FILTERS_AT_RECORD_TIME' \
 	'-D_FILE_OFFSET_BITS=64' \
 	'-DNO_TCMALLOC' \
 	'-DDISABLE_NACL' \
@@ -199,11 +199,13 @@
 	'-DENABLE_PRINTING=1' \
 	'-DENABLE_MANAGED_USERS=1' \
 	'-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \
-	'-DDATA_REDUCTION_DEV_HOST="http://proxy-dev.googlezip.net:80/"' \
+	'-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \
+	'-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \
 	'-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \
 	'-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \
 	'-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \
 	'-DVIDEO_HOLE=1' \
+	'-DENABLE_LOAD_COMPLETION_HACKS=1' \
 	'-DUSE_OPENSSL=1' \
 	'-DUSE_OPENSSL_CERTS=1' \
 	'-DANDROID' \
@@ -228,69 +230,26 @@
 
 # Flags passed to only C++ (and not C) files.
 LOCAL_CPPFLAGS_Release := \
+	-fno-exceptions \
 	-fno-rtti \
 	-fno-threadsafe-statics \
 	-fvisibility-inlines-hidden \
 	-Wno-deprecated \
 	-Wno-abi \
+	-std=gnu++11 \
+	-Wno-narrowing \
+	-Wno-literal-suffix \
 	-Wno-non-virtual-dtor \
 	-Wno-sign-promo \
 	-Wno-non-virtual-dtor
 
 
-LOCAL_FDO_SUPPORT_Release := false
-
 LOCAL_CFLAGS := $(MY_CFLAGS_$(GYP_CONFIGURATION)) $(MY_DEFS_$(GYP_CONFIGURATION))
-LOCAL_FDO_SUPPORT := $(LOCAL_FDO_SUPPORT_$(GYP_CONFIGURATION))
 LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CONFIGURATION))
 LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION))
 LOCAL_ASFLAGS := $(LOCAL_CFLAGS)
 ### Rules for final target.
 
-LOCAL_LDFLAGS_Debug := \
-	-Wl,-z,now \
-	-Wl,-z,relro \
-	-Wl,--fatal-warnings \
-	-Wl,-z,noexecstack \
-	-fPIC \
-	-Wl,-z,relro \
-	-Wl,-z,now \
-	-fuse-ld=gold \
-	-nostdlib \
-	-Wl,--no-undefined \
-	-Wl,--exclude-libs=ALL \
-	-Wl,--icf=safe \
-	-Wl,--warn-shared-textrel \
-	-Wl,-O1 \
-	-Wl,--as-needed
-
-
-LOCAL_LDFLAGS_Release := \
-	-Wl,-z,now \
-	-Wl,-z,relro \
-	-Wl,--fatal-warnings \
-	-Wl,-z,noexecstack \
-	-fPIC \
-	-Wl,-z,relro \
-	-Wl,-z,now \
-	-fuse-ld=gold \
-	-nostdlib \
-	-Wl,--no-undefined \
-	-Wl,--exclude-libs=ALL \
-	-Wl,--icf=safe \
-	-Wl,-O1 \
-	-Wl,--as-needed \
-	-Wl,--gc-sections \
-	-Wl,--warn-shared-textrel
-
-
-LOCAL_LDFLAGS := $(LOCAL_LDFLAGS_$(GYP_CONFIGURATION))
-
-LOCAL_STATIC_LIBRARIES :=
-
-# Enable grouping to fix circular references
-LOCAL_GROUP_STATIC_LIBRARIES := true
-
 LOCAL_SHARED_LIBRARIES := \
 	libstlport \
 	libdl
diff --git a/libyuv_test.gyp b/libyuv_test.gyp
index e7e9e76..140ba6c 100644
--- a/libyuv_test.gyp
+++ b/libyuv_test.gyp
@@ -7,6 +7,9 @@
 # be found in the AUTHORS file in the root of the source tree.
 
 {
+  'variables': {
+    'libyuv_disable_jpeg%': 0,
+  },
   'targets': [
     {
       'target_name': 'libyuv_unittest',
@@ -56,7 +59,13 @@
             'LIBYUV_DISABLE_NEON'
           ],
         }],
-        [ 'OS != "ios"', {
+        [ 'OS == "ios"', {
+          'xcode_settings': {
+            'DEBUGGING_SYMBOLS': 'YES',
+            'DEBUG_INFORMATION_FORMAT' : 'dwarf-with-dsym',
+          },
+        }],
+        [ 'OS != "ios" and libyuv_disable_jpeg != 1', {
           'defines': [
             'HAVE_JPEG',
           ],
@@ -110,14 +119,24 @@
         'util/psnr.cc',
         'util/ssim.cc',
       ],
+      'dependencies': [
+        'libyuv.gyp:libyuv',
+      ],
       'conditions': [
         [ 'OS == "ios" and target_subarch == 64', {
           'defines': [
             'LIBYUV_DISABLE_NEON'
           ],
         }],
+
+        [ 'OS != "ios" and libyuv_disable_jpeg != 1', {
+          'defines': [
+            'HAVE_JPEG',
+          ],
+        }],
       ], # conditions
     },
+
     {
       'target_name': 'cpuid',
       'type': 'executable',
diff --git a/source/compare_neon.cc b/source/compare_neon.cc
index 77f42f4..5e7b8e4 100644
--- a/source/compare_neon.cc
+++ b/source/compare_neon.cc
@@ -9,6 +9,7 @@
  */
 
 #include "libyuv/basic_types.h"
+#include "libyuv/row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -27,14 +28,9 @@
 
     ".p2align  2                               \n"
   "1:                                          \n"
-    // TODO(fbarchard): Define a macro for clearing address bits for NaCL.
-#if defined(__native_client__)
-    "bic        %0, #0xc0000000                \n"
-#endif
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"
-#if defined(__native_client__)
-    "bic        %1, #0xc0000000                \n"
-#endif
+    MEMACCESS(1)
     "vld1.8     {q1}, [%1]!                    \n"
     "subs       %2, %2, #16                    \n"
     "vsubl.u8   q2, d0, d2                     \n"
diff --git a/source/convert.cc b/source/convert.cc
index c8408dc..874a6cb 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -36,14 +36,14 @@
                       uint8* dst_v, int dst_stride_v,
                       int src_y_width, int src_y_height,
                       int src_uv_width, int src_uv_height) {
-  if (src_y_width == 0 || src_y_height == 0 ||
-      src_uv_width == 0 || src_uv_height == 0) {
-    return -1;
-  }
   const int dst_y_width = Abs(src_y_width);
   const int dst_y_height = Abs(src_y_height);
   const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
   const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
+  if (src_y_width == 0 || src_y_height == 0 ||
+      src_uv_width == 0 || src_uv_height == 0) {
+    return -1;
+  }
   ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
              dst_y, dst_stride_y, dst_y_width, dst_y_height,
              kFilterBilinear);
@@ -67,6 +67,8 @@
              uint8* dst_u, int dst_stride_u,
              uint8* dst_v, int dst_stride_v,
              int width, int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
   if (!src_y || !src_u || !src_v ||
       !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -75,7 +77,7 @@
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    const int halfheight = (height + 1) >> 1;
+    halfheight = (height + 1) >> 1;
     src_y = src_y + (height - 1) * src_stride_y;
     src_u = src_u + (halfheight - 1) * src_stride_u;
     src_v = src_v + (halfheight - 1) * src_stride_v;
@@ -88,8 +90,6 @@
     CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
   }
   // Copy UV planes.
-  const int halfwidth = (width + 1) >> 1;
-  const int halfheight = (height + 1) >> 1;
   CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
   CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
   return 0;
@@ -164,6 +164,8 @@
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
   if (!src_y || !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
     return -1;
@@ -171,11 +173,10 @@
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
+    halfheight = (height + 1) >> 1;
     src_y = src_y + (height - 1) * src_stride_y;
     src_stride_y = -src_stride_y;
   }
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
   CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
   SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
   SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
@@ -185,6 +186,7 @@
 static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
                        uint8* dst, int dst_stride,
                        int width, int height) {
+  int y;
   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
 #if defined(HAS_COPYROW_X86)
   if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
@@ -216,7 +218,7 @@
 #endif
 
   // Copy plane
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
     CopyRow(src, dst, width);
     CopyRow(src + src_stride_0, dst + dst_stride, width);
     src += src_stride_0 + src_stride_1;
@@ -243,6 +245,11 @@
                       uint8* dst_u, int dst_stride_u,
                       uint8* dst_v, int dst_stride_v,
                       int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
+      SplitUVRow_C;
   if (!src_y || !src_uv ||
       !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -251,7 +258,7 @@
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    int halfheight = (height + 1) >> 1;
+    halfheight = (height + 1) >> 1;
     dst_y = dst_y + (height - 1) * dst_stride_y;
     dst_u = dst_u + (halfheight - 1) * dst_stride_u;
     dst_v = dst_v + (halfheight - 1) * dst_stride_v;
@@ -260,8 +267,6 @@
     dst_stride_v = -dst_stride_v;
   }
   // Coalesce rows.
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
   if (src_stride_y0 == width &&
       src_stride_y1 == width &&
       dst_stride_y == width) {
@@ -277,8 +282,6 @@
     halfheight = 1;
     src_stride_uv = dst_stride_u = dst_stride_v = 0;
   }
-  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
-      SplitUVRow_C;
 #if defined(HAS_SPLITUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
     SplitUVRow = SplitUVRow_Any_SSE2;
@@ -331,7 +334,7 @@
     }
   }
 
-  for (int y = 0; y < halfheight; ++y) {
+  for (y = 0; y < halfheight; ++y) {
     // Copy a row of UV.
     SplitUVRow(src_uv, dst_u, dst_v, halfwidth);
     dst_u += dst_stride_u;
@@ -397,6 +400,13 @@
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  int y;
+  int halfheight = (height + 1) >> 1;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+  void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+      int pix) = YUY2ToUV422Row_C;
+  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =
+      YUY2ToYRow_C;
   if (!src_y || !src_yuy2 ||
       !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -405,7 +415,7 @@
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    int halfheight = (height + 1) >> 1;
+    halfheight = (height + 1) >> 1;
     dst_y = dst_y + (height - 1) * dst_stride_y;
     dst_u = dst_u + (halfheight - 1) * dst_stride_u;
     dst_v = dst_v + (halfheight - 1) * dst_stride_v;
@@ -414,7 +424,6 @@
     dst_stride_v = -dst_stride_v;
   }
   // CopyRow for rows of just Y in Q420 copied to Y plane of I420.
-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
 #if defined(HAS_COPYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
     CopyRow = CopyRow_NEON;
@@ -443,10 +452,6 @@
   }
 #endif
 
-  void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
-      int pix) = YUY2ToUV422Row_C;
-  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =
-      YUY2ToYRow_C;
 #if defined(HAS_YUY2TOYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
     YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
@@ -486,7 +491,7 @@
   }
 #endif
 
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
     CopyRow(src_y, dst_y, width);
     src_y += src_stride_y;
     dst_y += dst_stride_y;
@@ -512,18 +517,17 @@
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  int y;
+  void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
+      uint8* dst_u, uint8* dst_v, int pix) = YUY2ToUVRow_C;
+  void (*YUY2ToYRow)(const uint8* src_yuy2,
+      uint8* dst_y, int pix) = YUY2ToYRow_C;
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
     src_stride_yuy2 = -src_stride_yuy2;
   }
-  void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int pix);
-  void (*YUY2ToYRow)(const uint8* src_yuy2,
-                     uint8* dst_y, int pix);
-  YUY2ToYRow = YUY2ToYRow_C;
-  YUY2ToUVRow = YUY2ToUVRow_C;
 #if defined(HAS_YUY2TOYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
     YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
@@ -563,7 +567,7 @@
   }
 #endif
 
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
     YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
     YUY2ToYRow(src_yuy2, dst_y, width);
     YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
@@ -586,18 +590,17 @@
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  int y;
+  void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
+      uint8* dst_u, uint8* dst_v, int pix) = UYVYToUVRow_C;
+  void (*UYVYToYRow)(const uint8* src_uyvy,
+      uint8* dst_y, int pix) = UYVYToYRow_C;
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
     src_stride_uyvy = -src_stride_uyvy;
   }
-  void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int pix);
-  void (*UYVYToYRow)(const uint8* src_uyvy,
-                     uint8* dst_y, int pix);
-  UYVYToYRow = UYVYToYRow_C;
-  UYVYToUVRow = UYVYToUVRow_C;
 #if defined(HAS_UYVYTOYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
     UYVYToUVRow = UYVYToUVRow_Any_SSE2;
@@ -637,7 +640,7 @@
   }
 #endif
 
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
     UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
     UYVYToYRow(src_uyvy, dst_y, width);
     UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
@@ -660,6 +663,11 @@
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  int y;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
   if (!src_argb ||
       !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -671,10 +679,6 @@
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
-      ARGBToYRow_C;
 #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
@@ -716,7 +720,7 @@
   }
 #endif
 
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
     ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
     ARGBToYRow(src_argb, dst_y, width);
     ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
@@ -739,6 +743,11 @@
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  int y;
+  void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
+      uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;
+  void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix) =
+      BGRAToYRow_C;
   if (!src_bgra ||
       !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -750,10 +759,6 @@
     src_bgra = src_bgra + (height - 1) * src_stride_bgra;
     src_stride_bgra = -src_stride_bgra;
   }
-  void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
-                      uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;
-  void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix) =
-      BGRAToYRow_C;
 #if defined(HAS_BGRATOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
@@ -784,7 +789,7 @@
   }
 #endif
 
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
     BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
     BGRAToYRow(src_bgra, dst_y, width);
     BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);
@@ -807,6 +812,11 @@
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  int y;
+  void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
+      uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;
+  void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix) =
+      ABGRToYRow_C;
   if (!src_abgr ||
       !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -818,10 +828,6 @@
     src_abgr = src_abgr + (height - 1) * src_stride_abgr;
     src_stride_abgr = -src_stride_abgr;
   }
-  void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
-                      uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;
-  void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix) =
-      ABGRToYRow_C;
 #if defined(HAS_ABGRTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
@@ -852,7 +858,7 @@
   }
 #endif
 
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
     ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
     ABGRToYRow(src_abgr, dst_y, width);
     ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
@@ -875,6 +881,11 @@
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  int y;
+  void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
+      uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;
+  void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix) =
+      RGBAToYRow_C;
   if (!src_rgba ||
       !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -886,10 +897,6 @@
     src_rgba = src_rgba + (height - 1) * src_stride_rgba;
     src_stride_rgba = -src_stride_rgba;
   }
-  void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
-                      uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;
-  void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix) =
-      RGBAToYRow_C;
 #if defined(HAS_RGBATOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
@@ -920,7 +927,7 @@
   }
 #endif
 
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
     RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
     RGBAToYRow(src_rgba, dst_y, width);
     RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);
@@ -943,6 +950,23 @@
                 uint8* dst_u, int dst_stride_u,
                 uint8* dst_v, int dst_stride_v,
                 int width, int height) {
+  int y;
+#if defined(HAS_RGB24TOYROW_NEON)
+  void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,
+      uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
+  void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int pix) =
+      RGB24ToYRow_C;
+#else
+  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RGB24ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (width * 4 + 15) & ~15;
+  align_buffer_64(row, kRowSize * 2);
+#endif
   if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
     return -1;
@@ -955,10 +979,6 @@
   }
 
 #if defined(HAS_RGB24TOYROW_NEON)
-  void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,
-      uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
-  void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int pix) =
-      RGB24ToYRow_C;
   if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
     RGB24ToYRow = RGB24ToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
@@ -973,12 +993,6 @@
   }
 #else  // HAS_RGB24TOYROW_NEON
 
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (width * 4 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
-
-  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
-      RGB24ToARGBRow_C;
 #if defined(HAS_RGB24TOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
@@ -987,8 +1001,6 @@
     }
   }
 #endif
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
 #if defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
@@ -997,8 +1009,6 @@
     }
   }
 #endif
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
-      ARGBToYRow_C;
 #if defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -1012,7 +1022,7 @@
 #endif  // HAS_ARGBTOUVROW_SSSE3
 #endif  // HAS_RGB24TOYROW_NEON
 
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
 #if defined(HAS_RGB24TOYROW_NEON)
     RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
     RGB24ToYRow(src_rgb24, dst_y, width);
@@ -1052,6 +1062,23 @@
               uint8* dst_u, int dst_stride_u,
               uint8* dst_v, int dst_stride_v,
               int width, int height) {
+  int y;
+#if defined(HAS_RAWTOYROW_NEON)
+  void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,
+      uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;
+  void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int pix) =
+      RAWToYRow_C;
+#else
+  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RAWToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (width * 4 + 15) & ~15;
+  align_buffer_64(row, kRowSize * 2);
+#endif
   if (!src_raw || !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
     return -1;
@@ -1064,10 +1091,6 @@
   }
 
 #if defined(HAS_RAWTOYROW_NEON)
-  void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,
-      uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;
-  void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int pix) =
-      RAWToYRow_C;
   if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
     RAWToYRow = RAWToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
@@ -1082,12 +1105,6 @@
   }
 #else  // HAS_RAWTOYROW_NEON
 
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (width * 4 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
-
-  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
-      RAWToARGBRow_C;
 #if defined(HAS_RAWTOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
@@ -1096,8 +1113,6 @@
     }
   }
 #endif
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
 #if defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
@@ -1106,8 +1121,6 @@
     }
   }
 #endif
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
-      ARGBToYRow_C;
 #if defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -1121,7 +1134,7 @@
 #endif  // HAS_ARGBTOUVROW_SSSE3
 #endif  // HAS_RAWTOYROW_NEON
 
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
 #if defined(HAS_RAWTOYROW_NEON)
     RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
     RAWToYRow(src_raw, dst_y, width);
@@ -1157,10 +1170,27 @@
 // Convert RGB565 to I420.
 LIBYUV_API
 int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
-                uint8* dst_y, int dst_stride_y,
-                uint8* dst_u, int dst_stride_u,
-                uint8* dst_v, int dst_stride_v,
-                int width, int height) {
+                 uint8* dst_y, int dst_stride_y,
+                 uint8* dst_u, int dst_stride_u,
+                 uint8* dst_v, int dst_stride_v,
+                 int width, int height) {
+  int y;
+#if defined(HAS_RGB565TOYROW_NEON)
+  void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
+      uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;
+  void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int pix) =
+      RGB565ToYRow_C;
+#else
+  void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RGB565ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (width * 4 + 15) & ~15;
+  align_buffer_64(row, kRowSize * 2);
+#endif
   if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
     return -1;
@@ -1173,10 +1203,6 @@
   }
 
 #if defined(HAS_RGB565TOYROW_NEON)
-  void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
-      uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;
-  void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int pix) =
-      RGB565ToYRow_C;
   if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
     RGB565ToYRow = RGB565ToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
@@ -1191,12 +1217,6 @@
   }
 #else  // HAS_RGB565TOYROW_NEON
 
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (width * 4 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
-
-  void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
-      RGB565ToARGBRow_C;
 #if defined(HAS_RGB565TOARGBROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
     RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
@@ -1205,8 +1225,6 @@
     }
   }
 #endif
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
 #if defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
@@ -1215,8 +1233,6 @@
     }
   }
 #endif
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
-      ARGBToYRow_C;
 #if defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -1230,7 +1246,7 @@
 #endif  // HAS_ARGBTOUVROW_SSSE3
 #endif  // HAS_RGB565TOYROW_NEON
 
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
 #if defined(HAS_RGB565TOYROW_NEON)
     RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
     RGB565ToYRow(src_rgb565, dst_y, width);
@@ -1270,6 +1286,23 @@
                    uint8* dst_u, int dst_stride_u,
                    uint8* dst_v, int dst_stride_v,
                    int width, int height) {
+  int y;
+#if defined(HAS_ARGB1555TOYROW_NEON)
+  void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,
+      uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;
+  void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int pix) =
+      ARGB1555ToYRow_C;
+#else
+  void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      ARGB1555ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (width * 4 + 15) & ~15;
+  align_buffer_64(row, kRowSize * 2);
+#endif
   if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
     return -1;
@@ -1282,10 +1315,6 @@
   }
 
 #if defined(HAS_ARGB1555TOYROW_NEON)
-  void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,
-      uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;
-  void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int pix) =
-      ARGB1555ToYRow_C;
   if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
     ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
@@ -1300,12 +1329,6 @@
   }
 #else  // HAS_ARGB1555TOYROW_NEON
 
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (width * 4 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
-
-  void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
-      ARGB1555ToARGBRow_C;
 #if defined(HAS_ARGB1555TOARGBROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
     ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
@@ -1314,8 +1337,6 @@
     }
   }
 #endif
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
 #if defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
@@ -1324,8 +1345,6 @@
     }
   }
 #endif
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
-      ARGBToYRow_C;
 #if defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -1339,7 +1358,7 @@
 #endif  // HAS_ARGBTOUVROW_SSSE3
 #endif  // HAS_ARGB1555TOYROW_NEON
 
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
 #if defined(HAS_ARGB1555TOYROW_NEON)
     ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
     ARGB1555ToYRow(src_argb1555, dst_y, width);
@@ -1381,6 +1400,23 @@
                    uint8* dst_u, int dst_stride_u,
                    uint8* dst_v, int dst_stride_v,
                    int width, int height) {
+  int y;
+#if defined(HAS_ARGB4444TOYROW_NEON)
+  void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,
+      uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;
+  void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int pix) =
+      ARGB4444ToYRow_C;
+#else
+  void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      ARGB4444ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (width * 4 + 15) & ~15;
+  align_buffer_64(row, kRowSize * 2);
+#endif
   if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
     return -1;
@@ -1393,10 +1429,6 @@
   }
 
 #if defined(HAS_ARGB4444TOYROW_NEON)
-  void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,
-      uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;
-  void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int pix) =
-      ARGB4444ToYRow_C;
   if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
     ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
@@ -1411,12 +1443,6 @@
   }
 #else  // HAS_ARGB4444TOYROW_NEON
 
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (width * 4 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
-
-  void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
-      ARGB4444ToARGBRow_C;
 #if defined(HAS_ARGB4444TOARGBROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
     ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
@@ -1425,8 +1451,6 @@
     }
   }
 #endif
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
 #if defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
@@ -1435,8 +1459,6 @@
     }
   }
 #endif
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
-      ARGBToYRow_C;
 #if defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -1450,7 +1472,7 @@
 #endif  // HAS_ARGBTOUVROW_SSSE3
 #endif  // HAS_ARGB4444TOYROW_NEON
 
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
 #if defined(HAS_ARGB4444TOYROW_NEON)
     ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
     ARGB4444ToYRow(src_argb4444, dst_y, width);
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index a8aab91..ac0bc3d 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -52,6 +52,12 @@
                const uint8* src_v, int src_stride_v,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
+  int y;
+  void (*I444ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I444ToARGBRow_C;
   if (!src_y || !src_u || !src_v ||
       !dst_argb ||
       width <= 0 || height == 0) {
@@ -72,11 +78,6 @@
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
   }
-  void (*I444ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I444ToARGBRow_C;
 #if defined(HAS_I444TOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
     I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
@@ -96,7 +97,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     I444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
@@ -113,6 +114,12 @@
                const uint8* src_v, int src_stride_v,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
   if (!src_y || !src_u || !src_v ||
       !dst_argb ||
       width <= 0 || height == 0) {
@@ -133,11 +140,6 @@
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
   }
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToARGBRow_C;
 #if defined(HAS_I422TOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
     I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
@@ -175,7 +177,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
@@ -192,6 +194,12 @@
                const uint8* src_v, int src_stride_v,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
+  int y;
+  void (*I411ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I411ToARGBRow_C;
   if (!src_y || !src_u || !src_v ||
       !dst_argb ||
       width <= 0 || height == 0) {
@@ -212,11 +220,6 @@
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
   }
-  void (*I411ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I411ToARGBRow_C;
 #if defined(HAS_I411TOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
     I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
@@ -236,7 +239,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     I411ToARGBRow(src_y, src_u, src_v, dst_argb, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
@@ -251,6 +254,10 @@
 int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
                          uint8* dst_argb, int dst_stride_argb,
                          int width, int height) {
+  int y;
+  void (*YToARGBRow)(const uint8* y_buf,
+                     uint8* rgb_buf,
+                     int width) = YToARGBRow_C;
   if (!src_y || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
@@ -268,9 +275,6 @@
     height = 1;
     src_stride_y = dst_stride_argb = 0;
   }
-  void (*YToARGBRow)(const uint8* y_buf,
-                     uint8* rgb_buf,
-                     int width) = YToARGBRow_C;
 #if defined(HAS_YTOARGBROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
       IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
@@ -288,7 +292,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     YToARGBRow(src_y, dst_argb, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
@@ -301,6 +305,9 @@
 int I400ToARGB(const uint8* src_y, int src_stride_y,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
+  int y;
+  void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
+      I400ToARGBRow_C;
   if (!src_y || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
@@ -318,8 +325,6 @@
     height = 1;
     src_stride_y = dst_stride_argb = 0;
   }
-  void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
-      I400ToARGBRow_C;
 #if defined(HAS_I400TOARGBROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
     I400ToARGBRow = I400ToARGBRow_Any_SSE2;
@@ -338,7 +343,7 @@
     }
   }
 #endif
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     I400ToARGBRow(src_y, dst_argb, width);
     src_y += src_stride_y;
     dst_argb += dst_stride_argb;
@@ -372,6 +377,17 @@
                      width, height);
 }
 
+// Convert ARGB to BGRA (same as BGRAToARGB).
+LIBYUV_API
+int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskBGRAToARGB),
+                     width, height);
+}
+
 // Convert ABGR to ARGB.
 LIBYUV_API
 int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
@@ -383,6 +399,17 @@
                      width, height);
 }
 
+// Convert ARGB to ABGR to (same as ABGRToARGB).
+LIBYUV_API
+int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_abgr, src_stride_abgr,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskABGRToARGB),
+                     width, height);
+}
+
 // Convert RGBA to ARGB.
 LIBYUV_API
 int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,
@@ -399,6 +426,9 @@
 int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
                 uint8* dst_argb, int dst_stride_argb,
                 int width, int height) {
+  int y;
+  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RGB24ToARGBRow_C;
   if (!src_rgb24 || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
@@ -416,8 +446,6 @@
     height = 1;
     src_stride_rgb24 = dst_stride_argb = 0;
   }
-  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
-      RGB24ToARGBRow_C;
 #if defined(HAS_RGB24TOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
       IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
@@ -435,7 +463,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     RGB24ToARGBRow(src_rgb24, dst_argb, width);
     src_rgb24 += src_stride_rgb24;
     dst_argb += dst_stride_argb;
@@ -448,6 +476,9 @@
 int RAWToARGB(const uint8* src_raw, int src_stride_raw,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height) {
+  int y;
+  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RAWToARGBRow_C;
   if (!src_raw || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
@@ -465,8 +496,6 @@
     height = 1;
     src_stride_raw = dst_stride_argb = 0;
   }
-  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
-      RAWToARGBRow_C;
 #if defined(HAS_RAWTOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
       IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
@@ -484,7 +513,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     RAWToARGBRow(src_raw, dst_argb, width);
     src_raw += src_stride_raw;
     dst_argb += dst_stride_argb;
@@ -497,6 +526,9 @@
 int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
                  uint8* dst_argb, int dst_stride_argb,
                  int width, int height) {
+  int y;
+  void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) =
+      RGB565ToARGBRow_C;
   if (!src_rgb565 || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
@@ -514,8 +546,6 @@
     height = 1;
     src_stride_rgb565 = dst_stride_argb = 0;
   }
-  void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) =
-      RGB565ToARGBRow_C;
 #if defined(HAS_RGB565TOARGBROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
       IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
@@ -533,7 +563,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     RGB565ToARGBRow(src_rgb565, dst_argb, width);
     src_rgb565 += src_stride_rgb565;
     dst_argb += dst_stride_argb;
@@ -546,6 +576,9 @@
 int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
                    uint8* dst_argb, int dst_stride_argb,
                    int width, int height) {
+  int y;
+  void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
+      int pix) = ARGB1555ToARGBRow_C;
   if (!src_argb1555 || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
@@ -563,8 +596,6 @@
     height = 1;
     src_stride_argb1555 = dst_stride_argb = 0;
   }
-  void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
-                            int pix) = ARGB1555ToARGBRow_C;
 #if defined(HAS_ARGB1555TOARGBROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
       IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
@@ -582,7 +613,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
     src_argb1555 += src_stride_argb1555;
     dst_argb += dst_stride_argb;
@@ -595,6 +626,9 @@
 int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
                    uint8* dst_argb, int dst_stride_argb,
                    int width, int height) {
+  int y;
+  void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
+      int pix) = ARGB4444ToARGBRow_C;
   if (!src_argb4444 || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
@@ -612,8 +646,6 @@
     height = 1;
     src_stride_argb4444 = dst_stride_argb = 0;
   }
-  void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
-                            int pix) = ARGB4444ToARGBRow_C;
 #if defined(HAS_ARGB4444TOARGBROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
       IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
@@ -631,7 +663,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
     src_argb4444 += src_stride_argb4444;
     dst_argb += dst_stride_argb;
@@ -645,6 +677,11 @@
                const uint8* src_uv, int src_stride_uv,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
+  int y;
+  void (*NV12ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) = NV12ToARGBRow_C;
   if (!src_y || !src_uv || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
@@ -655,10 +692,6 @@
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  void (*NV12ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        int width) = NV12ToARGBRow_C;
 #if defined(HAS_NV12TOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
     NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
@@ -678,7 +711,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     NV12ToARGBRow(src_y, src_uv, dst_argb, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
@@ -695,6 +728,11 @@
                const uint8* src_uv, int src_stride_uv,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
+  int y;
+  void (*NV21ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) = NV21ToARGBRow_C;
   if (!src_y || !src_uv || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
@@ -705,10 +743,6 @@
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  void (*NV21ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        int width) = NV21ToARGBRow_C;
 #if defined(HAS_NV21TOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
     NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
@@ -729,7 +763,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     NV21ToARGBRow(src_y, src_uv, dst_argb, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
@@ -745,6 +779,11 @@
 int M420ToARGB(const uint8* src_m420, int src_stride_m420,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
+  int y;
+  void (*NV12ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) = NV12ToARGBRow_C;
   if (!src_m420 || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
@@ -755,10 +794,6 @@
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  void (*NV12ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        int width) = NV12ToARGBRow_C;
 #if defined(HAS_NV12TOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
     NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
@@ -778,7 +813,7 @@
   }
 #endif
 
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
     NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
     NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
                   dst_argb + dst_stride_argb, width);
@@ -796,6 +831,9 @@
 int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
+  int y;
+  void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) =
+      YUY2ToARGBRow_C;
   if (!src_yuy2 || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
@@ -813,8 +851,6 @@
     height = 1;
     src_stride_yuy2 = dst_stride_argb = 0;
   }
-  void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) =
-      YUY2ToARGBRow_C;
 #if defined(HAS_YUY2TOARGBROW_SSSE3)
   // Posix is 16, Windows is 8.
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
@@ -835,7 +871,7 @@
     }
   }
 #endif
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     YUY2ToARGBRow(src_yuy2, dst_argb, width);
     src_yuy2 += src_stride_yuy2;
     dst_argb += dst_stride_argb;
@@ -848,6 +884,9 @@
 int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
+  int y;
+  void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) =
+      UYVYToARGBRow_C;
   if (!src_uyvy || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
@@ -865,8 +904,6 @@
     height = 1;
     src_stride_uyvy = dst_stride_argb = 0;
   }
-  void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) =
-      UYVYToARGBRow_C;
 #if defined(HAS_UYVYTOARGBROW_SSSE3)
   // Posix is 16, Windows is 8.
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
@@ -887,7 +924,7 @@
     }
   }
 #endif
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     UYVYToARGBRow(src_uyvy, dst_argb, width);
     src_uyvy += src_stride_uyvy;
     dst_argb += dst_stride_argb;
diff --git a/source/convert_from.cc b/source/convert_from.cc
index 1e10832..c1a2f62 100644
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -39,14 +39,14 @@
                       uint8* dst_v, int dst_stride_v,
                       int src_y_width, int src_y_height,
                       int dst_uv_width, int dst_uv_height) {
-  if (src_y_width == 0 || src_y_height == 0 ||
-      dst_uv_width <= 0 || dst_uv_height <= 0) {
-    return -1;
-  }
   const int dst_y_width = Abs(src_y_width);
   const int dst_y_height = Abs(src_y_height);
   const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
   const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
+  if (src_y_width == 0 || src_y_height == 0 ||
+      dst_uv_width <= 0 || dst_uv_height <= 0) {
+    return -1;
+  }
   ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
              dst_y, dst_stride_y, dst_y_width, dst_y_height,
              kFilterBilinear);
@@ -150,6 +150,10 @@
                const uint8* src_v, int src_stride_v,
                uint8* dst_yuy2, int dst_stride_yuy2,
                int width, int height) {
+  int y;
+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
   if (!src_y || !src_u || !src_v || !dst_yuy2 ||
       width <= 0 || height == 0) {
     return -1;
@@ -169,9 +173,6 @@
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
   }
-  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_yuy2, int width) =
-      I422ToYUY2Row_C;
 #if defined(HAS_I422TOYUY2ROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
     I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
@@ -188,7 +189,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
     src_y += src_stride_y;
     src_u += src_stride_u;
@@ -204,6 +205,10 @@
                const uint8* src_v, int src_stride_v,
                uint8* dst_yuy2, int dst_stride_yuy2,
                int width, int height) {
+  int y;
+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
   if (!src_y || !src_u || !src_v || !dst_yuy2 ||
       width <= 0 || height == 0) {
     return -1;
@@ -214,9 +219,6 @@
     dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
     dst_stride_yuy2 = -dst_stride_yuy2;
   }
-  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_yuy2, int width) =
-      I422ToYUY2Row_C;
 #if defined(HAS_I422TOYUY2ROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
     I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
@@ -233,7 +235,7 @@
   }
 #endif
 
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
     I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
     I422ToYUY2Row(src_y + src_stride_y, src_u, src_v,
                   dst_yuy2 + dst_stride_yuy2, width);
@@ -254,6 +256,10 @@
                const uint8* src_v, int src_stride_v,
                uint8* dst_uyvy, int dst_stride_uyvy,
                int width, int height) {
+  int y;
+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_uyvy, int width) =
+      I422ToUYVYRow_C;
   if (!src_y || !src_u || !src_v || !dst_uyvy ||
       width <= 0 || height == 0) {
     return -1;
@@ -273,9 +279,6 @@
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
   }
-  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_uyvy, int width) =
-      I422ToUYVYRow_C;
 #if defined(HAS_I422TOUYVYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
     I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
@@ -292,7 +295,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
     src_y += src_stride_y;
     src_u += src_stride_u;
@@ -308,6 +311,10 @@
                const uint8* src_v, int src_stride_v,
                uint8* dst_uyvy, int dst_stride_uyvy,
                int width, int height) {
+  int y;
+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_uyvy, int width) =
+      I422ToUYVYRow_C;
   if (!src_y || !src_u || !src_v || !dst_uyvy ||
       width <= 0 || height == 0) {
     return -1;
@@ -318,9 +325,6 @@
     dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
     dst_stride_uyvy = -dst_stride_uyvy;
   }
-  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_uyvy, int width) =
-      I422ToUYVYRow_C;
 #if defined(HAS_I422TOUYVYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
     I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
@@ -337,7 +341,7 @@
   }
 #endif
 
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
     I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
     I422ToUYVYRow(src_y + src_stride_y, src_u, src_v,
                   dst_uyvy + dst_stride_uyvy, width);
@@ -359,6 +363,12 @@
                uint8* dst_y, int dst_stride_y,
                uint8* dst_uv, int dst_stride_uv,
                int width, int height) {
+  int y;
+  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+      int width) = MergeUVRow_C;
+  // Coalesce rows.
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
   if (!src_y || !src_u || !src_v || !dst_y || !dst_uv ||
       width <= 0 || height == 0) {
     return -1;
@@ -366,15 +376,12 @@
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    int halfheight = (height + 1) >> 1;
+    halfheight = (height + 1) >> 1;
     dst_y = dst_y + (height - 1) * dst_stride_y;
     dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv;
     dst_stride_y = -dst_stride_y;
     dst_stride_uv = -dst_stride_uv;
   }
-  // Coalesce rows.
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
   if (src_stride_y == width &&
       dst_stride_y == width) {
     width *= height;
@@ -389,8 +396,6 @@
     halfheight = 1;
     src_stride_u = src_stride_v = dst_stride_uv = 0;
   }
-  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                      int width) = MergeUVRow_C;
 #if defined(HAS_MERGEUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -422,7 +427,7 @@
 #endif
 
   CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  for (int y = 0; y < halfheight; ++y) {
+  for (y = 0; y < halfheight; ++y) {
     // Merge a row of U and V into a row of UV.
     MergeUVRow_(src_u, src_v, dst_uv, halfwidth);
     src_u += src_stride_u;
@@ -454,6 +459,12 @@
                const uint8* src_v, int src_stride_v,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
   if (!src_y || !src_u || !src_v || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
@@ -464,11 +475,6 @@
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToARGBRow_C;
 #if defined(HAS_I422TOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
     I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
@@ -506,7 +512,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
@@ -525,6 +531,12 @@
                const uint8* src_v, int src_stride_v,
                uint8* dst_bgra, int dst_stride_bgra,
                int width, int height) {
+  int y;
+  void (*I422ToBGRARow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToBGRARow_C;
   if (!src_y || !src_u || !src_v || !dst_bgra ||
       width <= 0 || height == 0) {
     return -1;
@@ -535,11 +547,6 @@
     dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
     dst_stride_bgra = -dst_stride_bgra;
   }
-  void (*I422ToBGRARow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToBGRARow_C;
 #if defined(HAS_I422TOBGRAROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
     I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
@@ -567,7 +574,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
     dst_bgra += dst_stride_bgra;
     src_y += src_stride_y;
@@ -586,6 +593,12 @@
                const uint8* src_v, int src_stride_v,
                uint8* dst_abgr, int dst_stride_abgr,
                int width, int height) {
+  int y;
+  void (*I422ToABGRRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToABGRRow_C;
   if (!src_y || !src_u || !src_v || !dst_abgr ||
       width <= 0 || height == 0) {
     return -1;
@@ -596,11 +609,6 @@
     dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
     dst_stride_abgr = -dst_stride_abgr;
   }
-  void (*I422ToABGRRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToABGRRow_C;
 #if defined(HAS_I422TOABGRROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
     I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
@@ -620,7 +628,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
     dst_abgr += dst_stride_abgr;
     src_y += src_stride_y;
@@ -639,6 +647,12 @@
                const uint8* src_v, int src_stride_v,
                uint8* dst_rgba, int dst_stride_rgba,
                int width, int height) {
+  int y;
+  void (*I422ToRGBARow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToRGBARow_C;
   if (!src_y || !src_u || !src_v || !dst_rgba ||
       width <= 0 || height == 0) {
     return -1;
@@ -649,11 +663,6 @@
     dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
     dst_stride_rgba = -dst_stride_rgba;
   }
-  void (*I422ToRGBARow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToRGBARow_C;
 #if defined(HAS_I422TORGBAROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
     I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
@@ -673,7 +682,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
     dst_rgba += dst_stride_rgba;
     src_y += src_stride_y;
@@ -692,6 +701,12 @@
                 const uint8* src_v, int src_stride_v,
                 uint8* dst_rgb24, int dst_stride_rgb24,
                 int width, int height) {
+  int y;
+  void (*I422ToRGB24Row)(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* rgb_buf,
+                         int width) = I422ToRGB24Row_C;
   if (!src_y || !src_u || !src_v || !dst_rgb24 ||
       width <= 0 || height == 0) {
     return -1;
@@ -702,11 +717,6 @@
     dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
     dst_stride_rgb24 = -dst_stride_rgb24;
   }
-  void (*I422ToRGB24Row)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToRGB24Row_C;
 #if defined(HAS_I422TORGB24ROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
     I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
@@ -723,7 +733,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width);
     dst_rgb24 += dst_stride_rgb24;
     src_y += src_stride_y;
@@ -742,6 +752,12 @@
                 const uint8* src_v, int src_stride_v,
                 uint8* dst_raw, int dst_stride_raw,
                 int width, int height) {
+  int y;
+  void (*I422ToRAWRow)(const uint8* y_buf,
+                       const uint8* u_buf,
+                       const uint8* v_buf,
+                       uint8* rgb_buf,
+                       int width) = I422ToRAWRow_C;
   if (!src_y || !src_u || !src_v || !dst_raw ||
       width <= 0 || height == 0) {
     return -1;
@@ -752,11 +768,6 @@
     dst_raw = dst_raw + (height - 1) * dst_stride_raw;
     dst_stride_raw = -dst_stride_raw;
   }
-  void (*I422ToRAWRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToRAWRow_C;
 #if defined(HAS_I422TORAWROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
     I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
@@ -773,7 +784,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     I422ToRAWRow(src_y, src_u, src_v, dst_raw, width);
     dst_raw += dst_stride_raw;
     src_y += src_stride_y;
@@ -792,6 +803,12 @@
                    const uint8* src_v, int src_stride_v,
                    uint8* dst_argb1555, int dst_stride_argb1555,
                    int width, int height) {
+  int y;
+  void (*I422ToARGB1555Row)(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width) = I422ToARGB1555Row_C;
   if (!src_y || !src_u || !src_v || !dst_argb1555 ||
       width <= 0 || height == 0) {
     return -1;
@@ -802,11 +819,6 @@
     dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
     dst_stride_argb1555 = -dst_stride_argb1555;
   }
-  void (*I422ToARGB1555Row)(const uint8* y_buf,
-                          const uint8* u_buf,
-                          const uint8* v_buf,
-                          uint8* rgb_buf,
-                          int width) = I422ToARGB1555Row_C;
 #if defined(HAS_I422TOARGB1555ROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
     I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
@@ -823,7 +835,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, width);
     dst_argb1555 += dst_stride_argb1555;
     src_y += src_stride_y;
@@ -843,6 +855,12 @@
                    const uint8* src_v, int src_stride_v,
                    uint8* dst_argb4444, int dst_stride_argb4444,
                    int width, int height) {
+  int y;
+  void (*I422ToARGB4444Row)(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width) = I422ToARGB4444Row_C;
   if (!src_y || !src_u || !src_v || !dst_argb4444 ||
       width <= 0 || height == 0) {
     return -1;
@@ -853,11 +871,6 @@
     dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
     dst_stride_argb4444 = -dst_stride_argb4444;
   }
-  void (*I422ToARGB4444Row)(const uint8* y_buf,
-                          const uint8* u_buf,
-                          const uint8* v_buf,
-                          uint8* rgb_buf,
-                          int width) = I422ToARGB4444Row_C;
 #if defined(HAS_I422TOARGB4444ROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
     I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
@@ -874,7 +887,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, width);
     dst_argb4444 += dst_stride_argb4444;
     src_y += src_stride_y;
@@ -893,6 +906,12 @@
                  const uint8* src_v, int src_stride_v,
                  uint8* dst_rgb565, int dst_stride_rgb565,
                  int width, int height) {
+  int y;
+  void (*I422ToRGB565Row)(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* rgb_buf,
+                          int width) = I422ToRGB565Row_C;
   if (!src_y || !src_u || !src_v || !dst_rgb565 ||
       width <= 0 || height == 0) {
     return -1;
@@ -903,11 +922,6 @@
     dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
     dst_stride_rgb565 = -dst_stride_rgb565;
   }
-  void (*I422ToRGB565Row)(const uint8* y_buf,
-                          const uint8* u_buf,
-                          const uint8* v_buf,
-                          uint8* rgb_buf,
-                          int width) = I422ToRGB565Row_C;
 #if defined(HAS_I422TORGB565ROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
     I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
@@ -924,7 +938,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, width);
     dst_rgb565 += dst_stride_rgb565;
     src_y += src_stride_y;
@@ -945,11 +959,11 @@
                     int width, int height,
                     uint32 fourcc) {
   uint32 format = CanonicalFourCC(fourcc);
+  int r = 0;
   if (!y || !u|| !v || !dst_sample ||
       width <= 0 || height == 0) {
     return -1;
   }
-  int r = 0;
   switch (format) {
     // Single plane formats
     case FOURCC_YUY2:
diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc
index 41421fb..121a416 100644
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -28,6 +28,11 @@
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  int y;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV444Row_C;
   if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
@@ -45,10 +50,6 @@
     height = 1;
     src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
   }
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
-      ARGBToYRow_C;
-  void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int pix) = ARGBToUV444Row_C;
 #if defined(HAS_ARGBTOUV444ROW_SSSE3)
     if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
       ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
@@ -83,7 +84,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     ARGBToUV444Row(src_argb, dst_u, dst_v, width);
     ARGBToYRow(src_argb, dst_y, width);
     src_argb += src_stride_argb;
@@ -101,6 +102,11 @@
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  int y;
+  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV422Row_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
   if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
@@ -118,8 +124,6 @@
     height = 1;
     src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
   }
-  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int pix) = ARGBToUV422Row_C;
 #if defined(HAS_ARGBTOUV422ROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
@@ -132,8 +136,6 @@
   }
 #endif
 
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
-      ARGBToYRow_C;
 #if defined(HAS_ARGBTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -160,7 +162,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     ARGBToUV422Row(src_argb, dst_u, dst_v, width);
     ARGBToYRow(src_argb, dst_y, width);
     src_argb += src_stride_argb;
@@ -178,6 +180,11 @@
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  int y;
+  void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV411Row_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
   if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
@@ -195,10 +202,6 @@
     height = 1;
     src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
   }
-  void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int pix) = ARGBToUV411Row_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
-      ARGBToYRow_C;
 #if defined(HAS_ARGBTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -234,7 +237,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     ARGBToUV411Row(src_argb, dst_u, dst_v, width);
     ARGBToYRow(src_argb, dst_y, width);
     src_argb += src_stride_argb;
@@ -250,6 +253,17 @@
                uint8* dst_y, int dst_stride_y,
                uint8* dst_uv, int dst_stride_uv,
                int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                      int width) = MergeUVRow_C;
+  // Allocate a rows of uv.
+  align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
+  uint8* row_v = row_u + ((halfwidth + 15) & ~15);
   if (!src_argb ||
       !dst_y || !dst_uv ||
       width <= 0 || height == 0) {
@@ -261,10 +275,6 @@
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
-      ARGBToYRow_C;
 #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
@@ -294,9 +304,6 @@
     }
   }
 #endif
-  int halfwidth = (width + 1) >> 1;
-  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                      int width) = MergeUVRow_C;
 #if defined(HAS_MERGEUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -325,11 +332,7 @@
   }
 #endif
 
-  // Allocate a rows of uv.
-  align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
-  uint8* row_v = row_u + ((halfwidth + 15) & ~15);
-
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
     ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
     MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
     ARGBToYRow(src_argb, dst_y, width);
@@ -353,6 +356,17 @@
                uint8* dst_y, int dst_stride_y,
                uint8* dst_uv, int dst_stride_uv,
                int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                      int width) = MergeUVRow_C;
+  // Allocate a rows of uv.
+  align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
+  uint8* row_v = row_u + ((halfwidth + 15) & ~15);
   if (!src_argb ||
       !dst_y || !dst_uv ||
       width <= 0 || height == 0) {
@@ -364,10 +378,6 @@
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
-      ARGBToYRow_C;
 #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
@@ -397,9 +407,6 @@
     }
   }
 #endif
-  int halfwidth = (width + 1) >> 1;
-  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                      int width) = MergeUVRow_C;
 #if defined(HAS_MERGEUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -428,11 +435,7 @@
   }
 #endif
 
-  // Allocate a rows of uv.
-  align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
-  uint8* row_v = row_u + ((halfwidth + 15) & ~15);
-
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
     ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
     MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
     ARGBToYRow(src_argb, dst_y, width);
@@ -455,6 +458,14 @@
 int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
                uint8* dst_yuy2, int dst_stride_yuy2,
                int width, int height) {
+  int y;
+  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV422Row_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+      const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C;
+
   if (!src_argb || !dst_yuy2 ||
       width <= 0 || height == 0) {
     return -1;
@@ -472,8 +483,6 @@
     height = 1;
     src_stride_argb = dst_stride_yuy2 = 0;
   }
-  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int pix) = ARGBToUV422Row_C;
 #if defined(HAS_ARGBTOUV422ROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
@@ -485,8 +494,6 @@
     }
   }
 #endif
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
-      ARGBToYRow_C;
 #if defined(HAS_ARGBTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -512,9 +519,6 @@
   }
 #endif
 
-  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_yuy2, int width) =
-      I422ToYUY2Row_C;
 #if defined(HAS_I422TOYUY2ROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
     I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
@@ -531,20 +535,22 @@
   }
 #endif
 
-  // Allocate a rows of yuv.
-  align_buffer_64(row_y, ((width + 63) & ~63) * 2);
-  uint8* row_u = row_y + ((width + 63) & ~63);
-  uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+  {
+    // Allocate a rows of yuv.
+    align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+    uint8* row_u = row_y + ((width + 63) & ~63);
+    uint8* row_v = row_u + ((width + 63) & ~63) / 2;
 
-  for (int y = 0; y < height; ++y) {
-    ARGBToUV422Row(src_argb, row_u, row_v, width);
-    ARGBToYRow(src_argb, row_y, width);
-    I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);
-    src_argb += src_stride_argb;
-    dst_yuy2 += dst_stride_yuy2;
+    for (y = 0; y < height; ++y) {
+      ARGBToUV422Row(src_argb, row_u, row_v, width);
+      ARGBToYRow(src_argb, row_y, width);
+      I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);
+      src_argb += src_stride_argb;
+      dst_yuy2 += dst_stride_yuy2;
+    }
+
+    free_aligned_buffer_64(row_y);
   }
-
-  free_aligned_buffer_64(row_y);
   return 0;
 }
 
@@ -553,6 +559,14 @@
 int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
                uint8* dst_uyvy, int dst_stride_uyvy,
                int width, int height) {
+  int y;
+  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV422Row_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+      const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C;
+
   if (!src_argb || !dst_uyvy ||
       width <= 0 || height == 0) {
     return -1;
@@ -570,8 +584,6 @@
     height = 1;
     src_stride_argb = dst_stride_uyvy = 0;
   }
-  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int pix) = ARGBToUV422Row_C;
 #if defined(HAS_ARGBTOUV422ROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
@@ -583,8 +595,6 @@
     }
   }
 #endif
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
-      ARGBToYRow_C;
 #if defined(HAS_ARGBTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -610,9 +620,6 @@
   }
 #endif
 
-  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_uyvy, int width) =
-      I422ToUYVYRow_C;
 #if defined(HAS_I422TOUYVYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
     I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
@@ -629,20 +636,22 @@
   }
 #endif
 
-  // Allocate a rows of yuv.
-  align_buffer_64(row_y, ((width + 63) & ~63) * 2);
-  uint8* row_u = row_y + ((width + 63) & ~63);
-  uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+  {
+    // Allocate a rows of yuv.
+    align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+    uint8* row_u = row_y + ((width + 63) & ~63);
+    uint8* row_v = row_u + ((width + 63) & ~63) / 2;
 
-  for (int y = 0; y < height; ++y) {
-    ARGBToUV422Row(src_argb, row_u, row_v, width);
-    ARGBToYRow(src_argb, row_y, width);
-    I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);
-    src_argb += src_stride_argb;
-    dst_uyvy += dst_stride_uyvy;
+    for (y = 0; y < height; ++y) {
+      ARGBToUV422Row(src_argb, row_u, row_v, width);
+      ARGBToYRow(src_argb, row_y, width);
+      I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);
+      src_argb += src_stride_argb;
+      dst_uyvy += dst_stride_uyvy;
+    }
+
+    free_aligned_buffer_64(row_y);
   }
-
-  free_aligned_buffer_64(row_y);
   return 0;
 }
 
@@ -651,6 +660,9 @@
 int ARGBToI400(const uint8* src_argb, int src_stride_argb,
                uint8* dst_y, int dst_stride_y,
                int width, int height) {
+  int y;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
   if (!src_argb || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
@@ -666,8 +678,6 @@
     height = 1;
     src_stride_argb = dst_stride_y = 0;
   }
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
-      ARGBToYRow_C;
 #if defined(HAS_ARGBTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -697,7 +707,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     ARGBToYRow(src_argb, dst_y, width);
     src_argb += src_stride_argb;
     dst_y += dst_stride_y;
@@ -726,6 +736,9 @@
 int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
                 uint8* dst_rgb24, int dst_stride_rgb24,
                 int width, int height) {
+  int y;
+  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToRGB24Row_C;
   if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
@@ -741,8 +754,6 @@
     height = 1;
     src_stride_argb = dst_stride_rgb24 = 0;
   }
-  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
-      ARGBToRGB24Row_C;
 #if defined(HAS_ARGBTORGB24ROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
@@ -759,7 +770,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     ARGBToRGB24Row(src_argb, dst_rgb24, width);
     src_argb += src_stride_argb;
     dst_rgb24 += dst_stride_rgb24;
@@ -772,6 +783,9 @@
 int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
               uint8* dst_raw, int dst_stride_raw,
               int width, int height) {
+  int y;
+  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToRAWRow_C;
   if (!src_argb || !dst_raw || width <= 0 || height == 0) {
     return -1;
   }
@@ -787,8 +801,6 @@
     height = 1;
     src_stride_argb = dst_stride_raw = 0;
   }
-  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
-      ARGBToRAWRow_C;
 #if defined(HAS_ARGBTORAWROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
@@ -805,7 +817,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     ARGBToRAWRow(src_argb, dst_raw, width);
     src_argb += src_stride_argb;
     dst_raw += dst_stride_raw;
@@ -818,6 +830,9 @@
 int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
                  uint8* dst_rgb565, int dst_stride_rgb565,
                  int width, int height) {
+  int y;
+  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToRGB565Row_C;
   if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
@@ -833,8 +848,6 @@
     height = 1;
     src_stride_argb = dst_stride_rgb565 = 0;
   }
-  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
-      ARGBToRGB565Row_C;
 #if defined(HAS_ARGBTORGB565ROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
       IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
@@ -852,7 +865,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     ARGBToRGB565Row(src_argb, dst_rgb565, width);
     src_argb += src_stride_argb;
     dst_rgb565 += dst_stride_rgb565;
@@ -865,6 +878,9 @@
 int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
                    uint8* dst_argb1555, int dst_stride_argb1555,
                    int width, int height) {
+  int y;
+  void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToARGB1555Row_C;
   if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
     return -1;
   }
@@ -880,8 +896,6 @@
     height = 1;
     src_stride_argb = dst_stride_argb1555 = 0;
   }
-  void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
-      ARGBToARGB1555Row_C;
 #if defined(HAS_ARGBTOARGB1555ROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
       IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
@@ -899,7 +913,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     ARGBToARGB1555Row(src_argb, dst_argb1555, width);
     src_argb += src_stride_argb;
     dst_argb1555 += dst_stride_argb1555;
@@ -912,6 +926,9 @@
 int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
                    uint8* dst_argb4444, int dst_stride_argb4444,
                    int width, int height) {
+  int y;
+  void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToARGB4444Row_C;
   if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
     return -1;
   }
@@ -927,8 +944,6 @@
     height = 1;
     src_stride_argb = dst_stride_argb4444 = 0;
   }
-  void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
-      ARGBToARGB4444Row_C;
 #if defined(HAS_ARGBTOARGB4444ROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
       IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
@@ -946,7 +961,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     ARGBToARGB4444Row(src_argb, dst_argb4444, width);
     src_argb += src_stride_argb;
     dst_argb4444 += dst_stride_argb4444;
@@ -961,6 +976,11 @@
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
+  int y;
+  void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
+      ARGBToYJRow_C;
   if (!src_argb ||
       !dst_yj || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -972,10 +992,6 @@
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
-      ARGBToYJRow_C;
 #if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
@@ -1015,7 +1031,7 @@
   }
 #endif
 
-  for (int y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
     ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
     ARGBToYJRow(src_argb, dst_yj, width);
     ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width);
@@ -1036,6 +1052,9 @@
 int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
                uint8* dst_yj, int dst_stride_yj,
                int width, int height) {
+  int y;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
+      ARGBToYJRow_C;
   if (!src_argb || !dst_yj || width <= 0 || height == 0) {
     return -1;
   }
@@ -1051,8 +1070,6 @@
     height = 1;
     src_stride_argb = dst_stride_yj = 0;
   }
-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
-      ARGBToYJRow_C;
 #if defined(HAS_ARGBTOYJROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
@@ -1082,7 +1099,7 @@
   }
 #endif
 
-  for (int y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     ARGBToYJRow(src_argb, dst_yj, width);
     src_argb += src_stride_argb;
     dst_yj += dst_stride_yj;
diff --git a/source/cpu_id.cc b/source/cpu_id.cc
index fdf4d09..2e0d61d 100644
--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@@ -10,7 +10,7 @@
 
 #include "libyuv/cpu_id.h"
 
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && !defined(__clang__)
 #include <intrin.h>  // For __cpuidex()
 #endif
 #if !defined(__pnacl__) && !defined(__CLR_VER) && \
@@ -48,7 +48,7 @@
     defined(__i386__) || defined(__x86_64__))
 LIBYUV_API
 void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
 #if (_MSC_FULL_VER >= 160040219)
   __cpuidex((int*)(cpu_info), info_eax, info_ecx);
 #elif defined(_M_IX86)
@@ -188,10 +188,14 @@
 int InitCpuFlags(void) {
 #if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
 
+  uint32 cpu_info0[4] = { 0, 0, 0, 0 };
   uint32 cpu_info1[4] = { 0, 0, 0, 0 };
   uint32 cpu_info7[4] = { 0, 0, 0, 0 };
+  CpuId(0, 0, cpu_info0);
   CpuId(1, 0, cpu_info1);
-  CpuId(7, 0, cpu_info7);
+  if (cpu_info0[0] >= 7) {
+    CpuId(7, 0, cpu_info7);
+  }
   cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
               ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
               ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
@@ -199,6 +203,7 @@
               ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
               ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
               kCpuHasX86;
+
 #ifdef HAS_XGETBV
   if ((cpu_info1[2] & 0x18000000) == 0x18000000 &&  // AVX and OSSave
       TestOsSaveYmm()) {  // Saves YMM.
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index f1297ca..3857008 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -37,6 +37,10 @@
     height = 1;
     src_stride_y = dst_stride_y = 0;
   }
+  // Nothing to do.
+  if (src_y == dst_y && src_stride_y == dst_stride_y) {
+    return;
+  }
 #if defined(HAS_COPYROW_X86)
   if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
     CopyRow = CopyRow_X86;
diff --git a/source/rotate.cc b/source/rotate.cc
index b052ac1..2ef3228 100644
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -388,10 +388,10 @@
 
 #if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)
 #define HAS_TRANSPOSE_UVWX8_SSE2
-extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
-                                    uint8* dst_a, int dst_stride_a,
-                                    uint8* dst_b, int dst_stride_b,
-                                    int w);
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int w);
   asm (
     DECLARE_FUNCTION(TransposeUVWx8_SSE2)
     "push   %ebx                               \n"
diff --git a/source/rotate_mips.cc b/source/rotate_mips.cc
index 04d5a66..70770fd 100644
--- a/source/rotate_mips.cc
+++ b/source/rotate_mips.cc
@@ -18,7 +18,8 @@
 #endif
 
 #if !defined(LIBYUV_DISABLE_MIPS) && \
-    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)
 
 void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
                              uint8* dst, int dst_stride,
@@ -303,10 +304,8 @@
        [width] "+r" (width)
       :[src_stride] "r" (src_stride),
        [dst_stride] "r" (dst_stride)
-      : "t0", "t1",  "t2", "t3",  "t4", "t5",
-        "t6", "t7", "t8", "t9",
-        "s0", "s1", "s2", "s3", "s4",
-        "s5", "s6", "s7"
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
   );
 }
 
diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc
index 7375bab..d354e11 100644
--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@@ -17,8 +17,8 @@
 extern "C" {
 #endif
 
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
-  !defined(__native_client__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+
 static uvec8 kVTbl4x4Transpose =
   { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
 
@@ -37,13 +37,21 @@
     "1:                                        \n"
       "mov         %0, %1                      \n"
 
+      MEMACCESS(0)
       "vld1.8      {d0}, [%0], %2              \n"
+      MEMACCESS(0)
       "vld1.8      {d1}, [%0], %2              \n"
+      MEMACCESS(0)
       "vld1.8      {d2}, [%0], %2              \n"
+      MEMACCESS(0)
       "vld1.8      {d3}, [%0], %2              \n"
+      MEMACCESS(0)
       "vld1.8      {d4}, [%0], %2              \n"
+      MEMACCESS(0)
       "vld1.8      {d5}, [%0], %2              \n"
+      MEMACCESS(0)
       "vld1.8      {d6}, [%0], %2              \n"
+      MEMACCESS(0)
       "vld1.8      {d7}, [%0]                  \n"
 
       "vtrn.8      d1, d0                      \n"
@@ -68,13 +76,21 @@
 
       "mov         %0, %3                      \n"
 
+    MEMACCESS(0)
       "vst1.8      {d1}, [%0], %4              \n"
+    MEMACCESS(0)
       "vst1.8      {d0}, [%0], %4              \n"
+    MEMACCESS(0)
       "vst1.8      {d3}, [%0], %4              \n"
+    MEMACCESS(0)
       "vst1.8      {d2}, [%0], %4              \n"
+    MEMACCESS(0)
       "vst1.8      {d5}, [%0], %4              \n"
+    MEMACCESS(0)
       "vst1.8      {d4}, [%0], %4              \n"
+    MEMACCESS(0)
       "vst1.8      {d7}, [%0], %4              \n"
+    MEMACCESS(0)
       "vst1.8      {d6}, [%0]                  \n"
 
       "add         %1, #8                      \n"  // src += 8
@@ -96,17 +112,26 @@
 
     // 4x8 block
     "mov         %0, %1                        \n"
+    MEMACCESS(0)
     "vld1.32     {d0[0]}, [%0], %2             \n"
+    MEMACCESS(0)
     "vld1.32     {d0[1]}, [%0], %2             \n"
+    MEMACCESS(0)
     "vld1.32     {d1[0]}, [%0], %2             \n"
+    MEMACCESS(0)
     "vld1.32     {d1[1]}, [%0], %2             \n"
+    MEMACCESS(0)
     "vld1.32     {d2[0]}, [%0], %2             \n"
+    MEMACCESS(0)
     "vld1.32     {d2[1]}, [%0], %2             \n"
+    MEMACCESS(0)
     "vld1.32     {d3[0]}, [%0], %2             \n"
+    MEMACCESS(0)
     "vld1.32     {d3[1]}, [%0]                 \n"
 
     "mov         %0, %3                        \n"
 
+    MEMACCESS(6)
     "vld1.8      {q3}, [%6]                    \n"
 
     "vtbl.8      d4, {d0, d1}, d6              \n"
@@ -116,15 +141,23 @@
 
     // TODO(frkoenig): Rework shuffle above to
     // write out with 4 instead of 8 writes.
+    MEMACCESS(0)
     "vst1.32     {d4[0]}, [%0], %4             \n"
+    MEMACCESS(0)
     "vst1.32     {d4[1]}, [%0], %4             \n"
+    MEMACCESS(0)
     "vst1.32     {d5[0]}, [%0], %4             \n"
+    MEMACCESS(0)
     "vst1.32     {d5[1]}, [%0]                 \n"
 
     "add         %0, %3, #4                    \n"
+    MEMACCESS(0)
     "vst1.32     {d0[0]}, [%0], %4             \n"
+    MEMACCESS(0)
     "vst1.32     {d0[1]}, [%0], %4             \n"
+    MEMACCESS(0)
     "vst1.32     {d1[0]}, [%0], %4             \n"
+    MEMACCESS(0)
     "vst1.32     {d1[1]}, [%0]                 \n"
 
     "add         %1, #4                        \n"  // src += 4
@@ -140,20 +173,30 @@
     // 2x8 block
     "2:                                        \n"
     "mov         %0, %1                        \n"
+    MEMACCESS(0)
     "vld1.16     {d0[0]}, [%0], %2             \n"
+    MEMACCESS(0)
     "vld1.16     {d1[0]}, [%0], %2             \n"
+    MEMACCESS(0)
     "vld1.16     {d0[1]}, [%0], %2             \n"
+    MEMACCESS(0)
     "vld1.16     {d1[1]}, [%0], %2             \n"
+    MEMACCESS(0)
     "vld1.16     {d0[2]}, [%0], %2             \n"
+    MEMACCESS(0)
     "vld1.16     {d1[2]}, [%0], %2             \n"
+    MEMACCESS(0)
     "vld1.16     {d0[3]}, [%0], %2             \n"
+    MEMACCESS(0)
     "vld1.16     {d1[3]}, [%0]                 \n"
 
     "vtrn.8      d0, d1                        \n"
 
     "mov         %0, %3                        \n"
 
+    MEMACCESS(0)
     "vst1.64     {d0}, [%0], %4                \n"
+    MEMACCESS(0)
     "vst1.64     {d1}, [%0]                    \n"
 
     "add         %1, #2                        \n"  // src += 2
@@ -163,15 +206,24 @@
 
     // 1x8 block
     "3:                                        \n"
+    MEMACCESS(1)
     "vld1.8      {d0[0]}, [%1], %2             \n"
+    MEMACCESS(1)
     "vld1.8      {d0[1]}, [%1], %2             \n"
+    MEMACCESS(1)
     "vld1.8      {d0[2]}, [%1], %2             \n"
+    MEMACCESS(1)
     "vld1.8      {d0[3]}, [%1], %2             \n"
+    MEMACCESS(1)
     "vld1.8      {d0[4]}, [%1], %2             \n"
+    MEMACCESS(1)
     "vld1.8      {d0[5]}, [%1], %2             \n"
+    MEMACCESS(1)
     "vld1.8      {d0[6]}, [%1], %2             \n"
+    MEMACCESS(1)
     "vld1.8      {d0[7]}, [%1]                 \n"
 
+    MEMACCESS(3)
     "vst1.64     {d0}, [%3]                    \n"
 
     "4:                                        \n"
@@ -206,13 +258,21 @@
     "1:                                        \n"
       "mov         %0, %1                      \n"
 
+      MEMACCESS(0)
       "vld2.8      {d0,  d1},  [%0], %2        \n"
+      MEMACCESS(0)
       "vld2.8      {d2,  d3},  [%0], %2        \n"
+      MEMACCESS(0)
       "vld2.8      {d4,  d5},  [%0], %2        \n"
+      MEMACCESS(0)
       "vld2.8      {d6,  d7},  [%0], %2        \n"
+      MEMACCESS(0)
       "vld2.8      {d16, d17}, [%0], %2        \n"
+      MEMACCESS(0)
       "vld2.8      {d18, d19}, [%0], %2        \n"
+      MEMACCESS(0)
       "vld2.8      {d20, d21}, [%0], %2        \n"
+      MEMACCESS(0)
       "vld2.8      {d22, d23}, [%0]            \n"
 
       "vtrn.8      q1, q0                      \n"
@@ -241,24 +301,40 @@
 
       "mov         %0, %3                      \n"
 
+    MEMACCESS(0)
       "vst1.8      {d2},  [%0], %4             \n"
+    MEMACCESS(0)
       "vst1.8      {d0},  [%0], %4             \n"
+    MEMACCESS(0)
       "vst1.8      {d6},  [%0], %4             \n"
+    MEMACCESS(0)
       "vst1.8      {d4},  [%0], %4             \n"
+    MEMACCESS(0)
       "vst1.8      {d18}, [%0], %4             \n"
+    MEMACCESS(0)
       "vst1.8      {d16}, [%0], %4             \n"
+    MEMACCESS(0)
       "vst1.8      {d22}, [%0], %4             \n"
+    MEMACCESS(0)
       "vst1.8      {d20}, [%0]                 \n"
 
       "mov         %0, %5                      \n"
 
+    MEMACCESS(0)
       "vst1.8      {d3},  [%0], %6             \n"
+    MEMACCESS(0)
       "vst1.8      {d1},  [%0], %6             \n"
+    MEMACCESS(0)
       "vst1.8      {d7},  [%0], %6             \n"
+    MEMACCESS(0)
       "vst1.8      {d5},  [%0], %6             \n"
+    MEMACCESS(0)
       "vst1.8      {d19}, [%0], %6             \n"
+    MEMACCESS(0)
       "vst1.8      {d17}, [%0], %6             \n"
+    MEMACCESS(0)
       "vst1.8      {d23}, [%0], %6             \n"
+    MEMACCESS(0)
       "vst1.8      {d21}, [%0]                 \n"
 
       "add         %1, #8*2                    \n"  // src   += 8*2
@@ -279,18 +355,27 @@
     "cmp         %7, #4                        \n"
     "blt         2f                            \n"
 
-    //TODO(frkoenig): Clean this up
+    // TODO(frkoenig): Clean this up
     // 4x8 block
     "mov         %0, %1                        \n"
+    MEMACCESS(0)
     "vld1.64     {d0}, [%0], %2                \n"
+    MEMACCESS(0)
     "vld1.64     {d1}, [%0], %2                \n"
+    MEMACCESS(0)
     "vld1.64     {d2}, [%0], %2                \n"
+    MEMACCESS(0)
     "vld1.64     {d3}, [%0], %2                \n"
+    MEMACCESS(0)
     "vld1.64     {d4}, [%0], %2                \n"
+    MEMACCESS(0)
     "vld1.64     {d5}, [%0], %2                \n"
+    MEMACCESS(0)
     "vld1.64     {d6}, [%0], %2                \n"
+    MEMACCESS(0)
     "vld1.64     {d7}, [%0]                    \n"
 
+    MEMACCESS(8)
     "vld1.8      {q15}, [%8]                   \n"
 
     "vtrn.8      q0, q1                        \n"
@@ -307,28 +392,44 @@
 
     "mov         %0, %3                        \n"
 
+    MEMACCESS(0)
     "vst1.32     {d16[0]},  [%0], %4           \n"
+    MEMACCESS(0)
     "vst1.32     {d16[1]},  [%0], %4           \n"
+    MEMACCESS(0)
     "vst1.32     {d17[0]},  [%0], %4           \n"
+    MEMACCESS(0)
     "vst1.32     {d17[1]},  [%0], %4           \n"
 
     "add         %0, %3, #4                    \n"
+    MEMACCESS(0)
     "vst1.32     {d20[0]}, [%0], %4            \n"
+    MEMACCESS(0)
     "vst1.32     {d20[1]}, [%0], %4            \n"
+    MEMACCESS(0)
     "vst1.32     {d21[0]}, [%0], %4            \n"
+    MEMACCESS(0)
     "vst1.32     {d21[1]}, [%0]                \n"
 
     "mov         %0, %5                        \n"
 
+    MEMACCESS(0)
     "vst1.32     {d18[0]}, [%0], %6            \n"
+    MEMACCESS(0)
     "vst1.32     {d18[1]}, [%0], %6            \n"
+    MEMACCESS(0)
     "vst1.32     {d19[0]}, [%0], %6            \n"
+    MEMACCESS(0)
     "vst1.32     {d19[1]}, [%0], %6            \n"
 
     "add         %0, %5, #4                    \n"
+    MEMACCESS(0)
     "vst1.32     {d22[0]},  [%0], %6           \n"
+    MEMACCESS(0)
     "vst1.32     {d22[1]},  [%0], %6           \n"
+    MEMACCESS(0)
     "vst1.32     {d23[0]},  [%0], %6           \n"
+    MEMACCESS(0)
     "vst1.32     {d23[1]},  [%0]               \n"
 
     "add         %1, #4*2                      \n"  // src   += 4 * 2
@@ -345,13 +446,21 @@
     // 2x8 block
     "2:                                        \n"
     "mov         %0, %1                        \n"
+    MEMACCESS(0)
     "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
+    MEMACCESS(0)
     "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
+    MEMACCESS(0)
     "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
+    MEMACCESS(0)
     "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
+    MEMACCESS(0)
     "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
+    MEMACCESS(0)
     "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
+    MEMACCESS(0)
     "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
+    MEMACCESS(0)
     "vld2.16     {d1[3], d3[3]}, [%0]          \n"
 
     "vtrn.8      d0, d1                        \n"
@@ -359,12 +468,16 @@
 
     "mov         %0, %3                        \n"
 
+    MEMACCESS(0)
     "vst1.64     {d0}, [%0], %4                \n"
+    MEMACCESS(0)
     "vst1.64     {d2}, [%0]                    \n"
 
     "mov         %0, %5                        \n"
 
+    MEMACCESS(0)
     "vst1.64     {d1}, [%0], %6                \n"
+    MEMACCESS(0)
     "vst1.64     {d3}, [%0]                    \n"
 
     "add         %1, #2*2                      \n"  // src   += 2 * 2
@@ -375,16 +488,26 @@
 
     // 1x8 block
     "3:                                        \n"
+    MEMACCESS(1)
     "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
+    MEMACCESS(1)
     "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
+    MEMACCESS(1)
     "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
+    MEMACCESS(1)
     "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
+    MEMACCESS(1)
     "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
+    MEMACCESS(1)
     "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
+    MEMACCESS(1)
     "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
+    MEMACCESS(1)
     "vld2.8      {d0[7], d1[7]}, [%1]          \n"
 
+    MEMACCESS(3)
     "vst1.64     {d0}, [%3]                    \n"
+    MEMACCESS(5)
     "vst1.64     {d1}, [%5]                    \n"
 
     "4:                                        \n"
diff --git a/source/row_any.cc b/source/row_any.cc
index 90c6a3f..97ef844 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -35,10 +35,12 @@
     }
 
 #ifdef HAS_I422TOARGBROW_SSSE3
-YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,
-     0, 4, 7)
 YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C,
      1, 4, 7)
+#endif  // HAS_I422TOARGBROW_SSSE3
+#ifdef HAS_I444TOARGBROW_SSSE3
+YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,
+     0, 4, 7)
 YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C,
      2, 4, 7)
 YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C,
@@ -59,7 +61,7 @@
 YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7)
 YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15)
 YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15)
-#endif  // HAS_I422TOARGBROW_SSSE3
+#endif  // HAS_I444TOARGBROW_SSSE3
 #ifdef HAS_I422TOARGBROW_AVX2
 YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15)
 #endif  // HAS_I422TOARGBROW_AVX2
diff --git a/source/row_mips.cc b/source/row_mips.cc
index 4435c55..da7183b 100644
--- a/source/row_mips.cc
+++ b/source/row_mips.cc
@@ -16,7 +16,8 @@
 #endif
 
 // The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)
 
 #ifdef HAS_COPYROW_MIPS
 void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
@@ -376,7 +377,9 @@
 
 // MIPS DSPR2 functions
 #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
-    (__mips_dsp_rev >= 2)
+    (__mips_dsp_rev >= 2) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
+
 void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                            int width) {
   __asm__ __volatile__ (
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 4f5158f..a84e3e4 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -16,39 +16,50 @@
 #endif
 
 // This module is for GCC Neon
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
-  !defined(__native_client__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
 
 // Read 8 Y, 4 U and 4 V from 422
 #define READYUV422                                                             \
+    MEMACCESS(0)                                                               \
     "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
     "vld1.32    {d2[0]}, [%1]!                 \n"                             \
+    MEMACCESS(2)                                                               \
     "vld1.32    {d2[1]}, [%2]!                 \n"
 
 // Read 8 Y, 2 U and 2 V from 422
 #define READYUV411                                                             \
+    MEMACCESS(0)                                                               \
     "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
     "vld1.16    {d2[0]}, [%1]!                 \n"                             \
+    MEMACCESS(2)                                                               \
     "vld1.16    {d2[1]}, [%2]!                 \n"                             \
     "vmov.u8    d3, d2                         \n"                             \
     "vzip.u8    d2, d3                         \n"
 
 // Read 8 Y, 8 U and 8 V from 444
 #define READYUV444                                                             \
+    MEMACCESS(0)                                                               \
     "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
     "vld1.8     {d2}, [%1]!                    \n"                             \
+    MEMACCESS(2)                                                               \
     "vld1.8     {d3}, [%2]!                    \n"                             \
     "vpaddl.u8  q1, q1                         \n"                             \
     "vrshrn.u16 d2, q1, #1                     \n"
 
 // Read 8 Y, and set 4 U and 4 V to 128
 #define READYUV400                                                             \
+    MEMACCESS(0)                                                               \
     "vld1.8     {d0}, [%0]!                    \n"                             \
     "vmov.u8    d2, #128                       \n"
 
 // Read 8 Y and 4 UV from NV12
 #define READNV12                                                               \
+    MEMACCESS(0)                                                               \
     "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
     "vld1.8     {d2}, [%1]!                    \n"                             \
     "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
     "vuzp.u8    d2, d3                         \n"                             \
@@ -56,7 +67,9 @@
 
 // Read 8 Y and 4 VU from NV21
 #define READNV21                                                               \
+    MEMACCESS(0)                                                               \
     "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
     "vld1.8     {d2}, [%1]!                    \n"                             \
     "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
     "vuzp.u8    d3, d2                         \n"                             \
@@ -64,6 +77,7 @@
 
 // Read 8 YUY2
 #define READYUY2                                                               \
+    MEMACCESS(0)                                                               \
     "vld2.8     {d0, d2}, [%0]!                \n"                             \
     "vmov.u8    d3, d2                         \n"                             \
     "vuzp.u8    d2, d3                         \n"                             \
@@ -71,6 +85,7 @@
 
 // Read 8 UYVY
 #define READUYVY                                                               \
+    MEMACCESS(0)                                                               \
     "vld2.8     {d2, d3}, [%0]!                \n"                             \
     "vmov.u8    d0, d3                         \n"                             \
     "vmov.u8    d3, d2                         \n"                             \
@@ -114,7 +129,9 @@
                         uint8* dst_argb,
                         int width) {
   asm volatile (
+    MEMACCESS(5)
     "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
     "vld1.8     {d25}, [%6]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -125,6 +142,7 @@
     YUV422TORGB
     "subs       %4, %4, #8                     \n"
     "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
     "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
     "bgt        1b                             \n"
     : "+r"(src_y),     // %0
@@ -145,7 +163,9 @@
                         uint8* dst_argb,
                         int width) {
   asm volatile (
+    MEMACCESS(5)
     "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
     "vld1.8     {d25}, [%6]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -156,6 +176,7 @@
     YUV422TORGB
     "subs       %4, %4, #8                     \n"
     "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
     "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
     "bgt        1b                             \n"
     : "+r"(src_y),     // %0
@@ -176,7 +197,9 @@
                         uint8* dst_argb,
                         int width) {
   asm volatile (
+    MEMACCESS(5)
     "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
     "vld1.8     {d25}, [%6]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -187,6 +210,7 @@
     YUV422TORGB
     "subs       %4, %4, #8                     \n"
     "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
     "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
     "bgt        1b                             \n"
     : "+r"(src_y),     // %0
@@ -207,7 +231,9 @@
                         uint8* dst_bgra,
                         int width) {
   asm volatile (
+    MEMACCESS(5)
     "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
     "vld1.8     {d25}, [%6]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -219,6 +245,7 @@
     "subs       %4, %4, #8                     \n"
     "vswp.u8    d20, d22                       \n"
     "vmov.u8    d19, #255                      \n"
+    MEMACCESS(3)
     "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
     "bgt        1b                             \n"
     : "+r"(src_y),     // %0
@@ -239,7 +266,9 @@
                         uint8* dst_abgr,
                         int width) {
   asm volatile (
+    MEMACCESS(5)
     "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
     "vld1.8     {d25}, [%6]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -251,6 +280,7 @@
     "subs       %4, %4, #8                     \n"
     "vswp.u8    d20, d22                       \n"
     "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
     "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
     "bgt        1b                             \n"
     : "+r"(src_y),     // %0
@@ -271,7 +301,9 @@
                         uint8* dst_rgba,
                         int width) {
   asm volatile (
+    MEMACCESS(5)
     "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
     "vld1.8     {d25}, [%6]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -282,6 +314,7 @@
     YUV422TORGB
     "subs       %4, %4, #8                     \n"
     "vmov.u8    d19, #255                      \n"
+    MEMACCESS(3)
     "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
     "bgt        1b                             \n"
     : "+r"(src_y),     // %0
@@ -302,7 +335,9 @@
                          uint8* dst_rgb24,
                          int width) {
   asm volatile (
+    MEMACCESS(5)
     "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
     "vld1.8     {d25}, [%6]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -312,6 +347,7 @@
     READYUV422
     YUV422TORGB
     "subs       %4, %4, #8                     \n"
+    MEMACCESS(3)
     "vst3.8     {d20, d21, d22}, [%3]!         \n"
     "bgt        1b                             \n"
     : "+r"(src_y),      // %0
@@ -332,7 +368,9 @@
                        uint8* dst_raw,
                        int width) {
   asm volatile (
+    MEMACCESS(5)
     "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
     "vld1.8     {d25}, [%6]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -343,6 +381,7 @@
     YUV422TORGB
     "subs       %4, %4, #8                     \n"
     "vswp.u8    d20, d22                       \n"
+    MEMACCESS(3)
     "vst3.8     {d20, d21, d22}, [%3]!         \n"
     "bgt        1b                             \n"
     : "+r"(src_y),    // %0
@@ -375,7 +414,9 @@
                           uint8* dst_rgb565,
                           int width) {
   asm volatile (
+    MEMACCESS(5)
     "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
     "vld1.8     {d25}, [%6]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -386,6 +427,7 @@
     YUV422TORGB
     "subs       %4, %4, #8                     \n"
     ARGBTORGB565
+    MEMACCESS(3)
     "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
     "bgt        1b                             \n"
     : "+r"(src_y),    // %0
@@ -421,7 +463,9 @@
                             uint8* dst_argb1555,
                             int width) {
   asm volatile (
+    MEMACCESS(5)
     "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
     "vld1.8     {d25}, [%6]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -433,6 +477,7 @@
     "subs       %4, %4, #8                     \n"
     "vmov.u8    d23, #255                      \n"
     ARGBTOARGB1555
+    MEMACCESS(3)
     "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555.
     "bgt        1b                             \n"
     : "+r"(src_y),    // %0
@@ -462,7 +507,9 @@
                             uint8* dst_argb4444,
                             int width) {
   asm volatile (
+    MEMACCESS(5)
     "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
     "vld1.8     {d25}, [%6]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -475,6 +522,7 @@
     "subs       %4, %4, #8                     \n"
     "vmov.u8    d23, #255                      \n"
     ARGBTOARGB4444
+    MEMACCESS(3)
     "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444.
     "bgt        1b                             \n"
     : "+r"(src_y),    // %0
@@ -493,7 +541,10 @@
                      uint8* dst_argb,
                      int width) {
   asm volatile (
+    MEMACCESS(3)
+    MEMACCESS(3)
     "vld1.8     {d24}, [%3]                    \n"
+    MEMACCESS(4)
     "vld1.8     {d25}, [%4]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -504,6 +555,7 @@
     YUV422TORGB
     "subs       %2, %2, #8                     \n"
     "vmov.u8    d23, #255                      \n"
+    MEMACCESS(1)
     "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
     "bgt        1b                             \n"
     : "+r"(src_y),     // %0
@@ -523,10 +575,12 @@
     ".p2align   2                              \n"
     "vmov.u8    d23, #255                      \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {d20}, [%0]!                   \n"
     "vmov       d21, d20                       \n"
     "vmov       d22, d20                       \n"
     "subs       %2, %2, #8                     \n"
+    MEMACCESS(1)
     "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
     "bgt        1b                             \n"
     : "+r"(src_y),     // %0
@@ -542,7 +596,9 @@
                         uint8* dst_argb,
                         int width) {
   asm volatile (
+    MEMACCESS(4)
     "vld1.8     {d24}, [%4]                    \n"
+    MEMACCESS(5)
     "vld1.8     {d25}, [%5]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -553,6 +609,7 @@
     YUV422TORGB
     "subs       %3, %3, #8                     \n"
     "vmov.u8    d23, #255                      \n"
+    MEMACCESS(2)
     "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
     "bgt        1b                             \n"
     : "+r"(src_y),     // %0
@@ -571,7 +628,9 @@
                         uint8* dst_argb,
                         int width) {
   asm volatile (
+    MEMACCESS(4)
     "vld1.8     {d24}, [%4]                    \n"
+    MEMACCESS(5)
     "vld1.8     {d25}, [%5]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -582,6 +641,7 @@
     YUV422TORGB
     "subs       %3, %3, #8                     \n"
     "vmov.u8    d23, #255                      \n"
+    MEMACCESS(2)
     "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
     "bgt        1b                             \n"
     : "+r"(src_y),     // %0
@@ -600,7 +660,9 @@
                           uint8* dst_rgb565,
                           int width) {
   asm volatile (
+    MEMACCESS(4)
     "vld1.8     {d24}, [%4]                    \n"
+    MEMACCESS(5)
     "vld1.8     {d25}, [%5]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -611,6 +673,7 @@
     YUV422TORGB
     "subs       %3, %3, #8                     \n"
     ARGBTORGB565
+    MEMACCESS(2)
     "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
     "bgt        1b                             \n"
     : "+r"(src_y),     // %0
@@ -629,7 +692,9 @@
                           uint8* dst_rgb565,
                           int width) {
   asm volatile (
+    MEMACCESS(4)
     "vld1.8     {d24}, [%4]                    \n"
+    MEMACCESS(5)
     "vld1.8     {d25}, [%5]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -640,6 +705,7 @@
     YUV422TORGB
     "subs       %3, %3, #8                     \n"
     ARGBTORGB565
+    MEMACCESS(2)
     "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
     "bgt        1b                             \n"
     : "+r"(src_y),     // %0
@@ -657,7 +723,9 @@
                         uint8* dst_argb,
                         int width) {
   asm volatile (
+    MEMACCESS(3)
     "vld1.8     {d24}, [%3]                    \n"
+    MEMACCESS(4)
     "vld1.8     {d25}, [%4]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -668,6 +736,7 @@
     YUV422TORGB
     "subs       %2, %2, #8                     \n"
     "vmov.u8    d23, #255                      \n"
+    MEMACCESS(1)
     "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
     "bgt        1b                             \n"
     : "+r"(src_yuy2),  // %0
@@ -684,7 +753,9 @@
                         uint8* dst_argb,
                         int width) {
   asm volatile (
+    MEMACCESS(3)
     "vld1.8     {d24}, [%3]                    \n"
+    MEMACCESS(4)
     "vld1.8     {d25}, [%4]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -695,6 +766,7 @@
     YUV422TORGB
     "subs       %2, %2, #8                     \n"
     "vmov.u8    d23, #255                      \n"
+    MEMACCESS(1)
     "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
     "bgt        1b                             \n"
     : "+r"(src_uyvy),  // %0
@@ -713,9 +785,12 @@
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    MEMACCESS(1)
     "vst1.8     {q0}, [%1]!                    \n"  // store U
+    MEMACCESS(2)
     "vst1.8     {q1}, [%2]!                    \n"  // store V
     "bgt        1b                             \n"
     : "+r"(src_uv),  // %0
@@ -733,9 +808,12 @@
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load U
+    MEMACCESS(1)
     "vld1.8     {q1}, [%1]!                    \n"  // load V
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    MEMACCESS(2)
     "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
     "bgt        1b                             \n"
     :
@@ -753,8 +831,10 @@
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
     "subs       %2, %2, #32                    \n"  // 32 processed per loop
+    MEMACCESS(1)
     "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
     "bgt        1b                             \n"
   : "+r"(src),   // %0
@@ -771,6 +851,7 @@
     "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
     "1:                                        \n"
     "subs      %1, %1, #16                     \n"  // 16 bytes per loop
+    MEMACCESS(0)
     "vst1.8    {q0}, [%0]!                     \n"  // store
     "bgt       1b                              \n"
   : "+r"(dst),   // %0
@@ -799,10 +880,13 @@
 
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
     "subs       %2, #16                        \n"  // 16 pixels per loop.
     "vrev64.8   q0, q0                         \n"
+    MEMACCESS(1)
     "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"
     "bgt        1b                             \n"
   : "+r"(src),   // %0
@@ -823,10 +907,13 @@
 
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
     "subs       %3, #8                         \n"  // 8 pixels per loop.
     "vrev64.8   q0, q0                         \n"
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
+    MEMACCESS(2)
     "vst1.8     {d1}, [%2]!                    \n"
     "bgt        1b                             \n"
   : "+r"(src_uv),  // %0
@@ -847,10 +934,13 @@
 
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
     "subs       %2, #4                         \n"  // 4 pixels per loop.
     "vrev64.32  q0, q0                         \n"
+    MEMACCESS(1)
     "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"
     "bgt        1b                             \n"
   : "+r"(src),   // %0
@@ -866,8 +956,10 @@
     "vmov.u8    d4, #255                       \n"  // Alpha
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    MEMACCESS(1)
     "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
     "bgt        1b                             \n"
   : "+r"(src_rgb24),  // %0
@@ -883,9 +975,11 @@
     "vmov.u8    d4, #255                       \n"  // Alpha
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vswp.u8    d1, d3                         \n"  // swap R, B
+    MEMACCESS(1)
     "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
     "bgt        1b                             \n"
   : "+r"(src_raw),   // %0
@@ -913,9 +1007,11 @@
     "vmov.u8    d3, #255                       \n"  // Alpha
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     RGB565TOARGB
+    MEMACCESS(1)
     "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
     "bgt        1b                             \n"
   : "+r"(src_rgb565),  // %0
@@ -959,9 +1055,11 @@
     "vmov.u8    d3, #255                       \n"  // Alpha
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     ARGB1555TOARGB
+    MEMACCESS(1)
     "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
     "bgt        1b                             \n"
   : "+r"(src_argb1555),  // %0
@@ -988,9 +1086,11 @@
     "vmov.u8    d3, #255                       \n"  // Alpha
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     ARGB4444TOARGB
+    MEMACCESS(1)
     "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
     "bgt        1b                             \n"
   : "+r"(src_argb4444),  // %0
@@ -1005,8 +1105,10 @@
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    MEMACCESS(1)
     "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
     "bgt        1b                             \n"
   : "+r"(src_argb),   // %0
@@ -1021,9 +1123,11 @@
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vswp.u8    d1, d3                         \n"  // swap R, B
+    MEMACCESS(1)
     "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1038,8 +1142,10 @@
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
     "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+    MEMACCESS(1)
     "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
     "bgt        1b                             \n"
   : "+r"(src_yuy2),  // %0
@@ -1054,8 +1160,10 @@
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
     "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+    MEMACCESS(1)
     "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
     "bgt        1b                             \n"
   : "+r"(src_uyvy),  // %0
@@ -1071,9 +1179,12 @@
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
     "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
     "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
+    MEMACCESS(2)
     "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
     "bgt        1b                             \n"
   : "+r"(src_yuy2),  // %0
@@ -1090,9 +1201,12 @@
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
     "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
+    MEMACCESS(2)
     "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
     "bgt        1b                             \n"
   : "+r"(src_uyvy),  // %0
@@ -1110,12 +1224,16 @@
     "add        %1, %0, %1                     \n"  // stride + src_yuy2
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
     "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
     "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
     "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
     "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
+    MEMACCESS(2)
     "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
+    MEMACCESS(3)
     "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
     "bgt        1b                             \n"
   : "+r"(src_yuy2),     // %0
@@ -1134,12 +1252,16 @@
     "add        %1, %0, %1                     \n"  // stride + src_uyvy
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
     "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
     "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
     "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
     "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
+    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
+    MEMACCESS(3)
     "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
     "bgt        1b                             \n"
   : "+r"(src_uyvy),     // %0
@@ -1158,10 +1280,13 @@
     // change the stride to row 2 pointer
     "add        %1, %0                         \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load row 1 16 pixels.
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    MEMACCESS(1)
     "vld1.8     {q1}, [%1]!                    \n"  // load row 2 16 pixels.
     "vrhadd.u8  q0, q1                         \n"  // average row 1 and 2
+    MEMACCESS(2)
     "vst1.8     {q0}, [%2]!                    \n"
     "bgt        1b                             \n"
   : "+r"(src_uv),         // %0
@@ -1179,11 +1304,13 @@
   asm volatile (
     "vmov.u32   d6[0], %3                      \n"  // selector
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0, q1}, [%0]!                \n"  // load row 8 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop
     "vtbl.8     d4, {d0, d1}, d6               \n"  // look up 4 pixels
     "vtbl.8     d5, {d2, d3}, d6               \n"  // look up 4 pixels
     "vtrn.u32   d4, d5                         \n"  // combine 8 pixels
+    MEMACCESS(1)
     "vst1.8     {d4}, [%1]!                    \n"  // store 8.
     "bgt        1b                             \n"
   : "+r"(src_argb),   // %0
@@ -1199,8 +1326,10 @@
                            uint32 /*selector*/, int pix) {
   asm volatile (
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load row 8 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    MEMACCESS(1)
     "vst1.8     {d1}, [%1]!                    \n"  // store 8 G's.
     "bgt        1b                             \n"
   : "+r"(src_argb),   // %0
@@ -1215,12 +1344,15 @@
 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
                          const uint8* shuffler, int pix) {
   asm volatile (
+    MEMACCESS(3)
     "vld1.8     {q2}, [%3]                     \n"  // shuffler
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
     "subs       %2, %2, #4                     \n"  // 4 processed per loop
     "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
     "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
+    MEMACCESS(1)
     "vst1.8     {q1}, [%1]!                    \n"  // store 4.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1238,10 +1370,14 @@
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
+    MEMACCESS(1)
     "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
+    MEMACCESS(2)
     "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
     "subs       %4, %4, #16                    \n"  // 16 pixels
+    MEMACCESS(3)
     "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
     "bgt        1b                             \n"
   : "+r"(src_y),     // %0
@@ -1261,10 +1397,14 @@
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
+    MEMACCESS(1)
     "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
+    MEMACCESS(2)
     "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
     "subs       %4, %4, #16                    \n"  // 16 pixels
+    MEMACCESS(3)
     "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
     "bgt        1b                             \n"
   : "+r"(src_y),     // %0
@@ -1281,9 +1421,11 @@
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     ARGBTORGB565
+    MEMACCESS(1)
     "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1299,9 +1441,11 @@
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     ARGBTOARGB1555
+    MEMACCESS(1)
     "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1318,9 +1462,11 @@
     "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     ARGBTOARGB4444
+    MEMACCESS(1)
     "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.
     "bgt        1b                             \n"
   : "+r"(src_argb),      // %0
@@ -1339,6 +1485,7 @@
     "vmov.u8    d27, #16                       \n"  // Add 16 constant
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q2, d0, d24                    \n"  // B
@@ -1346,6 +1493,7 @@
     "vmlal.u8   q2, d2, d26                    \n"  // R
     "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
     "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1363,12 +1511,14 @@
     "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q2, d0, d24                    \n"  // B
     "vmlal.u8   q2, d1, d25                    \n"  // G
     "vmlal.u8   q2, d2, d26                    \n"  // R
     "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1391,6 +1541,7 @@
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q2, d0, d24                    \n"  // B
@@ -1406,7 +1557,9 @@
     "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
     "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
 
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    MEMACCESS(2)
     "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1430,7 +1583,9 @@
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
 
     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
@@ -1451,7 +1606,9 @@
     "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
     "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
 
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    MEMACCESS(2)
     "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1476,12 +1633,16 @@
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(0)
     "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels.
+    MEMACCESS(0)
     "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels.
     "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts.
@@ -1509,7 +1670,9 @@
     "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
     "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
     "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    MEMACCESS(2)
     "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1548,12 +1711,16 @@
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
     "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
@@ -1565,7 +1732,9 @@
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1592,12 +1761,16 @@
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
     "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
@@ -1609,7 +1782,9 @@
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1635,12 +1810,16 @@
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
+    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
     "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.
+    MEMACCESS(1)
     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.
     "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.
@@ -1652,7 +1831,9 @@
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q3, q2, q1)
+    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_bgra),  // %0
@@ -1678,12 +1859,16 @@
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
+    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
     "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.
+    MEMACCESS(1)
     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.
     "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
@@ -1695,7 +1880,9 @@
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q2, q1, q0)
+    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_abgr),  // %0
@@ -1721,12 +1908,16 @@
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
+    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
     "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.
+    MEMACCESS(1)
     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.
     "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.
@@ -1738,7 +1929,9 @@
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_rgba),  // %0
@@ -1764,12 +1957,16 @@
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
+    MEMACCESS(0)
     "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
     "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.
+    MEMACCESS(1)
     "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.
     "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
@@ -1781,7 +1978,9 @@
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_rgb24),  // %0
@@ -1807,12 +2006,16 @@
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
+    MEMACCESS(0)
     "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
     "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
     "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.
+    MEMACCESS(1)
     "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.
     "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
@@ -1824,7 +2027,9 @@
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q2, q1, q0)
+    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_raw),  // %0
@@ -1851,22 +2056,26 @@
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
     RGB565TOARGB
     "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
     "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
     "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
     RGB565TOARGB
     "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
     "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
     "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
 
+    MEMACCESS(1)
     "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
     RGB565TOARGB
     "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
     "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
     "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
     "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
     RGB565TOARGB
     "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
@@ -1888,7 +2097,9 @@
     "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
     "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
     "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_rgb565),  // %0
@@ -1915,22 +2126,26 @@
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
     RGB555TOARGB
     "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
     "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
     "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
     RGB555TOARGB
     "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
     "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
     "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
 
+    MEMACCESS(1)
     "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
     RGB555TOARGB
     "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
     "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
     "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
     "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
     RGB555TOARGB
     "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
@@ -1952,7 +2167,9 @@
     "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
     "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
     "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_argb1555),  // %0
@@ -1979,22 +2196,26 @@
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
     ARGB4444TOARGB
     "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
     "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
     "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
     ARGB4444TOARGB
     "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
     "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
     "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
 
+    MEMACCESS(1)
     "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
     ARGB4444TOARGB
     "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
     "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
     "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
     "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
     ARGB4444TOARGB
     "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
@@ -2016,7 +2237,9 @@
     "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
     "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
     "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_argb4444),  // %0
@@ -2038,6 +2261,7 @@
     "vmov.u8    d27, #16                       \n"  // Add 16 constant
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     RGB565TOARGB
@@ -2046,6 +2270,7 @@
     "vmlal.u8   q2, d2, d26                    \n"  // R
     "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
     "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
     "bgt        1b                             \n"
   : "+r"(src_rgb565),  // %0
@@ -2064,6 +2289,7 @@
     "vmov.u8    d27, #16                       \n"  // Add 16 constant
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     ARGB1555TOARGB
@@ -2072,6 +2298,7 @@
     "vmlal.u8   q2, d2, d26                    \n"  // R
     "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
     "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
     "bgt        1b                             \n"
   : "+r"(src_argb1555),  // %0
@@ -2090,6 +2317,7 @@
     "vmov.u8    d27, #16                       \n"  // Add 16 constant
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     ARGB4444TOARGB
@@ -2098,6 +2326,7 @@
     "vmlal.u8   q2, d2, d26                    \n"  // R
     "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
     "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
     "bgt        1b                             \n"
   : "+r"(src_argb4444),  // %0
@@ -2116,6 +2345,7 @@
     "vmov.u8    d7, #16                        \n"  // Add 16 constant
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q8, d1, d4                     \n"  // R
@@ -2123,6 +2353,7 @@
     "vmlal.u8   q8, d3, d6                     \n"  // B
     "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
     "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
     "bgt        1b                             \n"
   : "+r"(src_bgra),  // %0
@@ -2141,6 +2372,7 @@
     "vmov.u8    d7, #16                        \n"  // Add 16 constant
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q8, d0, d4                     \n"  // R
@@ -2148,6 +2380,7 @@
     "vmlal.u8   q8, d2, d6                     \n"  // B
     "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
     "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
     "bgt        1b                             \n"
   : "+r"(src_abgr),  // %0
@@ -2166,6 +2399,7 @@
     "vmov.u8    d7, #16                        \n"  // Add 16 constant
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q8, d1, d4                     \n"  // B
@@ -2173,6 +2407,7 @@
     "vmlal.u8   q8, d3, d6                     \n"  // R
     "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
     "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
     "bgt        1b                             \n"
   : "+r"(src_rgba),  // %0
@@ -2191,6 +2426,7 @@
     "vmov.u8    d7, #16                        \n"  // Add 16 constant
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q8, d0, d4                     \n"  // B
@@ -2198,6 +2434,7 @@
     "vmlal.u8   q8, d2, d6                     \n"  // R
     "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
     "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
     "bgt        1b                             \n"
   : "+r"(src_rgb24),  // %0
@@ -2216,6 +2453,7 @@
     "vmov.u8    d7, #16                        \n"  // Add 16 constant
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q8, d0, d4                     \n"  // B
@@ -2223,6 +2461,7 @@
     "vmlal.u8   q8, d2, d6                     \n"  // R
     "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
     "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
     "bgt        1b                             \n"
   : "+r"(src_raw),  // %0
@@ -2253,7 +2492,9 @@
     "vdup.8     d4, %4                         \n"
     // General purpose row blend.
   "1:                                          \n"
+    MEMACCESS(1)
     "vld1.8     {q0}, [%1]!                    \n"
+    MEMACCESS(2)
     "vld1.8     {q1}, [%2]!                    \n"
     "subs       %3, %3, #16                    \n"
     "vmull.u8   q13, d0, d4                    \n"
@@ -2262,46 +2503,58 @@
     "vmlal.u8   q14, d3, d5                    \n"
     "vrshrn.u16 d0, q13, #8                    \n"
     "vrshrn.u16 d1, q14, #8                    \n"
+    MEMACCESS(0)
     "vst1.8     {q0}, [%0]!                    \n"
     "bgt        1b                             \n"
     "b          99f                            \n"
 
     // Blend 25 / 75.
   "25:                                         \n"
+    MEMACCESS(1)
     "vld1.8     {q0}, [%1]!                    \n"
+    MEMACCESS(2)
     "vld1.8     {q1}, [%2]!                    \n"
     "subs       %3, %3, #16                    \n"
     "vrhadd.u8  q0, q1                         \n"
     "vrhadd.u8  q0, q1                         \n"
+    MEMACCESS(0)
     "vst1.8     {q0}, [%0]!                    \n"
     "bgt        25b                            \n"
     "b          99f                            \n"
 
     // Blend 50 / 50.
   "50:                                         \n"
+    MEMACCESS(1)
     "vld1.8     {q0}, [%1]!                    \n"
+    MEMACCESS(2)
     "vld1.8     {q1}, [%2]!                    \n"
     "subs       %3, %3, #16                    \n"
     "vrhadd.u8  q0, q1                         \n"
+    MEMACCESS(0)
     "vst1.8     {q0}, [%0]!                    \n"
     "bgt        50b                            \n"
     "b          99f                            \n"
 
     // Blend 75 / 25.
   "75:                                         \n"
+    MEMACCESS(1)
     "vld1.8     {q1}, [%1]!                    \n"
+    MEMACCESS(2)
     "vld1.8     {q0}, [%2]!                    \n"
     "subs       %3, %3, #16                    \n"
     "vrhadd.u8  q0, q1                         \n"
     "vrhadd.u8  q0, q1                         \n"
+    MEMACCESS(0)
     "vst1.8     {q0}, [%0]!                    \n"
     "bgt        75b                            \n"
     "b          99f                            \n"
 
     // Blend 100 / 0 - Copy row unchanged.
   "100:                                        \n"
+    MEMACCESS(1)
     "vld1.8     {q0}, [%1]!                    \n"
     "subs       %3, %3, #16                    \n"
+    MEMACCESS(0)
     "vst1.8     {q0}, [%0]!                    \n"
     "bgt        100b                           \n"
 
@@ -2324,7 +2577,9 @@
     "blt        89f                            \n"
     // Blend 8 pixels.
   "8:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
+    MEMACCESS(1)
     "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q10, d4, d3                    \n"  // db * a
@@ -2338,6 +2593,7 @@
     "vqadd.u8   q0, q0, q2                     \n"  // + sbg
     "vqadd.u8   d2, d2, d6                     \n"  // + sr
     "vmov.u8    d3, #255                       \n"  // a = 255
+    MEMACCESS(2)
     "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
     "bge        8b                             \n"
 
@@ -2347,7 +2603,9 @@
 
     // Blend 1 pixels.
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
+    MEMACCESS(1)
     "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
     "subs       %3, %3, #1                     \n"  // 1 processed per loop.
     "vmull.u8   q10, d4, d3                    \n"  // db * a
@@ -2361,6 +2619,7 @@
     "vqadd.u8   q0, q0, q2                     \n"  // + sbg
     "vqadd.u8   d2, d2, d6                     \n"  // + sr
     "vmov.u8    d3, #255                       \n"  // a = 255
+    MEMACCESS(2)
     "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
     "bge        1b                             \n"
 
@@ -2380,6 +2639,7 @@
   asm volatile (
     // Attenuate 8 pixels.
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q10, d0, d3                    \n"  // b * a
@@ -2388,6 +2648,7 @@
     "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
     "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
     "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
+    MEMACCESS(1)
     "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
     "bgt        1b                             \n"
   : "+r"(src_argb),   // %0
@@ -2411,6 +2672,7 @@
     // 8 pixel loop.
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
     "subs       %1, %1, #8                     \n"  // 8 processed per loop.
     "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
@@ -2428,6 +2690,7 @@
     "vqmovn.u16 d0, q0                         \n"
     "vqmovn.u16 d2, q1                         \n"
     "vqmovn.u16 d4, q2                         \n"
+    MEMACCESS(0)
     "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
     "bgt        1b                             \n"
   : "+r"(dst_argb),       // %0
@@ -2452,6 +2715,7 @@
     // 8 pixel loop.
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
@@ -2466,6 +2730,7 @@
     "vqmovn.u16 d22, q11                       \n"
     "vqmovn.u16 d24, q12                       \n"
     "vqmovn.u16 d26, q13                       \n"
+    MEMACCESS(1)
     "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
     "bgt        1b                             \n"
   : "+r"(src_argb),       // %0
@@ -2486,6 +2751,7 @@
     "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q2, d0, d24                    \n"  // B
@@ -2494,6 +2760,7 @@
     "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
     "vmov       d1, d0                         \n"  // G
     "vmov       d2, d0                         \n"  // R
+    MEMACCESS(1)
     "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -2521,6 +2788,7 @@
     "vmov.u8    d30, #50                       \n"  // BR coefficient
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
     "subs       %1, %1, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
@@ -2535,6 +2803,7 @@
     "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
     "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
     "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
+    MEMACCESS(0)
     "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
   : "+r"(dst_argb),  // %0
@@ -2551,12 +2820,14 @@
 void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
                              const int8* matrix_argb, int width) {
   asm volatile (
+    MEMACCESS(3)
     "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
     "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
     "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
 
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
@@ -2595,6 +2866,7 @@
     "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
     "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
     "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
+    MEMACCESS(1)
     "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
   : "+r"(src_argb),   // %0
@@ -2615,7 +2887,9 @@
     // 8 pixel loop.
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
     "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q0, d0, d1                     \n"  // multiply B
@@ -2626,6 +2900,7 @@
     "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
     "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
     "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
+    MEMACCESS(2)
     "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
 
@@ -2646,11 +2921,14 @@
     // 8 pixel loop.
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
     "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
     "vqadd.u8   q0, q0, q2                     \n"  // add B, G
     "vqadd.u8   q1, q1, q3                     \n"  // add R, A
+    MEMACCESS(2)
     "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
 
@@ -2670,11 +2948,14 @@
     // 8 pixel loop.
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
     "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
     "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
     "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
+    MEMACCESS(2)
     "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
 
@@ -2699,12 +2980,15 @@
     // 8 pixel loop.
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
+    MEMACCESS(1)
     "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
     "vqadd.u8   d0, d0, d1                     \n"  // add
     "vmov.u8    d1, d0                         \n"
     "vmov.u8    d2, d0                         \n"
+    MEMACCESS(2)
     "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
   : "+r"(src_sobelx),  // %0
@@ -2723,10 +3007,13 @@
     // 16 pixel loop.
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
+    MEMACCESS(1)
     "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
     "subs       %3, %3, #16                    \n"  // 16 processed per loop.
     "vqadd.u8   q0, q0, q1                     \n"  // add
+    MEMACCESS(2)
     "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
     "bgt        1b                             \n"
   : "+r"(src_sobelx),  // %0
@@ -2750,10 +3037,13 @@
     // 8 pixel loop.
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
+    MEMACCESS(1)
     "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
     "vqadd.u8   d1, d0, d2                     \n"  // add
+    MEMACCESS(2)
     "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
   : "+r"(src_sobelx),  // %0
@@ -2774,21 +3064,28 @@
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {d0}, [%0],%5                  \n"  // top
+    MEMACCESS(0)
     "vld1.8     {d1}, [%0],%6                  \n"
     "vsubl.u8   q0, d0, d1                     \n"
+    MEMACCESS(1)
     "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
+    MEMACCESS(1)
     "vld1.8     {d3}, [%1],%6                  \n"
     "vsubl.u8   q1, d2, d3                     \n"
     "vadd.s16   q0, q0, q1                     \n"
     "vadd.s16   q0, q0, q1                     \n"
+    MEMACCESS(2)
     "vld1.8     {d2}, [%2],%5                  \n"  // bottom
+    MEMACCESS(2)
     "vld1.8     {d3}, [%2],%6                  \n"
     "subs       %4, %4, #8                     \n"  // 8 pixels
     "vsubl.u8   q1, d2, d3                     \n"
     "vadd.s16   q0, q0, q1                     \n"
     "vabs.s16   q0, q0                         \n"
     "vqmovn.u16 d0, q0                         \n"
+    MEMACCESS(3)
     "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
     "bgt        1b                             \n"
   : "+r"(src_y0),      // %0
@@ -2811,21 +3108,28 @@
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {d0}, [%0],%4                  \n"  // left
+    MEMACCESS(1)
     "vld1.8     {d1}, [%1],%4                  \n"
     "vsubl.u8   q0, d0, d1                     \n"
+    MEMACCESS(0)
     "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
+    MEMACCESS(1)
     "vld1.8     {d3}, [%1],%4                  \n"
     "vsubl.u8   q1, d2, d3                     \n"
     "vadd.s16   q0, q0, q1                     \n"
     "vadd.s16   q0, q0, q1                     \n"
+    MEMACCESS(0)
     "vld1.8     {d2}, [%0],%5                  \n"  // right
+    MEMACCESS(1)
     "vld1.8     {d3}, [%1],%5                  \n"
     "subs       %3, %3, #8                     \n"  // 8 pixels
     "vsubl.u8   q1, d2, d3                     \n"
     "vadd.s16   q0, q0, q1                     \n"
     "vabs.s16   q0, q0                         \n"
     "vqmovn.u16 d0, q0                         \n"
+    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
     "bgt        1b                             \n"
   : "+r"(src_y0),      // %0
diff --git a/source/row_win.cc b/source/row_win.cc
index f13e4d7..8eb8889 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -10,13 +10,177 @@
 
 #include "libyuv/row.h"
 
+#if defined (_M_X64)
+#include <emmintrin.h>
+#include <tmmintrin.h>  // For _mm_maddubs_epi16
+#endif
+
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
-// This module is for Visual C x86.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+// This module is for Visual C.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
+
+#define YG 74  /* (int8)(1.164 * 64 + 0.5) */
+
+#define UB 127  /* min(127,(int8)(2.018 * 64)) */
+#define UG -25  /* (int8)(-0.391 * 64 - 0.5) */
+#define UR 0
+
+#define VB 0
+#define VG -52  /* (int8)(-0.813 * 64 - 0.5) */
+#define VR 102  /* (int8)(1.596 * 64 + 0.5) */
+
+// Bias
+#define BB UB * 128 + VB * 128
+#define BG UG * 128 + VG * 128
+#define BR UR * 128 + VR * 128
+
+static const vec8 kUVToB = {
+  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
+};
+
+static const vec8 kUVToR = {
+  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
+};
+
+static const vec8 kUVToG = {
+  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
+};
+
+static const vec8 kVUToB = {
+  VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
+};
+
+static const vec8 kVUToR = {
+  VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
+};
+
+static const vec8 kVUToG = {
+  VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+};
+
+static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
+static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
+static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
+static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
+static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
+
+// 64 bit
+#if defined(_M_X64)
+
+// Aligned destination version.
+__declspec(align(16))
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         int width) {
+
+  __m128i xmm0, xmm1, xmm2, xmm3;
+  const __m128i xmm5 = _mm_set1_epi8(-1);
+  const __m128i xmm4 = _mm_setzero_si128();
+  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+
+  while (width > 0) {
+    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
+    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
+    xmm1 = _mm_load_si128(&xmm0);
+    xmm2 = _mm_load_si128(&xmm0);
+    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
+    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
+    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
+    xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
+    xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
+    xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
+    xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
+    xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
+    xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
+    xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
+    xmm0 = _mm_adds_epi16(xmm0, xmm3);
+    xmm1 = _mm_adds_epi16(xmm1, xmm3);
+    xmm2 = _mm_adds_epi16(xmm2, xmm3);
+    xmm0 = _mm_srai_epi16(xmm0, 6);
+    xmm1 = _mm_srai_epi16(xmm1, 6);
+    xmm2 = _mm_srai_epi16(xmm2, 6);
+    xmm0 = _mm_packus_epi16(xmm0, xmm0);
+    xmm1 = _mm_packus_epi16(xmm1, xmm1);
+    xmm2 = _mm_packus_epi16(xmm2, xmm2);
+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
+    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
+    xmm1 = _mm_load_si128(&xmm0);
+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
+    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
+
+    _mm_store_si128((__m128i *)dst_argb, xmm0);
+    _mm_store_si128((__m128i *)(dst_argb + 16), xmm1);
+
+    y_buf += 8;
+    u_buf += 4;
+    dst_argb += 32;
+    width -= 8;
+  }
+}
+
+// Unaligned destination version.
+void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* dst_argb,
+                                   int width) {
+
+  __m128i xmm0, xmm1, xmm2, xmm3;
+  const __m128i xmm5 = _mm_set1_epi8(-1);
+  const __m128i xmm4 = _mm_setzero_si128();
+  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+
+  while (width > 0) {
+    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
+    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
+    xmm1 = _mm_load_si128(&xmm0);
+    xmm2 = _mm_load_si128(&xmm0);
+    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
+    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
+    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
+    xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
+    xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
+    xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
+    xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
+    xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
+    xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
+    xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
+    xmm0 = _mm_adds_epi16(xmm0, xmm3);
+    xmm1 = _mm_adds_epi16(xmm1, xmm3);
+    xmm2 = _mm_adds_epi16(xmm2, xmm3);
+    xmm0 = _mm_srai_epi16(xmm0, 6);
+    xmm1 = _mm_srai_epi16(xmm1, 6);
+    xmm2 = _mm_srai_epi16(xmm2, 6);
+    xmm0 = _mm_packus_epi16(xmm0, xmm0);
+    xmm1 = _mm_packus_epi16(xmm1, xmm1);
+    xmm2 = _mm_packus_epi16(xmm2, xmm2);
+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
+    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
+    xmm1 = _mm_load_si128(&xmm0);
+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
+    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
+
+    _mm_storeu_si128((__m128i *)dst_argb, xmm0);
+    _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);
+
+    y_buf += 8;
+    u_buf += 4;
+    dst_argb += 32;
+    width -= 8;
+  }
+}
+// 32 bit
+#else  // defined(_M_X64)
 
 #ifdef HAS_ARGBTOYROW_SSSE3
 
@@ -2030,21 +2194,6 @@
 }
 #endif  // HAS_ARGBTOYROW_SSSE3
 
-#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
-
-#define UB 127 /* min(63,(int8)(2.018 * 64)) */
-#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
-#define UR 0
-
-#define VB 0
-#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
-#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
-
-// Bias
-#define BB UB * 128 + VB * 128
-#define BG UG * 128 + VG * 128
-#define BR UR * 128 + VR * 128
-
 #ifdef HAS_I422TOARGBROW_AVX2
 
 static const lvec8 kUVToB_AVX = {
@@ -2079,10 +2228,10 @@
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
 __declspec(naked) __declspec(align(16))
 void I422ToARGBRow_AVX2(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_argb,
-                         int width) {
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
   __asm {
     push       esi
     push       edi
@@ -2150,36 +2299,6 @@
 
 #ifdef HAS_I422TOARGBROW_SSSE3
 
-static const vec8 kUVToB = {
-  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
-};
-
-static const vec8 kUVToR = {
-  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
-};
-
-static const vec8 kUVToG = {
-  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
-};
-
-static const vec8 kVUToB = {
-  VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
-};
-
-static const vec8 kVUToR = {
-  VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
-};
-
-static const vec8 kVUToG = {
-  VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-};
-
-static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
-static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
-static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
-static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
-static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
-
 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
 
 // Read 8 UV from 444.
@@ -7276,7 +7395,8 @@
 }
 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
 
-#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+#endif  // defined(_M_X64)
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/source/scale_mips.cc b/source/scale_mips.cc
index 4572f45..3eb4f27 100644
--- a/source/scale_mips.cc
+++ b/source/scale_mips.cc
@@ -18,7 +18,8 @@
 
 // This module is for GCC MIPS DSPR2
 #if !defined(LIBYUV_DISABLE_MIPS) && \
-    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)
 
 void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst, int dst_width) {
diff --git a/source/scale_neon.cc b/source/scale_neon.cc
index 410364a..1b8a5ba 100644
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@@ -16,8 +16,7 @@
 #endif
 
 // This module is for GCC Neon.
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
-  !defined(__native_client__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
 
 // NEON downscalers with interpolation.
 // Provided by Fritz Koenig
@@ -29,8 +28,10 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     // load even pixels into q0, odd into q1
+    MEMACCESS(0)
     "vld2.8     {q0, q1}, [%0]!                \n"
     "subs       %2, %2, #16                    \n"  // 16 processed per loop
+    MEMACCESS(1)
     "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
     "bgt        1b                             \n"
   : "+r"(src_ptr),          // %0
@@ -49,7 +50,9 @@
     "add        %1, %0                         \n"
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
+    MEMACCESS(1)
     "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
     "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
@@ -58,6 +61,7 @@
     "vpadal.u8  q1, q3                         \n"
     "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
     "vrshrn.u16 d1, q1, #2                     \n"
+    MEMACCESS(2)
     "vst1.8     {q0}, [%2]!                    \n"
     "bgt        1b                             \n"
   : "+r"(src_ptr),          // %0
@@ -74,8 +78,10 @@
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0
     "subs       %2, %2, #8                     \n" // 8 processed per loop
+    MEMACCESS(1)
     "vst1.8     {d2}, [%1]!                    \n"
     "bgt        1b                             \n"
   : "+r"(src_ptr),          // %0
@@ -88,16 +94,20 @@
 
 void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "add        r4, %0, %3                     \n"
-    "add        r5, r4, %3                     \n"
-    "add        %3, r5, %3                     \n"
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  const uint8* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8* src_ptr3 = src_ptr + src_stride * 3;
+asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4
-    "vld1.8     {q1}, [r4]!                    \n"
-    "vld1.8     {q2}, [r5]!                    \n"
-    "vld1.8     {q3}, [%3]!                    \n"
+    MEMACCESS(3)
+    "vld1.8     {q1}, [%3]!                    \n"
+    MEMACCESS(4)
+    "vld1.8     {q2}, [%4]!                    \n"
+    MEMACCESS(5)
+    "vld1.8     {q3}, [%5]!                    \n"
     "subs       %2, %2, #4                     \n"
     "vpaddl.u8  q0, q0                         \n"
     "vpadal.u8  q0, q1                         \n"
@@ -106,13 +116,17 @@
     "vpaddl.u16 q0, q0                         \n"
     "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
     "vmovn.u16  d0, q0                         \n"
+    MEMACCESS(1)
     "vst1.32    {d0[0]}, [%1]!                 \n"
     "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  : "r"(src_stride)         // %3
-  : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width), // %2
+    "+r"(src_ptr1),  // %3
+    "+r"(src_ptr2),  // %4
+    "+r"(src_ptr3)   // %5
+  :
+  : "q0", "q1", "q2", "q3", "memory", "cc"
   );
 }
 
@@ -125,9 +139,11 @@
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0
     "subs       %2, %2, #24                  \n"
     "vmov       d2, d3                       \n" // order d0, d1, d2
+    MEMACCESS(1)
     "vst3.8     {d0, d1, d2}, [%1]!          \n"
     "bgt        1b                           \n"
   : "+r"(src_ptr),          // %0
@@ -146,7 +162,9 @@
     "add        %3, %0                         \n"
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    MEMACCESS(3)
     "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
     "subs         %2, %2, #24                  \n"
 
@@ -183,6 +201,7 @@
     "vmlal.u8     q8, d3, d24                  \n"
     "vqrshrn.u16  d2, q8, #2                   \n"
 
+    MEMACCESS(1)
     "vst3.8       {d0, d1, d2}, [%1]!          \n"
 
     "bgt          1b                           \n"
@@ -203,7 +222,9 @@
     "add        %3, %0                         \n"
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    MEMACCESS(3)
     "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
     "subs         %2, %2, #24                  \n"
     // average src line 0 with src line 1
@@ -223,6 +244,7 @@
     "vmlal.u8     q3, d3, d24                  \n"
     "vqrshrn.u16  d2, q3, #2                   \n"
 
+    MEMACCESS(1)
     "vst3.8       {d0, d1, d2}, [%1]!          \n"
     "bgt          1b                           \n"
   : "+r"(src_ptr),          // %0
@@ -251,14 +273,18 @@
                          ptrdiff_t src_stride,
                          uint8* dst_ptr, int dst_width) {
   asm volatile (
+    MEMACCESS(3)
     "vld1.8     {q3}, [%3]                     \n"
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
     "subs       %2, %2, #12                    \n"
     "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
     "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
+    MEMACCESS(1)
     "vst1.8     {d4}, [%1]!                    \n"
+    MEMACCESS(1)
     "vst1.32    {d5[0]}, [%1]!                 \n"
     "bgt        1b                             \n"
   : "+r"(src_ptr),          // %0
@@ -273,11 +299,15 @@
 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
                                       ptrdiff_t src_stride,
                                       uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride * 2;
+
   asm volatile (
-    "vld1.16    {q13}, [%4]                    \n"
-    "vld1.8     {q14}, [%5]                    \n"
-    "vld1.8     {q15}, [%6]                    \n"
-    "add        r4, %0, %3, lsl #1             \n"
+    MEMACCESS(5)
+    "vld1.16    {q13}, [%5]                    \n"
+    MEMACCESS(6)
+    "vld1.8     {q14}, [%6]                    \n"
+    MEMACCESS(7)
+    "vld1.8     {q15}, [%7]                    \n"
     "add        %3, %0                         \n"
     ".p2align   2                              \n"
   "1:                                          \n"
@@ -286,9 +316,12 @@
     // d1 = 10 50 11 51 12 52 13 53
     // d2 = 20 60 21 61 22 62 23 63
     // d3 = 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
     "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+    MEMACCESS(3)
     "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
-    "vld4.8       {d16, d17, d18, d19}, [r4]!  \n"
+    MEMACCESS(4)
+    "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"
     "subs         %2, %2, #12                  \n"
 
     // Shuffle the input data around to get align the data
@@ -365,18 +398,20 @@
     "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
     "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
 
+    MEMACCESS(1)
     "vst1.8       {d3}, [%1]!                  \n"
+    MEMACCESS(1)
     "vst1.32      {d4[0]}, [%1]!               \n"
     "bgt          1b                           \n"
   : "+r"(src_ptr),          // %0
     "+r"(dst_ptr),          // %1
     "+r"(dst_width),        // %2
-    "+r"(src_stride)        // %3
-  : "r"(&kMult38_Div6),     // %4
-    "r"(&kShuf38_2),        // %5
-    "r"(&kMult38_Div9)      // %6
-  : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
-    "q13", "q14", "q15", "memory", "cc"
+    "+r"(src_stride),       // %3
+    "+r"(src_ptr1)          // %4
+  : "r"(&kMult38_Div6),     // %5
+    "r"(&kShuf38_2),        // %6
+    "r"(&kMult38_Div9)      // %7
+  : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
   );
 }
 
@@ -385,7 +420,9 @@
                                ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
   asm volatile (
+    MEMACCESS(4)
     "vld1.16    {q13}, [%4]                    \n"
+    MEMACCESS(5)
     "vld1.8     {q14}, [%5]                    \n"
     "add        %3, %0                         \n"
     ".p2align   2                              \n"
@@ -395,7 +432,9 @@
     // d1 = 10 50 11 51 12 52 13 53
     // d2 = 20 60 21 61 22 62 23 63
     // d3 = 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
     "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+    MEMACCESS(3)
     "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
     "subs         %2, %2, #12                  \n"
 
@@ -462,7 +501,9 @@
     "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
     "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
 
+    MEMACCESS(1)
     "vst1.8       {d3}, [%1]!                  \n"
+    MEMACCESS(1)
     "vst1.32      {d4[0]}, [%1]!               \n"
     "bgt          1b                           \n"
   : "+r"(src_ptr),       // %0
@@ -495,7 +536,9 @@
     "vdup.8       d4, %4                       \n"
     // General purpose row blend.
   "1:                                          \n"
+    MEMACCESS(1)
     "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
     "vld1.8       {q1}, [%2]!                  \n"
     "subs         %3, %3, #16                  \n"
     "vmull.u8     q13, d0, d4                  \n"
@@ -504,50 +547,63 @@
     "vmlal.u8     q14, d3, d5                  \n"
     "vrshrn.u16   d0, q13, #8                  \n"
     "vrshrn.u16   d1, q14, #8                  \n"
+    MEMACCESS(0)
     "vst1.8       {q0}, [%0]!                  \n"
     "bgt          1b                           \n"
     "b            99f                          \n"
 
     // Blend 25 / 75.
   "25:                                         \n"
+    MEMACCESS(1)
     "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
     "vld1.8       {q1}, [%2]!                  \n"
     "subs         %3, %3, #16                  \n"
     "vrhadd.u8    q0, q1                       \n"
     "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
     "vst1.8       {q0}, [%0]!                  \n"
     "bgt          25b                          \n"
     "b            99f                          \n"
 
     // Blend 50 / 50.
   "50:                                         \n"
+    MEMACCESS(1)
     "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
     "vld1.8       {q1}, [%2]!                  \n"
     "subs         %3, %3, #16                  \n"
     "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
     "vst1.8       {q0}, [%0]!                  \n"
     "bgt          50b                          \n"
     "b            99f                          \n"
 
     // Blend 75 / 25.
   "75:                                         \n"
+    MEMACCESS(1)
     "vld1.8       {q1}, [%1]!                  \n"
+    MEMACCESS(2)
     "vld1.8       {q0}, [%2]!                  \n"
     "subs         %3, %3, #16                  \n"
     "vrhadd.u8    q0, q1                       \n"
     "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
     "vst1.8       {q0}, [%0]!                  \n"
     "bgt          75b                          \n"
     "b            99f                          \n"
 
     // Blend 100 / 0 - Copy row unchanged.
   "100:                                        \n"
+    MEMACCESS(1)
     "vld1.8       {q0}, [%1]!                  \n"
     "subs         %3, %3, #16                  \n"
+    MEMACCESS(0)
     "vst1.8       {q0}, [%0]!                  \n"
     "bgt          100b                         \n"
 
   "99:                                         \n"
+    MEMACCESS(0)
     "vst1.8       {d1[7]}, [%0]                \n"
   : "+r"(dst_ptr),          // %0
     "+r"(src_ptr),          // %1
@@ -565,10 +621,14 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     // load even pixels into q0, odd into q1
+    MEMACCESS(0)
     "vld2.32    {q0, q1}, [%0]!                \n"
+    MEMACCESS(0)
     "vld2.32    {q2, q3}, [%0]!                \n"
     "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    MEMACCESS(1)
     "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
+    MEMACCESS(1)
     "vst1.8     {q3}, [%1]!                    \n"
     "bgt        1b                             \n"
   : "+r"(src_ptr),          // %0
@@ -586,14 +646,18 @@
     "add        %1, %1, %0                     \n"
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
     "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
+    MEMACCESS(1)
     "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
     "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.
     "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
@@ -603,6 +667,7 @@
     "vrshrn.u16 d1, q1, #2                     \n"
     "vrshrn.u16 d2, q2, #2                     \n"
     "vrshrn.u16 d3, q3, #2                     \n"
+    MEMACCESS(2)
     "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
     "bgt        1b                             \n"
   : "+r"(src_ptr),          // %0
@@ -622,11 +687,16 @@
     "mov        r12, %3, lsl #2                \n"
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.32    {d0[0]}, [%0], r12             \n"
+    MEMACCESS(0)
     "vld1.32    {d0[1]}, [%0], r12             \n"
+    MEMACCESS(0)
     "vld1.32    {d1[0]}, [%0], r12             \n"
+    MEMACCESS(0)
     "vld1.32    {d1[1]}, [%0], r12             \n"
     "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
+    MEMACCESS(1)
     "vst1.8     {q0}, [%1]!                    \n"
     "bgt        1b                             \n"
   : "+r"(src_argb),    // %0
@@ -647,13 +717,21 @@
     "add        %1, %1, %0                     \n"
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1
+    MEMACCESS(1)
     "vld1.8     {d1}, [%1], r12                \n"
+    MEMACCESS(0)
     "vld1.8     {d2}, [%0], r12                \n"
+    MEMACCESS(1)
     "vld1.8     {d3}, [%1], r12                \n"
+    MEMACCESS(0)
     "vld1.8     {d4}, [%0], r12                \n"
+    MEMACCESS(1)
     "vld1.8     {d5}, [%1], r12                \n"
+    MEMACCESS(0)
     "vld1.8     {d6}, [%0], r12                \n"
+    MEMACCESS(1)
     "vld1.8     {d7}, [%1], r12                \n"
     "vaddl.u8   q0, d0, d1                     \n"
     "vaddl.u8   q1, d2, d3                     \n"
@@ -666,6 +744,7 @@
     "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
     "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
     "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
+    MEMACCESS(2)
     "vst1.8     {q0}, [%2]!                    \n"
     "bgt        1b                             \n"
   : "+r"(src_argb),    // %0
diff --git a/tools/sanitizer_options.gyp b/tools/sanitizer_options.gyp
new file mode 100644
index 0000000..ea453d3
--- /dev/null
+++ b/tools/sanitizer_options.gyp
@@ -0,0 +1,59 @@
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# This is a similar target to the one in Chromium's base.gyp. It's needed to get
+# the same sanitizer settings as Chromium uses (i.e. ASan, LSan, TSan...).
+{
+  'targets': [
+    {
+      'target_name': 'sanitizer_options',
+      'type': 'static_library',
+      'toolsets': ['host', 'target'],
+      'variables': {
+         # Every target is going to depend on sanitizer_options, so allow
+         # this one to depend on itself.
+         'prune_self_dependency': 1,
+         # Do not let 'none' targets depend on this one, they don't need to.
+         'link_dependency': 1,
+       },
+      'sources': [
+        'sanitizer_options/sanitizer_options.cc',
+      ],
+      'include_dirs': [
+        '<(DEPTH)',
+      ],
+      # Some targets may want to opt-out from ASan, TSan and MSan and link
+      # without the corresponding runtime libraries. We drop the libc++
+      # dependency and omit the compiler flags to avoid bringing instrumented
+      # code to those targets.
+      'conditions': [
+        ['use_custom_libcxx==1', {
+          'dependencies!': [
+            '<(DEPTH)/third_party/libc++/libc++.gyp:libcxx_proxy',
+          ],
+        }],
+        ['tsan==1', {
+          'sources': [
+            'tsan_suppressions/tsan_suppressions.cc',
+          ],
+        }],
+      ],
+      'cflags!': [
+        '-fsanitize=address',
+        '-fsanitize=thread',
+        '-fsanitize=memory',
+        '-fsanitize-memory-track-origins',
+      ],
+      'direct_dependent_settings': {
+        'ldflags': [
+          '-Wl,-u_sanitizer_options_link_helper',
+        ],
+      },
+    },
+  ],  # targets
+}
diff --git a/tools/supplement.gypi b/tools/supplement.gypi
new file mode 100644
index 0000000..c1cff3f
--- /dev/null
+++ b/tools/supplement.gypi
@@ -0,0 +1,31 @@
+# Copyright 2011 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# Supplement file for libyuv. To be processed, this file needs to be located in
+# the first level of directories below the gyp_libyuv file.
+
+# This is needed to workaround the otherwise failing include of base.gyp in
+# Chromium's common.gypi when a sanitizer tool is used.
+{
+  'variables': {
+    'use_sanitizer_options': 0,
+  },
+  'target_defaults': {
+    'conditions': [
+      # Add default sanitizer options similar to Chromium. This is needed to get
+      # the sanitizer options that has LeakSanitizer disabled by default.
+      # Otherwise yasm will throw leak errors during compile when
+      # GYP_DEFINES="asan=1".
+      ['OS=="linux" and (chromeos==0 or target_arch!="ia32")', {
+        'dependencies': [
+          '<(DEPTH)/tools/sanitizer_options.gyp:sanitizer_options',
+        ],
+      }],
+    ],
+  },
+}
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index 0a9d95f..9948074 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -24,6 +24,7 @@
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
 #include "libyuv/row.h"
+#include "libyuv/video_common.h"
 #include "../unit_test/unit_test.h"
 
 #if defined(_MSC_VER)
@@ -117,7 +118,7 @@
       }                                                                        \
     }                                                                          \
   }                                                                            \
-  EXPECT_LE(max_diff, 0);                                                      \
+  EXPECT_EQ(0, max_diff);                                                      \
   for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
     for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                   \
       int abs_diff =                                                           \
@@ -811,13 +812,13 @@
 
 #define TESTATOBIPLANAR(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)        \
     TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,           \
-                   benchmark_width_ - 4, _Any, +, 0)                           \
+                     benchmark_width_ - 4, _Any, +, 0)                         \
     TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,           \
-                   benchmark_width_, _Unaligned, +, 1)                         \
+                     benchmark_width_, _Unaligned, +, 1)                       \
     TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,           \
-                   benchmark_width_, _Invert, -, 0)                            \
+                     benchmark_width_, _Invert, -, 0)                          \
     TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,           \
-                   benchmark_width_, _Opt, +, 0)
+                     benchmark_width_, _Opt, +, 0)
 
 TESTATOBIPLANAR(ARGB, 4, NV12, 2, 2)
 TESTATOBIPLANAR(ARGB, 4, NV21, 2, 2)
@@ -963,6 +964,63 @@
 TESTATOB(Y, 1, 1, 1, ARGB, 4, 4, 1, 0)
 TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
 
+#define TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A,                          \
+                 W1280, N, NEG, OFF)                                           \
+TEST_F(libyuvTest, FMT_ATOB##_Symetric##N) {                                   \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;         \
+  const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;  \
+  align_buffer_64(src_argb, kStrideA * kHeightA + OFF);                        \
+  align_buffer_64(dst_argb_c, kStrideA * kHeightA);                            \
+  align_buffer_64(dst_argb_opt, kStrideA * kHeightA);                          \
+  srandom(time(NULL));                                                         \
+  for (int i = 0; i < kStrideA * kHeightA; ++i) {                              \
+    src_argb[i + OFF] = (random() & 0xff);                                     \
+  }                                                                            \
+  memset(dst_argb_c, 1, kStrideA * kHeightA);                                  \
+  memset(dst_argb_opt, 101, kStrideA * kHeightA);                              \
+  MaskCpuFlags(0);                                                             \
+  FMT_ATOB(src_argb + OFF, kStrideA,                                           \
+           dst_argb_c, kStrideA,                                               \
+           kWidth, NEG kHeight);                                               \
+  MaskCpuFlags(-1);                                                            \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_ATOB(src_argb + OFF, kStrideA,                                         \
+             dst_argb_opt, kStrideA,                                           \
+             kWidth, NEG kHeight);                                             \
+  }                                                                            \
+  MaskCpuFlags(0);                                                             \
+  FMT_ATOB(dst_argb_c, kStrideA,                                               \
+           dst_argb_c, kStrideA,                                               \
+           kWidth, NEG kHeight);                                               \
+  MaskCpuFlags(-1);                                                            \
+  FMT_ATOB(dst_argb_opt, kStrideA,                                             \
+           dst_argb_opt, kStrideA,                                             \
+           kWidth, NEG kHeight);                                               \
+  for (int i = 0; i < kStrideA * kHeightA; ++i) {                              \
+    EXPECT_EQ(src_argb[i + OFF], dst_argb_opt[i]);                             \
+    EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                                 \
+  }                                                                            \
+  free_aligned_buffer_64(src_argb);                                            \
+  free_aligned_buffer_64(dst_argb_c);                                          \
+  free_aligned_buffer_64(dst_argb_opt);                                        \
+}
+
+#define TESTSYM(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A)                           \
+    TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A,                              \
+             benchmark_width_ - 4, _Any, +, 0)                                 \
+    TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A,                              \
+             benchmark_width_, _Unaligned, +, 1)                               \
+    TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A,                              \
+             benchmark_width_, _Opt, +, 0)
+
+TESTSYM(ARGBToARGB, 4, 4, 1)
+TESTSYM(ARGBToBGRA, 4, 4, 1)
+TESTSYM(ARGBToABGR, 4, 4, 1)
+TESTSYM(BGRAToARGB, 4, 4, 1)
+TESTSYM(ABGRToARGB, 4, 4, 1)
+
 TEST_F(libyuvTest, Test565) {
   SIMD_ALIGNED(uint8 orig_pixels[256][4]);
   SIMD_ALIGNED(uint8 pixels565[256][2]);
@@ -1099,4 +1157,108 @@
 
 #endif  // HAVE_JPEG
 
+TEST_F(libyuvTest, CropNV12) {
+  const int SUBSAMP_X = 2;
+  const int SUBSAMP_Y = 2;
+  const int kWidth = benchmark_width_;
+  const int kHeight = benchmark_height_;
+  const int crop_y =
+    (benchmark_height_ - (benchmark_height_ * 360 / 480)) / 2;
+  const int kDestWidth = benchmark_width_;
+  const int kDestHeight = benchmark_height_ - crop_y * 2;;
+  const int sample_size = kWidth * kHeight +
+    SUBSAMPLE(kWidth, SUBSAMP_X) *
+    SUBSAMPLE(kHeight, SUBSAMP_Y) * 2;
+  align_buffer_64(src_y, sample_size);
+  uint8* src_uv = src_y + kWidth * kHeight;
+
+  align_buffer_64(dst_y, kDestWidth * kDestHeight);
+  align_buffer_64(dst_u,
+                  SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                  SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  align_buffer_64(dst_v,
+                  SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                  SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+  align_buffer_64(dst_y_2, kDestWidth * kDestHeight);
+  align_buffer_64(dst_u_2,
+                  SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                  SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  align_buffer_64(dst_v_2,
+                  SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                  SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+  srandom(time(NULL));
+  for (int i = 0; i < kHeight; ++i)
+    for (int j = 0; j < kWidth; ++j)
+      src_y[(i * kWidth) + j] = (random() & 0xff);
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {
+    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {
+      src_uv[(i * SUBSAMPLE(kWidth, SUBSAMP_X)) + j * 2 + 0] =
+          (random() & 0xff);
+      src_uv[(i * SUBSAMPLE(kWidth, SUBSAMP_X)) + j * 2 + 1] =
+          (random() & 0xff);
+    }
+  }
+  memset(dst_y, 1, kDestWidth * kDestHeight);
+  memset(dst_u, 2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  memset(dst_v, 3, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  memset(dst_y_2, 1, kDestWidth * kDestHeight);
+  memset(dst_u_2, 2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                     SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  memset(dst_v_2, 3, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                     SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+  NV12ToI420(src_y + crop_y * kWidth, kWidth,
+             src_uv + (crop_y / 2) * kWidth, kWidth,
+             dst_y, kDestWidth,
+             dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X),
+             dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X),
+             kDestWidth, kDestHeight);
+
+  ConvertToI420(src_y, sample_size,
+                dst_y_2, kDestWidth,
+                dst_u_2, SUBSAMPLE(kDestWidth, SUBSAMP_X),
+                dst_v_2, SUBSAMPLE(kDestWidth, SUBSAMP_X),
+                0, crop_y,
+                kWidth, kHeight,
+                kDestWidth, kDestHeight,
+                libyuv::kRotate0, libyuv::FOURCC_NV12);
+
+  for (int i = 0; i < kDestHeight; ++i) {
+    for (int j = 0; j < kDestWidth; ++j) {
+      EXPECT_EQ(dst_y[i * kWidth + j], dst_y_2[i * kWidth + j]);
+    }
+  }
+  for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+    for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+      EXPECT_EQ(dst_u[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j],
+                dst_u_2[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+    }
+  }
+  for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+    for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+      EXPECT_EQ(dst_v[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j],
+                dst_v_2[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+    }
+  }
+  free_aligned_buffer_64(dst_y);
+  free_aligned_buffer_64(dst_u);
+  free_aligned_buffer_64(dst_v);
+  free_aligned_buffer_64(dst_y_2);
+  free_aligned_buffer_64(dst_u_2);
+  free_aligned_buffer_64(dst_v_2);
+  free_aligned_buffer_64(src_y);
+}
+
+TEST_F(libyuvTest, HaveJPEG) {
+#ifdef HAVE_JPEG
+  printf("JPEG enabled\n.");
+#else
+  printf("JPEG disabled\n.");
+#endif
+}
+
 }  // namespace libyuv
diff --git a/util/Makefile b/util/Makefile
index be6de35..6044d2a 100644
--- a/util/Makefile
+++ b/util/Makefile
@@ -1,6 +1,6 @@
-psnr: psnr.cc ssim.cc psnr_main.cc

-ifeq ($(CXX),icl)

-	$(CXX) /arch:SSE2 /Ox /openmp psnr.cc ssim.cc psnr_main.cc

-else

-	$(CXX) -msse2 -O3 -fopenmp -static -o psnr psnr.cc ssim.cc psnr_main.cc -Wl,--strip-all

-endif

+psnr: psnr.cc ssim.cc psnr_main.cc
+ifeq ($(CXX),icl)
+	$(CXX) /arch:SSE2 /Ox /openmp psnr.cc ssim.cc psnr_main.cc
+else
+	$(CXX) -msse2 -O3 -fopenmp -static -o psnr psnr.cc ssim.cc psnr_main.cc -Wl,--strip-all
+endif
diff --git a/util/psnr.cc b/util/psnr.cc
index e8fd16a..f3cc0cf 100644
--- a/util/psnr.cc
+++ b/util/psnr.cc
@@ -10,8 +10,6 @@
 
 #include "./psnr.h"  // NOLINT
 
-#include <math.h>
-
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -34,13 +32,8 @@
 #endif  // __LP64__
 #endif  // _MSC_VER
 
-// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
-double ComputePSNR(double sse, double size) {
-  const double kMINSSE = 255.0 * 255.0 * size / pow(10., kMaxPSNR / 10.);
-  if (sse <= kMINSSE)
-    sse = kMINSSE;  // Produces max PSNR of 128
-  return 10.0 * log10(65025.0 * size / sse);
-}
+// libyuv provides this function when linking library for jpeg support.
+#if !defined(HAVE_JPEG)
 
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
 #define HAS_SUMSQUAREERROR_NEON
@@ -241,6 +234,16 @@
   }
   return static_cast<double>(sse);
 }
+#endif
+
+// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
+// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
+double ComputePSNR(double sse, double size) {
+  const double kMINSSE = 255.0 * 255.0 * size / pow(10.0, kMaxPSNR / 10.0);
+  if (sse <= kMINSSE)
+    sse = kMINSSE;  // Produces max PSNR of 128
+  return 10.0 * log10(255.0 * 255.0 * size / sse);
+}
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/util/psnr.h b/util/psnr.h
index 370337a..0816b97 100644
--- a/util/psnr.h
+++ b/util/psnr.h
@@ -13,6 +13,8 @@
 #ifndef UTIL_PSNR_H_  // NOLINT
 #define UTIL_PSNR_H_
 
+#include <math.h>  // For log10()
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -24,13 +26,17 @@
 
 static const double kMaxPSNR = 128.0;
 
-// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse).
-// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
-double ComputePSNR(double sse, double size);
-
+// libyuv provides this function when linking library for jpeg support.
+// TODO(fbarchard): make psnr lib compatible subset of libyuv.
+#if !defined(HAVE_JPEG)
 // Computer Sum of Squared Error (SSE).
 // Pass this to ComputePSNR for final result.
 double ComputeSumSquareError(const uint8* org, const uint8* rec, int size);
+#endif
+
+// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
+// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
+double ComputePSNR(double sse, double size);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/util/psnr_main.cc b/util/psnr_main.cc
index 9cee5f8..0518ab8 100644
--- a/util/psnr_main.cc
+++ b/util/psnr_main.cc
@@ -32,6 +32,10 @@
 
 #include "./psnr.h"
 #include "./ssim.h"
+#ifdef HAVE_JPEG
+#include "libyuv/compare.h"
+#include "libyuv/convert.h"
+#endif
 
 struct metric {
   double y, u, v, all;
@@ -75,6 +79,29 @@
       }
     }
   }
+
+#ifdef HAVE_JPEG
+  // Try parsing file as a jpeg.
+  FILE* const file_org = fopen(name, "rb");
+  if (file_org == NULL) {
+    fprintf(stderr, "Cannot open %s\n", name);
+    return false;
+  }
+  fseek(file_org, 0, SEEK_END);
+  size_t total_size  = ftell(file_org);
+  fseek(file_org, 0, SEEK_SET);
+  uint8* const ch_org = new uint8[total_size];
+  memset(ch_org, 0, total_size);
+  size_t bytes_org = fread(ch_org, sizeof(uint8), total_size, file_org);
+  fclose(file_org);
+  if (bytes_org == total_size) {
+    if (0 == libyuv::MJPGSize(ch_org, total_size, width_ptr, height_ptr)) {
+      delete[] ch_org;
+      return true;
+    }
+  }
+  delete[] ch_org;
+#endif  // HAVE_JPEG
   return false;
 }
 
@@ -94,6 +121,9 @@
 
 void PrintHelp(const char * program) {
   printf("%s [-options] org_seq rec_seq [rec_seq2.. etc]\n", program);
+#ifdef HAVE_JPEG
+  printf("jpeg or raw YUV 420 supported.\n");
+#endif
   printf("options:\n");
   printf(" -s <width> <height> .... specify YUV size, mandatory if none of the "
          "sequences have the\n");
@@ -215,9 +245,18 @@
   const uint8* const v_org = ch_org + y_size + (uv_size - uv_offset);
   const uint8* const v_rec = ch_rec + y_size + uv_size;
   if (do_psnr) {
+#ifdef HAVE_JPEG
+    double y_err = static_cast<double>(
+      libyuv::ComputeSumSquareError(ch_org, ch_rec, y_size));
+    double u_err = static_cast<double>(
+      libyuv::ComputeSumSquareError(u_org, u_rec, uv_size));
+    double v_err = static_cast<double>(
+      libyuv::ComputeSumSquareError(v_org, v_rec, uv_size));
+#else
     double y_err = ComputeSumSquareError(ch_org, ch_rec, y_size);
     double u_err = ComputeSumSquareError(u_org, u_rec, uv_size);
     double v_err = ComputeSumSquareError(v_org, v_rec, uv_size);
+#endif
     const double total_err = y_err + u_err + v_err;
     cur_distortion_psnr->global_y += y_err;
     cur_distortion_psnr->global_u += u_err;
@@ -230,10 +269,10 @@
                                        static_cast<double>(total_size));
   } else {
     distorted_frame->y = CalcSSIM(ch_org, ch_rec, image_width, image_height);
-    distorted_frame->u = CalcSSIM(u_org, u_rec, image_width / 2,
-                                 image_height / 2);
-    distorted_frame->v = CalcSSIM(v_org, v_rec, image_width / 2,
-                                 image_height / 2);
+    distorted_frame->u = CalcSSIM(u_org, u_rec, (image_width + 1) / 2,
+                                 (image_height + 1) / 2);
+    distorted_frame->v = CalcSSIM(v_org, v_rec, (image_width + 1) / 2,
+                                 (image_height + 1) / 2);
     distorted_frame->all =
       (distorted_frame->y + distorted_frame->u + distorted_frame->v)
         / total_size;
@@ -386,14 +425,62 @@
       break;
 
     size_t bytes_org = fread(ch_org, sizeof(uint8), total_size, file_org);
-    if (bytes_org < total_size)
+    if (bytes_org < total_size) {
+#ifdef HAVE_JPEG
+      // Try parsing file as a jpeg.
+      uint8* const ch_jpeg = new uint8[bytes_org];
+      memcpy(ch_jpeg, ch_org, bytes_org);
+      memset(ch_org, 0, total_size);
+
+      if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_org,
+                                  ch_org,
+                                  image_width,
+                                  ch_org + y_size,
+                                  (image_width + 1) / 2,
+                                  ch_org + y_size + uv_size,
+                                  (image_width + 1) / 2,
+                                  image_width,
+                                  image_height,
+                                  image_width,
+                                  image_height)) {
+        delete[] ch_jpeg;
+        break;
+      }
+      delete[] ch_jpeg;
+#else
       break;
+#endif  // HAVE_JPEG
+    }
 
     for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
       size_t bytes_rec = fread(ch_rec, sizeof(uint8),
                                total_size, file_rec[cur_rec]);
-      if (bytes_rec < total_size)
+      if (bytes_rec < total_size) {
+#ifdef HAVE_JPEG
+        // Try parsing file as a jpeg.
+        uint8* const ch_jpeg = new uint8[bytes_rec];
+        memcpy(ch_jpeg, ch_rec, bytes_rec);
+        memset(ch_rec, 0, total_size);
+
+        if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_rec,
+                                    ch_rec,
+                                    image_width,
+                                    ch_rec + y_size,
+                                    (image_width + 1) / 2,
+                                    ch_rec + y_size + uv_size,
+                                    (image_width + 1) / 2,
+                                    image_width,
+                                    image_height,
+                                    image_width,
+                                    image_height)) {
+          delete[] ch_jpeg;
+          break;
+        }
+        delete[] ch_jpeg;
+#else
         break;
+#endif  // HAVE_JPEG
+      }
 
       if (verbose) {
         printf("%5d", number_of_frames);
diff --git a/util/ssim.cc b/util/ssim.cc
index d07889a..5a6399b 100644
--- a/util/ssim.cc
+++ b/util/ssim.cc
@@ -10,7 +10,6 @@
 
 #include "../util/ssim.h"  // NOLINT
 
-#include <math.h>
 #include <string.h>
 
 #ifdef __cplusplus
diff --git a/util/ssim.h b/util/ssim.h
index 40120b4..430eb71 100644
--- a/util/ssim.h
+++ b/util/ssim.h
@@ -13,6 +13,8 @@
 #ifndef UTIL_SSIM_H_  // NOLINT
 #define UTIL_SSIM_H_
 
+#include <math.h>  // For log10()
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -25,7 +27,6 @@
 double CalcSSIM(const uint8* org, const uint8* rec,
                 const int image_width, const int image_height);
 
-// does -10.0 * log10(1.0 - ssim)
 double CalcLSSIM(double ssim);
 
 #ifdef __cplusplus