Initial population of libyuv

Change-Id: I46a6a1525aebaba979b0f2ca5b58be2004901410
diff --git a/files/source/conversion_tables.h b/files/source/conversion_tables.h
new file mode 100644
index 0000000..9a32864
--- /dev/null
+++ b/files/source/conversion_tables.h
@@ -0,0 +1,203 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/**************************************************************
+*  conversion_tables.h
+*
+*    Pre-compiled definitions of the conversion equations: YUV -> RGB.
+*
+***************************************************************/
+
+#ifndef LIBYUV_SOURCE_CONVERSION_TABLES_H_
+#define LIBYUV_SOURCE_CONVERSION_TABLES_H_
+
+namespace libyuv {
+
+/******************************************************************************
+* YUV TO RGB approximation
+*
+*  R = clip( (298 * (Y - 16)                   + 409 * (V - 128) + 128 ) >> 8 )
+*  G = clip( (298 * (Y - 16) - 100 * (U - 128) - 208 * (V - 128) + 128 ) >> 8 )
+*  B = clip( (298 * (Y - 16) + 516 * (U - 128)                   + 128 ) >> 8 )
+*******************************************************************************/
+
+    #define Yc(i)  static_cast<int> ( 298  * ( i - 16 )) // Y contribution
+    #define Ucg(i) static_cast<int> ( -100 * ( i - 128 ))// U contribution to G
+    #define Ucb(i) static_cast<int> ( 516  * ( i - 128 ))// U contribution to B
+    #define Vcr(i) static_cast<int> ( 409  * ( i - 128 ))// V contribution to R
+    #define Vcg(i) static_cast<int> ( -208 * ( i - 128 ))// V contribution to G
+
+    static const int mapYc[256] = {
+        Yc(0),Yc(1),Yc(2),Yc(3),Yc(4),Yc(5),Yc(6),Yc(7),Yc(8),Yc(9),
+        Yc(10),Yc(11),Yc(12),Yc(13),Yc(14),Yc(15),Yc(16),Yc(17),Yc(18),Yc(19),
+        Yc(20),Yc(21),Yc(22),Yc(23),Yc(24),Yc(25),Yc(26),Yc(27),Yc(28),Yc(29),
+        Yc(30),Yc(31),Yc(32),Yc(33),Yc(34),Yc(35),Yc(36),Yc(37),Yc(38),Yc(39),
+        Yc(40),Yc(41),Yc(42),Yc(43),Yc(44),Yc(45),Yc(46),Yc(47),Yc(48),Yc(49),
+        Yc(50),Yc(51),Yc(52),Yc(53),Yc(54),Yc(55),Yc(56),Yc(57),Yc(58),Yc(59),
+        Yc(60),Yc(61),Yc(62),Yc(63),Yc(64),Yc(65),Yc(66),Yc(67),Yc(68),Yc(69),
+        Yc(70),Yc(71),Yc(72),Yc(73),Yc(74),Yc(75),Yc(76),Yc(77),Yc(78),Yc(79),
+        Yc(80),Yc(81),Yc(82),Yc(83),Yc(84),Yc(85),Yc(86),Yc(87),Yc(88),Yc(89),
+        Yc(90),Yc(91),Yc(92),Yc(93),Yc(94),Yc(95),Yc(96),Yc(97),Yc(98),Yc(99),
+        Yc(100),Yc(101),Yc(102),Yc(103),Yc(104),Yc(105),Yc(106),Yc(107),Yc(108),
+        Yc(109),Yc(110),Yc(111),Yc(112),Yc(113),Yc(114),Yc(115),Yc(116),Yc(117),
+        Yc(118),Yc(119),Yc(120),Yc(121),Yc(122),Yc(123),Yc(124),Yc(125),Yc(126),
+        Yc(127),Yc(128),Yc(129),Yc(130),Yc(131),Yc(132),Yc(133),Yc(134),Yc(135),
+        Yc(136),Yc(137),Yc(138),Yc(139),Yc(140),Yc(141),Yc(142),Yc(143),Yc(144),
+        Yc(145),Yc(146),Yc(147),Yc(148),Yc(149),Yc(150),Yc(151),Yc(152),Yc(153),
+        Yc(154),Yc(155),Yc(156),Yc(157),Yc(158),Yc(159),Yc(160),Yc(161),Yc(162),
+        Yc(163),Yc(164),Yc(165),Yc(166),Yc(167),Yc(168),Yc(169),Yc(170),Yc(171),
+        Yc(172),Yc(173),Yc(174),Yc(175),Yc(176),Yc(177),Yc(178),Yc(179),Yc(180),
+        Yc(181),Yc(182),Yc(183),Yc(184),Yc(185),Yc(186),Yc(187),Yc(188),Yc(189),
+        Yc(190),Yc(191),Yc(192),Yc(193),Yc(194),Yc(195),Yc(196),Yc(197),Yc(198),
+        Yc(199),Yc(200),Yc(201),Yc(202),Yc(203),Yc(204),Yc(205),Yc(206),Yc(207),
+        Yc(208),Yc(209),Yc(210),Yc(211),Yc(212),Yc(213),Yc(214),Yc(215),Yc(216),
+        Yc(217),Yc(218),Yc(219),Yc(220),Yc(221),Yc(222),Yc(223),Yc(224),Yc(225),
+        Yc(226),Yc(227),Yc(228),Yc(229),Yc(230),Yc(231),Yc(232),Yc(233),Yc(234),
+        Yc(235),Yc(236),Yc(237),Yc(238),Yc(239),Yc(240),Yc(241),Yc(242),Yc(243),
+        Yc(244),Yc(245),Yc(246),Yc(247),Yc(248),Yc(249),Yc(250),Yc(251),Yc(252),
+        Yc(253),Yc(254),Yc(255)};
+
+   static const int mapUcg[256] = {
+        Ucg(0),Ucg(1),Ucg(2),Ucg(3),Ucg(4),Ucg(5),Ucg(6),Ucg(7),Ucg(8),Ucg(9),
+        Ucg(10),Ucg(11),Ucg(12),Ucg(13),Ucg(14),Ucg(15),Ucg(16),Ucg(17),Ucg(18),
+        Ucg(19),Ucg(20),Ucg(21),Ucg(22),Ucg(23),Ucg(24),Ucg(25),Ucg(26),Ucg(27),
+        Ucg(28),Ucg(29),Ucg(30),Ucg(31),Ucg(32),Ucg(33),Ucg(34),Ucg(35),Ucg(36),
+        Ucg(37),Ucg(38),Ucg(39),Ucg(40),Ucg(41),Ucg(42),Ucg(43),Ucg(44),Ucg(45),
+        Ucg(46),Ucg(47),Ucg(48),Ucg(49),Ucg(50),Ucg(51),Ucg(52),Ucg(53),Ucg(54),
+        Ucg(55),Ucg(56),Ucg(57),Ucg(58),Ucg(59),Ucg(60),Ucg(61),Ucg(62),Ucg(63),
+        Ucg(64),Ucg(65),Ucg(66),Ucg(67),Ucg(68),Ucg(69),Ucg(70),Ucg(71),Ucg(72),
+        Ucg(73),Ucg(74),Ucg(75),Ucg(76),Ucg(77),Ucg(78),Ucg(79),Ucg(80),Ucg(81),
+        Ucg(82),Ucg(83),Ucg(84),Ucg(85),Ucg(86),Ucg(87),Ucg(88),Ucg(89),Ucg(90),
+        Ucg(91),Ucg(92),Ucg(93),Ucg(94),Ucg(95),Ucg(96),Ucg(97),Ucg(98),Ucg(99),
+        Ucg(100),Ucg(101),Ucg(102),Ucg(103),Ucg(104),Ucg(105),Ucg(106),Ucg(107),
+        Ucg(108),Ucg(109),Ucg(110),Ucg(111),Ucg(112),Ucg(113),Ucg(114),Ucg(115),
+        Ucg(116),Ucg(117),Ucg(118),Ucg(119),Ucg(120),Ucg(121),Ucg(122),Ucg(123),
+        Ucg(124),Ucg(125),Ucg(126),Ucg(127),Ucg(128),Ucg(129),Ucg(130),Ucg(131),
+        Ucg(132),Ucg(133),Ucg(134),Ucg(135),Ucg(136),Ucg(137),Ucg(138),Ucg(139),
+        Ucg(140),Ucg(141),Ucg(142),Ucg(143),Ucg(144),Ucg(145),Ucg(146),Ucg(147),
+        Ucg(148),Ucg(149),Ucg(150),Ucg(151),Ucg(152),Ucg(153),Ucg(154),Ucg(155),
+        Ucg(156),Ucg(157),Ucg(158),Ucg(159),Ucg(160),Ucg(161),Ucg(162),Ucg(163),
+        Ucg(164),Ucg(165),Ucg(166),Ucg(167),Ucg(168),Ucg(169),Ucg(170),Ucg(171),
+        Ucg(172),Ucg(173),Ucg(174),Ucg(175),Ucg(176),Ucg(177),Ucg(178),Ucg(179),
+        Ucg(180),Ucg(181),Ucg(182),Ucg(183),Ucg(184),Ucg(185),Ucg(186),Ucg(187),
+        Ucg(188),Ucg(189),Ucg(190),Ucg(191),Ucg(192),Ucg(193),Ucg(194),Ucg(195),
+        Ucg(196),Ucg(197),Ucg(198),Ucg(199),Ucg(200),Ucg(201),Ucg(202),Ucg(203),
+        Ucg(204),Ucg(205),Ucg(206),Ucg(207),Ucg(208),Ucg(209),Ucg(210),Ucg(211),
+        Ucg(212),Ucg(213),Ucg(214),Ucg(215),Ucg(216),Ucg(217),Ucg(218),Ucg(219),
+        Ucg(220),Ucg(221),Ucg(222),Ucg(223),Ucg(224),Ucg(225),Ucg(226),Ucg(227),
+        Ucg(228),Ucg(229),Ucg(230),Ucg(231),Ucg(232),Ucg(233),Ucg(234),Ucg(235),
+        Ucg(236),Ucg(237),Ucg(238),Ucg(239),Ucg(240),Ucg(241),Ucg(242),Ucg(243),
+        Ucg(244),Ucg(245),Ucg(246),Ucg(247),Ucg(248),Ucg(249),Ucg(250),Ucg(251),
+        Ucg(252),Ucg(253),Ucg(254),Ucg(255)};
+
+   static const int mapUcb[256] = {
+        Ucb(0),Ucb(1),Ucb(2),Ucb(3),Ucb(4),Ucb(5),Ucb(6),Ucb(7),Ucb(8),Ucb(9),
+        Ucb(10),Ucb(11),Ucb(12),Ucb(13),Ucb(14),Ucb(15),Ucb(16),Ucb(17),Ucb(18),
+        Ucb(19),Ucb(20),Ucb(21),Ucb(22),Ucb(23),Ucb(24),Ucb(25),Ucb(26),Ucb(27),
+        Ucb(28),Ucb(29),Ucb(30),Ucb(31),Ucb(32),Ucb(33),Ucb(34),Ucb(35),Ucb(36),
+        Ucb(37),Ucb(38),Ucb(39),Ucb(40),Ucb(41),Ucb(42),Ucb(43),Ucb(44),Ucb(45),
+        Ucb(46),Ucb(47),Ucb(48),Ucb(49),Ucb(50),Ucb(51),Ucb(52),Ucb(53),Ucb(54),
+        Ucb(55),Ucb(56),Ucb(57),Ucb(58),Ucb(59),Ucb(60),Ucb(61),Ucb(62),Ucb(63),
+        Ucb(64),Ucb(65),Ucb(66),Ucb(67),Ucb(68),Ucb(69),Ucb(70),Ucb(71),Ucb(72),
+        Ucb(73),Ucb(74),Ucb(75),Ucb(76),Ucb(77),Ucb(78),Ucb(79),Ucb(80),Ucb(81),
+        Ucb(82),Ucb(83),Ucb(84),Ucb(85),Ucb(86),Ucb(87),Ucb(88),Ucb(89),Ucb(90),
+        Ucb(91),Ucb(92),Ucb(93),Ucb(94),Ucb(95),Ucb(96),Ucb(97),Ucb(98),Ucb(99),
+        Ucb(100),Ucb(101),Ucb(102),Ucb(103),Ucb(104),Ucb(105),Ucb(106),Ucb(107),
+        Ucb(108),Ucb(109),Ucb(110),Ucb(111),Ucb(112),Ucb(113),Ucb(114),Ucb(115),
+        Ucb(116),Ucb(117),Ucb(118),Ucb(119),Ucb(120),Ucb(121),Ucb(122),Ucb(123),
+        Ucb(124),Ucb(125),Ucb(126),Ucb(127),Ucb(128),Ucb(129),Ucb(130),Ucb(131),
+        Ucb(132),Ucb(133),Ucb(134),Ucb(135),Ucb(136),Ucb(137),Ucb(138),Ucb(139),
+        Ucb(140),Ucb(141),Ucb(142),Ucb(143),Ucb(144),Ucb(145),Ucb(146),Ucb(147),
+        Ucb(148),Ucb(149),Ucb(150),Ucb(151),Ucb(152),Ucb(153),Ucb(154),Ucb(155),
+        Ucb(156),Ucb(157),Ucb(158),Ucb(159),Ucb(160),Ucb(161),Ucb(162),Ucb(163),
+        Ucb(164),Ucb(165),Ucb(166),Ucb(167),Ucb(168),Ucb(169),Ucb(170),Ucb(171),
+        Ucb(172),Ucb(173),Ucb(174),Ucb(175),Ucb(176),Ucb(177),Ucb(178),Ucb(179),
+        Ucb(180),Ucb(181),Ucb(182),Ucb(183),Ucb(184),Ucb(185),Ucb(186),Ucb(187),
+        Ucb(188),Ucb(189),Ucb(190),Ucb(191),Ucb(192),Ucb(193),Ucb(194),Ucb(195),
+        Ucb(196),Ucb(197),Ucb(198),Ucb(199),Ucb(200),Ucb(201),Ucb(202),Ucb(203),
+        Ucb(204),Ucb(205),Ucb(206),Ucb(207),Ucb(208),Ucb(209),Ucb(210),Ucb(211),
+        Ucb(212),Ucb(213),Ucb(214),Ucb(215),Ucb(216),Ucb(217),Ucb(218),Ucb(219),
+        Ucb(220),Ucb(221),Ucb(222),Ucb(223),Ucb(224),Ucb(225),Ucb(226),Ucb(227),
+        Ucb(228),Ucb(229),Ucb(230),Ucb(231),Ucb(232),Ucb(233),Ucb(234),Ucb(235),
+        Ucb(236),Ucb(237),Ucb(238),Ucb(239),Ucb(240),Ucb(241),Ucb(242),Ucb(243),
+        Ucb(244),Ucb(245),Ucb(246),Ucb(247),Ucb(248),Ucb(249),Ucb(250),Ucb(251),
+        Ucb(252),Ucb(253),Ucb(254),Ucb(255)};
+
+    static const int mapVcr[256] = {
+        Vcr(0),Vcr(1),Vcr(2),Vcr(3),Vcr(4),Vcr(5),Vcr(6),Vcr(7),Vcr(8),Vcr(9),
+        Vcr(10),Vcr(11),Vcr(12),Vcr(13),Vcr(14),Vcr(15),Vcr(16),Vcr(17),Vcr(18),
+        Vcr(19),Vcr(20),Vcr(21),Vcr(22),Vcr(23),Vcr(24),Vcr(25),Vcr(26),Vcr(27),
+        Vcr(28),Vcr(29),Vcr(30),Vcr(31),Vcr(32),Vcr(33),Vcr(34),Vcr(35),Vcr(36),
+        Vcr(37),Vcr(38),Vcr(39),Vcr(40),Vcr(41),Vcr(42),Vcr(43),Vcr(44),Vcr(45),
+        Vcr(46),Vcr(47),Vcr(48),Vcr(49),Vcr(50),Vcr(51),Vcr(52),Vcr(53),Vcr(54),
+        Vcr(55),Vcr(56),Vcr(57),Vcr(58),Vcr(59),Vcr(60),Vcr(61),Vcr(62),Vcr(63),
+        Vcr(64),Vcr(65),Vcr(66),Vcr(67),Vcr(68),Vcr(69),Vcr(70),Vcr(71),Vcr(72),
+        Vcr(73),Vcr(74),Vcr(75),Vcr(76),Vcr(77),Vcr(78),Vcr(79),Vcr(80),Vcr(81),
+        Vcr(82),Vcr(83),Vcr(84),Vcr(85),Vcr(86),Vcr(87),Vcr(88),Vcr(89),Vcr(90),
+        Vcr(91),Vcr(92),Vcr(93),Vcr(94),Vcr(95),Vcr(96),Vcr(97),Vcr(98),Vcr(99),
+        Vcr(100),Vcr(101),Vcr(102),Vcr(103),Vcr(104),Vcr(105),Vcr(106),Vcr(107),
+        Vcr(108),Vcr(109),Vcr(110),Vcr(111),Vcr(112),Vcr(113),Vcr(114),Vcr(115),
+        Vcr(116),Vcr(117),Vcr(118),Vcr(119),Vcr(120),Vcr(121),Vcr(122),Vcr(123),
+        Vcr(124),Vcr(125),Vcr(126),Vcr(127),Vcr(128),Vcr(129),Vcr(130),Vcr(131),
+        Vcr(132),Vcr(133),Vcr(134),Vcr(135),Vcr(136),Vcr(137),Vcr(138),Vcr(139),
+        Vcr(140),Vcr(141),Vcr(142),Vcr(143),Vcr(144),Vcr(145),Vcr(146),Vcr(147),
+        Vcr(148),Vcr(149),Vcr(150),Vcr(151),Vcr(152),Vcr(153),Vcr(154),Vcr(155),
+        Vcr(156),Vcr(157),Vcr(158),Vcr(159),Vcr(160),Vcr(161),Vcr(162),Vcr(163),
+        Vcr(164),Vcr(165),Vcr(166),Vcr(167),Vcr(168),Vcr(169),Vcr(170),Vcr(171),
+        Vcr(172),Vcr(173),Vcr(174),Vcr(175),Vcr(176),Vcr(177),Vcr(178),Vcr(179),
+        Vcr(180),Vcr(181),Vcr(182),Vcr(183),Vcr(184),Vcr(185),Vcr(186),Vcr(187),
+        Vcr(188),Vcr(189),Vcr(190),Vcr(191),Vcr(192),Vcr(193),Vcr(194),Vcr(195),
+        Vcr(196),Vcr(197),Vcr(198),Vcr(199),Vcr(200),Vcr(201),Vcr(202),Vcr(203),
+        Vcr(204),Vcr(205),Vcr(206),Vcr(207),Vcr(208),Vcr(209),Vcr(210),Vcr(211),
+        Vcr(212),Vcr(213),Vcr(214),Vcr(215),Vcr(216),Vcr(217),Vcr(218),Vcr(219),
+        Vcr(220),Vcr(221),Vcr(222),Vcr(223),Vcr(224),Vcr(225),Vcr(226),Vcr(227),
+        Vcr(228),Vcr(229),Vcr(230),Vcr(231),Vcr(232),Vcr(233),Vcr(234),Vcr(235),
+        Vcr(236),Vcr(237),Vcr(238),Vcr(239),Vcr(240),Vcr(241),Vcr(242),Vcr(243),
+        Vcr(244),Vcr(245),Vcr(246),Vcr(247),Vcr(248),Vcr(249),Vcr(250),Vcr(251),
+        Vcr(252),Vcr(253),Vcr(254),Vcr(255)};
+
+
+         static const int mapVcg[256] = {
+        Vcg(0),Vcg(1),Vcg(2),Vcg(3),Vcg(4),Vcg(5),Vcg(6),Vcg(7),Vcg(8),Vcg(9),
+        Vcg(10),Vcg(11),Vcg(12),Vcg(13),Vcg(14),Vcg(15),Vcg(16),Vcg(17),Vcg(18),
+        Vcg(19),Vcg(20),Vcg(21),Vcg(22),Vcg(23),Vcg(24),Vcg(25),Vcg(26),Vcg(27),
+        Vcg(28),Vcg(29),Vcg(30),Vcg(31),Vcg(32),Vcg(33),Vcg(34),Vcg(35),Vcg(36),
+        Vcg(37),Vcg(38),Vcg(39),Vcg(40),Vcg(41),Vcg(42),Vcg(43),Vcg(44),Vcg(45),
+        Vcg(46),Vcg(47),Vcg(48),Vcg(49),Vcg(50),Vcg(51),Vcg(52),Vcg(53),Vcg(54),
+        Vcg(55),Vcg(56),Vcg(57),Vcg(58),Vcg(59),Vcg(60),Vcg(61),Vcg(62),Vcg(63),
+        Vcg(64),Vcg(65),Vcg(66),Vcg(67),Vcg(68),Vcg(69),Vcg(70),Vcg(71),Vcg(72),
+        Vcg(73),Vcg(74),Vcg(75),Vcg(76),Vcg(77),Vcg(78),Vcg(79),Vcg(80),Vcg(81),
+        Vcg(82),Vcg(83),Vcg(84),Vcg(85),Vcg(86),Vcg(87),Vcg(88),Vcg(89),Vcg(90),
+        Vcg(91),Vcg(92),Vcg(93),Vcg(94),Vcg(95),Vcg(96),Vcg(97),Vcg(98),Vcg(99),
+        Vcg(100),Vcg(101),Vcg(102),Vcg(103),Vcg(104),Vcg(105),Vcg(106),Vcg(107),
+        Vcg(108),Vcg(109),Vcg(110),Vcg(111),Vcg(112),Vcg(113),Vcg(114),Vcg(115),
+        Vcg(116),Vcg(117),Vcg(118),Vcg(119),Vcg(120),Vcg(121),Vcg(122),Vcg(123),
+        Vcg(124),Vcg(125),Vcg(126),Vcg(127),Vcg(128),Vcg(129),Vcg(130),Vcg(131),
+        Vcg(132),Vcg(133),Vcg(134),Vcg(135),Vcg(136),Vcg(137),Vcg(138),Vcg(139),
+        Vcg(140),Vcg(141),Vcg(142),Vcg(143),Vcg(144),Vcg(145),Vcg(146),Vcg(147),
+        Vcg(148),Vcg(149),Vcg(150),Vcg(151),Vcg(152),Vcg(153),Vcg(154),Vcg(155),
+        Vcg(156),Vcg(157),Vcg(158),Vcg(159),Vcg(160),Vcg(161),Vcg(162),Vcg(163),
+        Vcg(164),Vcg(165),Vcg(166),Vcg(167),Vcg(168),Vcg(169),Vcg(170),Vcg(171),
+        Vcg(172),Vcg(173),Vcg(174),Vcg(175),Vcg(176),Vcg(177),Vcg(178),Vcg(179),
+        Vcg(180),Vcg(181),Vcg(182),Vcg(183),Vcg(184),Vcg(185),Vcg(186),Vcg(187),
+        Vcg(188),Vcg(189),Vcg(190),Vcg(191),Vcg(192),Vcg(193),Vcg(194),Vcg(195),
+        Vcg(196),Vcg(197),Vcg(198),Vcg(199),Vcg(200),Vcg(201),Vcg(202),Vcg(203),
+        Vcg(204),Vcg(205),Vcg(206),Vcg(207),Vcg(208),Vcg(209),Vcg(210),Vcg(211),
+        Vcg(212),Vcg(213),Vcg(214),Vcg(215),Vcg(216),Vcg(217),Vcg(218),Vcg(219),
+        Vcg(220),Vcg(221),Vcg(222),Vcg(223),Vcg(224),Vcg(225),Vcg(226),Vcg(227),
+        Vcg(228),Vcg(229),Vcg(230),Vcg(231),Vcg(232),Vcg(233),Vcg(234),Vcg(235),
+        Vcg(236),Vcg(237),Vcg(238),Vcg(239),Vcg(240),Vcg(241),Vcg(242),Vcg(243),
+        Vcg(244),Vcg(245),Vcg(246),Vcg(247),Vcg(248),Vcg(249),Vcg(250),Vcg(251),
+        Vcg(252),Vcg(253),Vcg(254),Vcg(255)};
+
+} // namespace libyuv
+
+#endif
+
diff --git a/files/source/convert.cc b/files/source/convert.cc
new file mode 100644
index 0000000..8154dcb
--- /dev/null
+++ b/files/source/convert.cc
@@ -0,0 +1,904 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+
+#include "conversion_tables.h"
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "row.h"
+
+//#define SCALEOPT //Currently for windows only. June 2010
+
+#ifdef SCALEOPT
+#include <emmintrin.h>
+#endif
+
+namespace libyuv {
+
+static inline uint8 Clip(int32 val) {
+  if (val < 0) {
+    return (uint8) 0;
+  } else if (val > 255){
+    return (uint8) 255;
+  }
+  return (uint8) val;
+}
+
+int I420ToRGB24(const uint8* src_y, int src_stride_y,
+                const uint8* src_u, int src_stride_u,
+                const uint8* src_v, int src_stride_v,
+                uint8* dst_frame, int dst_stride_frame,
+                int width, int height) {
+  if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
+    return -1;
+  }
+
+  // RGB orientation - bottom up
+  // TODO(fbarchard): support inversion
+  uint8* out = dst_frame + dst_stride_frame * height - dst_stride_frame;
+  uint8* out2 = out - dst_stride_frame;
+  int h, w;
+  int tmp_r, tmp_g, tmp_b;
+  const uint8 *y1, *y2 ,*u, *v;
+  y1 = src_y;
+  y2 = y1 + src_stride_y;
+  u = src_u;
+  v = src_v;
+  for (h = ((height + 1) >> 1); h > 0; h--){
+    // 2 rows at a time, 2 y's at a time
+    for (w = 0; w < ((width + 1) >> 1); w++){
+      // Vertical and horizontal sub-sampling
+      tmp_r = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8);
+      tmp_g = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
+      tmp_b = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8);
+      out[0] = Clip(tmp_b);
+      out[1] = Clip(tmp_g);
+      out[2] = Clip(tmp_r);
+
+      tmp_r = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8);
+      tmp_g = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
+      tmp_b = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8);
+      out[3] = Clip(tmp_b);
+      out[4] = Clip(tmp_g);
+      out[5] = Clip(tmp_r);
+
+      tmp_r = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8);
+      tmp_g = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
+      tmp_b = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8);
+      out2[0] = Clip(tmp_b);
+      out2[1] = Clip(tmp_g);
+      out2[2] = Clip(tmp_r);
+
+      tmp_r = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8);
+      tmp_g = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
+      tmp_b = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8);
+      out2[3] = Clip(tmp_b);
+      out2[4] = Clip(tmp_g);
+      out2[5] = Clip(tmp_r);
+
+      out += 6;
+      out2 += 6;
+      y1 += 2;
+      y2 += 2;
+      u++;
+      v++;
+    }
+    y1 += src_stride_y + src_stride_y - width;
+    y2 += src_stride_y + src_stride_y - width;
+    u += src_stride_u - ((width + 1) >> 1);
+    v += src_stride_v - ((width + 1) >> 1);
+    out -= dst_stride_frame * 3;
+    out2 -= dst_stride_frame * 3;
+  } // end height for
+  return 0;
+}
+
+// Little Endian...
+int I420ToARGB4444(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_frame, int dst_stride_frame,
+                   int width, int height) {
+  if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
+    return -1;
+  }
+
+  // RGB orientation - bottom up
+  uint8* out = dst_frame + dst_stride_frame * (height - 1);
+  uint8* out2 = out - dst_stride_frame;
+  int tmp_r, tmp_g, tmp_b;
+  const uint8 *y1,*y2, *u, *v;
+  y1 = src_y;
+  y2 = y1 + src_stride_y;
+  u = src_u;
+  v = src_v;
+  int h, w;
+
+  for (h = ((height + 1) >> 1); h > 0; h--) {
+    // 2 rows at a time, 2 y's at a time
+    for (w = 0; w < ((width + 1) >> 1); w++) {
+        // Vertical and horizontal sub-sampling
+        // Convert to RGB888 and re-scale to 4 bits
+        tmp_r = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8);
+        tmp_g = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
+        tmp_b = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8);
+        out[0] =(uint8)((Clip(tmp_g) & 0xf0) + (Clip(tmp_b) >> 4));
+        out[1] = (uint8)(0xf0 + (Clip(tmp_r) >> 4));
+
+        tmp_r = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8);
+        tmp_g = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
+        tmp_b = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8);
+        out[2] = (uint8)((Clip(tmp_g) & 0xf0 ) + (Clip(tmp_b) >> 4));
+        out[3] = (uint8)(0xf0 + (Clip(tmp_r) >> 4));
+
+        tmp_r = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8);
+        tmp_g = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
+        tmp_b = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8);
+        out2[0] = (uint8)((Clip(tmp_g) & 0xf0 ) + (Clip(tmp_b) >> 4));
+        out2[1] = (uint8) (0xf0 + (Clip(tmp_r) >> 4));
+
+        tmp_r = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8);
+        tmp_g = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
+        tmp_b = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8);
+        out2[2] = (uint8)((Clip(tmp_g) & 0xf0 ) + (Clip(tmp_b) >> 4));
+        out2[3] = (uint8)(0xf0 + (Clip(tmp_r) >> 4));
+
+        out += 4;
+        out2 += 4;
+        y1 += 2;
+        y2 += 2;
+        u++;
+        v++;
+    }
+    y1 += 2 * src_stride_y - width;
+    y2 += 2 * src_stride_y - width;
+    u += src_stride_u - ((width + 1) >> 1);
+    v += src_stride_v - ((width + 1) >> 1);
+    out -= (dst_stride_frame + width) * 2;
+    out2 -= (dst_stride_frame + width) * 2;
+  } // end height for
+  return 0;
+}
+
+
+int I420ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_u, int src_stride_u,
+                 const uint8* src_v, int src_stride_v,
+                 uint8* dst_frame, int dst_stride_frame,
+                 int width, int height) {
+  if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+  uint16* out = (uint16*)(dst_frame) + dst_stride_frame * (height - 1);
+  uint16* out2 = out - dst_stride_frame;
+
+  int tmp_r, tmp_g, tmp_b;
+  const uint8* y1,* y2, * u, * v;
+  y1 = src_y;
+  y2 = y1 + src_stride_y;
+  u = src_u;
+  v = src_v;
+  int h, w;
+
+  for (h = ((height + 1) >> 1); h > 0; h--){
+    // 2 rows at a time, 2 y's at a time
+    for (w = 0; w < ((width + 1) >> 1); w++){
+      // Vertical and horizontal sub-sampling
+      // 1. Convert to RGB888
+      // 2. Shift to adequate location (in the 16 bit word) - RGB 565
+
+      tmp_r = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8);
+      tmp_g = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
+      tmp_b = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8);
+      out[0]  = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g)
+                          & 0xfc) << 3) + (Clip(tmp_b) >> 3);
+
+      tmp_r = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8);
+      tmp_g = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
+      tmp_b = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8);
+      out[1] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g)
+                         & 0xfc) << 3) + (Clip(tmp_b ) >> 3);
+
+      tmp_r = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8);
+      tmp_g = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
+      tmp_b = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8);
+      out2[0] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g)
+                          & 0xfc) << 3) + (Clip(tmp_b) >> 3);
+
+      tmp_r = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8);
+      tmp_g = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
+      tmp_b = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8);
+      out2[1] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g)
+                          & 0xfc) << 3) + (Clip(tmp_b) >> 3);
+
+      y1 += 2;
+      y2 += 2;
+      out += 2;
+      out2 += 2;
+      u++;
+      v++;
+    }
+    y1 += 2 * src_stride_y - width;
+    y2 += 2 * src_stride_y - width;
+    u += src_stride_u - ((width + 1) >> 1);
+    v += src_stride_v - ((width + 1) >> 1);
+    out -= 2 * dst_stride_frame + width;
+    out2 -=  2 * dst_stride_frame + width;
+  }
+  return 0;
+}
+
+
+int I420ToARGB1555(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_frame, int dst_stride_frame,
+                   int width, int height) {
+  if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
+    return -1;
+  }
+  uint16* out = (uint16*)(dst_frame) + dst_stride_frame * (height - 1);
+  uint16* out2 = out - dst_stride_frame ;
+  int32 tmp_r, tmp_g, tmp_b;
+  const uint8 *y1,*y2, *u, *v;
+  int h, w;
+
+  y1 = src_y;
+  y2 = y1 + src_stride_y;
+  u = src_u;
+  v = src_v;
+
+  for (h = ((height + 1) >> 1); h > 0; h--){
+    // 2 rows at a time, 2 y's at a time
+    for (w = 0; w < ((width + 1) >> 1); w++){
+      // Vertical and horizontal sub-sampling
+      // 1. Convert to RGB888
+      // 2. Shift to adequate location (in the 16 bit word) - RGB 555
+      // 3. Add 1 for alpha value
+      tmp_r = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8);
+      tmp_g = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
+      tmp_b = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8);
+      out[0]  = (uint16)(0x8000 + ((Clip(tmp_r) & 0xf8) << 10) +
+                ((Clip(tmp_g) & 0xf8) << 3) + (Clip(tmp_b) >> 3));
+
+      tmp_r = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8);
+      tmp_g = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]]  + 128) >> 8);
+      tmp_b = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8);
+      out[1]  = (uint16)(0x8000 + ((Clip(tmp_r) & 0xf8) << 10) +
+                ((Clip(tmp_g) & 0xf8) << 3)  + (Clip(tmp_b) >> 3));
+
+      tmp_r = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8);
+      tmp_g = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
+      tmp_b = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8);
+      out2[0]  = (uint16)(0x8000 + ((Clip(tmp_r) & 0xf8) << 10) +
+                 ((Clip(tmp_g) & 0xf8) << 3) + (Clip(tmp_b) >> 3));
+
+      tmp_r = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8);
+      tmp_g = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
+      tmp_b = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8);
+      out2[1]  = (uint16)(0x8000 + ((Clip(tmp_r) & 0xf8) << 10) +
+                 ((Clip(tmp_g) & 0xf8) << 3)  + (Clip(tmp_b) >> 3));
+
+      y1 += 2;
+      y2 += 2;
+      out += 2;
+      out2 += 2;
+      u++;
+      v++;
+    }
+    y1 += 2 * src_stride_y - width;
+    y2 += 2 * src_stride_y - width;
+    u += src_stride_u - ((width + 1) >> 1);
+    v += src_stride_v - ((width + 1) >> 1);
+    out -= 2 * dst_stride_frame + width;
+    out2 -=  2 * dst_stride_frame + width;
+  }
+  return 0;
+}
+
+
+int I420ToYUY2(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height) {
+  if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
+    return -1;
+  }
+
+  const uint8* in1 = src_y;
+  const uint8* in2 = src_y + src_stride_y;
+
+  uint8* out1 = dst_frame;
+  uint8* out2 = dst_frame + dst_stride_frame;
+
+  // YUY2 - Macro-pixel = 2 image pixels
+  // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
+#ifndef SCALEOPT
+  for (int i = 0; i < ((height + 1) >> 1); i++){
+    for (int j = 0; j < ((width + 1) >> 1); j++){
+      out1[0] = in1[0];
+      out1[1] = *src_u;
+      out1[2] = in1[1];
+      out1[3] = *src_v;
+
+      out2[0] = in2[0];
+      out2[1] = *src_u;
+      out2[2] = in2[1];
+      out2[3] = *src_v;
+      out1 += 4;
+      out2 += 4;
+      src_u++;
+      src_v++;
+      in1 += 2;
+      in2 += 2;
+    }
+    in1 += 2 * src_stride_y - width;
+    in2 += 2 * src_stride_y - width;
+    src_u += src_stride_u - ((width + 1) >> 1);
+    src_v += src_stride_v - ((width + 1) >> 1);
+    out1 += dst_stride_frame + dst_stride_frame - 2 * width;
+    out2 += dst_stride_frame + dst_stride_frame - 2 * width;
+  }
+#else
+  for (WebRtc_UWord32 i = 0; i < ((height + 1) >> 1);i++) {
+    int32 width__ = (width >> 4);
+    _asm
+    {
+      ;pusha
+      mov       eax, DWORD PTR [in1]                       ;1939.33
+      mov       ecx, DWORD PTR [in2]                       ;1939.33
+      mov       ebx, DWORD PTR [src_u]                       ;1939.33
+      mov       edx, DWORD PTR [src_v]                       ;1939.33
+      loop0:
+      movq      xmm6, QWORD PTR [ebx]          ;src_u
+      movq      xmm0, QWORD PTR [edx]          ;src_v
+      punpcklbw xmm6, xmm0                     ;src_u, src_v mix
+      ;movdqa    xmm1, xmm6
+      ;movdqa    xmm2, xmm6
+      ;movdqa    xmm4, xmm6
+
+      movdqu    xmm3, XMMWORD PTR [eax]        ;in1
+      movdqa    xmm1, xmm3
+      punpcklbw xmm1, xmm6                     ;in1, src_u, in1, src_v
+      mov       esi, DWORD PTR [out1]
+      movdqu    XMMWORD PTR [esi], xmm1        ;write to out1
+
+      movdqu    xmm5, XMMWORD PTR [ecx]        ;in2
+      movdqa    xmm2, xmm5
+      punpcklbw xmm2, xmm6                     ;in2, src_u, in2, src_v
+      mov       edi, DWORD PTR [out2]
+      movdqu    XMMWORD PTR [edi], xmm2        ;write to out2
+
+      punpckhbw xmm3, xmm6                     ;in1, src_u, in1, src_v again
+      movdqu    XMMWORD PTR [esi+16], xmm3     ;write to out1 again
+      add       esi, 32
+      mov       DWORD PTR [out1], esi
+
+      punpckhbw xmm5, xmm6                     ;src_u, in2, src_v again
+      movdqu    XMMWORD PTR [edi+16], xmm5     ;write to out2 again
+      add       edi, 32
+      mov       DWORD PTR [out2], edi
+
+      add       ebx, 8
+      add       edx, 8
+      add       eax, 16
+      add       ecx, 16
+
+      mov       esi, DWORD PTR [width__]
+      sub       esi, 1
+      mov       DWORD PTR [width__], esi
+      jg        loop0
+
+      mov       DWORD PTR [in1], eax                       ;1939.33
+      mov       DWORD PTR [in2], ecx                       ;1939.33
+      mov       DWORD PTR [src_u], ebx                       ;1939.33
+      mov       DWORD PTR [src_v], edx                       ;1939.33
+
+      ;popa
+      emms
+    }
+    in1 += 2 * src_stride_y - width;
+    in2 += 2 * src_stride_y - width;
+    out1 += dst_stride_frame + dst_stride_frame - 2 * width;
+    out2 += dst_stride_frame + dst_stride_frame - 2 * width;
+  }
+#endif
+  return 0;
+}
+
+int I420ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height) {
+  if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
+    return -1;
+  }
+
+  int i = 0;
+  const uint8* y1 = src_y;
+  const uint8* y2 = y1 + src_stride_y;
+  const uint8* u = src_u;
+  const uint8* v = src_v;
+
+  uint8* out1 = dst_frame;
+  uint8* out2 = dst_frame + dst_stride_frame;
+
+  // Macro-pixel = 2 image pixels
+  // U0Y0V0Y1....U2Y2V2Y3...U4Y4V4Y5.....
+
+#ifndef SCALEOPT
+  for (; i < ((height + 1) >> 1); i++) {
+    for (int j = 0; j < ((width + 1) >> 1); j++) {
+      out1[0] = *u;
+      out1[1] = y1[0];
+      out1[2] = *v;
+      out1[3] = y1[1];
+
+      out2[0] = *u;
+      out2[1] = y2[0];
+      out2[2] = *v;
+      out2[3] = y2[1];
+      out1 += 4;
+      out2 += 4;
+      u++;
+      v++;
+      y1 += 2;
+      y2 += 2;
+    }
+    y1 += 2 * src_stride_y - width;
+    y2 += 2 * src_stride_y - width;
+    u += src_stride_u - ((width + 1) >> 1);
+    v += src_stride_v - ((width + 1) >> 1);
+    out1 += 2 * (dst_stride_frame - width);
+    out2 += 2 * (dst_stride_frame - width);
+  }
+#else
+  for (; i < (height >> 1);i++) {
+    int32 width__ = (width >> 4);
+    _asm
+    {
+      ;pusha
+      mov       eax, DWORD PTR [in1]                       ;1939.33
+      mov       ecx, DWORD PTR [in2]                       ;1939.33
+      mov       ebx, DWORD PTR [src_u]                       ;1939.33
+      mov       edx, DWORD PTR [src_v]                       ;1939.33
+loop0:
+      movq      xmm6, QWORD PTR [ebx]          ;src_u
+      movq      xmm0, QWORD PTR [edx]          ;src_v
+      punpcklbw xmm6, xmm0                     ;src_u, src_v mix
+      movdqa    xmm1, xmm6
+      movdqa    xmm2, xmm6
+      movdqa    xmm4, xmm6
+
+      movdqu    xmm3, XMMWORD PTR [eax]        ;in1
+      punpcklbw xmm1, xmm3                     ;src_u, in1, src_v
+      mov       esi, DWORD PTR [out1]
+      movdqu    XMMWORD PTR [esi], xmm1        ;write to out1
+
+      movdqu    xmm5, XMMWORD PTR [ecx]        ;in2
+      punpcklbw xmm2, xmm5                     ;src_u, in2, src_v
+      mov       edi, DWORD PTR [out2]
+      movdqu    XMMWORD PTR [edi], xmm2        ;write to out2
+
+      punpckhbw xmm4, xmm3                     ;src_u, in1, src_v again
+      movdqu    XMMWORD PTR [esi+16], xmm4     ;write to out1 again
+      add       esi, 32
+      mov       DWORD PTR [out1], esi
+
+      punpckhbw xmm6, xmm5                     ;src_u, in2, src_v again
+      movdqu    XMMWORD PTR [edi+16], xmm6     ;write to out2 again
+      add       edi, 32
+      mov       DWORD PTR [out2], edi
+
+      add       ebx, 8
+      add       edx, 8
+      add       eax, 16
+      add       ecx, 16
+
+      mov       esi, DWORD PTR [width__]
+      sub       esi, 1
+      mov       DWORD PTR [width__], esi
+      jg        loop0
+
+      mov       DWORD PTR [in1], eax                       ;1939.33
+      mov       DWORD PTR [in2], ecx                       ;1939.33
+      mov       DWORD PTR [src_u], ebx                       ;1939.33
+      mov       DWORD PTR [src_v], edx                       ;1939.33
+
+      ;popa
+      emms
+    }
+    in1 += width;
+    in2 += width;
+    out1 += 2 * (dst_stride_frame - width);
+    out2 += 2 * (dst_stride_frame - width);
+  }
+#endif
+  return 0;
+}
+
+
+int NV12ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_uv, int src_stride_uv,
+                 uint8* dst_frame, int dst_stride_frame,
+                 int width, int height) {
+  if (src_y == NULL || src_uv == NULL || dst_frame == NULL) {
+    return -1;
+  }
+
+  // Bi-Planar: Y plane followed by an interlaced U and V plane
+  const uint8* interlacedSrc = src_uv;
+  uint16* out = (uint16*)(src_y) + dst_stride_frame * (height - 1);
+  uint16* out2 = out - dst_stride_frame;
+  int32 tmp_r, tmp_g, tmp_b;
+  const uint8 *y1,*y2;
+  y1 = src_y;
+  y2 = y1 + src_stride_y;
+  int h, w;
+
+  for (h = ((height + 1) >> 1); h > 0; h--) {
+    // 2 rows at a time, 2 y's at a time
+    for (w = 0; w < ((width + 1) >> 1); w++) {
+      // Vertical and horizontal sub-sampling
+      // 1. Convert to RGB888
+      // 2. Shift to adequate location (in the 16 bit word) - RGB 565
+
+      tmp_r = (int32)((mapYc[y1[0]] + mapVcr[interlacedSrc[1]] + 128) >> 8);
+      tmp_g = (int32)((mapYc[y1[0]] + mapUcg[interlacedSrc[0]]
+                      + mapVcg[interlacedSrc[1]] + 128) >> 8);
+      tmp_b = (int32)((mapYc[y1[0]] + mapUcb[interlacedSrc[0]] + 128) >> 8);
+      out[0]  = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g)
+                          & 0xfc) << 3) + (Clip(tmp_b) >> 3);
+
+      tmp_r = (int32)((mapYc[y1[1]] + mapVcr[interlacedSrc[1]] + 128) >> 8);
+      tmp_g = (int32)((mapYc[y1[1]] + mapUcg[interlacedSrc[0]]
+                      + mapVcg[interlacedSrc[1]] + 128) >> 8);
+      tmp_b = (int32)((mapYc[y1[1]] + mapUcb[interlacedSrc[0]] + 128) >> 8);
+      out[1] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g)
+                         & 0xfc) << 3) + (Clip(tmp_b ) >> 3);
+
+      tmp_r = (int32)((mapYc[y2[0]] + mapVcr[interlacedSrc[1]] + 128) >> 8);
+      tmp_g = (int32)((mapYc[y2[0]] + mapUcg[interlacedSrc[0]]
+                      + mapVcg[interlacedSrc[1]] + 128) >> 8);
+      tmp_b = (int32)((mapYc[y2[0]] + mapUcb[interlacedSrc[0]] + 128) >> 8);
+      out2[0] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g)
+                          & 0xfc) << 3) + (Clip(tmp_b) >> 3);
+
+      tmp_r = (int32)((mapYc[y2[1]] + mapVcr[interlacedSrc[1]]
+                      + 128) >> 8);
+      tmp_g = (int32)((mapYc[y2[1]] + mapUcg[interlacedSrc[0]]
+                      + mapVcg[interlacedSrc[1]] + 128) >> 8);
+      tmp_b = (int32)((mapYc[y2[1]] + mapUcb[interlacedSrc[0]] + 128) >> 8);
+      out2[1] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g)
+                          & 0xfc) << 3) + (Clip(tmp_b) >> 3);
+
+      y1 += 2;
+      y2 += 2;
+      out += 2;
+      out2 += 2;
+      interlacedSrc += 2;
+    }
+    y1 += 2 * src_stride_y - width;
+    y2 += 2 * src_stride_y - width;
+    interlacedSrc += src_stride_uv - ((width + 1) >> 1);
+    out -= 3 * dst_stride_frame + dst_stride_frame - width;
+    out2 -= 3 * dst_stride_frame + dst_stride_frame - width;
+  }
+  return 0;
+}
+
+// TODO(fbarchard): Deprecated - this is same as BG24ToARGB with -height
+int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
+                uint8* dst_frame, int dst_stride_frame,
+                int width, int height) {
+  if (src_frame == NULL || dst_frame == NULL) {
+    return -1;
+  }
+
+  int i, j, offset;
+  uint8* outFrame = dst_frame;
+  const uint8* inFrame = src_frame;
+
+  outFrame += dst_stride_frame * (height - 1) * 4;
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      offset = j * 4;
+      outFrame[0 + offset] = inFrame[0];
+      outFrame[1 + offset] = inFrame[1];
+      outFrame[2 + offset] = inFrame[2];
+      outFrame[3 + offset] = 0xff;
+      inFrame += 3;
+    }
+    outFrame -= 4 * (dst_stride_frame - width);
+    inFrame += src_stride_frame - width;
+  }
+  return 0;
+}
+
+int ARGBToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  if (height < 0) {
+    height = -height;
+    src_frame = src_frame + (height - 1) * src_stride_frame;
+    src_stride_frame = -src_stride_frame;
+  }
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
+      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
+    ARGBToYRow = ARGBToYRow_SSSE3;
+  } else
+#endif
+  {
+    ARGBToYRow = ARGBToYRow_C;
+  }
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
+      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
+      IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
+    ARGBToUVRow = ARGBToUVRow_SSSE3;
+  } else
+#endif
+  {
+    ARGBToUVRow = ARGBToUVRow_C;
+  }
+
+  for (int y = 0; y < (height - 1); y += 2) {
+    ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
+    ARGBToYRow(src_frame, dst_y, width);
+    ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
+    src_frame += src_stride_frame * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
+    ARGBToYRow(src_frame, dst_y, width);
+  }
+  return 0;
+}
+
+int BGRAToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  if (height < 0) {
+    height = -height;
+    src_frame = src_frame + (height - 1) * src_stride_frame;
+    src_stride_frame = -src_stride_frame;
+  }
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+#if defined(HAS_BGRATOYROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
+      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
+    ARGBToYRow = BGRAToYRow_SSSE3;
+  } else
+#endif
+  {
+    ARGBToYRow = BGRAToYRow_C;
+  }
+#if defined(HAS_BGRATOUVROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
+      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
+      IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
+    ARGBToUVRow = BGRAToUVRow_SSSE3;
+  } else
+#endif
+  {
+    ARGBToUVRow = BGRAToUVRow_C;
+  }
+
+  for (int y = 0; y < (height - 1); y += 2) {
+    ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
+    ARGBToYRow(src_frame, dst_y, width);
+    ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
+    src_frame += src_stride_frame * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
+    ARGBToYRow(src_frame, dst_y, width);
+  }
+  return 0;
+}
+
+int ABGRToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  if (height < 0) {
+    height = -height;
+    src_frame = src_frame + (height - 1) * src_stride_frame;
+    src_stride_frame = -src_stride_frame;
+  }
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+#if defined(HAS_ABGRTOYROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
+      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
+    ARGBToYRow = ABGRToYRow_SSSE3;
+  } else
+#endif
+  {
+    ARGBToYRow = ABGRToYRow_C;
+  }
+#if defined(HAS_ABGRTOUVROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
+      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
+      IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
+    ARGBToUVRow = ABGRToUVRow_SSSE3;
+  } else
+#endif
+  {
+    ARGBToUVRow = ABGRToUVRow_C;
+  }
+
+  for (int y = 0; y < (height - 1); y += 2) {
+    ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
+    ARGBToYRow(src_frame, dst_y, width);
+    ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
+    src_frame += src_stride_frame * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
+    ARGBToYRow(src_frame, dst_y, width);
+  }
+  return 0;
+}
+
+int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
+                uint8* dst_y, int dst_stride_y,
+                uint8* dst_u, int dst_stride_u,
+                uint8* dst_v, int dst_stride_v,
+                int width, int height) {
+  if (height < 0) {
+    height = -height;
+    src_frame = src_frame + (height - 1) * src_stride_frame;
+    src_stride_frame = -src_stride_frame;
+  }
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+#if defined(HAS_RGB24TOYROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
+      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
+    ARGBToYRow = RGB24ToYRow_SSSE3;
+  } else
+#endif
+  {
+    ARGBToYRow = RGB24ToYRow_C;
+  }
+#if defined(HAS_RGB24TOUVROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
+      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
+      IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
+    ARGBToUVRow = RGB24ToUVRow_SSSE3;
+  } else
+#endif
+  {
+    ARGBToUVRow = RGB24ToUVRow_C;
+  }
+
+  for (int y = 0; y < (height - 1); y += 2) {
+    ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
+    ARGBToYRow(src_frame, dst_y, width);
+    ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
+    src_frame += src_stride_frame * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
+    ARGBToYRow(src_frame, dst_y, width);
+  }
+  return 0;
+}
+
+int RAWToI420(const uint8* src_frame, int src_stride_frame,
+                uint8* dst_y, int dst_stride_y,
+                uint8* dst_u, int dst_stride_u,
+                uint8* dst_v, int dst_stride_v,
+                int width, int height) {
+  if (height < 0) {
+    height = -height;
+    src_frame = src_frame + (height - 1) * src_stride_frame;
+    src_stride_frame = -src_stride_frame;
+  }
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+#if defined(HAS_RAWTOYROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
+      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
+    ARGBToYRow = RAWToYRow_SSSE3;
+  } else
+#endif
+  {
+    ARGBToYRow = RAWToYRow_C;
+  }
+#if defined(HAS_RAWTOUVROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
+      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
+      IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
+    ARGBToUVRow = RAWToUVRow_SSSE3;
+  } else
+#endif
+  {
+    ARGBToUVRow = RAWToUVRow_C;
+  }
+
+  for (int y = 0; y < (height - 1); y += 2) {
+    ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
+    ARGBToYRow(src_frame, dst_y, width);
+    ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
+    src_frame += src_stride_frame * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
+    ARGBToYRow(src_frame, dst_y, width);
+  }
+  return 0;
+}
+
+} // namespace libyuv
diff --git a/files/source/cpu_id.cc b/files/source/cpu_id.cc
new file mode 100644
index 0000000..cc44e21
--- /dev/null
+++ b/files/source/cpu_id.cc
@@ -0,0 +1,74 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/basic_types.h"  // for CPU_X86
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
+#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
+static inline void __cpuid(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "mov %%ebx, %%edi\n"
+    "cpuid\n"
+    "xchg %%edi, %%ebx\n"
+    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type)
+  );
+}
+#elif defined(__i386__) || defined(__x86_64__)
+static inline void __cpuid(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "cpuid\n"
+    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type)
+  );
+}
+#endif
+
+namespace libyuv {
+
+// CPU detect function for SIMD instruction sets.
+static int cpu_info_ = 0;
+
+// TODO(fbarchard): (cpu_info[2] & 0x10000000 ? kCpuHasAVX : 0)
+static void InitCpuFlags() {
+#ifdef CPU_X86
+  int cpu_info[4];
+  __cpuid(cpu_info, 1);
+  cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) |
+              (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) |
+              kCpuInitialized;
+#elif defined(__ARM_NEON__)
+  // gcc -mfpu=neon defines __ARM_NEON__
+  // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags
+  // to disable Neon on devices that do not have it.
+  cpu_info_ = kCpuHasNEON | kCpuInitialized;
+#else
+  cpu_info_ = kCpuInitialized;
+#endif
+}
+
+void MaskCpuFlags(int enable_flags) {
+  InitCpuFlags();
+  cpu_info_ &= enable_flags;
+}
+
+bool TestCpuFlag(int flag) {
+  if (0 == cpu_info_) {
+    InitCpuFlags();
+  }
+  return cpu_info_ & flag ? true : false;
+}
+
+}  // namespace libyuv
diff --git a/files/source/format_conversion.cc b/files/source/format_conversion.cc
new file mode 100644
index 0000000..958f44c
--- /dev/null
+++ b/files/source/format_conversion.cc
@@ -0,0 +1,423 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "libyuv/cpu_id.h"
+#include "video_common.h"
+#include "row.h"
+
+#define kMaxStride (2048 * 4)
+
+namespace libyuv {
+
+// Note: to do this with Neon vld4.8 would load ARGB values into 4 registers
+// and vst would select which 2 components to write.  The low level would need
+// to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR
+
+#if defined(WIN32) && !defined(COVERAGE_ENABLED)
+#define HAS_ARGBTOBAYERROW_SSSE3
+__declspec(naked)
+static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
+                                 uint8* dst_bayer, uint32 selector, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_argb
+    mov        edx, [esp + 8]    // dst_bayer
+    movd       xmm7, [esp + 12]  // selector
+    mov        ecx, [esp + 16]   // pix
+    pshufd     xmm7, xmm7, 0
+
+  wloop:
+    movdqa     xmm0, [eax]
+    lea        eax, [eax + 16]
+    pshufb     xmm0, xmm7
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    sub        ecx, 4
+    ja         wloop
+    ret
+  }
+}
+
+#elif (defined(__x86_64__) || defined(__i386__)) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+
+#define HAS_ARGBTOBAYERROW_SSSE3
+static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
+                                 uint32 selector, int pix) {
+  asm volatile(
+    "movd   %3,%%xmm7\n"
+    "pshufd $0x0,%%xmm7,%%xmm7\n"
+"1:"
+    "movdqa (%0),%%xmm0\n"
+    "lea    0x10(%0),%0\n"
+    "pshufb %%xmm7,%%xmm0\n"
+    "movd   %%xmm0,(%1)\n"
+    "lea    0x4(%1),%1\n"
+    "sub    $0x4,%2\n"
+    "ja     1b\n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_bayer), // %1
+    "+r"(pix)        // %2
+  : "r"(selector)    // %3
+  : "memory"
+);
+}
+#endif
+
+static void ARGBToBayerRow_C(const uint8* src_argb,
+                             uint8* dst_bayer, uint32 selector, int pix) {
+  int index0 = selector & 0xff;
+  int index1 = (selector >> 8) & 0xff;
+  // Copy a row of Bayer.
+  for (int x = 0; x < (pix - 1); x += 2) {
+    dst_bayer[0] = src_argb[index0];
+    dst_bayer[1] = src_argb[index1];
+    src_argb += 8;
+    dst_bayer += 2;
+  }
+  if (pix & 1) {
+    dst_bayer[0] = src_argb[index0];
+  }
+}
+
+// generate a selector mask useful for pshufb
+static uint32 GenerateSelector(int select0, int select1) {
+  return static_cast<uint32>(select0) |
+         static_cast<uint32>((select1 + 4) << 8) |
+         static_cast<uint32>((select0 + 8) << 16) |
+         static_cast<uint32>((select1 + 12) << 24);
+}
+
+// Converts 32 bit ARGB to any Bayer RGB format.
+int ARGBToBayerRGB(const uint8* src_rgb, int src_stride_rgb,
+                   uint8* dst_bayer, int dst_stride_bayer,
+                   uint32 dst_fourcc_bayer,
+                   int width, int height) {
+  if (height < 0) {
+    height = -height;
+    src_rgb = src_rgb + (height - 1) * src_stride_rgb;
+    src_stride_rgb = -src_stride_rgb;
+  }
+  void (*ARGBToBayerRow)(const uint8* src_argb,
+                         uint8* dst_bayer, uint32 selector, int pix);
+#if defined(HAS_ARGBTOBAYERROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 4 == 0) &&
+      IS_ALIGNED(src_rgb, 16) && (src_stride_rgb % 16 == 0) &&
+      IS_ALIGNED(dst_bayer, 4) && (dst_stride_bayer % 4 == 0)) {
+    ARGBToBayerRow = ARGBToBayerRow_SSSE3;
+  } else
+#endif
+  {
+    ARGBToBayerRow = ARGBToBayerRow_C;
+  }
+
+  int blue_index = 0;
+  int green_index = 1;
+  int red_index = 2;
+
+  // Now build a lookup table containing the indices for the four pixels in each
+  // 2x2 Bayer grid.
+  uint32 index_map[2];
+  switch (dst_fourcc_bayer) {
+    default:
+      assert(false);
+    case FOURCC_RGGB:
+      index_map[0] = GenerateSelector(red_index, green_index);
+      index_map[1] = GenerateSelector(green_index, blue_index);
+      break;
+    case FOURCC_BGGR:
+      index_map[0] = GenerateSelector(blue_index, green_index);
+      index_map[1] = GenerateSelector(green_index, red_index);
+      break;
+    case FOURCC_GRBG:
+      index_map[0] = GenerateSelector(green_index, red_index);
+      index_map[1] = GenerateSelector(blue_index, green_index);
+      break;
+    case FOURCC_GBRG:
+      index_map[0] = GenerateSelector(green_index, blue_index);
+      index_map[1] = GenerateSelector(red_index, green_index);
+      break;
+  }
+
+  // Now convert.
+  for (int y = 0; y < height; ++y) {
+    ARGBToBayerRow(src_rgb, dst_bayer, index_map[y & 1], width);
+    src_rgb += src_stride_rgb;
+    dst_bayer += dst_stride_bayer;
+  }
+  return 0;
+}
+
+#define AVG(a,b) (((a) + (b)) >> 1)
+
+static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,
+                       uint8* dst_rgb, int pix) {
+  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
+  uint8 g = src_bayer0[1];
+  uint8 r = src_bayer1[1];
+  for (int x = 0; x < (pix - 2); x += 2) {
+    dst_rgb[0] = src_bayer0[0];
+    dst_rgb[1] = AVG(g, src_bayer0[1]);
+    dst_rgb[2] = AVG(r, src_bayer1[1]);
+    dst_rgb[3] = 255U;
+    dst_rgb[4] = AVG(src_bayer0[0], src_bayer0[2]);
+    dst_rgb[5] = src_bayer0[1];
+    dst_rgb[6] = src_bayer1[1];
+    dst_rgb[7] = 255U;
+    g = src_bayer0[1];
+    r = src_bayer1[1];
+    src_bayer0 += 2;
+    src_bayer1 += 2;
+    dst_rgb += 8;
+  }
+  dst_rgb[0] = src_bayer0[0];
+  dst_rgb[1] = AVG(g, src_bayer0[1]);
+  dst_rgb[2] = AVG(r, src_bayer1[1]);
+  dst_rgb[3] = 255U;
+  dst_rgb[4] = src_bayer0[0];
+  dst_rgb[5] = src_bayer0[1];
+  dst_rgb[6] = src_bayer1[1];
+  dst_rgb[7] = 255U;
+}
+
+static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
+                       uint8* dst_rgb, int pix) {
+  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
+  uint8 g = src_bayer0[1];
+  uint8 b = src_bayer1[1];
+  for (int x = 0; x < (pix - 2); x += 2) {
+    dst_rgb[0] = AVG(b, src_bayer1[1]);
+    dst_rgb[1] = AVG(g, src_bayer0[1]);
+    dst_rgb[2] = src_bayer0[0];
+    dst_rgb[3] = 255U;
+    dst_rgb[4] = src_bayer1[1];
+    dst_rgb[5] = src_bayer0[1];
+    dst_rgb[6] = AVG(src_bayer0[0], src_bayer0[2]);
+    dst_rgb[7] = 255U;
+    g = src_bayer0[1];
+    b = src_bayer1[1];
+    src_bayer0 += 2;
+    src_bayer1 += 2;
+    dst_rgb += 8;
+  }
+  dst_rgb[0] = AVG(b, src_bayer1[1]);
+  dst_rgb[1] = AVG(g, src_bayer0[1]);
+  dst_rgb[2] = src_bayer0[0];
+  dst_rgb[3] = 255U;
+  dst_rgb[4] = src_bayer1[1];
+  dst_rgb[5] = src_bayer0[1];
+  dst_rgb[6] = src_bayer0[0];
+  dst_rgb[7] = 255U;
+}
+
+static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,
+                       uint8* dst_rgb, int pix) {
+  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
+  uint8 b = src_bayer0[1];
+  for (int x = 0; x < (pix - 2); x += 2) {
+    dst_rgb[0] = AVG(b, src_bayer0[1]);
+    dst_rgb[1] = src_bayer0[0];
+    dst_rgb[2] = src_bayer1[0];
+    dst_rgb[3] = 255U;
+    dst_rgb[4] = src_bayer0[1];
+    dst_rgb[5] = AVG(src_bayer0[0], src_bayer0[2]);
+    dst_rgb[6] = AVG(src_bayer1[0], src_bayer1[2]);
+    dst_rgb[7] = 255U;
+    b = src_bayer0[1];
+    src_bayer0 += 2;
+    src_bayer1 += 2;
+    dst_rgb += 8;
+  }
+  dst_rgb[0] = AVG(b, src_bayer0[1]);
+  dst_rgb[1] = src_bayer0[0];
+  dst_rgb[2] = src_bayer1[0];
+  dst_rgb[3] = 255U;
+  dst_rgb[4] = src_bayer0[1];
+  dst_rgb[5] = src_bayer0[0];
+  dst_rgb[6] = src_bayer1[0];
+  dst_rgb[7] = 255U;
+}
+
+static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,
+                       uint8* dst_rgb, int pix) {
+  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
+  uint8 r = src_bayer0[1];
+  for (int x = 0; x < (pix - 2); x += 2) {
+    dst_rgb[0] = src_bayer1[0];
+    dst_rgb[1] = src_bayer0[0];
+    dst_rgb[2] = AVG(r, src_bayer0[1]);
+    dst_rgb[3] = 255U;
+    dst_rgb[4] = AVG(src_bayer1[0], src_bayer1[2]);
+    dst_rgb[5] = AVG(src_bayer0[0], src_bayer0[2]);
+    dst_rgb[6] = src_bayer0[1];
+    dst_rgb[7] = 255U;
+    r = src_bayer0[1];
+    src_bayer0 += 2;
+    src_bayer1 += 2;
+    dst_rgb += 8;
+  }
+  dst_rgb[0] = src_bayer1[0];
+  dst_rgb[1] = src_bayer0[0];
+  dst_rgb[2] = AVG(r, src_bayer0[1]);
+  dst_rgb[3] = 255U;
+  dst_rgb[4] = src_bayer1[0];
+  dst_rgb[5] = src_bayer0[0];
+  dst_rgb[6] = src_bayer0[1];
+  dst_rgb[7] = 255U;
+}
+
+// Converts any Bayer RGB format to ARGB.
+int BayerRGBToARGB(const uint8* src_bayer, int src_stride_bayer,
+                   uint32 src_fourcc_bayer,
+                   uint8* dst_rgb, int dst_stride_rgb,
+                   int width, int height) {
+  if (height < 0) {
+    height = -height;
+    dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
+    dst_stride_rgb = -dst_stride_rgb;
+  }
+  void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_rgb, int pix);
+  void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_rgb, int pix);
+
+  switch (src_fourcc_bayer) {
+    default:
+      assert(false);
+    case FOURCC_RGGB:
+      BayerRow0 = BayerRowRG;
+      BayerRow1 = BayerRowGB;
+      break;
+    case FOURCC_BGGR:
+      BayerRow0 = BayerRowBG;
+      BayerRow1 = BayerRowGR;
+      break;
+    case FOURCC_GRBG:
+      BayerRow0 = BayerRowGR;
+      BayerRow1 = BayerRowBG;
+      break;
+    case FOURCC_GBRG:
+      BayerRow0 = BayerRowGB;
+      BayerRow1 = BayerRowRG;
+      break;
+  }
+
+  for (int y = 0; y < (height - 1); y += 2) {
+    BayerRow0(src_bayer, src_stride_bayer, dst_rgb, width);
+    BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
+        dst_rgb + dst_stride_rgb, width);
+    src_bayer += src_stride_bayer * 2;
+    dst_rgb += dst_stride_rgb * 2;
+  }
+  if (height & 1) {
+    BayerRow0(src_bayer, -src_stride_bayer, dst_rgb, width);
+  }
+  return 0;
+}
+
+// Converts any Bayer RGB format to ARGB.
+int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
+                   uint32 src_fourcc_bayer,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height) {
+  if (width * 4 > kMaxStride) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    int halfheight = (height + 1) >> 1;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
+  void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_rgb, int pix);
+  void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_rgb, int pix);
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) &&
+      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
+    ARGBToYRow = ARGBToYRow_SSSE3;
+  } else
+#endif
+  {
+    ARGBToYRow = ARGBToYRow_C;
+  }
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) &&
+      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
+      IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
+    ARGBToUVRow = ARGBToUVRow_SSSE3;
+  } else
+#endif
+  {
+    ARGBToUVRow = ARGBToUVRow_C;
+  }
+
+  switch (src_fourcc_bayer) {
+    default:
+      assert(false);
+    case FOURCC_RGGB:
+      BayerRow0 = BayerRowRG;
+      BayerRow1 = BayerRowGB;
+      break;
+    case FOURCC_BGGR:
+      BayerRow0 = BayerRowBG;
+      BayerRow1 = BayerRowGR;
+      break;
+    case FOURCC_GRBG:
+      BayerRow0 = BayerRowGR;
+      BayerRow1 = BayerRowBG;
+      break;
+    case FOURCC_GBRG:
+      BayerRow0 = BayerRowGB;
+      BayerRow1 = BayerRowRG;
+      break;
+  }
+
+  for (int y = 0; y < (height - 1); y += 2) {
+    BayerRow0(src_bayer, src_stride_bayer, row, width);
+    BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
+              row + kMaxStride, width);
+    ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
+    ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
+    src_bayer += src_stride_bayer * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  // TODO(fbarchard): Make sure this filters properly
+  if (height & 1) {
+    BayerRow0(src_bayer, src_stride_bayer, row, width);
+    ARGBToUVRow(row, 0, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
+  }
+  return 0;
+}
+
+}  // namespace libyuv
diff --git a/files/source/general.cc b/files/source/general.cc
new file mode 100644
index 0000000..9d39f9b
--- /dev/null
+++ b/files/source/general.cc
@@ -0,0 +1,284 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/general.h"
+
+#include <string.h>     // memcpy(), memset()
+
+#include "libyuv/planar_functions.h"
+
+namespace libyuv {
+
+int
+I420Mirror(const uint8* src_yplane, int src_ystride,
+           const uint8* src_uplane, int src_ustride,
+           const uint8* src_vplane, int src_vstride,
+           uint8* dst_yplane, int dst_ystride,
+           uint8* dst_uplane, int dst_ustride,
+           uint8* dst_vplane, int dst_vstride,
+           int width, int height) {
+  if (src_yplane == NULL || src_uplane == NULL || src_vplane == NULL ||
+      dst_yplane == NULL || dst_uplane == NULL || dst_vplane == NULL) {
+    return -1;
+  }
+
+  int indO = 0;
+  int indS  = 0;
+  int wind, hind;
+  uint8 tmpVal, tmpValU, tmpValV;
+  // Will swap two values per iteration
+  const int halfWidth = (width + 1) >> 1;
+
+  // Y
+  for (wind = 0; wind < halfWidth; wind++) {
+   for (hind = 0; hind < height; hind++) {
+     indO = hind * src_ystride + wind;
+     indS = hind * dst_ystride + (width - wind - 1);
+     tmpVal = src_yplane[indO];
+     dst_yplane[indO] = src_yplane[indS];
+     dst_yplane[indS] = tmpVal;
+    }
+  }
+
+  const int halfHeight = (height + 1) >> 1;
+  const int halfSrcuvStride = (height + 1) >> 1;
+  const int halfuvWidth = (width + 1) >> 2;
+
+  for (wind = 0; wind < halfuvWidth; wind++) {
+   for (hind = 0; hind < halfHeight; hind++) {
+     indO = hind * halfSrcuvStride + wind;
+     indS = hind * halfSrcuvStride + (halfuvWidth - wind - 1);
+     // U
+     tmpValU = src_uplane[indO];
+     dst_uplane[indO] = src_uplane[indS];
+     dst_uplane[indS] = tmpValU;
+     // V
+     tmpValV = src_vplane[indO];
+     dst_vplane[indO] = src_vplane[indS];
+     dst_vplane[indS] = tmpValV;
+   }
+  }
+  return 0;
+}
+
+// Make a center cut
+int
+I420Crop(uint8* frame,
+         int src_width, int src_height,
+         int dst_width, int dst_height)
+{
+  if (frame == NULL)
+    return -1;
+
+  if (src_width == dst_width && src_height == dst_height) {
+      // Nothing to do
+    return 3 * dst_height * dst_width / 2;
+  }
+  if (dst_width > src_width || dst_height > src_height) {
+      // error
+      return -1;
+  }
+  int i = 0;
+  int m = 0;
+  int loop = 0;
+  int half_dst_width = dst_width / 2;
+  int halfdst_height = dst_height / 2;
+  int halfsrc_width = src_width / 2;
+  int half_dst_height= src_height / 2;
+  int crop_height = ( src_height - dst_height ) / 2;
+  int crop_width = ( src_width - dst_width ) / 2;
+
+  for (i = src_width * crop_height + crop_width; loop < dst_height ;
+      loop++, i += src_width) {
+    memcpy(&frame[m],&frame[i],dst_width);
+    m += dst_width;
+  }
+  i = src_width * src_height; // ilum
+  loop = 0;
+  for ( i += (halfsrc_width * crop_height / 2 + crop_width / 2);
+        loop < halfdst_height; loop++,i += halfsrc_width) {
+    memcpy(&frame[m],&frame[i],half_dst_width);
+    m += half_dst_width;
+  }
+  loop = 0;
+  i = src_width * src_height + half_dst_height * halfsrc_width; // ilum + Cr
+  for ( i += (halfsrc_width * crop_height / 2 + crop_width / 2);
+        loop < halfdst_height; loop++, i += halfsrc_width) {
+    memcpy(&frame[m],&frame[i],half_dst_width);
+    m += half_dst_width;
+  }
+  return 0;
+}
+
+
+int
+I420CropPad(const uint8* src_frame, int src_width,
+            int src_height, uint8* dst_frame,
+            int dst_width, int dst_height)
+{
+  if (src_width < 1 || dst_width < 1 || src_height < 1 || dst_height < 1) {
+    return -1;
+  }
+  if (src_width == dst_width && src_height == dst_height) {
+    memcpy(dst_frame, src_frame, 3 * dst_width * (dst_height >> 1));
+  } else {
+    if (src_height < dst_height) {
+      // pad height
+      int pad_height = dst_height - src_height;
+      int i = 0;
+      int pad_width = 0;
+      int crop_width = 0;
+      int width = src_width;
+      if (src_width < dst_width) {
+        // pad width
+        pad_width = dst_width - src_width;
+      } else {
+        // cut width
+        crop_width = src_width - dst_width;
+        width = dst_width;
+      }
+      if (pad_height) {
+        memset(dst_frame, 0, dst_width * (pad_height >> 1));
+        dst_frame +=  dst_width * (pad_height >> 1);
+      }
+      for (i = 0; i < src_height;i++) {
+        if (pad_width) {
+            memset(dst_frame, 0, pad_width / 2);
+            dst_frame +=  pad_width / 2;
+        }
+        src_frame += crop_width >> 1; // in case we have a cut
+        memcpy(dst_frame,src_frame ,width);
+        src_frame += crop_width >> 1;
+        dst_frame += width;
+        src_frame += width;
+        if (pad_width) {
+          memset(dst_frame, 0, pad_width / 2);
+          dst_frame +=  pad_width / 2;
+        }
+      }
+      if (pad_height) {
+        memset(dst_frame, 0, dst_width * (pad_height >> 1));
+        dst_frame +=  dst_width * (pad_height >> 1);
+      }
+      if (pad_height) {
+        memset(dst_frame, 127, (dst_width >> 2) * (pad_height >> 1));
+        dst_frame +=  (dst_width >> 2) * (pad_height >> 1);
+      }
+      for (i = 0; i < (src_height >> 1); i++) {
+        if (pad_width) {
+          memset(dst_frame, 127, pad_width >> 2);
+          dst_frame +=  pad_width >> 2;
+        }
+        src_frame += crop_width >> 2; // in case we have a cut
+        memcpy(dst_frame, src_frame,width >> 1);
+        src_frame += crop_width >> 2;
+        dst_frame += width >> 1;
+        src_frame += width >> 1;
+        if (pad_width) {
+          memset(dst_frame, 127, pad_width >> 2);
+          dst_frame +=  pad_width >> 2;
+        }
+      }
+      if (pad_height) {
+        memset(dst_frame, 127, (dst_width >> 1) * (pad_height >> 1));
+        dst_frame +=  (dst_width >> 1) * (pad_height >> 1);
+      }
+      for (i = 0; i < (src_height >> 1); i++) {
+        if (pad_width) {
+          memset(dst_frame, 127, pad_width >> 2);
+          dst_frame +=  pad_width >> 2;
+        }
+        src_frame += crop_width >> 2; // in case we have a cut
+        memcpy(dst_frame, src_frame,width >> 1);
+        src_frame += crop_width >> 2;
+        dst_frame += width >> 1;
+        src_frame += width >> 1;
+        if (pad_width) {
+          memset(dst_frame, 127, pad_width >> 2);
+          dst_frame += pad_width >> 2;
+        }
+      }
+      if (pad_height) {
+        memset(dst_frame, 127, (dst_width >> 2) * (pad_height >> 1));
+        dst_frame +=  (dst_width >> 2) * (pad_height >> 1);
+      }
+    } else {
+      // cut height
+      int i = 0;
+      int pad_width = 0;
+      int crop_width = 0;
+      int width = src_width;
+
+      if (src_width < dst_width) {
+        // pad width
+        pad_width = dst_width - src_width;
+      } else {
+        // cut width
+        crop_width = src_width - dst_width;
+        width = dst_width;
+      }
+      int diff_height = src_height - dst_height;
+      src_frame += src_width * (diff_height >> 1);  // skip top I
+
+      for (i = 0; i < dst_height; i++) {
+        if (pad_width) {
+          memset(dst_frame, 0, pad_width / 2);
+          dst_frame +=  pad_width / 2;
+        }
+        src_frame += crop_width >> 1; // in case we have a cut
+        memcpy(dst_frame,src_frame ,width);
+        src_frame += crop_width >> 1;
+        dst_frame += width;
+        src_frame += width;
+        if (pad_width) {
+          memset(dst_frame, 0, pad_width / 2);
+          dst_frame +=  pad_width / 2;
+        }
+      }
+      src_frame += src_width * (diff_height >> 1);  // skip end I
+      src_frame += (src_width >> 2) * (diff_height >> 1); // skip top of Cr
+      for (i = 0; i < (dst_height >> 1); i++) {
+        if (pad_width) {
+          memset(dst_frame, 127, pad_width >> 2);
+          dst_frame +=  pad_width >> 2;
+        }
+        src_frame += crop_width >> 2; // in case we have a cut
+        memcpy(dst_frame, src_frame,width >> 1);
+        src_frame += crop_width >> 2;
+        dst_frame += width >> 1;
+        src_frame += width >> 1;
+        if (pad_width) {
+          memset(dst_frame, 127, pad_width >> 2);
+          dst_frame +=  pad_width >> 2;
+        }
+      }
+      src_frame += (src_width >> 2) * (diff_height >> 1); // skip end of Cr
+      src_frame += (src_width >> 2) * (diff_height >> 1); // skip top of Cb
+      for (i = 0; i < (dst_height >> 1); i++) {
+        if (pad_width) {
+          memset(dst_frame, 127, pad_width >> 2);
+          dst_frame +=  pad_width >> 2;
+        }
+        src_frame += crop_width >> 2; // in case we have a cut
+        memcpy(dst_frame, src_frame, width >> 1);
+        src_frame += crop_width >> 2;
+        dst_frame += width >> 1;
+        src_frame += width >> 1;
+        if (pad_width) {
+          memset(dst_frame, 127, pad_width >> 2);
+          dst_frame +=  pad_width >> 2;
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+} // namespace libyuv
diff --git a/files/source/planar_functions.cc b/files/source/planar_functions.cc
new file mode 100644
index 0000000..a7e3e38
--- /dev/null
+++ b/files/source/planar_functions.cc
@@ -0,0 +1,1575 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/planar_functions.h"
+
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "row.h"
+
+namespace libyuv {
+
+#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
+#define HAS_SPLITUV_NEON
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
+// Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.
+static void SplitUV_NEON(const uint8* src_uv,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  __asm__ volatile
+  (
+    "1:\n"
+    "vld2.u8    {q0,q1}, [%0]!    \n"  // load 16 pairs of UV
+    "vst1.u8    {q0}, [%1]!       \n"  // store U
+    "vst1.u8    {q1}, [%2]!       \n"  // Store V
+    "subs       %3, %3, #16       \n"  // 16 processed per loop
+    "bhi        1b                \n"
+    : "+r"(src_uv),
+      "+r"(dst_u),
+      "+r"(dst_v),
+      "+r"(pix)             // Output registers
+    :                       // Input registers
+    : "q0", "q1"            // Clobber List
+  );
+}
+
+#elif (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
+    && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+#if defined(_MSC_VER)
+#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
+#else
+#define TALIGN16(t, var) t var __attribute__((aligned(16)))
+#endif
+
+// Shuffle table for converting ABGR to ARGB.
+extern "C" TALIGN16(const uint8, kShuffleMaskABGRToARGB[16]) = {
+  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
+};
+
+// Shuffle table for converting BGRA to ARGB.
+extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = {
+  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
+};
+
+#if defined(WIN32) && !defined(COVERAGE_ENABLED)
+#define HAS_SPLITUV_SSE2
+__declspec(naked)
+static void SplitUV_SSE2(const uint8* src_uv,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_uv
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
+    psrlw      xmm7, 8
+
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm0
+    movdqa     xmm3, xmm1
+    pand       xmm0, xmm7   // even bytes
+    pand       xmm1, xmm7
+    packuswb   xmm0, xmm1
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    psrlw      xmm2, 8      // odd bytes
+    psrlw      xmm3, 8
+    packuswb   xmm2, xmm3
+    movdqa     [edi], xmm2
+    lea        edi, [edi + 16]
+    sub        ecx, 16
+    ja         wloop
+    pop        edi
+    ret
+  }
+}
+
+#elif (defined(__x86_64__) || defined(__i386__)) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+#define HAS_SPLITUV_SSE2
+static void SplitUV_SSE2(const uint8* src_uv,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile(
+  "pcmpeqb    %%xmm7,%%xmm7\n"
+  "psrlw      $0x8,%%xmm7\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "lea        0x20(%0),%0\n"
+  "movdqa     %%xmm0,%%xmm2\n"
+  "movdqa     %%xmm1,%%xmm3\n"
+  "pand       %%xmm7,%%xmm0\n"
+  "pand       %%xmm7,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm0\n"
+  "movdqa     %%xmm0,(%1)\n"
+  "lea        0x10(%1),%1\n"
+  "psrlw      $0x8,%%xmm2\n"
+  "psrlw      $0x8,%%xmm3\n"
+  "packuswb   %%xmm3,%%xmm2\n"
+  "movdqa     %%xmm2,(%2)\n"
+  "lea        0x10(%2),%2\n"
+  "sub        $0x10,%3\n"
+  "ja         1b\n"
+  : "+r"(src_uv),     // %0
+    "+r"(dst_u),      // %1
+    "+r"(dst_v),      // %2
+    "+r"(pix)         // %3
+  :
+  : "memory"
+);
+}
+#endif
+#endif
+
+static void SplitUV_C(const uint8* src_uv,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  // Copy a row of UV.
+  for (int x = 0; x < pix; ++x) {
+    dst_u[0] = src_uv[0];
+    dst_v[0] = src_uv[1];
+    src_uv += 2;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+static void I420CopyPlane(const uint8* src_y, int src_stride_y,
+                          uint8* dst_y, int dst_stride_y,
+                          int width, int height) {
+  // Copy plane
+  for (int y = 0; y < height; ++y) {
+    memcpy(dst_y, src_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+// Copy I420 with optional flipping
+int I420Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    int halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  I420CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  I420CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+  I420CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+  return 0;
+}
+
+// SetRows32 writes 'count' bytes using a 32 bit value repeated
+
+#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
+#define HAS_SETROW_NEON
+static void SetRow32_NEON(uint8* dst, uint32 v32, int count) {
+  __asm__ volatile
+  (
+    "vdup.u32   q0, %2            \n"  // duplicate 4 ints
+    "1:\n"
+    "vst1.u32   {q0}, [%0]!       \n"  // store
+    "subs       %1, %1, #16       \n"  // 16 processed per loop
+    "bhi        1b                \n"
+  : "+r"(dst),  // %0
+    "+r"(count) // %1
+  : "r"(v32)    // %2
+  : "q0", "memory"
+  );
+}
+
+#elif defined(WIN32) && !defined(COVERAGE_ENABLED)
+#define HAS_SETROW_SSE2
+__declspec(naked)
+static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
+  __asm {
+    mov        eax, [esp + 4]    // dst
+    movd       xmm7, [esp + 8]   // v32
+    mov        ecx, [esp + 12]   // count
+    pshufd     xmm7, xmm7, 0
+
+  wloop:
+    movdqa     [eax], xmm7
+    lea        eax, [eax + 16]
+    sub        ecx, 16
+    ja         wloop
+    ret
+  }
+}
+
+#elif (defined(__x86_64__) || defined(__i386__)) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+
+#define HAS_SETROW_SSE2
+static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
+  asm volatile(
+  "movd       %2, %%xmm7\n"
+  "pshufd     $0x0,%%xmm7,%%xmm7\n"
+"1:"
+  "movdqa     %%xmm7,(%0)\n"
+  "lea        0x10(%0),%0\n"
+  "sub        $0x10,%1\n"
+  "ja         1b\n"
+  : "+r"(dst),  // %0
+    "+r"(count) // %1
+  : "r"(v32)    // %2
+  : "memory"
+);
+}
+#endif
+
+static void SetRow8_C(uint8* dst, uint32 v8, int count) {
+  memset(dst, v8, count);
+}
+
+static void I420SetPlane(uint8* dst_y, int dst_stride_y,
+                         int width, int height,
+                         int value) {
+  void (*SetRow)(uint8* dst, uint32 value, int pix);
+#if defined(HAS_SETROW_NEON)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
+    SetRow = SetRow32_NEON;
+  } else
+#elif defined(HAS_SETROW_SSE2)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
+    SetRow = SetRow32_SSE2;
+  } else
+#endif
+  {
+    SetRow = SetRow8_C;
+  }
+
+  uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
+  // Set plane
+  for (int y = 0; y < height; ++y) {
+    SetRow(dst_y, v32, width);
+    dst_y += dst_stride_y;
+  }
+}
+
+// Draw a rectangle into I420
+int I420Rect(uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int x, int y,
+             int width, int height,
+             int value_y, int value_u, int value_v) {
+  if (!dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0 ||
+      x < 0 || y < 0 ||
+      value_y < 0 || value_y > 255 ||
+      value_u < 0 || value_u > 255 ||
+      value_v < 0 || value_v > 255) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    int halfheight = (height + 1) >> 1;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
+
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  uint8* start_y = dst_y + y * dst_stride_y + x;
+  uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
+  uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
+
+  I420SetPlane(start_y, dst_stride_y, width, height, value_y);
+  I420SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
+  I420SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);
+  return 0;
+}
+
+// Helper function to copy yuv data without scaling.  Used
+// by our jpeg conversion callbacks to incrementally fill a yuv image.
+int I422ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  // Copy Y plane
+  I420CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+
+  // SubSample UV planes.
+  int x, y;
+  int halfwidth = (width + 1) >> 1;
+  for (y = 0; y < height; y += 2) {
+    const uint8* u0 = src_u;
+    const uint8* u1 = src_u + src_stride_u;
+    if ((y + 1) >= height) {
+      u1 = u0;
+    }
+    for (x = 0; x < halfwidth; ++x) {
+      dst_u[x] = (u0[x] + u1[x] + 1) >> 1;
+    }
+    src_u += src_stride_u * 2;
+    dst_u += dst_stride_u;
+  }
+  for (y = 0; y < height; y += 2) {
+    const uint8* v0 = src_v;
+    const uint8* v1 = src_v + src_stride_v;
+    if ((y + 1) >= height) {
+      v1 = v0;
+    }
+    for (x = 0; x < halfwidth; ++x) {
+      dst_v[x] = (v0[x] + v1[x] + 1) >> 1;
+    }
+    src_v += src_stride_v * 2;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+static void I420CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
+                           uint8* dst, int dst_stride_frame,
+                           int width, int height) {
+  // Copy plane
+  for (int y = 0; y < height; y += 2) {
+    memcpy(dst, src, width);
+    src += src_stride_0;
+    dst += dst_stride_frame;
+    memcpy(dst, src, width);
+    src += src_stride_1;
+    dst += dst_stride_frame;
+  }
+}
+
+// Support converting from FOURCC_M420
+// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
+// easy conversion to I420.
+// M420 format description:
+// M420 is row biplanar 420: 2 rows of Y and 1 row of VU.
+// Chroma is half width / half height. (420)
+// src_stride_m420 is row planar.  Normally this will be the width in pixels.
+//   The UV plane is half width, but 2 values, so src_stride_m420 applies to
+//   this as well as the two Y planes.
+static int X420ToI420(const uint8* src_y,
+                      int src_stride_y0, int src_stride_y1,
+                      const uint8* src_uv, int src_stride_uv,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    int halfheight = (height + 1) >> 1;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
+
+  int halfwidth = (width + 1) >> 1;
+  void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+#if defined(HAS_SPLITUV_NEON)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
+      (halfwidth % 16 == 0) &&
+      IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) &&
+      IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) &&
+      IS_ALIGNED(dst_v, 16) && (dst_stride_v % 16 == 0)) {
+    SplitUV = SplitUV_NEON;
+  } else
+#elif defined(HAS_SPLITUV_SSE2)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
+      (halfwidth % 16 == 0) &&
+      IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) &&
+      IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) &&
+      IS_ALIGNED(dst_v, 16) && (dst_stride_v % 16 == 0)) {
+    SplitUV = SplitUV_SSE2;
+  } else
+#endif
+  {
+    SplitUV = SplitUV_C;
+  }
+
+  I420CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
+                 width, height);
+
+  int halfheight = (height + 1) >> 1;
+  for (int y = 0; y < halfheight; ++y) {
+    // Copy a row of UV.
+    SplitUV(src_uv, dst_u, dst_v, halfwidth);
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+    src_uv += src_stride_uv;
+  }
+  return 0;
+}
+
+// Convert M420 to I420.
+int M420ToI420(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
+                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height);
+}
+
+// Convert NV12 to I420.
+int NV12ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return X420ToI420(src_y, src_stride_y, src_stride_y,
+                    src_uv, src_stride_uv,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height);
+}
+
+// Convert NV12 to I420.  Deprecated.
+int NV12ToI420(const uint8* src_y,
+               const uint8* src_uv,
+               int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return X420ToI420(src_y, src_stride_frame, src_stride_frame,
+                    src_uv, src_stride_frame,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height);
+}
+
+#if defined(WIN32) && !defined(COVERAGE_ENABLED)
+#define HAS_SPLITYUY2_SSE2
+__declspec(naked)
+static void SplitYUY2_SSE2(const uint8* src_yuy2,
+                           uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        edx, [esp + 8 + 8]    // dst_y
+    mov        esi, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // pix
+    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
+    psrlw      xmm7, 8
+
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm0
+    movdqa     xmm3, xmm1
+    pand       xmm2, xmm7   // even bytes are Y
+    pand       xmm3, xmm7
+    packuswb   xmm2, xmm3
+    movdqa     [edx], xmm2
+    lea        edx, [edx + 16]
+    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm7  // U
+    packuswb   xmm0, xmm0
+    movq       qword ptr [esi], xmm0
+    lea        esi, [esi + 8]
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edi], xmm1
+    lea        edi, [edi + 8]
+    sub        ecx, 16
+    ja         wloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#elif (defined(__x86_64__) || defined(__i386__)) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+#define HAS_SPLITYUY2_SSE2
+static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
+                           uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile(
+  "pcmpeqb    %%xmm7,%%xmm7\n"
+  "psrlw      $0x8,%%xmm7\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "lea        0x20(%0),%0\n"
+  "movdqa     %%xmm0,%%xmm2\n"
+  "movdqa     %%xmm1,%%xmm3\n"
+  "pand       %%xmm7,%%xmm2\n"
+  "pand       %%xmm7,%%xmm3\n"
+  "packuswb   %%xmm3,%%xmm2\n"
+  "movdqa     %%xmm2,(%1)\n"
+  "lea        0x10(%1),%1\n"
+  "psrlw      $0x8,%%xmm0\n"
+  "psrlw      $0x8,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm0\n"
+  "movdqa     %%xmm0,%%xmm1\n"
+  "pand       %%xmm7,%%xmm0\n"
+  "packuswb   %%xmm0,%%xmm0\n"
+  "movq       %%xmm0,(%2)\n"
+  "lea        0x8(%2),%2\n"
+  "psrlw      $0x8,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm1\n"
+  "movq       %%xmm1,(%3)\n"
+  "lea        0x8(%3),%3\n"
+  "sub        $0x10,%4\n"
+  "ja         1b\n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_y),       // %1
+    "+r"(dst_u),       // %2
+    "+r"(dst_v),       // %3
+    "+r"(pix)          // %4
+  :
+  : "memory"
+);
+}
+#endif
+
+static void SplitYUY2_C(const uint8* src_yuy2,
+                        uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) {
+  // Copy a row of YUY2.
+  for (int x = 0; x < pix; x += 2) {
+    dst_y[0] = src_yuy2[0];
+    dst_y[1] = src_yuy2[2];
+    dst_u[0] = src_yuy2[1];
+    dst_v[0] = src_yuy2[3];
+    src_yuy2 += 4;
+    dst_y += 2;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Convert Q420 to I420.
+// Format is rows of YY/YUYV
+int Q420ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    int halfheight = (height + 1) >> 1;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
+  void (*SplitYUY2)(const uint8* src_yuy2,
+                    uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix);
+#if defined(HAS_SPLITYUY2_SSE2)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) &&
+      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) &&
+      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
+      IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
+    SplitYUY2 = SplitYUY2_SSE2;
+  } else
+#endif
+  {
+    SplitYUY2 = SplitYUY2_C;
+  }
+  for (int y = 0; y < height; y += 2) {
+    memcpy(dst_y, src_y, width);
+    dst_y += dst_stride_y;
+    src_y += src_stride_y;
+
+    // Copy a row of YUY2.
+    SplitYUY2(src_yuy2, dst_y, dst_u, dst_v, width);
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+    src_yuy2 += src_stride_yuy2;
+  }
+  return 0;
+}
+
+#if defined(WIN32) && !defined(COVERAGE_ENABLED)
+#define HAS_YUY2TOI420ROW_SSE2
+__declspec(naked)
+void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
+                         uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_yuy2
+    mov        edx, [esp + 8]    // dst_y
+    mov        ecx, [esp + 12]   // pix
+    pcmpeqb    xmm7, xmm7        // generate mask 0x00ff00ff
+    psrlw      xmm7, 8
+
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm7   // even bytes are Y
+    pand       xmm1, xmm7
+    packuswb   xmm0, xmm1
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    ja         wloop
+    ret
+  }
+}
+
+__declspec(naked)
+void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                          uint8* dst_u, uint8* dst_y, int pix) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        esi, [esp + 8 + 8]    // stride_yuy2
+    mov        edx, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // pix
+    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
+    psrlw      xmm7, 8
+
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + esi]
+    movdqa     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm7  // U
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edi], xmm1
+    lea        edi, [edi + 8]
+    sub        ecx, 16
+    ja         wloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#define HAS_UYVYTOI420ROW_SSE2
+__declspec(naked)
+void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
+                         uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_uyvy
+    mov        edx, [esp + 8]    // dst_y
+    mov        ecx, [esp + 12]   // pix
+
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8    // odd bytes are Y
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    ja         wloop
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                          uint8* dst_u, uint8* dst_y, int pix) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        esi, [esp + 8 + 8]    // stride_yuy2
+    mov        edx, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // pix
+    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
+    psrlw      xmm7, 8
+
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + esi]
+    movdqa     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    pand       xmm0, xmm7   // UYVY -> UVUV
+    pand       xmm1, xmm7
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm7  // U
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edi], xmm1
+    lea        edi, [edi + 8]
+    sub        ecx, 16
+    ja         wloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#elif (defined(__x86_64__) || defined(__i386__)) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+
+#define HAS_YUY2TOI420ROW_SSE2
+static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
+                                uint8* dst_y, int pix) {
+  asm volatile(
+  "pcmpeqb    %%xmm7,%%xmm7\n"
+  "psrlw      $0x8,%%xmm7\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "lea        0x20(%0),%0\n"
+  "pand       %%xmm7,%%xmm0\n"
+  "pand       %%xmm7,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm0\n"
+  "movdqa     %%xmm0,(%1)\n"
+  "lea        0x10(%1),%1\n"
+  "sub        $0x10,%2\n"
+  "ja         1b\n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory"
+);
+}
+
+static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                                 uint8* dst_u, uint8* dst_y, int pix) {
+  asm volatile(
+  "pcmpeqb    %%xmm7,%%xmm7\n"
+  "psrlw      $0x8,%%xmm7\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "movdqa     (%0,%4,1),%%xmm2\n"
+  "movdqa     0x10(%0,%4,1),%%xmm3\n"
+  "lea        0x20(%0),%0\n"
+  "pavgb      %%xmm2,%%xmm0\n"
+  "pavgb      %%xmm3,%%xmm1\n"
+  "psrlw      $0x8,%%xmm0\n"
+  "psrlw      $0x8,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm0\n"
+  "movdqa     %%xmm0,%%xmm1\n"
+  "pand       %%xmm7,%%xmm0\n"
+  "packuswb   %%xmm0,%%xmm0\n"
+  "movq       %%xmm0,(%1)\n"
+  "lea        0x8(%1),%1\n"
+  "psrlw      $0x8,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm1\n"
+  "movq       %%xmm1,(%2)\n"
+  "lea        0x8(%2),%2\n"
+  "sub        $0x10,%3\n"
+  "ja         1b\n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_y),       // %2
+    "+r"(pix)          // %3
+  : "r"(static_cast<intptr_t>(stride_yuy2))  // %4
+  : "memory"
+);
+}
+#define HAS_UYVYTOI420ROW_SSE2
+static void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
+                                uint8* dst_y, int pix) {
+  asm volatile(
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "lea        0x20(%0),%0\n"
+  "psrlw      $0x8,%%xmm0\n"
+  "psrlw      $0x8,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm0\n"
+  "movdqa     %%xmm0,(%1)\n"
+  "lea        0x10(%1),%1\n"
+  "sub        $0x10,%2\n"
+  "ja         1b\n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory"
+);
+}
+
+static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                                 uint8* dst_u, uint8* dst_y, int pix) {
+  asm volatile(
+  "pcmpeqb    %%xmm7,%%xmm7\n"
+  "psrlw      $0x8,%%xmm7\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "movdqa     (%0,%4,1),%%xmm2\n"
+  "movdqa     0x10(%0,%4,1),%%xmm3\n"
+  "lea        0x20(%0),%0\n"
+  "pavgb      %%xmm2,%%xmm0\n"
+  "pavgb      %%xmm3,%%xmm1\n"
+  "pand       %%xmm7,%%xmm0\n"
+  "pand       %%xmm7,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm0\n"
+  "movdqa     %%xmm0,%%xmm1\n"
+  "pand       %%xmm7,%%xmm0\n"
+  "packuswb   %%xmm0,%%xmm0\n"
+  "movq       %%xmm0,(%1)\n"
+  "lea        0x8(%1),%1\n"
+  "psrlw      $0x8,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm1\n"
+  "movq       %%xmm1,(%2)\n"
+  "lea        0x8(%2),%2\n"
+  "sub        $0x10,%3\n"
+  "ja         1b\n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_y),       // %2
+    "+r"(pix)          // %3
+  : "r"(static_cast<intptr_t>(stride_uyvy))  // %4
+  : "memory"
+);
+}
+#endif
+
+// Filter 2 rows of YUY2 UV's (422) into U and V (420)
+void YUY2ToI420RowUV_C(const uint8* src_yuy2, int src_stride_yuy2,
+                       uint8* dst_u, uint8* dst_v, int pix) {
+  // Output a row of UV values, filtering 2 rows of YUY2
+  for (int x = 0; x < pix; x += 2) {
+    dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
+    dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
+    src_yuy2 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+void YUY2ToI420RowY_C(const uint8* src_yuy2,
+                      uint8* dst_y, int pix) {
+  // Copy a row of yuy2 Y values
+  for (int x = 0; x < pix; ++x) {
+    dst_y[0] = src_yuy2[0];
+    src_yuy2 += 2;
+    dst_y += 1;
+  }
+}
+
+void UYVYToI420RowUV_C(const uint8* src_uyvy, int src_stride_uyvy,
+                       uint8* dst_u, uint8* dst_v, int pix) {
+  // Copy a row of uyvy UV values
+  for (int x = 0; x < pix; x += 2) {
+    dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
+    dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
+    src_uyvy += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+void UYVYToI420RowY_C(const uint8* src_uyvy,
+                      uint8* dst_y, int pix) {
+  // Copy a row of uyvy Y values
+  for (int x = 0; x < pix; ++x) {
+    dst_y[0] = src_uyvy[1];
+    src_uyvy += 2;
+    dst_y += 1;
+  }
+}
+
+// Convert YUY2 to I420.
+int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+  void (*YUY2ToI420RowUV)(const uint8* src_yuy2, int src_stride_yuy2,
+                          uint8* dst_u, uint8* dst_v, int pix);
+  void (*YUY2ToI420RowY)(const uint8* src_yuy2,
+                         uint8* dst_y, int pix);
+#if defined(HAS_YUY2TOI420ROW_SSE2)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) &&
+      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) &&
+      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
+      IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
+    YUY2ToI420RowY = YUY2ToI420RowY_SSE2;
+    YUY2ToI420RowUV = YUY2ToI420RowUV_SSE2;
+  } else
+#endif
+  {
+    YUY2ToI420RowY = YUY2ToI420RowY_C;
+    YUY2ToI420RowUV = YUY2ToI420RowUV_C;
+  }
+  for (int y = 0; y < height; ++y) {
+    if ((y & 1) == 0) {
+      if (y >= (height - 1) ) {  // last chroma on odd height clamp height
+        src_stride_yuy2 = 0;
+      }
+      YUY2ToI420RowUV(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    YUY2ToI420RowY(src_yuy2, dst_y, width);
+    dst_y += dst_stride_y;
+    src_yuy2 += src_stride_yuy2;
+  }
+  return 0;
+}
+
+// Convert UYVY to I420.
+int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+  void (*UYVYToI420RowUV)(const uint8* src_uyvy, int src_stride_uyvy,
+                          uint8* dst_u, uint8* dst_v, int pix);
+  void (*UYVYToI420RowY)(const uint8* src_uyvy,
+                         uint8* dst_y, int pix);
+#if defined(HAS_UYVYTOI420ROW_SSE2)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src_uyvy, 16) && (src_stride_uyvy % 16 == 0) &&
+      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) &&
+      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
+      IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
+    UYVYToI420RowY = UYVYToI420RowY_SSE2;
+    UYVYToI420RowUV = UYVYToI420RowUV_SSE2;
+  } else
+#endif
+  {
+    UYVYToI420RowY = UYVYToI420RowY_C;
+    UYVYToI420RowUV = UYVYToI420RowUV_C;
+  }
+  for (int y = 0; y < height; ++y) {
+    if ((y & 1) == 0) {
+      if (y >= (height - 1) ) {  // last chroma on odd height clamp height
+        src_stride_uyvy = 0;
+      }
+      UYVYToI420RowUV(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    UYVYToI420RowY(src_uyvy, dst_y, width);
+    dst_y += dst_stride_y;
+    src_uyvy += src_stride_uyvy;
+  }
+  return 0;
+}
+
+// Convert I420 to ARGB.
+// TODO(fbarchard): Add SSE2 version and supply C version for fallback.
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  for (int y = 0; y < height; ++y) {
+    FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
+  EMMS();
+  return 0;
+}
+
+// Convert I420 to BGRA.
+int I420ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  for (int y = 0; y < height; ++y) {
+    FastConvertYUVToBGRARow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  EMMS();
+  return 0;
+}
+
+// Convert I420 to BGRA.
+int I420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  for (int y = 0; y < height; ++y) {
+    FastConvertYUVToABGRRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  EMMS();
+  return 0;
+}
+
+// Convert I422 to ARGB.
+int I422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  for (int y = 0; y < height; ++y) {
+    FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
+  EMMS();
+  return 0;
+}
+
+// Convert I444 to ARGB.
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  for (int y = 0; y < height; ++y) {
+    FastConvertYUV444ToRGB32Row(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
+  EMMS();
+  return 0;
+}
+
+// Convert I400 to ARGB.
+int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
+                         uint8* dst_argb, int dst_stride_argb,
+                         int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  for (int y = 0; y < height; ++y) {
+    FastConvertYToRGB32Row(src_y, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+  }
+  // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
+  EMMS();
+  return 0;
+}
+
+// TODO(fbarchard): 64 bit version
+#if defined(WIN32) && !defined(COVERAGE_ENABLED)
+
+#define HAS_I400TOARGBROW_SSE2
+__declspec(naked)
+static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+  __asm {
+    mov        eax, [esp + 4]        // src_y
+    mov        edx, [esp + 8]        // dst_argb
+    mov        ecx, [esp + 12]       // pix
+    pcmpeqb    xmm7, xmm7            // generate mask 0xff000000
+    pslld      xmm7, 24
+
+  wloop:
+    movq       xmm0, qword ptr [eax]
+    lea        eax,  [eax + 8]
+    punpcklbw  xmm0, xmm0
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm0
+    punpckhwd  xmm1, xmm1
+    por        xmm0, xmm7
+    por        xmm1, xmm7
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    ja         wloop
+    ret
+  }
+}
+
+#define HAS_ABGRTOARGBROW_SSSE3
+__declspec(naked)
+static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb,
+                                int pix) {
+__asm {
+    mov       eax, [esp + 4]   // src_abgr
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    movdqa    xmm7, _kShuffleMaskABGRToARGB
+
+ convertloop :
+    movdqa    xmm0, [eax]
+    lea       eax, [eax + 16]
+    pshufb    xmm0, xmm7
+    movdqa    [edx], xmm0
+    lea       edx, [edx + 16]
+    sub       ecx, 4
+    ja        convertloop
+    ret
+  }
+}
+
+#define HAS_BGRATOARGBROW_SSSE3
+__declspec(naked)
+static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb,
+                                int pix) {
+__asm {
+    mov       eax, [esp + 4]   // src_bgra
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    movdqa    xmm7, _kShuffleMaskBGRAToARGB
+
+ convertloop :
+    movdqa    xmm0, [eax]
+    lea       eax, [eax + 16]
+    pshufb    xmm0, xmm7
+    movdqa    [edx], xmm0
+    lea       edx, [edx + 16]
+    sub       ecx, 4
+    ja        convertloop
+    ret
+  }
+}
+
+
+#elif (defined(__x86_64__) || defined(__i386__)) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+
+// TODO(yuche): consider moving ARGB related codes to a separate file.
+#define HAS_I400TOARGBROW_SSE2
+static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+  asm volatile(
+  "pcmpeqb    %%xmm7,%%xmm7\n"
+  "pslld      $0x18,%%xmm7\n"
+"1:"
+  "movq       (%0),%%xmm0\n"
+  "lea        0x8(%0),%0\n"
+  "punpcklbw  %%xmm0,%%xmm0\n"
+  "movdqa     %%xmm0,%%xmm1\n"
+  "punpcklwd  %%xmm0,%%xmm0\n"
+  "punpckhwd  %%xmm1,%%xmm1\n"
+  "por        %%xmm7,%%xmm0\n"
+  "por        %%xmm7,%%xmm1\n"
+  "movdqa     %%xmm0,(%1)\n"
+  "movdqa     %%xmm1,0x10(%1)\n"
+  "lea        0x20(%1),%1\n"
+  "sub        $0x8,%2\n"
+  "ja         1b\n"
+  : "+r"(src_y),     // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  :
+  : "memory"
+);
+}
+
+#define HAS_ABGRTOARGBROW_SSSE3
+static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb,
+                                int pix) {
+  asm volatile(
+  "movdqa     (%3),%%xmm7\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "lea        0x10(%0),%0\n"
+  "pshufb     %%xmm7,%%xmm0\n"
+  "movdqa     %%xmm0,(%1)\n"
+  "lea        0x10(%1),%1\n"
+  "sub        $0x4,%2\n"
+  "ja         1b\n"
+  : "+r"(src_abgr),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "r"(kShuffleMaskABGRToARGB)  // %3
+  : "memory"
+);
+}
+
+#define HAS_BGRATOARGBROW_SSSE3
+static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb,
+                                int pix) {
+  asm volatile(
+  "movdqa     (%3),%%xmm7\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "lea        0x10(%0),%0\n"
+  "pshufb     %%xmm7,%%xmm0\n"
+  "movdqa     %%xmm0,(%1)\n"
+  "lea        0x10(%1),%1\n"
+  "sub        $0x4,%2\n"
+  "ja         1b\n"
+  : "+r"(src_bgra),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "r"(kShuffleMaskBGRAToARGB)  // %3
+  : "memory"
+);
+}
+
+#endif
+
+static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) {
+  // Copy a Y to RGB.
+  for (int x = 0; x < pix; ++x) {
+    uint8 y = src_y[0];
+    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    ++src_y;
+  }
+}
+
+// Convert I400 to ARGB.
+int I400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix);
+#if defined(HAS_I400TOARGBROW_SSE2)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
+      (width % 8 == 0) &&
+      IS_ALIGNED(src_y, 8) && (src_stride_y % 8 == 0) &&
+      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
+    I400ToARGBRow = I400ToARGBRow_SSE2;
+  } else
+#endif
+  {
+    I400ToARGBRow = I400ToARGBRow_C;
+  }
+
+  for (int y = 0; y < height; ++y) {
+    I400ToARGBRow(src_y, dst_argb, width);
+    src_y += src_stride_y;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+static void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
+  for (int x = 0; x < pix; ++x) {
+    // To support in-place conversion.
+    uint8 r = src_abgr[0];
+    uint8 g = src_abgr[1];
+    uint8 b = src_abgr[2];
+    uint8 a = src_abgr[3];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
+    dst_argb += 4;
+    src_abgr += 4;
+  }
+}
+
+int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix);
+#if defined(HAS_ABGRTOARGBROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 4 == 0) &&
+      IS_ALIGNED(src_abgr, 16) && (src_stride_abgr % 16 == 0) &&
+      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
+    ABGRToARGBRow = ABGRToARGBRow_SSSE3;
+  } else
+#endif
+  {
+    ABGRToARGBRow = ABGRToARGBRow_C;
+  }
+
+  for (int y = 0; y < height; ++y) {
+    ABGRToARGBRow(src_abgr, dst_argb, width);
+    src_abgr += src_stride_abgr;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+static void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) {
+  for (int x = 0; x < pix; ++x) {
+    // To support in-place conversion.
+    uint8 a = src_bgra[0];
+    uint8 r = src_bgra[1];
+    uint8 g = src_bgra[2];
+    uint8 b = src_bgra[3];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
+    dst_argb += 4;
+    src_bgra += 4;
+  }
+}
+
+// Convert BGRA to ARGB.
+int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  if (height < 0) {
+    height = -height;
+    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+    src_stride_bgra = -src_stride_bgra;
+  }
+  void (*BGRAToARGBRow)(const uint8* src_bgra, uint8* dst_argb, int pix);
+#if defined(HAS_BGRATOARGBROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 4 == 0) &&
+      IS_ALIGNED(src_bgra, 16) && (src_stride_bgra % 16 == 0) &&
+      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
+    BGRAToARGBRow = BGRAToARGBRow_SSSE3;
+  } else
+#endif
+  {
+    BGRAToARGBRow = BGRAToARGBRow_C;
+  }
+
+  for (int y = 0; y < height; ++y) {
+    BGRAToARGBRow(src_bgra, dst_argb, width);
+    src_bgra += src_stride_bgra;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert ARGB to I400.
+int ARGBToI400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 4 == 0) &&
+      IS_ALIGNED(src_argb, 16) && (src_stride_argb % 16 == 0) &&
+      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
+    ARGBToYRow = ARGBToYRow_SSSE3;
+  } else
+#endif
+  {
+    ARGBToYRow = ARGBToYRow_C;
+  }
+
+  for (int y = 0; y < height; ++y) {
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
+
+
+// Convert RAW to ARGB.
+int RAWToARGB(const uint8* src_raw, int src_stride_raw,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height) {
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+  void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix);
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) &&
+      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
+    RAWToARGBRow = RAWToARGBRow_SSSE3;
+  } else
+#endif
+  {
+    RAWToARGBRow = RAWToARGBRow_C;
+  }
+
+  for (int y = 0; y < height; ++y) {
+    RAWToARGBRow(src_raw, dst_argb, width);
+    src_raw += src_stride_raw;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert BG24 to ARGB.
+int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  if (height < 0) {
+    height = -height;
+    src_bg24 = src_bg24 + (height - 1) * src_stride_bg24;
+    src_stride_bg24 = -src_stride_bg24;
+  }
+  void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix);
+#if defined(HAS_BG24TOARGBROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) &&
+      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
+    BG24ToARGBRow = BG24ToARGBRow_SSSE3;
+  } else
+#endif
+  {
+    BG24ToARGBRow = BG24ToARGBRow_C;
+  }
+
+  for (int y = 0; y < height; ++y) {
+    BG24ToARGBRow(src_bg24, dst_argb, width);
+    src_bg24 += src_stride_bg24;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+}  // namespace libyuv
+
diff --git a/files/source/rotate.cc b/files/source/rotate.cc
new file mode 100644
index 0000000..7d3a332
--- /dev/null
+++ b/files/source/rotate.cc
@@ -0,0 +1,1310 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "rotate_priv.h"
+
+#include "libyuv/cpu_id.h"
+
+namespace libyuv {
+
+#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
+    && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+#if defined(_MSC_VER)
+#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
+#else
+#define TALIGN16(t, var) t var __attribute__((aligned(16)))
+#endif
+// Shuffle table for reversing the bytes.
+extern "C" TALIGN16(const uint8, kShuffleReverse[16]) =
+  { 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u };
+// Shuffle table for reversing the bytes of UV channels.
+extern "C" TALIGN16(const uint8, kShuffleReverseUV[16]) =
+  { 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u };
+#endif
+
+typedef void (*reverse_uv_func)(const uint8*, uint8*, uint8*, int);
+typedef void (*reverse_func)(const uint8*, uint8*, int);
+typedef void (*rotate_uv_wx8_func)(const uint8*, int,
+                                   uint8*, int,
+                                   uint8*, int, int);
+typedef void (*rotate_uv_wxh_func)(const uint8*, int,
+                                   uint8*, int,
+                                   uint8*, int, int, int);
+typedef void (*rotate_wx8_func)(const uint8*, int, uint8*, int, int);
+typedef void (*rotate_wxh_func)(const uint8*, int, uint8*, int, int, int);
+
+#ifdef __ARM_NEON__
+extern "C" {
+void RestoreRegisters_NEON(unsigned long long *restore);
+void SaveRegisters_NEON(unsigned long long *store);
+#define HAS_REVERSE_LINE_NEON
+void ReverseLine_NEON(const uint8* src, uint8* dst, int width);
+#define HAS_REVERSE_LINE_UV_NEON
+void ReverseLineUV_NEON(const uint8* src,
+                        uint8* dst_a, uint8* dst_b,
+                        int width);
+#define HAS_TRANSPOSE_WX8_NEON
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride, int width);
+#define HAS_TRANSPOSE_UVWX8_NEON
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int width);
+}  // extern "C"
+#endif
+
+#if defined(WIN32) && !defined(COVERAGE_ENABLED)
+#define HAS_TRANSPOSE_WX8_SSSE3
+__declspec(naked)
+static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                               uint8* dst, int dst_stride, int width) {
+__asm {
+    push      edi
+    push      esi
+    push      ebp
+    mov       eax, [esp + 12 + 4]   // src
+    mov       edi, [esp + 12 + 8]   // src_stride
+    mov       edx, [esp + 12 + 12]  // dst
+    mov       esi, [esp + 12 + 16]  // dst_stride
+    mov       ecx, [esp + 12 + 20]  // width
+ convertloop :
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    movq      xmm0, qword ptr [eax]
+    lea       ebp, [eax + 8]
+    movq      xmm1, qword ptr [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    punpcklbw xmm0, xmm1
+    movq      xmm2, qword ptr [eax]
+    movdqa    xmm1, xmm0
+    palignr   xmm1, xmm1, 8
+    movq      xmm3, qword ptr [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    punpcklbw xmm2, xmm3
+    movdqa    xmm3, xmm2
+    movq      xmm4, qword ptr [eax]
+    palignr   xmm3, xmm3, 8
+    movq      xmm5, qword ptr [eax + edi]
+    punpcklbw xmm4, xmm5
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm5, xmm4
+    movq      xmm6, qword ptr [eax]
+    palignr   xmm5, xmm5, 8
+    movq      xmm7, qword ptr [eax + edi]
+    punpcklbw xmm6, xmm7
+    mov       eax, ebp
+    movdqa    xmm7, xmm6
+    palignr   xmm7, xmm7, 8
+    // Second round of bit swap.
+    punpcklwd xmm0, xmm2
+    punpcklwd xmm1, xmm3
+    movdqa    xmm2, xmm0
+    movdqa    xmm3, xmm1
+    palignr   xmm2, xmm2, 8
+    palignr   xmm3, xmm3, 8
+    punpcklwd xmm4, xmm6
+    punpcklwd xmm5, xmm7
+    movdqa    xmm6, xmm4
+    movdqa    xmm7, xmm5
+    palignr   xmm6, xmm6, 8
+    palignr   xmm7, xmm7, 8
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    punpckldq xmm0, xmm4
+    movq      qword ptr [edx], xmm0
+    movdqa    xmm4, xmm0
+    palignr   xmm4, xmm4, 8
+    movq      qword ptr [edx + esi], xmm4
+    lea       edx, [edx + 2 * esi]
+    punpckldq xmm2, xmm6
+    movdqa    xmm6, xmm2
+    palignr   xmm6, xmm6, 8
+    movq      qword ptr [edx], xmm2
+    punpckldq xmm1, xmm5
+    movq      qword ptr [edx + esi], xmm6
+    lea       edx, [edx + 2 * esi]
+    movdqa    xmm5, xmm1
+    movq      qword ptr [edx], xmm1
+    palignr   xmm5, xmm5, 8
+    punpckldq xmm3, xmm7
+    movq      qword ptr [edx + esi], xmm5
+    lea       edx, [edx + 2 * esi]
+    movq      qword ptr [edx], xmm3
+    movdqa    xmm7, xmm3
+    palignr   xmm7, xmm7, 8
+    movq      qword ptr [edx + esi], xmm7
+    lea       edx, [edx + 2 * esi]
+    sub       ecx, 8
+    ja        convertloop
+
+    pop       ebp
+    pop       esi
+    pop       edi
+    ret
+  }
+}
+
+#define HAS_TRANSPOSE_UVWX8_SSE2
+__declspec(naked)
+static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                                uint8* dst_a, int dst_stride_a,
+                                uint8* dst_b, int dst_stride_b,
+                                int w) {
+__asm {
+    push      ebx
+    push      esi
+    push      edi
+    push      ebp
+    mov       eax, [esp + 16 + 4]   // src
+    mov       edi, [esp + 16 + 8]   // src_stride
+    mov       edx, [esp + 16 + 12]  // dst_a
+    mov       esi, [esp + 16 + 16]  // dst_stride_a
+    mov       ebx, [esp + 16 + 20]  // dst_b
+    mov       ebp, [esp + 16 + 24]  // dst_stride_b
+    mov       ecx, esp
+    sub       esp, 4 + 16
+    and       esp, ~15
+    mov       [esp + 16], ecx
+    mov       ecx, [ecx + 16 + 28]  // w
+ convertloop :
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    movdqa    xmm0, [eax]
+    movdqa    xmm1, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm0  // use xmm7 as temp register.
+    punpcklbw xmm0, xmm1
+    punpckhbw xmm7, xmm1
+    movdqa    xmm1, xmm7
+    movdqa    xmm2, [eax]
+    movdqa    xmm3, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm2
+    punpcklbw xmm2, xmm3
+    punpckhbw xmm7, xmm3
+    movdqa    xmm3, xmm7
+    movdqa    xmm4, [eax]
+    movdqa    xmm5, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm4
+    punpcklbw xmm4, xmm5
+    punpckhbw xmm7, xmm5
+    movdqa    xmm5, xmm7
+    movdqa    xmm6, [eax]
+    movdqa    xmm7, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    [esp], xmm5  // backup xmm5
+    neg       edi
+    movdqa    xmm5, xmm6   // use xmm5 as temp register.
+    punpcklbw xmm6, xmm7
+    punpckhbw xmm5, xmm7
+    movdqa    xmm7, xmm5
+    lea       eax, [eax + 8 * edi + 16]
+    neg       edi
+    // Second round of bit swap.
+    movdqa    xmm5, xmm0
+    punpcklwd xmm0, xmm2
+    punpckhwd xmm5, xmm2
+    movdqa    xmm2, xmm5
+    movdqa    xmm5, xmm1
+    punpcklwd xmm1, xmm3
+    punpckhwd xmm5, xmm3
+    movdqa    xmm3, xmm5
+    movdqa    xmm5, xmm4
+    punpcklwd xmm4, xmm6
+    punpckhwd xmm5, xmm6
+    movdqa    xmm6, xmm5
+    movdqa    xmm5, [esp]  // restore xmm5
+    movdqa    [esp], xmm6  // backup xmm6
+    movdqa    xmm6, xmm5    // use xmm6 as temp register.
+    punpcklwd xmm5, xmm7
+    punpckhwd xmm6, xmm7
+    movdqa    xmm7, xmm6
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    movdqa    xmm6, xmm0
+    punpckldq xmm0, xmm4
+    punpckhdq xmm6, xmm4
+    movdqa    xmm4, xmm6
+    movdqa    xmm6, [esp]  // restore xmm6
+    movlpd    qword ptr [edx], xmm0
+    movhpd    qword ptr [ebx], xmm0
+    movlpd    qword ptr [edx + esi], xmm4
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm4
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
+    punpckldq xmm2, xmm6
+    movlpd    qword ptr [edx], xmm2
+    movhpd    qword ptr [ebx], xmm2
+    punpckhdq xmm0, xmm6
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
+    punpckldq xmm1, xmm5
+    movlpd    qword ptr [edx], xmm1
+    movhpd    qword ptr [ebx], xmm1
+    punpckhdq xmm0, xmm5
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
+    punpckldq xmm3, xmm7
+    movlpd    qword ptr [edx], xmm3
+    movhpd    qword ptr [ebx], xmm3
+    punpckhdq xmm0, xmm7
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    sub       ecx, 8
+    ja        convertloop
+
+    mov       esp, [esp + 16]
+    pop       ebp
+    pop       edi
+    pop       esi
+    pop       ebx
+    ret
+  }
+}
+#elif (defined(__i386__) || defined(__x86_64__)) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+#define HAS_TRANSPOSE_WX8_SSSE3
+static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                               uint8* dst, int dst_stride, int width) {
+  asm volatile(
+"1:"
+  // Read in the data from the source pointer.
+  // First round of bit swap.
+  "movq       (%0),%%xmm0\n"
+  "movq       (%0,%3),%%xmm1\n"
+  "lea        (%0,%3,2),%0\n"
+  "punpcklbw  %%xmm1,%%xmm0\n"
+  "movq       (%0),%%xmm2\n"
+  "movdqa     %%xmm0,%%xmm1\n"
+  "palignr    $0x8,%%xmm1,%%xmm1\n"
+  "movq       (%0,%3),%%xmm3\n"
+  "lea        (%0,%3,2),%0\n"
+  "punpcklbw  %%xmm3,%%xmm2\n"
+  "movdqa     %%xmm2,%%xmm3\n"
+  "movq       (%0),%%xmm4\n"
+  "palignr    $0x8,%%xmm3,%%xmm3\n"
+  "movq       (%0,%3),%%xmm5\n"
+  "lea        (%0,%3,2),%0\n"
+  "punpcklbw  %%xmm5,%%xmm4\n"
+  "movdqa     %%xmm4,%%xmm5\n"
+  "movq       (%0),%%xmm6\n"
+  "palignr    $0x8,%%xmm5,%%xmm5\n"
+  "movq       (%0,%3),%%xmm7\n"
+  "lea        (%0,%3,2),%0\n"
+  "punpcklbw  %%xmm7,%%xmm6\n"
+  "neg        %3\n"
+  "movdqa     %%xmm6,%%xmm7\n"
+  "lea        0x8(%0,%3,8),%0\n"
+  "palignr    $0x8,%%xmm7,%%xmm7\n"
+  "neg        %3\n"
+   // Second round of bit swap.
+  "punpcklwd  %%xmm2,%%xmm0\n"
+  "punpcklwd  %%xmm3,%%xmm1\n"
+  "movdqa     %%xmm0,%%xmm2\n"
+  "movdqa     %%xmm1,%%xmm3\n"
+  "palignr    $0x8,%%xmm2,%%xmm2\n"
+  "palignr    $0x8,%%xmm3,%%xmm3\n"
+  "punpcklwd  %%xmm6,%%xmm4\n"
+  "punpcklwd  %%xmm7,%%xmm5\n"
+  "movdqa     %%xmm4,%%xmm6\n"
+  "movdqa     %%xmm5,%%xmm7\n"
+  "palignr    $0x8,%%xmm6,%%xmm6\n"
+  "palignr    $0x8,%%xmm7,%%xmm7\n"
+  // Third round of bit swap.
+  // Write to the destination pointer.
+  "punpckldq  %%xmm4,%%xmm0\n"
+  "movq       %%xmm0,(%1)\n"
+  "movdqa     %%xmm0,%%xmm4\n"
+  "palignr    $0x8,%%xmm4,%%xmm4\n"
+  "movq       %%xmm4,(%1,%4)\n"
+  "lea        (%1,%4,2),%1\n"
+  "punpckldq  %%xmm6,%%xmm2\n"
+  "movdqa     %%xmm2,%%xmm6\n"
+  "movq       %%xmm2,(%1)\n"
+  "palignr    $0x8,%%xmm6,%%xmm6\n"
+  "punpckldq  %%xmm5,%%xmm1\n"
+  "movq       %%xmm6,(%1,%4)\n"
+  "lea        (%1,%4,2),%1\n"
+  "movdqa     %%xmm1,%%xmm5\n"
+  "movq       %%xmm1,(%1)\n"
+  "palignr    $0x8,%%xmm5,%%xmm5\n"
+  "movq       %%xmm5,(%1,%4)\n"
+  "lea        (%1,%4,2),%1\n"
+  "punpckldq  %%xmm7,%%xmm3\n"
+  "movq       %%xmm3,(%1)\n"
+  "movdqa     %%xmm3,%%xmm7\n"
+  "palignr    $0x8,%%xmm7,%%xmm7\n"
+  "movq       %%xmm7,(%1,%4)\n"
+  "lea        (%1,%4,2),%1\n"
+  "sub        $0x8,%2\n"
+  "ja         1b\n"
+  : "+r"(src),    // %0
+    "+r"(dst),    // %1
+    "+r"(width)   // %2
+  : "r"(static_cast<intptr_t>(src_stride)),  // %3
+    "r"(static_cast<intptr_t>(dst_stride))   // %4
+  : "memory"
+);
+}
+
+#if defined (__i386__)
+#define HAS_TRANSPOSE_UVWX8_SSE2
+extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                                    uint8* dst_a, int dst_stride_a,
+                                    uint8* dst_b, int dst_stride_b,
+                                    int w);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _TransposeUVWx8_SSE2\n"
+"_TransposeUVWx8_SSE2:\n"
+#else
+    ".global TransposeUVWx8_SSE2\n"
+"TransposeUVWx8_SSE2:\n"
+#endif
+    "push   %ebx\n"
+    "push   %esi\n"
+    "push   %edi\n"
+    "push   %ebp\n"
+    "mov    0x14(%esp),%eax\n"
+    "mov    0x18(%esp),%edi\n"
+    "mov    0x1c(%esp),%edx\n"
+    "mov    0x20(%esp),%esi\n"
+    "mov    0x24(%esp),%ebx\n"
+    "mov    0x28(%esp),%ebp\n"
+    "mov    %esp,%ecx\n"
+    "sub    $0x14,%esp\n"
+    "and    $0xfffffff0,%esp\n"
+    "mov    %ecx,0x10(%esp)\n"
+    "mov    0x2c(%ecx),%ecx\n"
+
+"1:"
+    "movdqa (%eax),%xmm0\n"
+    "movdqa (%eax,%edi,1),%xmm1\n"
+    "lea    (%eax,%edi,2),%eax\n"
+    "movdqa %xmm0,%xmm7\n"
+    "punpcklbw %xmm1,%xmm0\n"
+    "punpckhbw %xmm1,%xmm7\n"
+    "movdqa %xmm7,%xmm1\n"
+    "movdqa (%eax),%xmm2\n"
+    "movdqa (%eax,%edi,1),%xmm3\n"
+    "lea    (%eax,%edi,2),%eax\n"
+    "movdqa %xmm2,%xmm7\n"
+    "punpcklbw %xmm3,%xmm2\n"
+    "punpckhbw %xmm3,%xmm7\n"
+    "movdqa %xmm7,%xmm3\n"
+    "movdqa (%eax),%xmm4\n"
+    "movdqa (%eax,%edi,1),%xmm5\n"
+    "lea    (%eax,%edi,2),%eax\n"
+    "movdqa %xmm4,%xmm7\n"
+    "punpcklbw %xmm5,%xmm4\n"
+    "punpckhbw %xmm5,%xmm7\n"
+    "movdqa %xmm7,%xmm5\n"
+    "movdqa (%eax),%xmm6\n"
+    "movdqa (%eax,%edi,1),%xmm7\n"
+    "lea    (%eax,%edi,2),%eax\n"
+    "movdqa %xmm5,(%esp)\n"
+    "neg    %edi\n"
+    "movdqa %xmm6,%xmm5\n"
+    "punpcklbw %xmm7,%xmm6\n"
+    "punpckhbw %xmm7,%xmm5\n"
+    "movdqa %xmm5,%xmm7\n"
+    "lea    0x10(%eax,%edi,8),%eax\n"
+    "neg    %edi\n"
+    "movdqa %xmm0,%xmm5\n"
+    "punpcklwd %xmm2,%xmm0\n"
+    "punpckhwd %xmm2,%xmm5\n"
+    "movdqa %xmm5,%xmm2\n"
+    "movdqa %xmm1,%xmm5\n"
+    "punpcklwd %xmm3,%xmm1\n"
+    "punpckhwd %xmm3,%xmm5\n"
+    "movdqa %xmm5,%xmm3\n"
+    "movdqa %xmm4,%xmm5\n"
+    "punpcklwd %xmm6,%xmm4\n"
+    "punpckhwd %xmm6,%xmm5\n"
+    "movdqa %xmm5,%xmm6\n"
+    "movdqa (%esp),%xmm5\n"
+    "movdqa %xmm6,(%esp)\n"
+    "movdqa %xmm5,%xmm6\n"
+    "punpcklwd %xmm7,%xmm5\n"
+    "punpckhwd %xmm7,%xmm6\n"
+    "movdqa %xmm6,%xmm7\n"
+    "movdqa %xmm0,%xmm6\n"
+    "punpckldq %xmm4,%xmm0\n"
+    "punpckhdq %xmm4,%xmm6\n"
+    "movdqa %xmm6,%xmm4\n"
+    "movdqa (%esp),%xmm6\n"
+    "movlpd %xmm0,(%edx)\n"
+    "movhpd %xmm0,(%ebx)\n"
+    "movlpd %xmm4,(%edx,%esi,1)\n"
+    "lea    (%edx,%esi,2),%edx\n"
+    "movhpd %xmm4,(%ebx,%ebp,1)\n"
+    "lea    (%ebx,%ebp,2),%ebx\n"
+    "movdqa %xmm2,%xmm0\n"
+    "punpckldq %xmm6,%xmm2\n"
+    "movlpd %xmm2,(%edx)\n"
+    "movhpd %xmm2,(%ebx)\n"
+    "punpckhdq %xmm6,%xmm0\n"
+    "movlpd %xmm0,(%edx,%esi,1)\n"
+    "lea    (%edx,%esi,2),%edx\n"
+    "movhpd %xmm0,(%ebx,%ebp,1)\n"
+    "lea    (%ebx,%ebp,2),%ebx\n"
+    "movdqa %xmm1,%xmm0\n"
+    "punpckldq %xmm5,%xmm1\n"
+    "movlpd %xmm1,(%edx)\n"
+    "movhpd %xmm1,(%ebx)\n"
+    "punpckhdq %xmm5,%xmm0\n"
+    "movlpd %xmm0,(%edx,%esi,1)\n"
+    "lea    (%edx,%esi,2),%edx\n"
+    "movhpd %xmm0,(%ebx,%ebp,1)\n"
+    "lea    (%ebx,%ebp,2),%ebx\n"
+    "movdqa %xmm3,%xmm0\n"
+    "punpckldq %xmm7,%xmm3\n"
+    "movlpd %xmm3,(%edx)\n"
+    "movhpd %xmm3,(%ebx)\n"
+    "punpckhdq %xmm7,%xmm0\n"
+    "movlpd %xmm0,(%edx,%esi,1)\n"
+    "lea    (%edx,%esi,2),%edx\n"
+    "movhpd %xmm0,(%ebx,%ebp,1)\n"
+    "lea    (%ebx,%ebp,2),%ebx\n"
+    "sub    $0x8,%ecx\n"
+    "ja     1b\n"
+    "mov    0x10(%esp),%esp\n"
+    "pop    %ebp\n"
+    "pop    %edi\n"
+    "pop    %esi\n"
+    "pop    %ebx\n"
+    "ret\n"
+);
+#elif defined (__x86_64__)
+// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
+#define HAS_TRANSPOSE_WX8_FAST_SSSE3
+static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
+                                    uint8* dst, int dst_stride, int width) {
+  asm volatile(
+"1:"
+  // Read in the data from the source pointer.
+  // First round of bit swap.
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     (%0,%3),%%xmm1\n"
+  "lea        (%0,%3,2),%0\n"
+  "movdqa     %%xmm0,%%xmm8\n"
+  "punpcklbw  %%xmm1,%%xmm0\n"
+  "punpckhbw  %%xmm1,%%xmm8\n"
+  "movdqa     (%0),%%xmm2\n"
+  "movdqa     %%xmm0,%%xmm1\n"
+  "movdqa     %%xmm8,%%xmm9\n"
+  "palignr    $0x8,%%xmm1,%%xmm1\n"
+  "palignr    $0x8,%%xmm9,%%xmm9\n"
+  "movdqa     (%0,%3),%%xmm3\n"
+  "lea        (%0,%3,2),%0\n"
+  "movdqa     %%xmm2,%%xmm10\n"
+  "punpcklbw  %%xmm3,%%xmm2\n"
+  "punpckhbw  %%xmm3,%%xmm10\n"
+  "movdqa     %%xmm2,%%xmm3\n"
+  "movdqa     %%xmm10,%%xmm11\n"
+  "movdqa     (%0),%%xmm4\n"
+  "palignr    $0x8,%%xmm3,%%xmm3\n"
+  "palignr    $0x8,%%xmm11,%%xmm11\n"
+  "movdqa     (%0,%3),%%xmm5\n"
+  "lea        (%0,%3,2),%0\n"
+  "movdqa     %%xmm4,%%xmm12\n"
+  "punpcklbw  %%xmm5,%%xmm4\n"
+  "punpckhbw  %%xmm5,%%xmm12\n"
+  "movdqa     %%xmm4,%%xmm5\n"
+  "movdqa     %%xmm12,%%xmm13\n"
+  "movdqa     (%0),%%xmm6\n"
+  "palignr    $0x8,%%xmm5,%%xmm5\n"
+  "palignr    $0x8,%%xmm13,%%xmm13\n"
+  "movdqa     (%0,%3),%%xmm7\n"
+  "lea        (%0,%3,2),%0\n"
+  "movdqa     %%xmm6,%%xmm14\n"
+  "punpcklbw  %%xmm7,%%xmm6\n"
+  "punpckhbw  %%xmm7,%%xmm14\n"
+  "neg        %3\n"
+  "movdqa     %%xmm6,%%xmm7\n"
+  "movdqa     %%xmm14,%%xmm15\n"
+  "lea        0x10(%0,%3,8),%0\n"
+  "palignr    $0x8,%%xmm7,%%xmm7\n"
+  "palignr    $0x8,%%xmm15,%%xmm15\n"
+  "neg        %3\n"
+   // Second round of bit swap.
+  "punpcklwd  %%xmm2,%%xmm0\n"
+  "punpcklwd  %%xmm3,%%xmm1\n"
+  "movdqa     %%xmm0,%%xmm2\n"
+  "movdqa     %%xmm1,%%xmm3\n"
+  "palignr    $0x8,%%xmm2,%%xmm2\n"
+  "palignr    $0x8,%%xmm3,%%xmm3\n"
+  "punpcklwd  %%xmm6,%%xmm4\n"
+  "punpcklwd  %%xmm7,%%xmm5\n"
+  "movdqa     %%xmm4,%%xmm6\n"
+  "movdqa     %%xmm5,%%xmm7\n"
+  "palignr    $0x8,%%xmm6,%%xmm6\n"
+  "palignr    $0x8,%%xmm7,%%xmm7\n"
+  "punpcklwd  %%xmm10,%%xmm8\n"
+  "punpcklwd  %%xmm11,%%xmm9\n"
+  "movdqa     %%xmm8,%%xmm10\n"
+  "movdqa     %%xmm9,%%xmm11\n"
+  "palignr    $0x8,%%xmm10,%%xmm10\n"
+  "palignr    $0x8,%%xmm11,%%xmm11\n"
+  "punpcklwd  %%xmm14,%%xmm12\n"
+  "punpcklwd  %%xmm15,%%xmm13\n"
+  "movdqa     %%xmm12,%%xmm14\n"
+  "movdqa     %%xmm13,%%xmm15\n"
+  "palignr    $0x8,%%xmm14,%%xmm14\n"
+  "palignr    $0x8,%%xmm15,%%xmm15\n"
+  // Third round of bit swap.
+  // Write to the destination pointer.
+  "punpckldq  %%xmm4,%%xmm0\n"
+  "movq       %%xmm0,(%1)\n"
+  "movdqa     %%xmm0,%%xmm4\n"
+  "palignr    $0x8,%%xmm4,%%xmm4\n"
+  "movq       %%xmm4,(%1,%4)\n"
+  "lea        (%1,%4,2),%1\n"
+  "punpckldq  %%xmm6,%%xmm2\n"
+  "movdqa     %%xmm2,%%xmm6\n"
+  "movq       %%xmm2,(%1)\n"
+  "palignr    $0x8,%%xmm6,%%xmm6\n"
+  "punpckldq  %%xmm5,%%xmm1\n"
+  "movq       %%xmm6,(%1,%4)\n"
+  "lea        (%1,%4,2),%1\n"
+  "movdqa     %%xmm1,%%xmm5\n"
+  "movq       %%xmm1,(%1)\n"
+  "palignr    $0x8,%%xmm5,%%xmm5\n"
+  "movq       %%xmm5,(%1,%4)\n"
+  "lea        (%1,%4,2),%1\n"
+  "punpckldq  %%xmm7,%%xmm3\n"
+  "movq       %%xmm3,(%1)\n"
+  "movdqa     %%xmm3,%%xmm7\n"
+  "palignr    $0x8,%%xmm7,%%xmm7\n"
+  "movq       %%xmm7,(%1,%4)\n"
+  "lea        (%1,%4,2),%1\n"
+  "punpckldq  %%xmm12,%%xmm8\n"
+  "movq       %%xmm8,(%1)\n"
+  "movdqa     %%xmm8,%%xmm12\n"
+  "palignr    $0x8,%%xmm12,%%xmm12\n"
+  "movq       %%xmm12,(%1,%4)\n"
+  "lea        (%1,%4,2),%1\n"
+  "punpckldq  %%xmm14,%%xmm10\n"
+  "movdqa     %%xmm10,%%xmm14\n"
+  "movq       %%xmm10,(%1)\n"
+  "palignr    $0x8,%%xmm14,%%xmm14\n"
+  "punpckldq  %%xmm13,%%xmm9\n"
+  "movq       %%xmm14,(%1,%4)\n"
+  "lea        (%1,%4,2),%1\n"
+  "movdqa     %%xmm9,%%xmm13\n"
+  "movq       %%xmm9,(%1)\n"
+  "palignr    $0x8,%%xmm13,%%xmm13\n"
+  "movq       %%xmm13,(%1,%4)\n"
+  "lea        (%1,%4,2),%1\n"
+  "punpckldq  %%xmm15,%%xmm11\n"
+  "movq       %%xmm11,(%1)\n"
+  "movdqa     %%xmm11,%%xmm15\n"
+  "palignr    $0x8,%%xmm15,%%xmm15\n"
+  "movq       %%xmm15,(%1,%4)\n"
+  "lea        (%1,%4,2),%1\n"
+  "sub        $0x10,%2\n"
+  "ja         1b\n"
+  : "+r"(src),    // %0
+    "+r"(dst),    // %1
+    "+r"(width)   // %2
+  : "r"(static_cast<intptr_t>(src_stride)),  // %3
+    "r"(static_cast<intptr_t>(dst_stride))   // %4
+  : "memory"
+);
+}
+
+#define HAS_TRANSPOSE_UVWX8_SSE2
+static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                                uint8* dst_a, int dst_stride_a,
+                                uint8* dst_b, int dst_stride_b,
+                                int w) {
+  asm volatile(
+"1:"
+  // Read in the data from the source pointer.
+  // First round of bit swap.
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     (%0,%4),%%xmm1\n"
+  "lea        (%0,%4,2),%0\n"
+  "movdqa     %%xmm0,%%xmm8\n"
+  "punpcklbw  %%xmm1,%%xmm0\n"
+  "punpckhbw  %%xmm1,%%xmm8\n"
+  "movdqa     %%xmm8,%%xmm1\n"
+  "movdqa     (%0),%%xmm2\n"
+  "movdqa     (%0,%4),%%xmm3\n"
+  "lea        (%0,%4,2),%0\n"
+  "movdqa     %%xmm2,%%xmm8\n"
+  "punpcklbw  %%xmm3,%%xmm2\n"
+  "punpckhbw  %%xmm3,%%xmm8\n"
+  "movdqa     %%xmm8,%%xmm3\n"
+  "movdqa     (%0),%%xmm4\n"
+  "movdqa     (%0,%4),%%xmm5\n"
+  "lea        (%0,%4,2),%0\n"
+  "movdqa     %%xmm4,%%xmm8\n"
+  "punpcklbw  %%xmm5,%%xmm4\n"
+  "punpckhbw  %%xmm5,%%xmm8\n"
+  "movdqa     %%xmm8,%%xmm5\n"
+  "movdqa     (%0),%%xmm6\n"
+  "movdqa     (%0,%4),%%xmm7\n"
+  "lea        (%0,%4,2),%0\n"
+  "movdqa     %%xmm6,%%xmm8\n"
+  "punpcklbw  %%xmm7,%%xmm6\n"
+  "neg        %4\n"
+  "lea        0x10(%0,%4,8),%0\n"
+  "punpckhbw  %%xmm7,%%xmm8\n"
+  "movdqa     %%xmm8,%%xmm7\n"
+  "neg        %4\n"
+   // Second round of bit swap.
+  "movdqa     %%xmm0,%%xmm8\n"
+  "movdqa     %%xmm1,%%xmm9\n"
+  "punpckhwd  %%xmm2,%%xmm8\n"
+  "punpckhwd  %%xmm3,%%xmm9\n"
+  "punpcklwd  %%xmm2,%%xmm0\n"
+  "punpcklwd  %%xmm3,%%xmm1\n"
+  "movdqa     %%xmm8,%%xmm2\n"
+  "movdqa     %%xmm9,%%xmm3\n"
+  "movdqa     %%xmm4,%%xmm8\n"
+  "movdqa     %%xmm5,%%xmm9\n"
+  "punpckhwd  %%xmm6,%%xmm8\n"
+  "punpckhwd  %%xmm7,%%xmm9\n"
+  "punpcklwd  %%xmm6,%%xmm4\n"
+  "punpcklwd  %%xmm7,%%xmm5\n"
+  "movdqa     %%xmm8,%%xmm6\n"
+  "movdqa     %%xmm9,%%xmm7\n"
+  // Third round of bit swap.
+  // Write to the destination pointer.
+  "movdqa     %%xmm0,%%xmm8\n"
+  "punpckldq  %%xmm4,%%xmm0\n"
+  "movlpd     %%xmm0,(%1)\n"  // Write back U channel
+  "movhpd     %%xmm0,(%2)\n"  // Write back V channel
+  "punpckhdq  %%xmm4,%%xmm8\n"
+  "movlpd     %%xmm8,(%1,%5)\n"
+  "lea        (%1,%5,2),%1\n"
+  "movhpd     %%xmm8,(%2,%6)\n"
+  "lea        (%2,%6,2),%2\n"
+  "movdqa     %%xmm2,%%xmm8\n"
+  "punpckldq  %%xmm6,%%xmm2\n"
+  "movlpd     %%xmm2,(%1)\n"
+  "movhpd     %%xmm2,(%2)\n"
+  "punpckhdq  %%xmm6,%%xmm8\n"
+  "movlpd     %%xmm8,(%1,%5)\n"
+  "lea        (%1,%5,2),%1\n"
+  "movhpd     %%xmm8,(%2,%6)\n"
+  "lea        (%2,%6,2),%2\n"
+  "movdqa     %%xmm1,%%xmm8\n"
+  "punpckldq  %%xmm5,%%xmm1\n"
+  "movlpd     %%xmm1,(%1)\n"
+  "movhpd     %%xmm1,(%2)\n"
+  "punpckhdq  %%xmm5,%%xmm8\n"
+  "movlpd     %%xmm8,(%1,%5)\n"
+  "lea        (%1,%5,2),%1\n"
+  "movhpd     %%xmm8,(%2,%6)\n"
+  "lea        (%2,%6,2),%2\n"
+  "movdqa     %%xmm3,%%xmm8\n"
+  "punpckldq  %%xmm7,%%xmm3\n"
+  "movlpd     %%xmm3,(%1)\n"
+  "movhpd     %%xmm3,(%2)\n"
+  "punpckhdq  %%xmm7,%%xmm8\n"
+  "movlpd     %%xmm8,(%1,%5)\n"
+  "lea        (%1,%5,2),%1\n"
+  "movhpd     %%xmm8,(%2,%6)\n"
+  "lea        (%2,%6,2),%2\n"
+  "sub        $0x8,%3\n"
+  "ja         1b\n"
+  : "+r"(src),    // %0
+    "+r"(dst_a),  // %1
+    "+r"(dst_b),  // %2
+    "+r"(w)   // %3
+  : "r"(static_cast<intptr_t>(src_stride)),    // %4
+    "r"(static_cast<intptr_t>(dst_stride_a)),  // %5
+    "r"(static_cast<intptr_t>(dst_stride_b))   // %6
+  : "memory"
+);
+}
+#endif
+#endif
+
+static void TransposeWx8_C(const uint8* src, int src_stride,
+                           uint8* dst, int dst_stride,
+                           int w) {
+  int i;
+  for (i = 0; i < w; ++i) {
+    dst[0] = src[0 * src_stride];
+    dst[1] = src[1 * src_stride];
+    dst[2] = src[2 * src_stride];
+    dst[3] = src[3 * src_stride];
+    dst[4] = src[4 * src_stride];
+    dst[5] = src[5 * src_stride];
+    dst[6] = src[6 * src_stride];
+    dst[7] = src[7 * src_stride];
+    ++src;
+    dst += dst_stride;
+  }
+}
+
+static void TransposeWxH_C(const uint8* src, int src_stride,
+                           uint8* dst, int dst_stride,
+                           int width, int height) {
+  int i, j;
+  for (i = 0; i < width; ++i)
+    for (j = 0; j < height; ++j)
+      dst[i * dst_stride + j] = src[j * src_stride + i];
+}
+
+void TransposePlane(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  int i = height;
+  rotate_wx8_func TransposeWx8;
+  rotate_wxh_func TransposeWxH;
+
+#if defined(HAS_TRANSPOSE_WX8_NEON)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
+      (width % 8 == 0) &&
+      IS_ALIGNED(src, 8) && (src_stride % 8 == 0) &&
+      IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
+    TransposeWx8 = TransposeWx8_NEON;
+    TransposeWxH = TransposeWxH_C;
+  } else
+#endif
+#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
+      IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
+    TransposeWx8 = TransposeWx8_FAST_SSSE3;
+    TransposeWxH = TransposeWxH_C;
+  } else
+#endif
+#if defined(HAS_TRANSPOSE_WX8_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 8 == 0) &&
+      IS_ALIGNED(src, 8) && (src_stride % 8 == 0) &&
+      IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
+    TransposeWx8 = TransposeWx8_SSSE3;
+    TransposeWxH = TransposeWxH_C;
+  } else
+#endif
+  {
+    TransposeWx8 = TransposeWx8_C;
+    TransposeWxH = TransposeWxH_C;
+  }
+
+  // work across the source in 8x8 tiles
+  while (i >= 8) {
+    TransposeWx8(src, src_stride, dst, dst_stride, width);
+
+    src += 8 * src_stride;    // go down 8 rows
+    dst += 8;                 // move over 8 columns
+    i   -= 8;
+  }
+
+  TransposeWxH(src, src_stride, dst, dst_stride, width, i);
+}
+
+void RotatePlane90(const uint8* src, int src_stride,
+                   uint8* dst, int dst_stride,
+                   int width, int height) {
+  // Rotate by 90 is a transpose with the source read
+  // from bottom to top.  So set the source pointer to the end
+  // of the buffer and flip the sign of the source stride.
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+
+  TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+void RotatePlane270(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  // Rotate by 270 is a transpose with the destination written
+  // from bottom to top.  So set the destination pointer to the end
+  // of the buffer and flip the sign of the destination stride.
+  dst += dst_stride * (width - 1);
+  dst_stride = -dst_stride;
+
+  TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+static void ReverseLine_C(const uint8* src, uint8* dst, int width) {
+  int i;
+  src += width - 1;
+  for (i = 0; i < width; ++i) {
+    dst[i] = src[0];
+    --src;
+  }
+}
+
+#if defined(WIN32) && !defined(COVERAGE_ENABLED)
+#define HAS_REVERSE_LINE_SSSE3
+__declspec(naked)
+static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) {
+__asm {
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+    movdqa    xmm7, _kShuffleReverse
+    lea       eax, [eax + ecx - 16]
+ convertloop :
+    movdqa    xmm0, [eax]
+    lea       eax, [eax - 16]
+    pshufb    xmm0, xmm7
+    movdqa    [edx], xmm0
+    lea       edx, [edx + 16]
+    sub       ecx, 16
+    ja        convertloop
+    ret
+  }
+}
+
+#elif (defined(__i386__) || defined(__x86_64__)) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+#define HAS_REVERSE_LINE_SSSE3
+static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = static_cast<intptr_t>(width);
+  asm volatile(
+  "movdqa     (%3),%%xmm7\n"
+  "lea        -0x10(%0,%2,1),%0\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "lea        -0x10(%0),%0\n"
+  "pshufb     %%xmm7,%%xmm0\n"
+  "movdqa     %%xmm0,(%1)\n"
+  "lea        0x10(%1),%1\n"
+  "sub        $0x10,%2\n"
+  "ja         1b\n"
+  : "+r"(src),    // %0
+    "+r"(dst),    // %1
+    "+r"(temp_width)   // %2
+  : "r"(kShuffleReverse)   // %3
+  : "memory"
+);
+}
+#endif
+
+void RotatePlane180(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  int i;
+  reverse_func ReverseLine;
+
+#if defined(HAS_REVERSE_LINE_NEON)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
+      IS_ALIGNED(dst, 16) && (dst_stride % 16 == 0)) {
+    ReverseLine = ReverseLine_NEON;
+  } else
+#endif
+#if defined(HAS_REVERSE_LINE_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
+      IS_ALIGNED(dst, 16) && (dst_stride % 16 == 0)) {
+    ReverseLine = ReverseLine_SSSE3;
+  } else
+#endif
+  {
+    ReverseLine = ReverseLine_C;
+  }
+  // Rotate by 180 is a mirror and vertical flip
+  src += src_stride * (height - 1);
+
+  for (i = 0; i < height; ++i) {
+    ReverseLine(src, dst, width);
+    src -= src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void TransposeUVWx8_C(const uint8* src, int src_stride,
+                             uint8* dst_a, int dst_stride_a,
+                             uint8* dst_b, int dst_stride_b,
+                             int w) {
+  int i;
+  for (i = 0; i < w; ++i) {
+    dst_a[0] = src[0 * src_stride + 0];
+    dst_b[0] = src[0 * src_stride + 1];
+    dst_a[1] = src[1 * src_stride + 0];
+    dst_b[1] = src[1 * src_stride + 1];
+    dst_a[2] = src[2 * src_stride + 0];
+    dst_b[2] = src[2 * src_stride + 1];
+    dst_a[3] = src[3 * src_stride + 0];
+    dst_b[3] = src[3 * src_stride + 1];
+    dst_a[4] = src[4 * src_stride + 0];
+    dst_b[4] = src[4 * src_stride + 1];
+    dst_a[5] = src[5 * src_stride + 0];
+    dst_b[5] = src[5 * src_stride + 1];
+    dst_a[6] = src[6 * src_stride + 0];
+    dst_b[6] = src[6 * src_stride + 1];
+    dst_a[7] = src[7 * src_stride + 0];
+    dst_b[7] = src[7 * src_stride + 1];
+    src += 2;
+    dst_a += dst_stride_a;
+    dst_b += dst_stride_b;
+  }
+}
+
+static void TransposeUVWxH_C(const uint8* src, int src_stride,
+                             uint8* dst_a, int dst_stride_a,
+                             uint8* dst_b, int dst_stride_b,
+                             int w, int h) {
+  int i, j;
+  for (i = 0; i < w * 2; i += 2)
+    for (j = 0; j < h; ++j) {
+      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
+      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
+    }
+}
+
+void TransposeUV(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height) {
+  int i = height;
+  rotate_uv_wx8_func TransposeWx8;
+  rotate_uv_wxh_func TransposeWxH;
+
+#if defined(HAS_TRANSPOSE_UVWX8_NEON)
+  unsigned long long store_reg[8];
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) {
+    SaveRegisters_NEON(store_reg);
+    TransposeWx8 = TransposeUVWx8_NEON;
+    TransposeWxH = TransposeUVWxH_C;
+  } else
+#endif
+#if defined(HAS_TRANSPOSE_UVWX8_SSE2)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
+      (width % 8 == 0) &&
+      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
+      IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
+      IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0)) {
+    TransposeWx8 = TransposeUVWx8_SSE2;
+    TransposeWxH = TransposeUVWxH_C;
+  } else
+#endif
+  {
+    TransposeWx8 = TransposeUVWx8_C;
+    TransposeWxH = TransposeUVWxH_C;
+  }
+
+  // work through the source in 8x8 tiles
+  while (i >= 8) {
+    TransposeWx8(src, src_stride,
+                 dst_a, dst_stride_a,
+                 dst_b, dst_stride_b,
+                 width);
+
+    src   += 8 * src_stride;    // go down 8 rows
+    dst_a += 8;                 // move over 8 columns
+    dst_b += 8;                 // move over 8 columns
+    i     -= 8;
+  }
+
+  TransposeWxH(src, src_stride,
+               dst_a, dst_stride_a,
+               dst_b, dst_stride_b,
+               width, i);
+
+#if defined(HAS_TRANSPOSE_UVWX8_NEON)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) {
+    RestoreRegisters_NEON(store_reg);
+  }
+#endif
+}
+
+void RotateUV90(const uint8* src, int src_stride,
+                uint8* dst_a, int dst_stride_a,
+                uint8* dst_b, int dst_stride_b,
+                int width, int height) {
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+
+  TransposeUV(src, src_stride,
+              dst_a, dst_stride_a,
+              dst_b, dst_stride_b,
+              width, height);
+}
+
+void RotateUV270(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height) {
+  dst_a += dst_stride_a * (width - 1);
+  dst_b += dst_stride_b * (width - 1);
+  dst_stride_a = -dst_stride_a;
+  dst_stride_b = -dst_stride_b;
+
+  TransposeUV(src, src_stride,
+              dst_a, dst_stride_a,
+              dst_b, dst_stride_b,
+              width, height);
+}
+
+#if defined(WIN32) && !defined(COVERAGE_ENABLED)
+#define HAS_REVERSE_LINE_UV_SSSE3
+__declspec(naked)
+void ReverseLineUV_SSSE3(const uint8* src,
+                         uint8* dst_a, uint8* dst_b,
+                         int width) {
+__asm {
+    push      edi
+    mov       eax, [esp + 4 + 4]   // src
+    mov       edx, [esp + 4 + 8]   // dst_a
+    mov       edi, [esp + 4 + 12]  // dst_b
+    mov       ecx, [esp + 4 + 16]  // width
+    movdqa    xmm7, _kShuffleReverseUV
+    lea       eax, [eax + ecx * 2 - 16]
+
+ convertloop :
+    movdqa    xmm0, [eax]
+    lea       eax, [eax - 16]
+    pshufb    xmm0, xmm7
+    movlpd    qword ptr [edx], xmm0
+    lea       edx, [edx + 8]
+    movhpd    qword ptr [edi], xmm0
+    lea       edi, [edi + 8]
+    sub       ecx, 8
+    ja        convertloop
+    pop       edi
+    ret
+  }
+}
+
+#elif (defined(__i386__) || defined(__x86_64__)) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+#define HAS_REVERSE_LINE_UV_SSSE3
+void ReverseLineUV_SSSE3(const uint8* src,
+                         uint8* dst_a, uint8* dst_b,
+                         int width) {
+  intptr_t temp_width = static_cast<intptr_t>(width);
+  asm volatile(
+  "movdqa     (%4),%%xmm7\n"
+  "lea        -0x10(%0,%3,2),%0\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "lea        -0x10(%0),%0\n"
+  "pshufb     %%xmm7,%%xmm0\n"
+  "movlpd     %%xmm0,(%1)\n"
+  "lea        0x8(%1),%1\n"
+  "movhpd     %%xmm0,(%2)\n"
+  "lea        0x8(%2),%2\n"
+  "sub        $0x8,%3\n"
+  "ja         1b\n"
+  : "+r"(src),      // %0
+    "+r"(dst_a),    // %1
+    "+r"(dst_b),    // %2
+    "+r"(temp_width)     // %3
+  : "r"(kShuffleReverseUV)  // %4
+  : "memory"
+);
+}
+#endif
+
+static void ReverseLineUV_C(const uint8* src,
+                            uint8* dst_a, uint8* dst_b,
+                            int width) {
+  int i;
+  src += width << 1;
+  for (i = 0; i < width; ++i) {
+    src -= 2;
+    dst_a[i] = src[0];
+    dst_b[i] = src[1];
+  }
+}
+
+void RotateUV180(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height) {
+  int i;
+  reverse_uv_func ReverseLine;
+
+#if defined(HAS_REVERSE_LINE_UV_NEON)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
+      IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
+      IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0) ) {
+    ReverseLine = ReverseLineUV_NEON;
+  } else
+#endif
+#if defined(HAS_REVERSE_LINE_UV_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
+      IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
+      IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0) ) {
+    ReverseLine = ReverseLineUV_SSSE3;
+  } else
+#endif
+  {
+    ReverseLine = ReverseLineUV_C;
+  }
+
+  dst_a += dst_stride_a * (height - 1);
+  dst_b += dst_stride_b * (height - 1);
+
+  for (i = 0; i < height; ++i) {
+    ReverseLine(src, dst_a, dst_b, width);
+
+    src   += src_stride;      // down one line at a time
+    dst_a -= dst_stride_a;    // nominally up one line at a time
+    dst_b -= dst_stride_b;    // nominally up one line at a time
+  }
+}
+
+int I420Rotate(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height,
+               RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return I420Copy(src_y, src_stride_y,
+                      src_u, src_stride_u,
+                      src_v, src_stride_v,
+                      dst_y, dst_stride_y,
+                      dst_u, dst_stride_u,
+                      dst_v, dst_stride_v,
+                      width, height);
+    case kRotate90:
+      RotatePlane90(src_y, src_stride_y,
+                    dst_y, dst_stride_y,
+                    width, height);
+      RotatePlane90(src_u, src_stride_u,
+                    dst_u, dst_stride_u,
+                    halfwidth, halfheight);
+      RotatePlane90(src_v, src_stride_v,
+                    dst_v, dst_stride_v,
+                    halfwidth, halfheight);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotatePlane270(src_u, src_stride_u,
+                     dst_u, dst_stride_u,
+                     halfwidth, halfheight);
+      RotatePlane270(src_v, src_stride_v,
+                     dst_v, dst_stride_v,
+                     halfwidth, halfheight);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotatePlane180(src_u, src_stride_u,
+                     dst_u, dst_stride_u,
+                     halfwidth, halfheight);
+      RotatePlane180(src_v, src_stride_v,
+                     dst_v, dst_stride_v,
+                     halfwidth, halfheight);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
+                     const uint8* src_uv, int src_stride_uv,
+                     uint8* dst_y, int dst_stride_y,
+                     uint8* dst_u, int dst_stride_u,
+                     uint8* dst_v, int dst_stride_v,
+                     int width, int height,
+                     RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+    src_stride_y = -src_stride_y;
+    src_stride_uv = -src_stride_uv;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return NV12ToI420(src_y, src_uv, src_stride_y,
+                        dst_y, dst_stride_y,
+                        dst_u, dst_stride_u,
+                        dst_v, dst_stride_v,
+                        width, height);
+    case kRotate90:
+      RotatePlane90(src_y, src_stride_y,
+                    dst_y, dst_stride_y,
+                    width, height);
+      RotateUV90(src_uv, src_stride_uv,
+                 dst_u, dst_stride_u,
+                 dst_v, dst_stride_v,
+                 halfwidth, halfheight);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotateUV270(src_uv, src_stride_uv,
+                  dst_u, dst_stride_u,
+                  dst_v, dst_stride_v,
+                  halfwidth, halfheight);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotateUV180(src_uv, src_stride_uv,
+                  dst_u, dst_stride_u,
+                  dst_v, dst_stride_v,
+                  halfwidth, halfheight);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+}  // namespace libyuv
diff --git a/files/source/rotate_neon.s b/files/source/rotate_neon.s
new file mode 100644
index 0000000..75ea957
--- /dev/null
+++ b/files/source/rotate_neon.s
@@ -0,0 +1,563 @@
+  .global RestoreRegisters_NEON
+  .global ReverseLine_NEON
+  .global ReverseLineUV_NEON
+  .global SaveRegisters_NEON
+  .global TransposeWx8_NEON
+  .global TransposeUVWx8_NEON
+  .type RestoreRegisters_NEON, function
+  .type ReverseLine_NEON, function
+  .type ReverseLineUV_NEON, function
+  .type SaveRegisters_NEON, function
+  .type TransposeWx8_NEON, function
+  .type TransposeUVWx8_NEON, function
+
+@ void ReverseLine_NEON (const uint8* src, uint8* dst, int width)
+@ r0 const uint8* src
+@ r1 uint8* dst
+@ r2 width
+ReverseLine_NEON:
+
+  @ compute where to start writing destination
+  add         r1, r2      @ dst + width
+
+  @ work on segments that are multiples of 16
+  lsrs        r3, r2, #4
+
+  @ the output is written in two block.  8 bytes followed
+  @ by another 8.  reading is done sequentially, from left to
+  @ right.  writing is done from right to left in block sizes
+  @ r1, the destination pointer is incremented after writing
+  @ the first of the two blocks.  need to subtract that 8 off
+  @ along with 16 to get the next location.
+  mov         r3, #-24
+
+  beq         Lline_residuals
+
+  @ back of destination by the size of the register that is
+  @ going to be reversed
+  sub         r1, #16
+
+  @ the loop needs to run on blocks of 16.  what will be left
+  @ over is either a negative number, the residuals that need
+  @ to be done, or 0.  if this isn't subtracted off here the
+  @ loop will run one extra time.
+  sub         r2, #16
+
+Lsegments_of_16:
+    vld1.8      {q0}, [r0]!               @ src += 16
+
+    @ reverse the bytes in the 64 bit segments.  unable to reverse
+    @ the bytes in the entire 128 bits in one go.
+    vrev64.8    q0, q0
+
+    @ because of the inability to reverse the entire 128 bits
+    @ reverse the writing out of the two 64 bit segments.
+    vst1.8      {d1}, [r1]!
+    vst1.8      {d0}, [r1], r3            @ dst -= 16
+
+    subs        r2, #16
+    bge         Lsegments_of_16
+
+  @ add 16 back to the counter.  if the result is 0 there is no
+  @ residuals so return
+  adds        r2, #16
+  bxeq        lr
+
+  add         r1, #16
+
+Lline_residuals:
+
+  mov         r3, #-3
+
+  sub         r1, #2
+  subs        r2, #2
+  @ check for 16*n+1 scenarios where segments_of_2 should not
+  @ be run, but there is something left over.
+  blt         Lsegment_of_1
+
+@ do this in neon registers as per
+@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
+Lsegments_of_2:
+    vld2.8      {d0[0], d1[0]}, [r0]!     @ src += 2
+
+    vst1.8      {d1[0]}, [r1]!
+    vst1.8      {d0[0]}, [r1], r3         @ dst -= 2
+
+    subs        r2, #2
+    bge         Lsegments_of_2
+
+  adds        r2, #2
+  bxeq        lr
+
+Lsegment_of_1:
+  add         r1, #1
+  vld1.8      {d0[0]}, [r0]
+  vst1.8      {d0[0]}, [r1]
+
+  bx          lr
+
+@ void TransposeWx8_NEON (const uint8* src, int src_stride,
+@                         uint8* dst, int dst_stride,
+@                         int w)
+@ r0 const uint8* src
+@ r1 int src_stride
+@ r2 uint8* dst
+@ r3 int dst_stride
+@ stack int w
+TransposeWx8_NEON:
+  push        {r4,r8,r9,lr}
+
+  ldr         r8, [sp, #16]        @ width
+
+  @ loops are on blocks of 8.  loop will stop when
+  @ counter gets to or below 0.  starting the counter
+  @ at w-8 allow for this
+  sub         r8, #8
+
+@ handle 8x8 blocks.  this should be the majority of the plane
+Lloop_8x8:
+    mov         r9, r0
+
+    vld1.8      {d0}, [r9], r1
+    vld1.8      {d1}, [r9], r1
+    vld1.8      {d2}, [r9], r1
+    vld1.8      {d3}, [r9], r1
+    vld1.8      {d4}, [r9], r1
+    vld1.8      {d5}, [r9], r1
+    vld1.8      {d6}, [r9], r1
+    vld1.8      {d7}, [r9]
+
+    vtrn.8      d1, d0
+    vtrn.8      d3, d2
+    vtrn.8      d5, d4
+    vtrn.8      d7, d6
+
+    vtrn.16     d1, d3
+    vtrn.16     d0, d2
+    vtrn.16     d5, d7
+    vtrn.16     d4, d6
+
+    vtrn.32     d1, d5
+    vtrn.32     d0, d4
+    vtrn.32     d3, d7
+    vtrn.32     d2, d6
+
+    vrev16.8    q0, q0
+    vrev16.8    q1, q1
+    vrev16.8    q2, q2
+    vrev16.8    q3, q3
+
+    mov         r9, r2
+
+    vst1.8      {d1}, [r9], r3
+    vst1.8      {d0}, [r9], r3
+    vst1.8      {d3}, [r9], r3
+    vst1.8      {d2}, [r9], r3
+    vst1.8      {d5}, [r9], r3
+    vst1.8      {d4}, [r9], r3
+    vst1.8      {d7}, [r9], r3
+    vst1.8      {d6}, [r9]
+
+    add         r0, #8            @ src += 8
+    add         r2, r3, lsl #3    @ dst += 8 * dst_stride
+    subs        r8,  #8           @ w   -= 8
+    bge         Lloop_8x8
+
+  @ add 8 back to counter.  if the result is 0 there are
+  @ no residuals.
+  adds        r8, #8
+  beq         Ldone
+
+  @ some residual, so between 1 and 7 lines left to transpose
+  cmp         r8, #2
+  blt         Lblock_1x8
+
+  cmp         r8, #4
+  blt         Lblock_2x8
+
+Lblock_4x8:
+  mov         r9, r0
+  vld1.32     {d0[0]}, [r9], r1
+  vld1.32     {d0[1]}, [r9], r1
+  vld1.32     {d1[0]}, [r9], r1
+  vld1.32     {d1[1]}, [r9], r1
+  vld1.32     {d2[0]}, [r9], r1
+  vld1.32     {d2[1]}, [r9], r1
+  vld1.32     {d3[0]}, [r9], r1
+  vld1.32     {d3[1]}, [r9]
+
+  mov         r9, r2
+
+  adr         r12, vtbl_4x4_transpose
+  vld1.8      {q3}, [r12]
+
+  vtbl.8      d4, {d0, d1}, d6
+  vtbl.8      d5, {d0, d1}, d7
+  vtbl.8      d0, {d2, d3}, d6
+  vtbl.8      d1, {d2, d3}, d7
+
+  @ TODO: rework shuffle above to write
+  @       out with 4 instead of 8 writes
+  vst1.32     {d4[0]}, [r9], r3
+  vst1.32     {d4[1]}, [r9], r3
+  vst1.32     {d5[0]}, [r9], r3
+  vst1.32     {d5[1]}, [r9]
+
+  add         r9, r2, #4
+  vst1.32     {d0[0]}, [r9], r3
+  vst1.32     {d0[1]}, [r9], r3
+  vst1.32     {d1[0]}, [r9], r3
+  vst1.32     {d1[1]}, [r9]
+
+  add         r0, #4            @ src += 4
+  add         r2, r3, lsl #2    @ dst += 4 * dst_stride
+  subs        r8,  #4           @ w   -= 4
+  beq         Ldone
+
+  @ some residual, check to see if it includes a 2x8 block,
+  @ or less
+  cmp         r8, #2
+  blt         Lblock_1x8
+
+Lblock_2x8:
+  mov         r9, r0
+  vld1.16     {d0[0]}, [r9], r1
+  vld1.16     {d1[0]}, [r9], r1
+  vld1.16     {d0[1]}, [r9], r1
+  vld1.16     {d1[1]}, [r9], r1
+  vld1.16     {d0[2]}, [r9], r1
+  vld1.16     {d1[2]}, [r9], r1
+  vld1.16     {d0[3]}, [r9], r1
+  vld1.16     {d1[3]}, [r9]
+
+  vtrn.8      d0, d1
+
+  mov         r9, r2
+
+  vst1.64     {d0}, [r9], r3
+  vst1.64     {d1}, [r9]
+
+  add         r0, #2            @ src += 2
+  add         r2, r3, lsl #1    @ dst += 2 * dst_stride
+  subs        r8,  #2           @ w   -= 2
+  beq         Ldone
+
+Lblock_1x8:
+  vld1.8      {d0[0]}, [r0], r1
+  vld1.8      {d0[1]}, [r0], r1
+  vld1.8      {d0[2]}, [r0], r1
+  vld1.8      {d0[3]}, [r0], r1
+  vld1.8      {d0[4]}, [r0], r1
+  vld1.8      {d0[5]}, [r0], r1
+  vld1.8      {d0[6]}, [r0], r1
+  vld1.8      {d0[7]}, [r0]
+
+  vst1.64     {d0}, [r2]
+
+Ldone:
+
+  pop         {r4,r8,r9,pc}
+
+vtbl_4x4_transpose:
+  .byte  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
+
+@ void SaveRegisters_NEON (unsigned long long store)
+@ r0 unsigned long long store
+SaveRegisters_NEON:
+  vst1.i64    {d8, d9, d10, d11}, [r0]!
+  vst1.i64    {d12, d13, d14, d15}, [r0]!
+  bx          lr
+
+@ void RestoreRegisters_NEON (unsigned long long store)
+@ r0 unsigned long long store
+RestoreRegisters_NEON:
+  vld1.i64    {d8, d9, d10, d11}, [r0]!
+  vld1.i64    {d12, d13, d14, d15}, [r0]!
+  bx          lr
+
+@ void ReverseLineUV_NEON (const uint8* src,
+@                          uint8* dst_a,
+@                          uint8* dst_b,
+@                          int width)
+@ r0 const uint8* src
+@ r1 uint8* dst_a
+@ r2 uint8* dst_b
+@ r3 width
+ReverseLineUV_NEON:
+
+  @ compute where to start writing destination
+  add         r1, r1, r3      @ dst_a + width
+  add         r2, r2, r3      @ dst_b + width
+
+  @ work on input segments that are multiples of 16, but
+  @ width that has been passed is output segments, half
+  @ the size of input.
+  lsrs        r12, r3, #3
+
+  beq         Lline_residuals_di
+
+  @ the output is written in to two blocks.
+  mov         r12, #-8
+
+  @ back of destination by the size of the register that is
+  @ going to be reversed
+  sub         r1, r1, #8
+  sub         r2, r2, #8
+
+  @ the loop needs to run on blocks of 8.  what will be left
+  @ over is either a negative number, the residuals that need
+  @ to be done, or 0.  if this isn't subtracted off here the
+  @ loop will run one extra time.
+  sub         r3, r3, #8
+
+Lsegments_of_8_di:
+    vld2.8      {d0, d1}, [r0]!         @ src += 16
+
+    @ reverse the bytes in the 64 bit segments
+    vrev64.8    q0, q0
+
+    vst1.8      {d0}, [r1], r12         @ dst_a -= 8
+    vst1.8      {d1}, [r2], r12         @ dst_b -= 8
+
+    subs        r3, r3, #8
+    bge         Lsegments_of_8_di
+
+  @ add 8 back to the counter.  if the result is 0 there is no
+  @ residuals so return
+  adds        r3, r3, #8
+  bxeq        lr
+
+  add         r1, r1, #8
+  add         r2, r2, #8
+
+Lline_residuals_di:
+
+  mov         r12, #-1
+
+  sub         r1, r1, #1
+  sub         r2, r2, #1
+
+@ do this in neon registers as per
+@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
+Lsegments_of_1:
+    vld2.8      {d0[0], d1[0]}, [r0]!     @ src += 2
+
+    vst1.8      {d0[0]}, [r1], r12        @ dst_a -= 1
+    vst1.8      {d1[0]}, [r2], r12        @ dst_b -= 1
+
+    subs        r3, r3, #1
+    bgt         Lsegments_of_1
+
+  bx          lr
+
+@ void TransposeUVWx8_NEON (const uint8* src, int src_stride,
+@                           uint8* dst_a, int dst_stride_a,
+@                           uint8* dst_b, int dst_stride_b,
+@                           int width)
+@ r0 const uint8* src
+@ r1 int src_stride
+@ r2 uint8* dst_a
+@ r3 int dst_stride_a
+@ stack uint8* dst_b
+@ stack int dst_stride_b
+@ stack int width
+TransposeUVWx8_NEON:
+  push        {r4-r9,lr}
+
+  ldr         r4, [sp, #28]         @ dst_b
+  ldr         r5, [sp, #32]         @ dst_stride_b
+  ldr         r8, [sp, #36]         @ width
+  @ loops are on blocks of 8.  loop will stop when
+  @ counter gets to or below 0.  starting the counter
+  @ at w-8 allow for this
+  sub         r8, #8
+
+@ handle 8x8 blocks.  this should be the majority of the plane
+Lloop_8x8_di:
+    mov         r9, r0
+
+    vld2.8      {d0,  d1},  [r9], r1
+    vld2.8      {d2,  d3},  [r9], r1
+    vld2.8      {d4,  d5},  [r9], r1
+    vld2.8      {d6,  d7},  [r9], r1
+    vld2.8      {d8,  d9},  [r9], r1
+    vld2.8      {d10, d11}, [r9], r1
+    vld2.8      {d12, d13}, [r9], r1
+    vld2.8      {d14, d15}, [r9]
+
+    vtrn.8      q1, q0
+    vtrn.8      q3, q2
+    vtrn.8      q5, q4
+    vtrn.8      q7, q6
+
+    vtrn.16     q1, q3
+    vtrn.16     q0, q2
+    vtrn.16     q5, q7
+    vtrn.16     q4, q6
+
+    vtrn.32     q1, q5
+    vtrn.32     q0, q4
+    vtrn.32     q3, q7
+    vtrn.32     q2, q6
+
+    vrev16.8    q0, q0
+    vrev16.8    q1, q1
+    vrev16.8    q2, q2
+    vrev16.8    q3, q3
+    vrev16.8    q4, q4
+    vrev16.8    q5, q5
+    vrev16.8    q6, q6
+    vrev16.8    q7, q7
+
+    mov         r9, r2
+
+    vst1.8      {d2},  [r9], r3
+    vst1.8      {d0},  [r9], r3
+    vst1.8      {d6},  [r9], r3
+    vst1.8      {d4},  [r9], r3
+    vst1.8      {d10}, [r9], r3
+    vst1.8      {d8},  [r9], r3
+    vst1.8      {d14}, [r9], r3
+    vst1.8      {d12}, [r9]
+
+    mov         r9, r4
+
+    vst1.8      {d3},  [r9], r5
+    vst1.8      {d1},  [r9], r5
+    vst1.8      {d7},  [r9], r5
+    vst1.8      {d5},  [r9], r5
+    vst1.8      {d11}, [r9], r5
+    vst1.8      {d9},  [r9], r5
+    vst1.8      {d15}, [r9], r5
+    vst1.8      {d13}, [r9]
+
+    add         r0, #8*2          @ src   += 8*2
+    add         r2, r3, lsl #3    @ dst_a += 8 * dst_stride_a
+    add         r4, r5, lsl #3    @ dst_b += 8 * dst_stride_b
+    subs        r8,  #8           @ w     -= 8
+    bge         Lloop_8x8_di
+
+  @ add 8 back to counter.  if the result is 0 there are
+  @ no residuals.
+  adds        r8, #8
+  beq         Ldone_di
+
+  @ some residual, so between 1 and 7 lines left to transpose
+  cmp         r8, #2
+  blt         Lblock_1x8_di
+
+  cmp         r8, #4
+  blt         Lblock_2x8_di
+
+@ TODO(frkoenig) : clean this up
+Lblock_4x8_di:
+  mov         r9, r0
+  vld1.64     {d0}, [r9], r1
+  vld1.64     {d1}, [r9], r1
+  vld1.64     {d2}, [r9], r1
+  vld1.64     {d3}, [r9], r1
+  vld1.64     {d4}, [r9], r1
+  vld1.64     {d5}, [r9], r1
+  vld1.64     {d6}, [r9], r1
+  vld1.64     {d7}, [r9]
+
+  adr         r12, vtbl_4x4_transpose_di
+  vld1.8      {q7}, [r12]
+
+  vtrn.8      q0, q1
+  vtrn.8      q2, q3
+
+  vtbl.8      d8,  {d0, d1}, d14
+  vtbl.8      d9,  {d0, d1}, d15
+  vtbl.8      d10, {d2, d3}, d14
+  vtbl.8      d11, {d2, d3}, d15
+  vtbl.8      d12, {d4, d5}, d14
+  vtbl.8      d13, {d4, d5}, d15
+  vtbl.8      d0,  {d6, d7}, d14
+  vtbl.8      d1,  {d6, d7}, d15
+
+  mov         r9, r2
+
+  vst1.32     {d8[0]},  [r9], r3
+  vst1.32     {d8[1]},  [r9], r3
+  vst1.32     {d9[0]},  [r9], r3
+  vst1.32     {d9[1]},  [r9], r3
+
+  add         r9, r2, #4
+  vst1.32     {d12[0]}, [r9], r3
+  vst1.32     {d12[1]}, [r9], r3
+  vst1.32     {d13[0]}, [r9], r3
+  vst1.32     {d13[1]}, [r9]
+
+  mov         r9, r4
+
+  vst1.32     {d10[0]}, [r9], r5
+  vst1.32     {d10[1]}, [r9], r5
+  vst1.32     {d11[0]}, [r9], r5
+  vst1.32     {d11[1]}, [r9], r5
+
+  add         r9, r4, #4
+  vst1.32     {d0[0]},  [r9], r5
+  vst1.32     {d0[1]},  [r9], r5
+  vst1.32     {d1[0]},  [r9], r5
+  vst1.32     {d1[1]},  [r9]
+
+  add         r0, #4*2          @ src   += 4 * 2
+  add         r2, r3, lsl #2    @ dst_a += 4 * dst_stride_a
+  add         r4, r5, lsl #2    @ dst_b += 4 * dst_stride_b
+  subs        r8,  #4           @ w     -= 4
+  beq         Ldone_di
+
+  @ some residual, check to see if it includes a 2x8 block,
+  @ or less
+  cmp         r8, #2
+  blt         Lblock_1x8_di
+
+Lblock_2x8_di:
+  mov         r9, r0
+  vld2.16     {d0[0], d2[0]}, [r9], r1
+  vld2.16     {d1[0], d3[0]}, [r9], r1
+  vld2.16     {d0[1], d2[1]}, [r9], r1
+  vld2.16     {d1[1], d3[1]}, [r9], r1
+  vld2.16     {d0[2], d2[2]}, [r9], r1
+  vld2.16     {d1[2], d3[2]}, [r9], r1
+  vld2.16     {d0[3], d2[3]}, [r9], r1
+  vld2.16     {d1[3], d3[3]}, [r9]
+
+  vtrn.8      d0, d1
+  vtrn.8      d2, d3
+
+  mov         r9, r2
+
+  vst1.64     {d0}, [r9], r3
+  vst1.64     {d2}, [r9]
+
+  mov         r9, r4
+
+  vst1.64     {d1}, [r9], r5
+  vst1.64     {d3}, [r9]
+
+  add         r0, #2*2          @ src   += 2 * 2
+  add         r2, r3, lsl #1    @ dst_a += 2 * dst_stride_a
+  add         r4, r5, lsl #1    @ dst_a += 2 * dst_stride_a
+  subs        r8,  #2           @ w     -= 2
+  beq         Ldone_di
+
+Lblock_1x8_di:
+  vld2.8      {d0[0], d1[0]}, [r0], r1
+  vld2.8      {d0[1], d1[1]}, [r0], r1
+  vld2.8      {d0[2], d1[2]}, [r0], r1
+  vld2.8      {d0[3], d1[3]}, [r0], r1
+  vld2.8      {d0[4], d1[4]}, [r0], r1
+  vld2.8      {d0[5], d1[5]}, [r0], r1
+  vld2.8      {d0[6], d1[6]}, [r0], r1
+  vld2.8      {d0[7], d1[7]}, [r0]
+
+  vst1.64     {d0}, [r2]
+  vst1.64     {d1}, [r4]
+
+Ldone_di:
+  pop         {r4-r9, pc}
+
+vtbl_4x4_transpose_di:
+  .byte  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
diff --git a/files/source/rotate_priv.h b/files/source/rotate_priv.h
new file mode 100644
index 0000000..b4df149
--- /dev/null
+++ b/files/source/rotate_priv.h
@@ -0,0 +1,72 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef SOURCE_ROTATE_PRIV_H_
+#define SOURCE_ROTATE_PRIV_H_
+
+#include "libyuv/basic_types.h"
+
+namespace libyuv {
+
+// Rotate planes by 90, 180, 270
+void
+RotatePlane90(const uint8* src, int src_stride,
+              uint8* dst, int dst_stride,
+              int width, int height);
+
+void
+RotatePlane180(const uint8* src, int src_stride,
+               uint8* dst, int dst_stride,
+               int width, int height);
+
+void
+RotatePlane270(const uint8* src, int src_stride,
+               uint8* dst, int dst_stride,
+               int width, int height);
+
+void
+RotateUV90(const uint8* src, int src_stride,
+           uint8* dst_a, int dst_stride_a,
+           uint8* dst_b, int dst_stride_b,
+           int width, int height);
+
+// Rotations for when U and V are interleaved.
+// These functions take one input pointer and
+// split the data into two buffers while
+// rotating them.
+void
+RotateUV180(const uint8* src, int src_stride,
+            uint8* dst_a, int dst_stride_a,
+            uint8* dst_b, int dst_stride_b,
+            int width, int height);
+
+void
+RotateUV270(const uint8* src, int src_stride,
+            uint8* dst_a, int dst_stride_a,
+            uint8* dst_b, int dst_stride_b,
+            int width, int height);
+
+// The 90 and 270 functions are based on transposes.
+// Doing a transpose with reversing the read/write
+// order will result in a rotation by +- 90 degrees.
+void
+TransposePlane(const uint8* src, int src_stride,
+               uint8* dst, int dst_stride,
+               int width, int height);
+
+void
+TransposeUV(const uint8* src, int src_stride,
+            uint8* dst_a, int dst_stride_a,
+            uint8* dst_b, int dst_stride_b,
+            int width, int height);
+
+}  // namespace libyuv
+
+#endif  // SOURCE_ROTATE_PRIV_H_
diff --git a/files/source/row.h b/files/source/row.h
new file mode 100644
index 0000000..85343c5
--- /dev/null
+++ b/files/source/row.h
@@ -0,0 +1,167 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef LIBYUV_SOURCE_ROW_H_
+#define LIBYUV_SOURCE_ROW_H_
+
+#include "libyuv/basic_types.h"
+
+// The following are available on all x86 platforms
+#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
+    && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+#define HAS_ARGBTOYROW_SSSE3
+#define HAS_BG24TOARGBROW_SSSE3
+#define HAS_RAWTOARGBROW_SSSE3
+#define HAS_RGB24TOYROW_SSSE3
+#define HAS_RAWTOYROW_SSSE3
+#define HAS_RGB24TOUVROW_SSSE3
+#define HAS_RAWTOUVROW_SSSE3
+#endif
+
+// The following are available only on Windows
+#if defined(WIN32) \
+    && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+#define HAS_BGRATOYROW_SSSE3
+#define HAS_ABGRTOYROW_SSSE3
+#define HAS_ARGBTOUVROW_SSSE3
+#define HAS_BGRATOUVROW_SSSE3
+#define HAS_ABGRTOUVROW_SSSE3
+#endif
+
+extern "C" {
+#ifdef HAS_ARGBTOYROW_SSSE3
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
+#endif
+#if defined(HAS_BG24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3)
+#define HASRGB24TOYROW_SSSE3
+#endif
+#ifdef HASRGB24TOYROW_SSSE3
+void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void RGB24ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                        uint8* dst_u, uint8* dst_v, int width);
+void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+#endif
+void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+                   uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+                   uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+                   uint8* dst_u, uint8* dst_v, int width);
+void RGB24ToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+                    uint8* dst_u, uint8* dst_v, int width);
+void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+                  uint8* dst_u, uint8* dst_v, int width);
+
+#ifdef HAS_BG24TOARGBROW_SSSE3
+void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
+void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
+#endif
+void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
+void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
+
+#if defined(_MSC_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
+#else
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#define TALIGN16(t, var) t var __attribute__((aligned(16)))
+#endif
+
+#ifdef OSX
+extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
+extern SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);
+extern SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
+#else
+extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]);
+extern SIMD_ALIGNED(const int16 _kCoefficientsBgraY[768][4]);
+extern SIMD_ALIGNED(const int16 _kCoefficientsAbgrY[768][4]);
+#endif
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width);
+
+void FastConvertYUVToBGRARow(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             uint8* rgb_buf,
+                             int width);
+
+void FastConvertYUVToABGRRow(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             uint8* rgb_buf,
+                             int width);
+
+void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 uint8* rgb_buf,
+                                 int width);
+
+void FastConvertYToRGB32Row(const uint8* y_buf,
+                            uint8* rgb_buf,
+                            int width);
+
+// Method to force C version.
+//#define USE_MMX 0
+//#define USE_SSE2 0
+
+#if !defined(USE_MMX)
+// Windows, Mac and Linux use MMX
+#if defined(__i386__) || defined(_MSC_VER)
+#define USE_MMX 1
+#else
+#define USE_MMX 0
+#endif
+#endif
+
+#if !defined(USE_SSE2)
+#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2
+#define USE_SSE2 1
+#else
+#define USE_SSE2 0
+#endif
+#endif
+
+// x64 uses MMX2 (SSE) so emms is not required.
+// Warning C4799: function has no EMMS instruction.
+// EMMS() is slow and should be called by the calling function once per image.
+#if USE_MMX && !defined(ARCH_CPU_X86_64)
+#if defined(_MSC_VER)
+#define EMMS() __asm emms
+#pragma warning(disable: 4799)
+#else
+#define EMMS() asm("emms")
+#endif
+#else
+#define EMMS()
+#endif
+
+
+}  // extern "C"
+
+#endif  // LIBYUV_SOURCE_ROW_H_
diff --git a/files/source/row_posix.cc b/files/source/row_posix.cc
new file mode 100644
index 0000000..88ce475
--- /dev/null
+++ b/files/source/row_posix.cc
@@ -0,0 +1,659 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "row.h"
+
+extern "C" {
+
+#ifdef HAS_ARGBTOYROW_SSSE3
+
+// Constant multiplication table for converting ARGB to I400.
+extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) = {
+  13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u
+};
+
+extern "C" TALIGN16(const uint8, kAdd16[16]) = {
+  1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u
+};
+
+// Shuffle table for converting BG24 to ARGB.
+extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
+  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
+};
+
+// Shuffle table for converting RAW to ARGB.
+extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
+  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
+};
+
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile(
+  "movdqa     (%3),%%xmm7\n"
+  "movdqa     (%4),%%xmm6\n"
+  "movdqa     %%xmm6,%%xmm5\n"
+  "psllw      $0x4,%%xmm5\n"  // Generate a mask of 0x10 on each byte.
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "pmaddubsw  %%xmm7,%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "psrlw      $0x7,%%xmm0\n"
+  "pmaddubsw  %%xmm7,%%xmm1\n"
+  "lea        0x20(%0),%0\n"
+  "psrlw      $0x7,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm0\n"
+  "pmaddubsw  %%xmm6,%%xmm0\n"
+  "packuswb   %%xmm0,%%xmm0\n"
+  "paddb      %%xmm5,%%xmm0\n"
+  "movq       %%xmm0,(%1)\n"
+  "lea        0x8(%1),%1\n"
+  "sub        $0x8,%2\n"
+  "ja         1b\n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_y),      // %1
+    "+r"(pix)         // %2
+  : "r"(kMultiplyMaskARGBToI400),    // %3
+    "r"(kAdd16)   // %4
+  : "memory"
+);
+}
+#endif
+
+#ifdef  HAS_BG24TOARGBROW_SSSE3
+void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
+  asm volatile(
+  "pcmpeqb    %%xmm7,%%xmm7\n"  // generate mask 0xff000000
+  "pslld      $0x18,%%xmm7\n"
+  "movdqa     (%3),%%xmm6\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "movdqa     0x20(%0),%%xmm3\n"
+  "lea        0x30(%0),%0\n"
+  "movdqa     %%xmm3,%%xmm2\n"
+  "palignr    $0x8,%%xmm1,%%xmm2\n"  // xmm2 = { xmm3[0:3] xmm1[8:15] }
+  "pshufb     %%xmm6,%%xmm2\n"
+  "por        %%xmm7,%%xmm2\n"
+  "palignr    $0xc,%%xmm0,%%xmm1\n"  // xmm1 = { xmm3[0:7] xmm0[12:15] }
+  "pshufb     %%xmm6,%%xmm0\n"
+  "movdqa     %%xmm2,0x20(%1)\n"
+  "por        %%xmm7,%%xmm0\n"
+  "pshufb     %%xmm6,%%xmm1\n"
+  "movdqa     %%xmm0,(%1)\n"
+  "por        %%xmm7,%%xmm1\n"
+  "palignr    $0x4,%%xmm3,%%xmm3\n"  // xmm3 = { xmm3[4:15] }
+  "pshufb     %%xmm6,%%xmm3\n"
+  "movdqa     %%xmm1,0x10(%1)\n"
+  "por        %%xmm7,%%xmm3\n"
+  "movdqa     %%xmm3,0x30(%1)\n"
+  "lea        0x40(%1),%1\n"
+  "sub        $0x10,%2\n"
+  "ja         1b\n"
+  : "+r"(src_bg24),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "r"(kShuffleMaskBG24ToARGB)  // %3
+  : "memory"
+);
+}
+
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
+  asm volatile(
+  "pcmpeqb    %%xmm7,%%xmm7\n"  // generate mask 0xff000000
+  "pslld      $0x18,%%xmm7\n"
+  "movdqa     (%3),%%xmm6\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "movdqa     0x20(%0),%%xmm3\n"
+  "lea        0x30(%0),%0\n"
+  "movdqa     %%xmm3,%%xmm2\n"
+  "palignr    $0x8,%%xmm1,%%xmm2\n"  // xmm2 = { xmm3[0:3] xmm1[8:15] }
+  "pshufb     %%xmm6,%%xmm2\n"
+  "por        %%xmm7,%%xmm2\n"
+  "palignr    $0xc,%%xmm0,%%xmm1\n"  // xmm1 = { xmm3[0:7] xmm0[12:15] }
+  "pshufb     %%xmm6,%%xmm0\n"
+  "movdqa     %%xmm2,0x20(%1)\n"
+  "por        %%xmm7,%%xmm0\n"
+  "pshufb     %%xmm6,%%xmm1\n"
+  "movdqa     %%xmm0,(%1)\n"
+  "por        %%xmm7,%%xmm1\n"
+  "palignr    $0x4,%%xmm3,%%xmm3\n"  // xmm3 = { xmm3[4:15] }
+  "pshufb     %%xmm6,%%xmm3\n"
+  "movdqa     %%xmm1,0x10(%1)\n"
+  "por        %%xmm7,%%xmm3\n"
+  "movdqa     %%xmm3,0x30(%1)\n"
+  "lea        0x40(%1),%1\n"
+  "sub        $0x10,%2\n"
+  "ja         1b\n"
+  : "+r"(src_raw),   // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "r"(kShuffleMaskRAWToARGB)  // %3
+  : "memory"
+);
+}
+#endif
+
+#if defined(__x86_64__)
+
+// 64 bit linux gcc version
+
+void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
+                              const uint8* u_buf,  // rsi
+                              const uint8* v_buf,  // rdx
+                              uint8* rgb_buf,      // rcx
+                              int width) {         // r8
+  asm volatile(
+"1:"
+  "movzb  (%1),%%r10\n"
+  "lea    1(%1),%1\n"
+  "movzb  (%2),%%r11\n"
+  "lea    1(%2),%2\n"
+  "movq   2048(%5,%%r10,8),%%xmm0\n"
+  "movzb  (%0),%%r10\n"
+  "movq   4096(%5,%%r11,8),%%xmm1\n"
+  "movzb  0x1(%0),%%r11\n"
+  "paddsw %%xmm1,%%xmm0\n"
+  "movq   (%5,%%r10,8),%%xmm2\n"
+  "lea    2(%0),%0\n"
+  "movq   (%5,%%r11,8),%%xmm3\n"
+  "paddsw %%xmm0,%%xmm2\n"
+  "paddsw %%xmm0,%%xmm3\n"
+  "shufps $0x44,%%xmm3,%%xmm2\n"
+  "psraw  $0x6,%%xmm2\n"
+  "packuswb %%xmm2,%%xmm2\n"
+  "movq   %%xmm2,0x0(%3)\n"
+  "lea    8(%3),%3\n"
+  "sub    $0x2,%4\n"
+  "ja     1b\n"
+  : "+r"(y_buf),    // %0
+    "+r"(u_buf),    // %1
+    "+r"(v_buf),    // %2
+    "+r"(rgb_buf),  // %3
+    "+r"(width)     // %4
+  : "r" (_kCoefficientsRgbY)  // %5
+  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
+);
+}
+
+void FastConvertYUVToBGRARow(const uint8* y_buf,  // rdi
+                             const uint8* u_buf,  // rsi
+                             const uint8* v_buf,  // rdx
+                             uint8* rgb_buf,      // rcx
+                             int width) {         // r8
+  asm volatile(
+"1:"
+  "movzb  (%1),%%r10\n"
+  "lea    1(%1),%1\n"
+  "movzb  (%2),%%r11\n"
+  "lea    1(%2),%2\n"
+  "movq   2048(%5,%%r10,8),%%xmm0\n"
+  "movzb  (%0),%%r10\n"
+  "movq   4096(%5,%%r11,8),%%xmm1\n"
+  "movzb  0x1(%0),%%r11\n"
+  "paddsw %%xmm1,%%xmm0\n"
+  "movq   (%5,%%r10,8),%%xmm2\n"
+  "lea    2(%0),%0\n"
+  "movq   (%5,%%r11,8),%%xmm3\n"
+  "paddsw %%xmm0,%%xmm2\n"
+  "paddsw %%xmm0,%%xmm3\n"
+  "shufps $0x44,%%xmm3,%%xmm2\n"
+  "psraw  $0x6,%%xmm2\n"
+  "packuswb %%xmm2,%%xmm2\n"
+  "movq   %%xmm2,0x0(%3)\n"
+  "lea    8(%3),%3\n"
+  "sub    $0x2,%4\n"
+  "ja     1b\n"
+  : "+r"(y_buf),    // %0
+    "+r"(u_buf),    // %1
+    "+r"(v_buf),    // %2
+    "+r"(rgb_buf),  // %3
+    "+r"(width)     // %4
+  : "r" (_kCoefficientsBgraY)  // %5
+  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
+);
+}
+
+void FastConvertYUVToABGRRow(const uint8* y_buf,  // rdi
+                             const uint8* u_buf,  // rsi
+                             const uint8* v_buf,  // rdx
+                             uint8* rgb_buf,      // rcx
+                             int width) {         // r8
+  asm volatile(
+"1:"
+  "movzb  (%1),%%r10\n"
+  "lea    1(%1),%1\n"
+  "movzb  (%2),%%r11\n"
+  "lea    1(%2),%2\n"
+  "movq   2048(%5,%%r10,8),%%xmm0\n"
+  "movzb  (%0),%%r10\n"
+  "movq   4096(%5,%%r11,8),%%xmm1\n"
+  "movzb  0x1(%0),%%r11\n"
+  "paddsw %%xmm1,%%xmm0\n"
+  "movq   (%5,%%r10,8),%%xmm2\n"
+  "lea    2(%0),%0\n"
+  "movq   (%5,%%r11,8),%%xmm3\n"
+  "paddsw %%xmm0,%%xmm2\n"
+  "paddsw %%xmm0,%%xmm3\n"
+  "shufps $0x44,%%xmm3,%%xmm2\n"
+  "psraw  $0x6,%%xmm2\n"
+  "packuswb %%xmm2,%%xmm2\n"
+  "movq   %%xmm2,0x0(%3)\n"
+  "lea    8(%3),%3\n"
+  "sub    $0x2,%4\n"
+  "ja     1b\n"
+  : "+r"(y_buf),    // %0
+    "+r"(u_buf),    // %1
+    "+r"(v_buf),    // %2
+    "+r"(rgb_buf),  // %3
+    "+r"(width)     // %4
+  : "r" (_kCoefficientsAbgrY)  // %5
+  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
+);
+}
+
+void FastConvertYUV444ToRGB32Row(const uint8* y_buf,  // rdi
+                                 const uint8* u_buf,  // rsi
+                                 const uint8* v_buf,  // rdx
+                                 uint8* rgb_buf,      // rcx
+                                 int width) {         // r8
+  asm volatile(
+"1:"
+  "movzb  (%1),%%r10\n"
+  "lea    1(%1),%1\n"
+  "movzb  (%2),%%r11\n"
+  "lea    1(%2),%2\n"
+  "movq   2048(%5,%%r10,8),%%xmm0\n"
+  "movzb  (%0),%%r10\n"
+  "movq   4096(%5,%%r11,8),%%xmm1\n"
+  "paddsw %%xmm1,%%xmm0\n"
+  "movq   (%5,%%r10,8),%%xmm2\n"
+  "lea    1(%0),%0\n"
+  "paddsw %%xmm0,%%xmm2\n"
+  "shufps $0x44,%%xmm2,%%xmm2\n"
+  "psraw  $0x6,%%xmm2\n"
+  "packuswb %%xmm2,%%xmm2\n"
+  "movd   %%xmm2,0x0(%3)\n"
+  "lea    4(%3),%3\n"
+  "sub    $0x1,%4\n"
+  "ja     1b\n"
+  : "+r"(y_buf),    // %0
+    "+r"(u_buf),    // %1
+    "+r"(v_buf),    // %2
+    "+r"(rgb_buf),  // %3
+    "+r"(width)     // %4
+  : "r" (_kCoefficientsRgbY)  // %5
+  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2"
+);
+}
+
+void FastConvertYToRGB32Row(const uint8* y_buf,  // rdi
+                            uint8* rgb_buf,      // rcx
+                            int width) {         // r8
+  asm volatile(
+"1:"
+  "movzb  (%0),%%r10\n"
+  "movzb  0x1(%0),%%r11\n"
+  "movq   (%3,%%r10,8),%%xmm2\n"
+  "lea    2(%0),%0\n"
+  "movq   (%3,%%r11,8),%%xmm3\n"
+  "shufps $0x44,%%xmm3,%%xmm2\n"
+  "psraw  $0x6,%%xmm2\n"
+  "packuswb %%xmm2,%%xmm2\n"
+  "movq   %%xmm2,0x0(%1)\n"
+  "lea    8(%1),%1\n"
+  "sub    $0x2,%2\n"
+  "ja     1b\n"
+  : "+r"(y_buf),    // %0
+    "+r"(rgb_buf),  // %1
+    "+r"(width)     // %2
+  : "r" (_kCoefficientsRgbY)  // %3
+  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
+);
+}
+
+#elif defined(__i386__)
+// 32 bit gcc version
+
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width);
+  asm(
+  ".text\n"
+#if defined(OSX) || defined(IOS)
+  ".globl _FastConvertYUVToRGB32Row\n"
+"_FastConvertYUVToRGB32Row:\n"
+#else
+  ".global FastConvertYUVToRGB32Row\n"
+"FastConvertYUVToRGB32Row:\n"
+#endif
+  "pusha\n"
+  "mov    0x24(%esp),%edx\n"
+  "mov    0x28(%esp),%edi\n"
+  "mov    0x2c(%esp),%esi\n"
+  "mov    0x30(%esp),%ebp\n"
+  "mov    0x34(%esp),%ecx\n"
+
+"1:"
+  "movzbl (%edi),%eax\n"
+  "lea    1(%edi),%edi\n"
+  "movzbl (%esi),%ebx\n"
+  "lea    1(%esi),%esi\n"
+  "movq   _kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+  "movzbl (%edx),%eax\n"
+  "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
+  "movzbl 0x1(%edx),%ebx\n"
+  "movq   _kCoefficientsRgbY(,%eax,8),%mm1\n"
+  "lea    2(%edx),%edx\n"
+  "movq   _kCoefficientsRgbY(,%ebx,8),%mm2\n"
+  "paddsw %mm0,%mm1\n"
+  "paddsw %mm0,%mm2\n"
+  "psraw  $0x6,%mm1\n"
+  "psraw  $0x6,%mm2\n"
+  "packuswb %mm2,%mm1\n"
+  "movntq %mm1,0x0(%ebp)\n"
+  "lea    8(%ebp),%ebp\n"
+  "sub    $0x2,%ecx\n"
+  "ja     1b\n"
+  "popa\n"
+  "ret\n"
+);
+
+void FastConvertYUVToBGRARow(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width);
+  asm(
+  ".text\n"
+#if defined(OSX) || defined(IOS)
+  ".globl _FastConvertYUVToBGRARow\n"
+"_FastConvertYUVToBGRARow:\n"
+#else
+  ".global FastConvertYUVToBGRARow\n"
+"FastConvertYUVToBGRARow:\n"
+#endif
+  "pusha\n"
+  "mov    0x24(%esp),%edx\n"
+  "mov    0x28(%esp),%edi\n"
+  "mov    0x2c(%esp),%esi\n"
+  "mov    0x30(%esp),%ebp\n"
+  "mov    0x34(%esp),%ecx\n"
+
+"1:"
+  "movzbl (%edi),%eax\n"
+  "lea    1(%edi),%edi\n"
+  "movzbl (%esi),%ebx\n"
+  "lea    1(%esi),%esi\n"
+  "movq   _kCoefficientsBgraY+2048(,%eax,8),%mm0\n"
+  "movzbl (%edx),%eax\n"
+  "paddsw _kCoefficientsBgraY+4096(,%ebx,8),%mm0\n"
+  "movzbl 0x1(%edx),%ebx\n"
+  "movq   _kCoefficientsBgraY(,%eax,8),%mm1\n"
+  "lea    2(%edx),%edx\n"
+  "movq   _kCoefficientsBgraY(,%ebx,8),%mm2\n"
+  "paddsw %mm0,%mm1\n"
+  "paddsw %mm0,%mm2\n"
+  "psraw  $0x6,%mm1\n"
+  "psraw  $0x6,%mm2\n"
+  "packuswb %mm2,%mm1\n"
+  "movntq %mm1,0x0(%ebp)\n"
+  "lea    8(%ebp),%ebp\n"
+  "sub    $0x2,%ecx\n"
+  "ja     1b\n"
+  "popa\n"
+  "ret\n"
+);
+
+void FastConvertYUVToABGRRow(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width);
+  asm(
+  ".text\n"
+#if defined(OSX) || defined(IOS)
+  ".globl _FastConvertYUVToABGRRow\n"
+"_FastConvertYUVToABGRRow:\n"
+#else
+  ".global FastConvertYUVToABGRRow\n"
+"FastConvertYUVToABGRRow:\n"
+#endif
+  "pusha\n"
+  "mov    0x24(%esp),%edx\n"
+  "mov    0x28(%esp),%edi\n"
+  "mov    0x2c(%esp),%esi\n"
+  "mov    0x30(%esp),%ebp\n"
+  "mov    0x34(%esp),%ecx\n"
+
+"1:"
+  "movzbl (%edi),%eax\n"
+  "lea    1(%edi),%edi\n"
+  "movzbl (%esi),%ebx\n"
+  "lea    1(%esi),%esi\n"
+  "movq   _kCoefficientsAbgrY+2048(,%eax,8),%mm0\n"
+  "movzbl (%edx),%eax\n"
+  "paddsw _kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n"
+  "movzbl 0x1(%edx),%ebx\n"
+  "movq   _kCoefficientsAbgrY(,%eax,8),%mm1\n"
+  "lea    2(%edx),%edx\n"
+  "movq   _kCoefficientsAbgrY(,%ebx,8),%mm2\n"
+  "paddsw %mm0,%mm1\n"
+  "paddsw %mm0,%mm2\n"
+  "psraw  $0x6,%mm1\n"
+  "psraw  $0x6,%mm2\n"
+  "packuswb %mm2,%mm1\n"
+  "movntq %mm1,0x0(%ebp)\n"
+  "lea    8(%ebp),%ebp\n"
+  "sub    $0x2,%ecx\n"
+  "ja     1b\n"
+  "popa\n"
+  "ret\n"
+);
+
+void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 uint8* rgb_buf,
+                                 int width);
+  asm(
+  ".text\n"
+#if defined(OSX) || defined(IOS)
+  ".globl _FastConvertYUV444ToRGB32Row\n"
+"_FastConvertYUV444ToRGB32Row:\n"
+#else
+  ".global FastConvertYUV444ToRGB32Row\n"
+"FastConvertYUV444ToRGB32Row:\n"
+#endif
+  "pusha\n"
+  "mov    0x24(%esp),%edx\n"
+  "mov    0x28(%esp),%edi\n"
+  "mov    0x2c(%esp),%esi\n"
+  "mov    0x30(%esp),%ebp\n"
+  "mov    0x34(%esp),%ecx\n"
+
+"1:"
+  "movzbl (%edi),%eax\n"
+  "lea    1(%edi),%edi\n"
+  "movzbl (%esi),%ebx\n"
+  "lea    1(%esi),%esi\n"
+  "movq   _kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+  "movzbl (%edx),%eax\n"
+  "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
+  "lea    1(%edx),%edx\n"
+  "paddsw _kCoefficientsRgbY(,%eax,8),%mm0\n"
+  "psraw  $0x6,%mm0\n"
+  "packuswb %mm0,%mm0\n"
+  "movd   %mm0,0x0(%ebp)\n"
+  "lea    4(%ebp),%ebp\n"
+  "sub    $0x1,%ecx\n"
+  "ja     1b\n"
+  "popa\n"
+  "ret\n"
+);
+
+void FastConvertYToRGB32Row(const uint8* y_buf,
+                            uint8* rgb_buf,
+                            int width);
+  asm(
+  ".text\n"
+#if defined(OSX) || defined(IOS)
+  ".globl _FastConvertYToRGB32Row\n"
+"_FastConvertYToRGB32Row:\n"
+#else
+  ".global FastConvertYToRGB32Row\n"
+"FastConvertYToRGB32Row:\n"
+#endif
+  "push   %ebx\n"
+  "mov    0x8(%esp),%eax\n"
+  "mov    0xc(%esp),%edx\n"
+  "mov    0x10(%esp),%ecx\n"
+
+"1:"
+  "movzbl (%eax),%ebx\n"
+  "movq   _kCoefficientsRgbY(,%ebx,8),%mm0\n"
+  "psraw  $0x6,%mm0\n"
+  "movzbl 0x1(%eax),%ebx\n"
+  "movq   _kCoefficientsRgbY(,%ebx,8),%mm1\n"
+  "psraw  $0x6,%mm1\n"
+  "packuswb %mm1,%mm0\n"
+  "lea    0x2(%eax),%eax\n"
+  "movq   %mm0,(%edx)\n"
+  "lea    0x8(%edx),%edx\n"
+  "sub    $0x2,%ecx\n"
+  "ja     1b\n"
+  "pop    %ebx\n"
+  "ret\n"
+);
+
+#else
+// C reference code that mimic the YUV assembly.
+#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
+#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
+    (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
+
+static inline void YuvPixel(uint8 y,
+                            uint8 u,
+                            uint8 v,
+                            uint8* rgb_buf,
+                            int ashift,
+                            int rshift,
+                            int gshift,
+                            int bshift) {
+
+  int b = _kCoefficientsRgbY[256+u][0];
+  int g = _kCoefficientsRgbY[256+u][1];
+  int r = _kCoefficientsRgbY[256+u][2];
+  int a = _kCoefficientsRgbY[256+u][3];
+
+  b = paddsw(b, _kCoefficientsRgbY[512+v][0]);
+  g = paddsw(g, _kCoefficientsRgbY[512+v][1]);
+  r = paddsw(r, _kCoefficientsRgbY[512+v][2]);
+  a = paddsw(a, _kCoefficientsRgbY[512+v][3]);
+
+  b = paddsw(b, _kCoefficientsRgbY[y][0]);
+  g = paddsw(g, _kCoefficientsRgbY[y][1]);
+  r = paddsw(r, _kCoefficientsRgbY[y][2]);
+  a = paddsw(a, _kCoefficientsRgbY[y][3]);
+
+  b >>= 6;
+  g >>= 6;
+  r >>= 6;
+  a >>= 6;
+
+  *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b) << bshift) |
+                                        (packuswb(g) << gshift) |
+                                        (packuswb(r) << rshift) |
+                                        (packuswb(a) << ashift);
+}
+
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  for (int x = 0; x < width; x += 2) {
+    uint8 u = u_buf[x >> 1];
+    uint8 v = v_buf[x >> 1];
+    uint8 y0 = y_buf[x];
+    YuvPixel(y0, u, v, rgb_buf, 24, 16, 8, 0);
+    if ((x + 1) < width) {
+      uint8 y1 = y_buf[x + 1];
+      YuvPixel(y1, u, v, rgb_buf + 4, 24, 16, 8, 0);
+    }
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+}
+
+void FastConvertYUVToBGRARow(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             uint8* rgb_buf,
+                             int width) {
+  for (int x = 0; x < width; x += 2) {
+    uint8 u = u_buf[x >> 1];
+    uint8 v = v_buf[x >> 1];
+    uint8 y0 = y_buf[x];
+    YuvPixel(y0, u, v, rgb_buf, 0, 8, 16, 24);
+    if ((x + 1) < width) {
+      uint8 y1 = y_buf[x + 1];
+      YuvPixel(y1, u, v, rgb_buf + 4, 0, 8, 16, 24);
+    }
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+}
+
+void FastConvertYUVToABGRRow(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             uint8* rgb_buf,
+                             int width) {
+  for (int x = 0; x < width; x += 2) {
+    uint8 u = u_buf[x >> 1];
+    uint8 v = v_buf[x >> 1];
+    uint8 y0 = y_buf[x];
+    YuvPixel(y0, u, v, rgb_buf, 24, 0, 8, 16);
+    if ((x + 1) < width) {
+      uint8 y1 = y_buf[x + 1];
+      YuvPixel(y1, u, v, rgb_buf + 4, 24, 0, 8, 16);
+    }
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+}
+
+void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 uint8* rgb_buf,
+                                 int width) {
+  for (int x = 0; x < width; ++x) {
+    uint8 u = u_buf[x];
+    uint8 v = v_buf[x];
+    uint8 y = y_buf[x];
+    YuvPixel(y, u, v, rgb_buf, 24, 16, 8, 0);
+    rgb_buf += 4;  // Advance 1 pixel.
+  }
+}
+
+void FastConvertYToRGB32Row(const uint8* y_buf,
+                            uint8* rgb_buf,
+                            int width) {
+  for (int x = 0; x < width; ++x) {
+    uint8 y = y_buf[x];
+    YuvPixel(y, 128, 128, rgb_buf, 24, 16, 8, 0);
+    rgb_buf += 4;  // Advance 1 pixel.
+  }
+}
+
+#endif
+
+}  // extern "C"
diff --git a/files/source/row_table.cc b/files/source/row_table.cc
new file mode 100644
index 0000000..022d9f8
--- /dev/null
+++ b/files/source/row_table.cc
@@ -0,0 +1,469 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "row.h"
+
+#define kMaxStride (2048 * 4)
+
+extern "C" {
+
+#define MAKETABLE(NAME) \
+SIMD_ALIGNED(const int16 NAME[256 * 3][4]) = {\
+  RGBY(0x00), RGBY(0x01), RGBY(0x02), RGBY(0x03), \
+  RGBY(0x04), RGBY(0x05), RGBY(0x06), RGBY(0x07), \
+  RGBY(0x08), RGBY(0x09), RGBY(0x0A), RGBY(0x0B), \
+  RGBY(0x0C), RGBY(0x0D), RGBY(0x0E), RGBY(0x0F), \
+  RGBY(0x10), RGBY(0x11), RGBY(0x12), RGBY(0x13), \
+  RGBY(0x14), RGBY(0x15), RGBY(0x16), RGBY(0x17), \
+  RGBY(0x18), RGBY(0x19), RGBY(0x1A), RGBY(0x1B), \
+  RGBY(0x1C), RGBY(0x1D), RGBY(0x1E), RGBY(0x1F), \
+  RGBY(0x20), RGBY(0x21), RGBY(0x22), RGBY(0x23), \
+  RGBY(0x24), RGBY(0x25), RGBY(0x26), RGBY(0x27), \
+  RGBY(0x28), RGBY(0x29), RGBY(0x2A), RGBY(0x2B), \
+  RGBY(0x2C), RGBY(0x2D), RGBY(0x2E), RGBY(0x2F), \
+  RGBY(0x30), RGBY(0x31), RGBY(0x32), RGBY(0x33), \
+  RGBY(0x34), RGBY(0x35), RGBY(0x36), RGBY(0x37), \
+  RGBY(0x38), RGBY(0x39), RGBY(0x3A), RGBY(0x3B), \
+  RGBY(0x3C), RGBY(0x3D), RGBY(0x3E), RGBY(0x3F), \
+  RGBY(0x40), RGBY(0x41), RGBY(0x42), RGBY(0x43), \
+  RGBY(0x44), RGBY(0x45), RGBY(0x46), RGBY(0x47), \
+  RGBY(0x48), RGBY(0x49), RGBY(0x4A), RGBY(0x4B), \
+  RGBY(0x4C), RGBY(0x4D), RGBY(0x4E), RGBY(0x4F), \
+  RGBY(0x50), RGBY(0x51), RGBY(0x52), RGBY(0x53), \
+  RGBY(0x54), RGBY(0x55), RGBY(0x56), RGBY(0x57), \
+  RGBY(0x58), RGBY(0x59), RGBY(0x5A), RGBY(0x5B), \
+  RGBY(0x5C), RGBY(0x5D), RGBY(0x5E), RGBY(0x5F), \
+  RGBY(0x60), RGBY(0x61), RGBY(0x62), RGBY(0x63), \
+  RGBY(0x64), RGBY(0x65), RGBY(0x66), RGBY(0x67), \
+  RGBY(0x68), RGBY(0x69), RGBY(0x6A), RGBY(0x6B), \
+  RGBY(0x6C), RGBY(0x6D), RGBY(0x6E), RGBY(0x6F), \
+  RGBY(0x70), RGBY(0x71), RGBY(0x72), RGBY(0x73), \
+  RGBY(0x74), RGBY(0x75), RGBY(0x76), RGBY(0x77), \
+  RGBY(0x78), RGBY(0x79), RGBY(0x7A), RGBY(0x7B), \
+  RGBY(0x7C), RGBY(0x7D), RGBY(0x7E), RGBY(0x7F), \
+  RGBY(0x80), RGBY(0x81), RGBY(0x82), RGBY(0x83), \
+  RGBY(0x84), RGBY(0x85), RGBY(0x86), RGBY(0x87), \
+  RGBY(0x88), RGBY(0x89), RGBY(0x8A), RGBY(0x8B), \
+  RGBY(0x8C), RGBY(0x8D), RGBY(0x8E), RGBY(0x8F), \
+  RGBY(0x90), RGBY(0x91), RGBY(0x92), RGBY(0x93), \
+  RGBY(0x94), RGBY(0x95), RGBY(0x96), RGBY(0x97), \
+  RGBY(0x98), RGBY(0x99), RGBY(0x9A), RGBY(0x9B), \
+  RGBY(0x9C), RGBY(0x9D), RGBY(0x9E), RGBY(0x9F), \
+  RGBY(0xA0), RGBY(0xA1), RGBY(0xA2), RGBY(0xA3), \
+  RGBY(0xA4), RGBY(0xA5), RGBY(0xA6), RGBY(0xA7), \
+  RGBY(0xA8), RGBY(0xA9), RGBY(0xAA), RGBY(0xAB), \
+  RGBY(0xAC), RGBY(0xAD), RGBY(0xAE), RGBY(0xAF), \
+  RGBY(0xB0), RGBY(0xB1), RGBY(0xB2), RGBY(0xB3), \
+  RGBY(0xB4), RGBY(0xB5), RGBY(0xB6), RGBY(0xB7), \
+  RGBY(0xB8), RGBY(0xB9), RGBY(0xBA), RGBY(0xBB), \
+  RGBY(0xBC), RGBY(0xBD), RGBY(0xBE), RGBY(0xBF), \
+  RGBY(0xC0), RGBY(0xC1), RGBY(0xC2), RGBY(0xC3), \
+  RGBY(0xC4), RGBY(0xC5), RGBY(0xC6), RGBY(0xC7), \
+  RGBY(0xC8), RGBY(0xC9), RGBY(0xCA), RGBY(0xCB), \
+  RGBY(0xCC), RGBY(0xCD), RGBY(0xCE), RGBY(0xCF), \
+  RGBY(0xD0), RGBY(0xD1), RGBY(0xD2), RGBY(0xD3), \
+  RGBY(0xD4), RGBY(0xD5), RGBY(0xD6), RGBY(0xD7), \
+  RGBY(0xD8), RGBY(0xD9), RGBY(0xDA), RGBY(0xDB), \
+  RGBY(0xDC), RGBY(0xDD), RGBY(0xDE), RGBY(0xDF), \
+  RGBY(0xE0), RGBY(0xE1), RGBY(0xE2), RGBY(0xE3), \
+  RGBY(0xE4), RGBY(0xE5), RGBY(0xE6), RGBY(0xE7), \
+  RGBY(0xE8), RGBY(0xE9), RGBY(0xEA), RGBY(0xEB), \
+  RGBY(0xEC), RGBY(0xED), RGBY(0xEE), RGBY(0xEF), \
+  RGBY(0xF0), RGBY(0xF1), RGBY(0xF2), RGBY(0xF3), \
+  RGBY(0xF4), RGBY(0xF5), RGBY(0xF6), RGBY(0xF7), \
+  RGBY(0xF8), RGBY(0xF9), RGBY(0xFA), RGBY(0xFB), \
+  RGBY(0xFC), RGBY(0xFD), RGBY(0xFE), RGBY(0xFF), \
+  RGBU(0x00), RGBU(0x01), RGBU(0x02), RGBU(0x03), \
+  RGBU(0x04), RGBU(0x05), RGBU(0x06), RGBU(0x07), \
+  RGBU(0x08), RGBU(0x09), RGBU(0x0A), RGBU(0x0B), \
+  RGBU(0x0C), RGBU(0x0D), RGBU(0x0E), RGBU(0x0F), \
+  RGBU(0x10), RGBU(0x11), RGBU(0x12), RGBU(0x13), \
+  RGBU(0x14), RGBU(0x15), RGBU(0x16), RGBU(0x17), \
+  RGBU(0x18), RGBU(0x19), RGBU(0x1A), RGBU(0x1B), \
+  RGBU(0x1C), RGBU(0x1D), RGBU(0x1E), RGBU(0x1F), \
+  RGBU(0x20), RGBU(0x21), RGBU(0x22), RGBU(0x23), \
+  RGBU(0x24), RGBU(0x25), RGBU(0x26), RGBU(0x27), \
+  RGBU(0x28), RGBU(0x29), RGBU(0x2A), RGBU(0x2B), \
+  RGBU(0x2C), RGBU(0x2D), RGBU(0x2E), RGBU(0x2F), \
+  RGBU(0x30), RGBU(0x31), RGBU(0x32), RGBU(0x33), \
+  RGBU(0x34), RGBU(0x35), RGBU(0x36), RGBU(0x37), \
+  RGBU(0x38), RGBU(0x39), RGBU(0x3A), RGBU(0x3B), \
+  RGBU(0x3C), RGBU(0x3D), RGBU(0x3E), RGBU(0x3F), \
+  RGBU(0x40), RGBU(0x41), RGBU(0x42), RGBU(0x43), \
+  RGBU(0x44), RGBU(0x45), RGBU(0x46), RGBU(0x47), \
+  RGBU(0x48), RGBU(0x49), RGBU(0x4A), RGBU(0x4B), \
+  RGBU(0x4C), RGBU(0x4D), RGBU(0x4E), RGBU(0x4F), \
+  RGBU(0x50), RGBU(0x51), RGBU(0x52), RGBU(0x53), \
+  RGBU(0x54), RGBU(0x55), RGBU(0x56), RGBU(0x57), \
+  RGBU(0x58), RGBU(0x59), RGBU(0x5A), RGBU(0x5B), \
+  RGBU(0x5C), RGBU(0x5D), RGBU(0x5E), RGBU(0x5F), \
+  RGBU(0x60), RGBU(0x61), RGBU(0x62), RGBU(0x63), \
+  RGBU(0x64), RGBU(0x65), RGBU(0x66), RGBU(0x67), \
+  RGBU(0x68), RGBU(0x69), RGBU(0x6A), RGBU(0x6B), \
+  RGBU(0x6C), RGBU(0x6D), RGBU(0x6E), RGBU(0x6F), \
+  RGBU(0x70), RGBU(0x71), RGBU(0x72), RGBU(0x73), \
+  RGBU(0x74), RGBU(0x75), RGBU(0x76), RGBU(0x77), \
+  RGBU(0x78), RGBU(0x79), RGBU(0x7A), RGBU(0x7B), \
+  RGBU(0x7C), RGBU(0x7D), RGBU(0x7E), RGBU(0x7F), \
+  RGBU(0x80), RGBU(0x81), RGBU(0x82), RGBU(0x83), \
+  RGBU(0x84), RGBU(0x85), RGBU(0x86), RGBU(0x87), \
+  RGBU(0x88), RGBU(0x89), RGBU(0x8A), RGBU(0x8B), \
+  RGBU(0x8C), RGBU(0x8D), RGBU(0x8E), RGBU(0x8F), \
+  RGBU(0x90), RGBU(0x91), RGBU(0x92), RGBU(0x93), \
+  RGBU(0x94), RGBU(0x95), RGBU(0x96), RGBU(0x97), \
+  RGBU(0x98), RGBU(0x99), RGBU(0x9A), RGBU(0x9B), \
+  RGBU(0x9C), RGBU(0x9D), RGBU(0x9E), RGBU(0x9F), \
+  RGBU(0xA0), RGBU(0xA1), RGBU(0xA2), RGBU(0xA3), \
+  RGBU(0xA4), RGBU(0xA5), RGBU(0xA6), RGBU(0xA7), \
+  RGBU(0xA8), RGBU(0xA9), RGBU(0xAA), RGBU(0xAB), \
+  RGBU(0xAC), RGBU(0xAD), RGBU(0xAE), RGBU(0xAF), \
+  RGBU(0xB0), RGBU(0xB1), RGBU(0xB2), RGBU(0xB3), \
+  RGBU(0xB4), RGBU(0xB5), RGBU(0xB6), RGBU(0xB7), \
+  RGBU(0xB8), RGBU(0xB9), RGBU(0xBA), RGBU(0xBB), \
+  RGBU(0xBC), RGBU(0xBD), RGBU(0xBE), RGBU(0xBF), \
+  RGBU(0xC0), RGBU(0xC1), RGBU(0xC2), RGBU(0xC3), \
+  RGBU(0xC4), RGBU(0xC5), RGBU(0xC6), RGBU(0xC7), \
+  RGBU(0xC8), RGBU(0xC9), RGBU(0xCA), RGBU(0xCB), \
+  RGBU(0xCC), RGBU(0xCD), RGBU(0xCE), RGBU(0xCF), \
+  RGBU(0xD0), RGBU(0xD1), RGBU(0xD2), RGBU(0xD3), \
+  RGBU(0xD4), RGBU(0xD5), RGBU(0xD6), RGBU(0xD7), \
+  RGBU(0xD8), RGBU(0xD9), RGBU(0xDA), RGBU(0xDB), \
+  RGBU(0xDC), RGBU(0xDD), RGBU(0xDE), RGBU(0xDF), \
+  RGBU(0xE0), RGBU(0xE1), RGBU(0xE2), RGBU(0xE3), \
+  RGBU(0xE4), RGBU(0xE5), RGBU(0xE6), RGBU(0xE7), \
+  RGBU(0xE8), RGBU(0xE9), RGBU(0xEA), RGBU(0xEB), \
+  RGBU(0xEC), RGBU(0xED), RGBU(0xEE), RGBU(0xEF), \
+  RGBU(0xF0), RGBU(0xF1), RGBU(0xF2), RGBU(0xF3), \
+  RGBU(0xF4), RGBU(0xF5), RGBU(0xF6), RGBU(0xF7), \
+  RGBU(0xF8), RGBU(0xF9), RGBU(0xFA), RGBU(0xFB), \
+  RGBU(0xFC), RGBU(0xFD), RGBU(0xFE), RGBU(0xFF), \
+  RGBV(0x00), RGBV(0x01), RGBV(0x02), RGBV(0x03), \
+  RGBV(0x04), RGBV(0x05), RGBV(0x06), RGBV(0x07), \
+  RGBV(0x08), RGBV(0x09), RGBV(0x0A), RGBV(0x0B), \
+  RGBV(0x0C), RGBV(0x0D), RGBV(0x0E), RGBV(0x0F), \
+  RGBV(0x10), RGBV(0x11), RGBV(0x12), RGBV(0x13), \
+  RGBV(0x14), RGBV(0x15), RGBV(0x16), RGBV(0x17), \
+  RGBV(0x18), RGBV(0x19), RGBV(0x1A), RGBV(0x1B), \
+  RGBV(0x1C), RGBV(0x1D), RGBV(0x1E), RGBV(0x1F), \
+  RGBV(0x20), RGBV(0x21), RGBV(0x22), RGBV(0x23), \
+  RGBV(0x24), RGBV(0x25), RGBV(0x26), RGBV(0x27), \
+  RGBV(0x28), RGBV(0x29), RGBV(0x2A), RGBV(0x2B), \
+  RGBV(0x2C), RGBV(0x2D), RGBV(0x2E), RGBV(0x2F), \
+  RGBV(0x30), RGBV(0x31), RGBV(0x32), RGBV(0x33), \
+  RGBV(0x34), RGBV(0x35), RGBV(0x36), RGBV(0x37), \
+  RGBV(0x38), RGBV(0x39), RGBV(0x3A), RGBV(0x3B), \
+  RGBV(0x3C), RGBV(0x3D), RGBV(0x3E), RGBV(0x3F), \
+  RGBV(0x40), RGBV(0x41), RGBV(0x42), RGBV(0x43), \
+  RGBV(0x44), RGBV(0x45), RGBV(0x46), RGBV(0x47), \
+  RGBV(0x48), RGBV(0x49), RGBV(0x4A), RGBV(0x4B), \
+  RGBV(0x4C), RGBV(0x4D), RGBV(0x4E), RGBV(0x4F), \
+  RGBV(0x50), RGBV(0x51), RGBV(0x52), RGBV(0x53), \
+  RGBV(0x54), RGBV(0x55), RGBV(0x56), RGBV(0x57), \
+  RGBV(0x58), RGBV(0x59), RGBV(0x5A), RGBV(0x5B), \
+  RGBV(0x5C), RGBV(0x5D), RGBV(0x5E), RGBV(0x5F), \
+  RGBV(0x60), RGBV(0x61), RGBV(0x62), RGBV(0x63), \
+  RGBV(0x64), RGBV(0x65), RGBV(0x66), RGBV(0x67), \
+  RGBV(0x68), RGBV(0x69), RGBV(0x6A), RGBV(0x6B), \
+  RGBV(0x6C), RGBV(0x6D), RGBV(0x6E), RGBV(0x6F), \
+  RGBV(0x70), RGBV(0x71), RGBV(0x72), RGBV(0x73), \
+  RGBV(0x74), RGBV(0x75), RGBV(0x76), RGBV(0x77), \
+  RGBV(0x78), RGBV(0x79), RGBV(0x7A), RGBV(0x7B), \
+  RGBV(0x7C), RGBV(0x7D), RGBV(0x7E), RGBV(0x7F), \
+  RGBV(0x80), RGBV(0x81), RGBV(0x82), RGBV(0x83), \
+  RGBV(0x84), RGBV(0x85), RGBV(0x86), RGBV(0x87), \
+  RGBV(0x88), RGBV(0x89), RGBV(0x8A), RGBV(0x8B), \
+  RGBV(0x8C), RGBV(0x8D), RGBV(0x8E), RGBV(0x8F), \
+  RGBV(0x90), RGBV(0x91), RGBV(0x92), RGBV(0x93), \
+  RGBV(0x94), RGBV(0x95), RGBV(0x96), RGBV(0x97), \
+  RGBV(0x98), RGBV(0x99), RGBV(0x9A), RGBV(0x9B), \
+  RGBV(0x9C), RGBV(0x9D), RGBV(0x9E), RGBV(0x9F), \
+  RGBV(0xA0), RGBV(0xA1), RGBV(0xA2), RGBV(0xA3), \
+  RGBV(0xA4), RGBV(0xA5), RGBV(0xA6), RGBV(0xA7), \
+  RGBV(0xA8), RGBV(0xA9), RGBV(0xAA), RGBV(0xAB), \
+  RGBV(0xAC), RGBV(0xAD), RGBV(0xAE), RGBV(0xAF), \
+  RGBV(0xB0), RGBV(0xB1), RGBV(0xB2), RGBV(0xB3), \
+  RGBV(0xB4), RGBV(0xB5), RGBV(0xB6), RGBV(0xB7), \
+  RGBV(0xB8), RGBV(0xB9), RGBV(0xBA), RGBV(0xBB), \
+  RGBV(0xBC), RGBV(0xBD), RGBV(0xBE), RGBV(0xBF), \
+  RGBV(0xC0), RGBV(0xC1), RGBV(0xC2), RGBV(0xC3), \
+  RGBV(0xC4), RGBV(0xC5), RGBV(0xC6), RGBV(0xC7), \
+  RGBV(0xC8), RGBV(0xC9), RGBV(0xCA), RGBV(0xCB), \
+  RGBV(0xCC), RGBV(0xCD), RGBV(0xCE), RGBV(0xCF), \
+  RGBV(0xD0), RGBV(0xD1), RGBV(0xD2), RGBV(0xD3), \
+  RGBV(0xD4), RGBV(0xD5), RGBV(0xD6), RGBV(0xD7), \
+  RGBV(0xD8), RGBV(0xD9), RGBV(0xDA), RGBV(0xDB), \
+  RGBV(0xDC), RGBV(0xDD), RGBV(0xDE), RGBV(0xDF), \
+  RGBV(0xE0), RGBV(0xE1), RGBV(0xE2), RGBV(0xE3), \
+  RGBV(0xE4), RGBV(0xE5), RGBV(0xE6), RGBV(0xE7), \
+  RGBV(0xE8), RGBV(0xE9), RGBV(0xEA), RGBV(0xEB), \
+  RGBV(0xEC), RGBV(0xED), RGBV(0xEE), RGBV(0xEF), \
+  RGBV(0xF0), RGBV(0xF1), RGBV(0xF2), RGBV(0xF3), \
+  RGBV(0xF4), RGBV(0xF5), RGBV(0xF6), RGBV(0xF7), \
+  RGBV(0xF8), RGBV(0xF9), RGBV(0xFA), RGBV(0xFB), \
+  RGBV(0xFC), RGBV(0xFD), RGBV(0xFE), RGBV(0xFF), \
+};
+
+// ARGB table
+#define RGBY(i) { \
+  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+  static_cast<int16>(256 * 64 - 1) \
+}
+
+#define RGBU(i) { \
+  static_cast<int16>(2.018 * 64 * (i - 128) + 0.5), \
+  static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \
+  0, \
+  0 \
+}
+
+#define RGBV(i) { \
+  0, \
+  static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \
+  static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \
+  0 \
+}
+
+#ifdef OSX
+MAKETABLE(kCoefficientsRgbY)
+#else
+MAKETABLE(_kCoefficientsRgbY)
+#endif
+
+#undef RGBY
+#undef RGBU
+#undef RGBV
+
+// BGRA table
+#define RGBY(i) { \
+  static_cast<int16>(256 * 64 - 1), \
+  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5) \
+}
+
+#define RGBU(i) { \
+  0, \
+  0, \
+  static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \
+  static_cast<int16>(2.018 * 64 * (i - 128) + 0.5) \
+}
+
+#define RGBV(i) { \
+  0, \
+  static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \
+  static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \
+  0 \
+}
+
+#ifdef OSX
+MAKETABLE(kCoefficientsBgraY)
+#else
+MAKETABLE(_kCoefficientsBgraY)
+#endif
+
+
+#undef RGBY
+#undef RGBU
+#undef RGBV
+
+// ABGR table
+#define RGBY(i) { \
+  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+  static_cast<int16>(256 * 64 - 1) \
+}
+
+#define RGBU(i) { \
+  0, \
+  static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \
+  static_cast<int16>(2.018 * 64 * (i - 128) + 0.5), \
+  0 \
+}
+
+#define RGBV(i) { \
+  static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \
+  static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \
+  0, \
+  0 \
+}
+
+#ifdef OSX
+MAKETABLE(kCoefficientsAbgrY)
+#else
+MAKETABLE(_kCoefficientsAbgrY)
+#endif
+
+
+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
+  for (int x = 0; x < pix; ++x) {
+    uint8 r = src_raw[0];
+    uint8 g = src_raw[1];
+    uint8 b = src_raw[2];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    src_raw += 3;
+  }
+}
+
+void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix) {
+  for (int x = 0; x < pix; ++x) {
+    uint8 b = src_bg24[0];
+    uint8 g = src_bg24[1];
+    uint8 r = src_bg24[2];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = 255u;
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    src_bg24 += 3;
+  }
+}
+
+// C versions do the same
+void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix) {
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  BG24ToARGBRow_C(src_argb, row, pix);
+  ARGBToYRow_C(row, dst_y, pix);
+}
+
+void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix) {
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  RAWToARGBRow_C(src_argb, row, pix);
+  ARGBToYRow_C(row, dst_y, pix);
+}
+
+void RGB24ToUVRow_C(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_u, uint8* dst_v, int pix) {
+  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+  BG24ToARGBRow_C(src_argb, row, pix);
+  BG24ToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
+  ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
+}
+
+void RAWToUVRow_C(const uint8* src_argb, int src_stride_argb,
+                  uint8* dst_u, uint8* dst_v, int pix) {
+  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+  RAWToARGBRow_C(src_argb, row, pix);
+  RAWToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
+  ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
+}
+
+static inline int RGBToY(uint8 r, uint8 g, uint8 b) {
+  return (( 66 * r + 129 * g +  25 * b + 128) >> 8) + 16;
+}
+
+static inline int RGBToU(uint8 r, uint8 g, uint8 b) {
+  return ((-38 * r -  74 * g + 112 * b + 128) >> 8) + 128;
+}
+static inline int RGBToV(uint8 r, uint8 g, uint8 b) {
+  return ((112 * r -  94 * g -  18 * b + 128) >> 8) + 128;
+}
+
+#define MAKEROWY(NAME,R,G,B) \
+void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {       \
+  for (int x = 0; x < width; ++x) {                                            \
+    dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);               \
+    src_argb0 += 4;                                                            \
+    dst_y += 1;                                                                \
+  }                                                                            \
+}                                                                              \
+void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \
+                       uint8* dst_u, uint8* dst_v, int width) {                \
+  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
+  for (int x = 0; x < width - 1; x += 2) {                                     \
+    uint8 ab = (src_rgb0[B] + src_rgb0[B + 4] +                                \
+               src_rgb1[B] + src_rgb1[B + 4]) >> 2;                            \
+    uint8 ag = (src_rgb0[G] + src_rgb0[G + 4] +                                \
+               src_rgb1[G] + src_rgb1[G + 4]) >> 2;                            \
+    uint8 ar = (src_rgb0[R] + src_rgb0[R + 4] +                                \
+               src_rgb1[R] + src_rgb1[R + 4]) >> 2;                            \
+    dst_u[0] = RGBToU(ar, ag, ab);                                             \
+    dst_v[0] = RGBToV(ar, ag, ab);                                             \
+    src_rgb0 += 8;                                                             \
+    src_rgb1 += 8;                                                             \
+    dst_u += 1;                                                                \
+    dst_v += 1;                                                                \
+  }                                                                            \
+  if (width & 1) {                                                             \
+    uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                               \
+    uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                               \
+    uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                               \
+    dst_u[0] = RGBToU(ar, ag, ab);                                             \
+    dst_v[0] = RGBToV(ar, ag, ab);                                             \
+  }                                                                            \
+}
+
+MAKEROWY(ARGB,2,1,0)
+MAKEROWY(BGRA,1,2,3)
+MAKEROWY(ABGR,0,1,2)
+
+#if defined(HAS_RAWTOYROW_SSSE3)
+
+void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  BG24ToARGBRow_SSSE3(src_argb, row, pix);
+  ARGBToYRow_SSSE3(row, dst_y, pix);
+}
+
+void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  RAWToARGBRow_SSSE3(src_argb, row, pix);
+  ARGBToYRow_SSSE3(row, dst_y, pix);
+}
+
+#endif
+
+#if defined(HAS_RAWTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+void RGB24ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+                        uint8* dst_u, uint8* dst_v, int pix) {
+  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+  BG24ToARGBRow_SSSE3(src_argb, row, pix);
+  BG24ToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
+  ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
+}
+
+void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+  RAWToARGBRow_SSSE3(src_argb, row, pix);
+  RAWToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
+  ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
+}
+
+#else
+
+void RGB24ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+                        uint8* dst_u, uint8* dst_v, int pix) {
+  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+  BG24ToARGBRow_SSSE3(src_argb, row, pix);
+  BG24ToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
+  ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
+}
+
+void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+  RAWToARGBRow_SSSE3(src_argb, row, pix);
+  RAWToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
+  ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
+}
+
+#endif
+#endif
+
+}  // extern "C"
diff --git a/files/source/row_win.cc b/files/source/row_win.cc
new file mode 100644
index 0000000..2bc5fb1
--- /dev/null
+++ b/files/source/row_win.cc
@@ -0,0 +1,636 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "row.h"
+
+extern "C" {
+
+#ifdef HAS_ARGBTOYROW_SSSE3
+#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
+
+// Constant multiplication table for converting ARGB to I400.
+extern "C" TALIGN16(const int8, kARGBToY[16]) = {
+  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
+};
+
+extern "C" TALIGN16(const int8, kARGBToU[16]) = {
+  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
+};
+
+extern "C" TALIGN16(const int8, kARGBToV[16]) = {
+  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+};
+
+// Constants for BGRA
+extern "C" TALIGN16(const int8, kBGRAToY[16]) = {
+  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
+};
+
+extern "C" TALIGN16(const int8, kBGRAToU[16]) = {
+  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
+};
+
+extern "C" TALIGN16(const int8, kBGRAToV[16]) = {
+  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
+};
+
+// Constants for ABGR
+extern "C" TALIGN16(const int8, kABGRToY[16]) = {
+  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
+};
+
+extern "C" TALIGN16(const int8, kABGRToU[16]) = {
+  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
+};
+
+extern "C" TALIGN16(const int8, kABGRToV[16]) = {
+  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
+};
+
+extern "C" TALIGN16(const uint8, kAddY16[16]) = {
+  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+};
+
+extern "C" TALIGN16(const uint8, kAddUV128[16]) = {
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting BG24 to ARGB.
+extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
+  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
+};
+
+// Shuffle table for converting RAW to ARGB.
+extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
+  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
+};
+
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values
+__declspec(naked)
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+__asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm7, _kARGBToY
+    movdqa     xmm6, _kAddY16
+
+ convertloop :
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + 32]
+    movdqa     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm7
+    pmaddubsw  xmm1, xmm7
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm3, xmm7
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm6
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    ja         convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+__asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm7, _kBGRAToY
+    movdqa     xmm6, _kAddY16
+
+ convertloop :
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + 32]
+    movdqa     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm7
+    pmaddubsw  xmm1, xmm7
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm3, xmm7
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm6
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    ja         convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+__asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm7, _kABGRToY
+    movdqa     xmm6, _kAddY16
+
+ convertloop :
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + 32]
+    movdqa     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm7
+    pmaddubsw  xmm1, xmm7
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm3, xmm7
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm6
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    ja         convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+__asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm7, _kARGBToU
+    movdqa     xmm6, _kARGBToV
+    movdqa     xmm5, _kAddUV128
+    sub        edi, edx             // stride from u to v
+
+ convertloop :
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + 32]
+    movdqa     xmm3, [eax + 48]
+    pavgb      xmm0, [eax + esi]
+    pavgb      xmm1, [eax + esi + 16]
+    pavgb      xmm2, [eax + esi + 32]
+    pavgb      xmm3, [eax + esi + 48]
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    ja         convertloop
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+__asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm7, _kBGRAToU
+    movdqa     xmm6, _kBGRAToV
+    movdqa     xmm5, _kAddUV128
+    sub        edi, edx             // stride from u to v
+
+ convertloop :
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + 32]
+    movdqa     xmm3, [eax + 48]
+    pavgb      xmm0, [eax + esi]
+    pavgb      xmm1, [eax + esi + 16]
+    pavgb      xmm2, [eax + esi + 32]
+    pavgb      xmm3, [eax + esi + 48]
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    ja         convertloop
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+__asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm7, _kABGRToU
+    movdqa     xmm6, _kABGRToV
+    movdqa     xmm5, _kAddUV128
+    sub        edi, edx             // stride from u to v
+
+ convertloop :
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + 32]
+    movdqa     xmm3, [eax + 48]
+    pavgb      xmm0, [eax + esi]
+    pavgb      xmm1, [eax + esi + 16]
+    pavgb      xmm2, [eax + esi + 32]
+    pavgb      xmm3, [eax + esi + 48]
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    ja         convertloop
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
+__asm {
+    mov       eax, [esp + 4]   // src_bg24
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm7, xmm7       // generate mask 0xff000000
+    pslld     xmm7, 24
+    movdqa    xmm6, _kShuffleMaskBG24ToARGB
+
+ convertloop :
+    movdqa    xmm0, [eax]
+    movdqa    xmm1, [eax + 16]
+    movdqa    xmm3, [eax + 32]
+    lea       eax, [eax + 48]
+    movdqa    xmm2, xmm3
+    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
+    pshufb    xmm2, xmm6
+    por       xmm2, xmm7
+    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
+    pshufb    xmm0, xmm6
+    movdqa    [edx + 32], xmm2
+    por       xmm0, xmm7
+    pshufb    xmm1, xmm6
+    movdqa    [edx], xmm0
+    por       xmm1, xmm7
+    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
+    pshufb    xmm3, xmm6
+    movdqa    [edx + 16], xmm1
+    por       xmm3, xmm7
+    movdqa    [edx + 48], xmm3
+    lea       edx, [edx + 64]
+    sub       ecx, 16
+    ja        convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
+                        int pix) {
+__asm {
+    mov       eax, [esp + 4]   // src_raw
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm7, xmm7       // generate mask 0xff000000
+    pslld     xmm7, 24
+    movdqa    xmm6, _kShuffleMaskRAWToARGB
+
+ convertloop :
+    movdqa    xmm0, [eax]
+    movdqa    xmm1, [eax + 16]
+    movdqa    xmm3, [eax + 32]
+    lea       eax, [eax + 48]
+    movdqa    xmm2, xmm3
+    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
+    pshufb    xmm2, xmm6
+    por       xmm2, xmm7
+    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
+    pshufb    xmm0, xmm6
+    movdqa    [edx + 32], xmm2
+    por       xmm0, xmm7
+    pshufb    xmm1, xmm6
+    movdqa    [edx], xmm0
+    por       xmm1, xmm7
+    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
+    pshufb    xmm3, xmm6
+    movdqa    [edx + 16], xmm1
+    por       xmm3, xmm7
+    movdqa    [edx + 48], xmm3
+    lea       edx, [edx + 64]
+    sub       ecx, 16
+    ja        convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  __asm {
+    pushad
+    mov       edx, [esp + 32 + 4]
+    mov       edi, [esp + 32 + 8]
+    mov       esi, [esp + 32 + 12]
+    mov       ebp, [esp + 32 + 16]
+    mov       ecx, [esp + 32 + 20]
+
+ convertloop :
+    movzx     eax, byte ptr [edi]
+    lea       edi, [edi + 1]
+    movzx     ebx, byte ptr [esi]
+    lea       esi, [esi + 1]
+    movq      mm0, [_kCoefficientsRgbY + 2048 + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    paddsw    mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx]
+    movzx     ebx, byte ptr [edx + 1]
+    movq      mm1, [_kCoefficientsRgbY + 8 * eax]
+    lea       edx, [edx + 2]
+    movq      mm2, [_kCoefficientsRgbY + 8 * ebx]
+    paddsw    mm1, mm0
+    paddsw    mm2, mm0
+    psraw     mm1, 6
+    psraw     mm2, 6
+    packuswb  mm1, mm2
+    movntq    [ebp], mm1
+    lea       ebp, [ebp + 8]
+    sub       ecx, 2
+    ja        convertloop
+
+    popad
+    ret
+  }
+}
+
+__declspec(naked)
+void FastConvertYUVToBGRARow(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             uint8* rgb_buf,
+                             int width) {
+  __asm {
+    pushad
+    mov       edx, [esp + 32 + 4]
+    mov       edi, [esp + 32 + 8]
+    mov       esi, [esp + 32 + 12]
+    mov       ebp, [esp + 32 + 16]
+    mov       ecx, [esp + 32 + 20]
+
+ convertloop :
+    movzx     eax, byte ptr [edi]
+    lea       edi, [edi + 1]
+    movzx     ebx, byte ptr [esi]
+    lea       esi, [esi + 1]
+    movq      mm0, [_kCoefficientsBgraY + 2048 + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    paddsw    mm0, [_kCoefficientsBgraY + 4096 + 8 * ebx]
+    movzx     ebx, byte ptr [edx + 1]
+    movq      mm1, [_kCoefficientsBgraY + 8 * eax]
+    lea       edx, [edx + 2]
+    movq      mm2, [_kCoefficientsBgraY + 8 * ebx]
+    paddsw    mm1, mm0
+    paddsw    mm2, mm0
+    psraw     mm1, 6
+    psraw     mm2, 6
+    packuswb  mm1, mm2
+    movntq    [ebp], mm1
+    lea       ebp, [ebp + 8]
+    sub       ecx, 2
+    ja        convertloop
+
+    popad
+    ret
+  }
+}
+
+__declspec(naked)
+void FastConvertYUVToABGRRow(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             uint8* rgb_buf,
+                             int width) {
+  __asm {
+    pushad
+    mov       edx, [esp + 32 + 4]
+    mov       edi, [esp + 32 + 8]
+    mov       esi, [esp + 32 + 12]
+    mov       ebp, [esp + 32 + 16]
+    mov       ecx, [esp + 32 + 20]
+
+ convertloop :
+    movzx     eax, byte ptr [edi]
+    lea       edi, [edi + 1]
+    movzx     ebx, byte ptr [esi]
+    lea       esi, [esi + 1]
+    movq      mm0, [_kCoefficientsAbgrY + 2048 + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    paddsw    mm0, [_kCoefficientsAbgrY + 4096 + 8 * ebx]
+    movzx     ebx, byte ptr [edx + 1]
+    movq      mm1, [_kCoefficientsAbgrY + 8 * eax]
+    lea       edx, [edx + 2]
+    movq      mm2, [_kCoefficientsAbgrY + 8 * ebx]
+    paddsw    mm1, mm0
+    paddsw    mm2, mm0
+    psraw     mm1, 6
+    psraw     mm2, 6
+    packuswb  mm1, mm2
+    movntq    [ebp], mm1
+    lea       ebp, [ebp + 8]
+    sub       ecx, 2
+    ja        convertloop
+
+    popad
+    ret
+  }
+}
+
+__declspec(naked)
+void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 uint8* rgb_buf,
+                                 int width) {
+  __asm {
+    pushad
+    mov       edx, [esp + 32 + 4]   // Y
+    mov       edi, [esp + 32 + 8]   // U
+    mov       esi, [esp + 32 + 12]  // V
+    mov       ebp, [esp + 32 + 16]  // rgb
+    mov       ecx, [esp + 32 + 20]  // width
+
+ convertloop :
+    movzx     eax, byte ptr [edi]
+    lea       edi, [edi + 1]
+    movzx     ebx, byte ptr [esi]
+    lea       esi, [esi + 1]
+    movq      mm0, [_kCoefficientsRgbY + 2048 + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    paddsw    mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx]
+    lea       edx, [edx + 1]
+    paddsw    mm0, [_kCoefficientsRgbY + 8 * eax]
+    psraw     mm0, 6
+    packuswb  mm0, mm0
+    movd      [ebp], mm0
+    lea       ebp, [ebp + 4]
+    sub       ecx, 1
+    ja        convertloop
+
+    popad
+    ret
+  }
+}
+
+__declspec(naked)
+void FastConvertYToRGB32Row(const uint8* y_buf,
+                            uint8* rgb_buf,
+                            int width) {
+  __asm {
+    push      ebx
+    mov       eax, [esp + 4 + 4]   // Y
+    mov       edx, [esp + 4 + 8]   // rgb
+    mov       ecx, [esp + 4 + 12]  // width
+
+ convertloop :
+    movzx     ebx, byte ptr [eax]
+    movq      mm0, [_kCoefficientsRgbY + 8 * ebx]
+    psraw     mm0, 6
+    movzx     ebx, byte ptr [eax + 1]
+    movq      mm1, [_kCoefficientsRgbY + 8 * ebx]
+    psraw     mm1, 6
+    packuswb  mm0, mm1
+    lea       eax, [eax + 2]
+    movq      [edx], mm0
+    lea       edx, [edx + 8]
+    sub       ecx, 2
+    ja        convertloop
+
+    pop       ebx
+    ret
+  }
+}
+
+#endif
+
+}  // extern "C"
diff --git a/files/source/scale.cc b/files/source/scale.cc
new file mode 100644
index 0000000..d3b7d33
--- /dev/null
+++ b/files/source/scale.cc
@@ -0,0 +1,3481 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+
+#if defined(_MSC_VER)
+#define ALIGN16(var) __declspec(align(16)) var
+#else
+#define ALIGN16(var) var __attribute__((aligned(16)))
+#endif
+
+// Note: A Neon reference manual
+// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html
+// Note: Some SSE2 reference manuals
+// cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
+
+namespace libyuv {
+
+// Set the following flag to true to revert to only
+// using the reference implementation ScalePlaneBox(), and
+// NOT the optimized versions. Useful for debugging and
+// when comparing the quality of the resulting YUV planes
+// as produced by the optimized and non-optimized versions.
+
+static bool use_reference_impl_ = false;
+
+void SetUseReferenceImpl(bool use) {
+  use_reference_impl_ = use;
+}
+
+/**
+ * NEON downscalers with interpolation.
+ *
+ * Provided by Fritz Koenig
+ *
+ */
+
+#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
+#define HAS_SCALEROWDOWN2_NEON
+void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */,
+                        uint8* dst, int dst_width) {
+  __asm__ volatile
+  (
+    "1:\n"
+    "vld2.u8    {q0,q1}, [%0]!    \n"  // load even pixels into q0, odd into q1
+    "vst1.u8    {q0}, [%1]!       \n"  // store even pixels
+    "subs       %2, %2, #16       \n"  // 16 processed per loop
+    "bhi        1b                \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst),              // %1
+      "+r"(dst_width)         // %2
+    :
+    : "q0", "q1"              // Clobber List
+  );
+}
+
+void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
+                           uint8* dst, int dst_width) {
+  __asm__ volatile
+  (
+    "mov        r4, #2            \n"  // rounding constant
+    "add        %1, %0            \n"  // change the stride to row 2 pointer
+    "vdup.16    q4, r4            \n"
+    "1:\n"
+    "vld1.u8    {q0,q1}, [%0]!    \n"  // load row 1 and post increment
+    "vld1.u8    {q2,q3}, [%1]!    \n"  // load row 2 and post increment
+    "vpaddl.u8  q0, q0            \n"  // row 1 add adjacent
+    "vpaddl.u8  q1, q1            \n"
+    "vpadal.u8  q0, q2            \n"  // row 2 add adjacent, add row 1 to row 2
+    "vpadal.u8  q1, q3            \n"
+    "vadd.u16   q0, q4            \n"  // rounding
+    "vadd.u16   q1, q4            \n"
+    "vshrn.u16  d0, q0, #2        \n"  // downshift and pack
+    "vshrn.u16  d1, q1, #2        \n"
+    "vst1.u8    {q0}, [%2]!       \n"
+    "subs       %3, %3, #16       \n"  // 16 processed per loop
+    "bhi        1b                \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(src_stride),       // %1
+      "+r"(dst),              // %2
+      "+r"(dst_width)         // %3
+    :
+    : "r4", "q0", "q1", "q2", "q3", "q4"              // Clobber List
+   );
+}
+
+#define HAS_SCALEROWDOWN4_NEON
+// Expecting widths on arm devices to be smaller.  Went with 8x4 blocks
+//  to get most coverage.  Look to back and evaluate 16x4 blocks with
+//  handling of leftovers.
+static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */,
+                               uint8* dst_ptr, int dst_width) {
+  __asm__ volatile
+  (
+    "mov        r4, #4            \n"
+    "1:                           \n"
+    "vld1.u8    {d0[0]}, [%0],r4  \n"   // load up only 2 pixels of data to
+    "vld1.u8    {d0[1]}, [%0],r4  \n"   //  represent the entire 8x4 block
+
+    "vst1.u16   {d0[0]}, [%1]!    \n"
+
+    "subs       %2, #2            \n"   // dst_width -= 2
+    "bhi        1b                \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width)         // %2
+    :
+    : "r4", "q0", "q1", "memory", "cc"
+  );
+}
+
+static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
+                                  uint8* dst_ptr, int dst_width) {
+  __asm__ volatile
+  (
+    "1:                           \n"
+    "mov        r4, %0            \n"
+    "vld1.u8    {d0}, [r4],%3     \n"   // load up 8x4 block of input data
+    "vld1.u8    {d1}, [r4],%3     \n"
+    "vld1.u8    {d2}, [r4],%3     \n"
+    "vld1.u8    {d3}, [r4]        \n"
+
+    // data is loaded up int q0 and q1
+    // q0 = a00 a01 a02 a03 b00 b01 b02 b03 a10 a11 a12 a13 b10 b11 b12 b13
+    // q1 = a20 a21 a22 a23 b20 b21 b22 b23 a20 a21 a22 a23 b20 b21 b22 b23
+    // q0 = a00+a01 a02+a03 b00+b01 b02+b03 a10+a11 a12+a13 b10+b11 b12+b13
+    "vpaddl.u8  q0, q0            \n"
+
+    // d0 = a00+a01+a20+a21 a02+a03+a22+a23 b00+b01+b20+b21 b02+b03+b22+b23
+    // d1 = a10+a11+a20+a21 a12+a13+a22+a23 b10+b11+b20+b21 b12+b13+b22+b23
+    "vpadal.u8  q0, q1            \n"
+
+    // d0 = a00+a01+a20+a21+a02+a03+a22+a23 b00+b01+b20+b21+b02+b03+b22+b23
+    // d1 = a10+a11+a20+a21+a12+a13+a22+a23 b10+b11+b20+b21+b12+b13+b22+b23
+    "vpaddl.u16 q0, q0            \n"
+
+
+    // d0 = a00+a01+a20+a21+a02+a03+a22+a23+a10+a11+a20+a21+a12+a13+a22+a23
+    //      b00+b01+b20+b21+b02+b03+b22+b23+b10+b11+b20+b21+b12+b13+b22+b23
+    "vadd.u32   d0, d1            \n"
+
+    "vrshr.u32  d0, d0, #4        \n"   // divide by 16 w/rounding
+
+    "vst1.u8    {d0[0]}, [%1]!    \n"
+    "vst1.u8    {d0[4]}, [%1]!    \n"
+
+    "add        %0, #8            \n"   // move src pointer to next 8 pixels
+    "subs       %2, #2            \n"   // dst_width -= 2
+    "bhi        1b                \n"
+
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width)         // %2
+    : "r"(src_stride)         // %3
+    : "r4", "q0", "q1", "memory", "cc"
+  );
+}
+
+/**
+ * SSE2 downscalers with interpolation.
+ *
+ * Provided by Frank Barchard (fbarchard@google.com)
+ *
+ */
+
+// Constants for SSE2 code
+#elif (defined(WIN32) || defined(__i386__) || defined(__x86_64__)) && \
+    !defined(COVERAGE_ENABLED) && !TARGET_IPHONE_SIMULATOR
+#if defined(_MSC_VER)
+#define TALIGN16(t, var) __declspec(align(16)) t _ ## var
+#elif defined(OSX)
+#define TALIGN16(t, var) t var __attribute__((aligned(16)))
+#else
+#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
+#endif
+
+// Offsets for source bytes 0 to 9
+extern "C" TALIGN16(const uint8, shuf0[16]) =
+  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+extern "C" TALIGN16(const uint8, shuf1[16]) =
+  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+extern "C" TALIGN16(const uint8, shuf2[16]) =
+  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 0 to 10
+extern "C" TALIGN16(const uint8, shuf01[16]) =
+  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+extern "C" TALIGN16(const uint8, shuf11[16]) =
+  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+extern "C" TALIGN16(const uint8, shuf21[16]) =
+  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+
+// Coefficients for source bytes 0 to 10
+extern "C" TALIGN16(const uint8, madd01[16]) =
+  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+
+// Coefficients for source bytes 10 to 21
+extern "C" TALIGN16(const uint8, madd11[16]) =
+  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+
+// Coefficients for source bytes 21 to 31
+extern "C" TALIGN16(const uint8, madd21[16]) =
+  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+
+// Coefficients for source bytes 21 to 31
+extern "C" TALIGN16(const int16, round34[8]) =
+  { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+extern "C" TALIGN16(const uint8, shuf38a[16]) =
+  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+extern "C" TALIGN16(const uint8, shuf38b[16]) =
+  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 0,1,2
+extern "C" TALIGN16(const uint8, shufac0[16]) =
+  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 3,4,5
+extern "C" TALIGN16(const uint8, shufac3[16]) =
+  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x3 and 2x3
+extern "C" TALIGN16(const uint16, scaleac3[8]) =
+  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+
+// Arrange first value for pixels 0,1,2,3,4,5
+extern "C" TALIGN16(const uint8, shufab0[16]) =
+  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+
+// Arrange second value for pixels 0,1,2,3,4,5
+extern "C" TALIGN16(const uint8, shufab1[16]) =
+  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+
+// Arrange third value for pixels 0,1,2,3,4,5
+extern "C" TALIGN16(const uint8, shufab2[16]) =
+  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x2 and 2x2
+extern "C" TALIGN16(const uint16, scaleab2[8]) =
+  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+#endif
+
+#if defined(WIN32) && !defined(COVERAGE_ENABLED)
+
+#define HAS_SCALEROWDOWN2_SSE2
+// Reads 32 pixels, throws half away and writes 16 pixels.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked)
+static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
+    psrlw      xmm7, 8
+
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm7
+    pand       xmm1, xmm7
+    packuswb   xmm0, xmm1
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    ja         wloop
+
+    ret
+  }
+}
+// Blends 32x2 rectangle to 16x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked)
+static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
+                                  uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
+    psrlw      xmm7, 8
+
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + esi]
+    movdqa     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+
+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
+    psrlw      xmm0, 8
+    movdqa     xmm3, xmm1
+    psrlw      xmm1, 8
+    pand       xmm2, xmm7
+    pand       xmm3, xmm7
+    pavgw      xmm0, xmm2
+    pavgw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    ja         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+#define HAS_SCALEROWDOWN4_SSE2
+// Point samples 32 pixels to 8 pixels.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked)
+static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // src_ptr
+                                     // src_stride ignored
+    mov        edi, [esp + 32 + 12]  // dst_ptr
+    mov        ecx, [esp + 32 + 16]  // dst_width
+    pcmpeqb    xmm7, xmm7            // generate mask 0x000000ff
+    psrld      xmm7, 24
+
+  wloop:
+    movdqa     xmm0, [esi]
+    movdqa     xmm1, [esi + 16]
+    lea        esi,  [esi + 32]
+    pand       xmm0, xmm7
+    pand       xmm1, xmm7
+    packuswb   xmm0, xmm1
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi], xmm0
+    lea        edi, [edi + 8]
+    sub        ecx, 8
+    ja         wloop
+
+    popad
+    ret
+  }
+}
+
+// Blends 32x4 rectangle to 8x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked)
+static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
+                                  uint8* dst_ptr, int dst_width) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        ebx, [esp + 32 + 8]   // src_stride
+    mov        edi, [esp + 32 + 12]  // dst_ptr
+    mov        ecx, [esp + 32 + 16]  // dst_width
+    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
+    psrlw      xmm7, 8
+    lea        edx, [ebx + ebx * 2]  // src_stride * 3
+
+  wloop:
+    movdqa     xmm0, [esi]
+    movdqa     xmm1, [esi + 16]
+    movdqa     xmm2, [esi + ebx]
+    movdqa     xmm3, [esi + ebx + 16]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, [esi + ebx * 2]
+    movdqa     xmm3, [esi + ebx * 2 + 16]
+    movdqa     xmm4, [esi + edx]
+    movdqa     xmm5, [esi + edx + 16]
+    lea        esi, [esi + 32]
+    pavgb      xmm2, xmm4
+    pavgb      xmm3, xmm5
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+
+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
+    psrlw      xmm0, 8
+    movdqa     xmm3, xmm1
+    psrlw      xmm1, 8
+    pand       xmm2, xmm7
+    pand       xmm3, xmm7
+    pavgw      xmm0, xmm2
+    pavgw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+
+    movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
+    psrlw      xmm0, 8
+    pand       xmm2, xmm7
+    pavgw      xmm0, xmm2
+    packuswb   xmm0, xmm0
+
+    movq       qword ptr [edi], xmm0
+    lea        edi, [edi + 8]
+    sub        ecx, 8
+    ja         wloop
+
+    popad
+    ret
+  }
+}
+
+#define HAS_SCALEROWDOWN8_SSE2
+// Point samples 32 pixels to 4 pixels.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
+__declspec(naked)
+static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // src_ptr
+                                     // src_stride ignored
+    mov        edi, [esp + 32 + 12]  // dst_ptr
+    mov        ecx, [esp + 32 + 16]  // dst_width
+    pcmpeqb    xmm7, xmm7            // generate mask isolating 1 src 8 bytes
+    psrlq      xmm7, 56
+
+  wloop:
+    movdqa     xmm0, [esi]
+    movdqa     xmm1, [esi + 16]
+    lea        esi,  [esi + 32]
+    pand       xmm0, xmm7
+    pand       xmm1, xmm7
+    packuswb   xmm0, xmm1  // 32->16
+    packuswb   xmm0, xmm0  // 16->8
+    packuswb   xmm0, xmm0  // 8->4
+    movd       dword ptr [edi], xmm0
+    lea        edi, [edi + 4]
+    sub        ecx, 4
+    ja         wloop
+
+    popad
+    ret
+  }
+}
+
+// Blends 32x8 rectangle to 4x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
+__declspec(naked)
+static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
+                                  uint8* dst_ptr, int dst_width) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        ebx, [esp + 32 + 8]   // src_stride
+    mov        edi, [esp + 32 + 12]  // dst_ptr
+    mov        ecx, [esp + 32 + 16]  // dst_width
+    lea        edx, [ebx + ebx * 2]  // src_stride * 3
+    pxor       xmm7, xmm7
+
+  wloop:
+    movdqa     xmm0, [esi]           // average 8 rows to 1
+    movdqa     xmm1, [esi + 16]
+    movdqa     xmm2, [esi + ebx]
+    movdqa     xmm3, [esi + ebx + 16]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, [esi + ebx * 2]
+    movdqa     xmm3, [esi + ebx * 2 + 16]
+    movdqa     xmm4, [esi + edx]
+    movdqa     xmm5, [esi + edx + 16]
+    lea        ebp, [esi + ebx * 4]
+    lea        esi, [esi + 32]
+    pavgb      xmm2, xmm4
+    pavgb      xmm3, xmm5
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+
+    movdqa     xmm2, [ebp]
+    movdqa     xmm3, [ebp + 16]
+    movdqa     xmm4, [ebp + ebx]
+    movdqa     xmm5, [ebp + ebx + 16]
+    pavgb      xmm2, xmm4
+    pavgb      xmm3, xmm5
+    movdqa     xmm4, [ebp + ebx * 2]
+    movdqa     xmm5, [ebp + ebx * 2 + 16]
+    movdqa     xmm6, [ebp + edx]
+    pavgb      xmm4, xmm6
+    movdqa     xmm6, [ebp + edx + 16]
+    pavgb      xmm5, xmm6
+    pavgb      xmm2, xmm4
+    pavgb      xmm3, xmm5
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+
+    psadbw     xmm0, xmm7            // average 32 pixels to 4
+    psadbw     xmm1, xmm7
+    pshufd     xmm0, xmm0, 0xd8      // x1x0 -> xx01
+    pshufd     xmm1, xmm1, 0x8d      // x3x2 -> 32xx
+    por        xmm0, xmm1            //      -> 3201
+    psrlw      xmm0, 3
+    packuswb   xmm0, xmm0
+    packuswb   xmm0, xmm0
+    movd       dword ptr [edi], xmm0
+
+    lea        edi, [edi + 4]
+    sub        ecx, 4
+    ja         wloop
+
+    popad
+    ret
+  }
+}
+
+#define HAS_SCALEROWDOWN34_SSSE3
+// Point samples 32 pixels to 24 pixels.
+// Produces three 8 byte values.  For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked)
+static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
+                                 uint8* dst_ptr, int dst_width) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // src_ptr
+                                     // src_stride ignored
+    mov        edi, [esp + 32 + 12]  // dst_ptr
+    mov        ecx, [esp + 32 + 16]  // dst_width
+    movdqa     xmm3, _shuf0
+    movdqa     xmm4, _shuf1
+    movdqa     xmm5, _shuf2
+
+  wloop:
+    movdqa     xmm0, [esi]
+    movdqa     xmm2, [esi + 16]
+    lea        esi,  [esi + 32]
+    movdqa     xmm1, xmm2
+    palignr    xmm1, xmm0, 8
+    pshufb     xmm0, xmm3
+    pshufb     xmm1, xmm4
+    pshufb     xmm2, xmm5
+    movq       qword ptr [edi], xmm0
+    movq       qword ptr [edi + 8], xmm1
+    movq       qword ptr [edi + 16], xmm2
+    lea        edi, [edi + 24]
+    sub        ecx, 24
+    ja         wloop
+
+    popad
+    ret
+  }
+}
+
+// Blends 32x2 rectangle to 24x1
+// Produces three 8 byte values.  For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+// Register usage:
+// xmm0 src_row 0
+// xmm1 src_row 1
+// xmm2 shuf 0
+// xmm3 shuf 1
+// xmm4 shuf 2
+// xmm5 madd 0
+// xmm6 madd 1
+// xmm7 round34
+
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked)
+static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                       uint8* dst_ptr, int dst_width) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        ebx, [esp + 32 + 8]   // src_stride
+    mov        edi, [esp + 32 + 12]  // dst_ptr
+    mov        ecx, [esp + 32 + 16]  // dst_width
+    movdqa     xmm2, _shuf01
+    movdqa     xmm3, _shuf11
+    movdqa     xmm4, _shuf21
+    movdqa     xmm5, _madd01
+    movdqa     xmm6, _madd11
+    movdqa     xmm7, _round34
+
+  wloop:
+    movdqa     xmm0, [esi]           // pixels 0..7
+    movdqa     xmm1, [esi+ebx]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm2
+    pmaddubsw  xmm0, xmm5
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi], xmm0
+    movdqu     xmm0, [esi+8]         // pixels 8..15
+    movdqu     xmm1, [esi+ebx+8]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm3
+    pmaddubsw  xmm0, xmm6
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi+8], xmm0
+    movdqa     xmm0, [esi+16]        // pixels 16..23
+    movdqa     xmm1, [esi+ebx+16]
+    lea        esi, [esi+32]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm4
+    movdqa     xmm1, _madd21
+    pmaddubsw  xmm0, xmm1
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi+16], xmm0
+    lea        edi, [edi+24]
+    sub        ecx, 24
+    ja         wloop
+
+    popad
+    ret
+  }
+}
+
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked)
+static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                       uint8* dst_ptr, int dst_width) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        ebx, [esp + 32 + 8]   // src_stride
+    mov        edi, [esp + 32 + 12]  // dst_ptr
+    mov        ecx, [esp + 32 + 16]  // dst_width
+    movdqa     xmm2, _shuf01
+    movdqa     xmm3, _shuf11
+    movdqa     xmm4, _shuf21
+    movdqa     xmm5, _madd01
+    movdqa     xmm6, _madd11
+    movdqa     xmm7, _round34
+
+  wloop:
+    movdqa     xmm0, [esi]           // pixels 0..7
+    movdqa     xmm1, [esi+ebx]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm2
+    pmaddubsw  xmm0, xmm5
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi], xmm0
+    movdqu     xmm0, [esi+8]         // pixels 8..15
+    movdqu     xmm1, [esi+ebx+8]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm3
+    pmaddubsw  xmm0, xmm6
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi+8], xmm0
+    movdqa     xmm0, [esi+16]        // pixels 16..23
+    movdqa     xmm1, [esi+ebx+16]
+    lea        esi, [esi+32]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm4
+    movdqa     xmm1, _madd21
+    pmaddubsw  xmm0, xmm1
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi+16], xmm0
+    lea        edi, [edi+24]
+    sub        ecx, 24
+    ja         wloop
+
+    popad
+    ret
+  }
+}
+
+#define HAS_SCALEROWDOWN38_SSSE3
+// 3/8 point sampler
+
+// Scale 32 pixels to 12
+__declspec(naked)
+static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
+                                 uint8* dst_ptr, int dst_width) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        edx, [esp + 32 + 8]   // src_stride
+    mov        edi, [esp + 32 + 12]  // dst_ptr
+    mov        ecx, [esp + 32 + 16]  // dst_width
+    movdqa     xmm5, _shuf38a
+    movdqa     xmm6, _shuf38b
+    pxor       xmm7, xmm7
+
+  xloop:
+    movdqa     xmm0, [esi]           // 16 pixels -> 0,1,2,3,4,5
+    movdqa     xmm1, [esi + 16]      // 16 pixels -> 6,7,8,9,10,11
+    lea        esi, [esi + 32]
+    pshufb     xmm0, xmm5
+    pshufb     xmm1, xmm6
+    paddusb    xmm0, xmm1
+
+    movq       qword ptr [edi], xmm0 // write 12 pixels
+    movhlps    xmm1, xmm0
+    movd       [edi + 8], xmm1
+    lea        edi, [edi + 12]
+    sub        ecx, 12
+    ja         xloop
+
+    popad
+    ret
+  }
+}
+
+// Scale 16x3 pixels to 6x1 with interpolation
+__declspec(naked)
+static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                       uint8* dst_ptr, int dst_width) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        edx, [esp + 32 + 8]   // src_stride
+    mov        edi, [esp + 32 + 12]  // dst_ptr
+    mov        ecx, [esp + 32 + 16]  // dst_width
+    movdqa     xmm4, _shufac0
+    movdqa     xmm5, _shufac3
+    movdqa     xmm6, _scaleac3
+    pxor       xmm7, xmm7
+
+  xloop:
+    movdqa     xmm0, [esi]           // sum up 3 rows into xmm0/1
+    movdqa     xmm2, [esi + edx]
+    movhlps    xmm1, xmm0
+    movhlps    xmm3, xmm2
+    punpcklbw  xmm0, xmm7
+    punpcklbw  xmm1, xmm7
+    punpcklbw  xmm2, xmm7
+    punpcklbw  xmm3, xmm7
+    paddusw    xmm0, xmm2
+    paddusw    xmm1, xmm3
+    movdqa     xmm2, [esi + edx * 2]
+    lea        esi, [esi + 16]
+    movhlps    xmm3, xmm2
+    punpcklbw  xmm2, xmm7
+    punpcklbw  xmm3, xmm7
+    paddusw    xmm0, xmm2
+    paddusw    xmm1, xmm3
+
+    movdqa     xmm2, xmm0            // 8 pixels -> 0,1,2 of xmm2
+    psrldq     xmm0, 2
+    paddusw    xmm2, xmm0
+    psrldq     xmm0, 2
+    paddusw    xmm2, xmm0
+    pshufb     xmm2, xmm4
+
+    movdqa     xmm3, xmm1            // 8 pixels -> 3,4,5 of xmm2
+    psrldq     xmm1, 2
+    paddusw    xmm3, xmm1
+    psrldq     xmm1, 2
+    paddusw    xmm3, xmm1
+    pshufb     xmm3, xmm5
+    paddusw    xmm2, xmm3
+
+    pmulhuw    xmm2, xmm6            // divide by 9,9,6, 9,9,6
+    packuswb   xmm2, xmm2
+
+    movd       [edi], xmm2           // write 6 pixels
+    pextrw     eax, xmm2, 2
+    mov        [edi + 4], ax
+    lea        edi, [edi + 6]
+    sub        ecx, 6
+    ja         xloop
+
+    popad
+    ret
+  }
+}
+
+// Scale 16x2 pixels to 6x1 with interpolation
+__declspec(naked)
+static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                       uint8* dst_ptr, int dst_width) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        edx, [esp + 32 + 8]   // src_stride
+    mov        edi, [esp + 32 + 12]  // dst_ptr
+    mov        ecx, [esp + 32 + 16]  // dst_width
+    movdqa     xmm4, _shufab0
+    movdqa     xmm5, _shufab1
+    movdqa     xmm6, _shufab2
+    movdqa     xmm7, _scaleab2
+
+  xloop:
+    movdqa     xmm2, [esi]           // average 2 rows into xmm2
+    pavgb      xmm2, [esi + edx]
+    lea        esi, [esi + 16]
+
+    movdqa     xmm0, xmm2            // 16 pixels -> 0,1,2,3,4,5 of xmm0
+    pshufb     xmm0, xmm4
+    movdqa     xmm1, xmm2
+    pshufb     xmm1, xmm5
+    paddusw    xmm0, xmm1
+    pshufb     xmm2, xmm6
+    paddusw    xmm0, xmm2
+
+    pmulhuw    xmm0, xmm7            // divide by 3,3,2, 3,3,2
+    packuswb   xmm0, xmm0
+
+    movd       [edi], xmm0           // write 6 pixels
+    pextrw     eax, xmm0, 2
+    mov        [edi + 4], ax
+    lea        edi, [edi + 6]
+    sub        ecx, 6
+    ja         xloop
+
+    popad
+    ret
+  }
+}
+
+#define HAS_SCALEADDROWS_SSE2
+
+// Reads 8xN bytes and produces 16 shorts at a time.
+__declspec(naked)
+static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
+                              uint16* dst_ptr, int src_width,
+                              int src_height) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        edx, [esp + 32 + 8]   // src_stride
+    mov        edi, [esp + 32 + 12]  // dst_ptr
+    mov        ecx, [esp + 32 + 16]  // dst_width
+    mov        ebx, [esp + 32 + 20]  // height
+    pxor       xmm7, xmm7
+    dec        ebx
+
+  xloop:
+    // first row
+    movdqa     xmm2, [esi]
+    lea        eax, [esi + edx]
+    movhlps    xmm3, xmm2
+    mov        ebp, ebx
+    punpcklbw  xmm2, xmm7
+    punpcklbw  xmm3, xmm7
+
+    // sum remaining rows
+  yloop:
+    movdqa     xmm0, [eax]       // read 16 pixels
+    lea        eax, [eax + edx]  // advance to next row
+    movhlps    xmm1, xmm0
+    punpcklbw  xmm0, xmm7
+    punpcklbw  xmm1, xmm7
+    paddusw    xmm2, xmm0        // sum 16 words
+    paddusw    xmm3, xmm1
+    sub        ebp, 1
+    ja         yloop
+
+    movdqa     [edi], xmm2
+    movdqa     [edi + 16], xmm3
+    lea        edi, [edi + 32]
+    lea        esi, [esi + 16]
+
+    sub        ecx, 16
+    ja         xloop
+
+    popad
+    ret
+  }
+}
+
+// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
+#define HAS_SCALEFILTERROWS_SSE2
+__declspec(naked)
+static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                                 int src_stride, int dst_width,
+                                 int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    cmp        eax, 0
+    je         xloop1
+    cmp        eax, 128
+    je         xloop2
+
+    movd       xmm6, eax            // xmm6 = y fraction
+    punpcklwd  xmm6, xmm6
+    pshufd     xmm6, xmm6, 0
+    neg        eax                  // xmm5 = 256 - y fraction
+    add        eax, 256
+    movd       xmm5, eax
+    punpcklwd  xmm5, xmm5
+    pshufd     xmm5, xmm5, 0
+    pxor       xmm7, xmm7
+
+  xloop:
+    movdqa     xmm0, [esi]
+    movdqa     xmm2, [esi + edx]
+    lea        esi, [esi + 16]
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    punpcklbw  xmm0, xmm7
+    punpcklbw  xmm2, xmm7
+    punpckhbw  xmm1, xmm7
+    punpckhbw  xmm3, xmm7
+    pmullw     xmm0, xmm5           // scale row 0
+    pmullw     xmm1, xmm5
+    pmullw     xmm2, xmm6           // scale row 1
+    pmullw     xmm3, xmm6
+    paddusw    xmm0, xmm2           // sum rows
+    paddusw    xmm1, xmm3
+    psrlw      xmm0, 8
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 16
+    ja         xloop
+
+    mov        al, [edi - 1]
+    mov        [edi], al
+    pop        edi
+    pop        esi
+    ret
+
+  xloop1:
+    movdqa     xmm0, [esi]
+    lea        esi, [esi + 16]
+    movdqa     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 16
+    ja         xloop1
+
+    mov        al, [edi - 1]
+    mov        [edi], al
+    pop        edi
+    pop        esi
+    ret
+
+  xloop2:
+    movdqa     xmm0, [esi]
+    movdqa     xmm2, [esi + edx]
+    lea        esi, [esi + 16]
+    pavgb      xmm0, xmm2
+    movdqa     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 16
+    ja         xloop2
+
+    mov        al, [edi - 1]
+    mov        [edi], al
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
+#define HAS_SCALEFILTERROWS_SSSE3
+__declspec(naked)
+static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                                  int src_stride, int dst_width,
+                                  int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    cmp        eax, 0
+    je         xloop1
+    cmp        eax, 128
+    je         xloop2
+
+    shr        eax, 1
+    mov        ah,al
+    neg        al
+    add        al, 128
+    movd       xmm7, eax
+    punpcklwd  xmm7, xmm7
+    pshufd     xmm7, xmm7, 0
+
+  xloop:
+    movdqa     xmm0, [esi]
+    movdqa     xmm2, [esi + edx]
+    lea        esi, [esi + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm2
+    punpckhbw  xmm1, xmm2
+    pmaddubsw  xmm0, xmm7
+    pmaddubsw  xmm1, xmm7
+    psrlw      xmm0, 7
+    psrlw      xmm1, 7
+    packuswb   xmm0, xmm1
+    movdqa     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 16
+    ja         xloop
+
+    mov        al, [edi - 1]
+    mov        [edi], al
+    pop        edi
+    pop        esi
+    ret
+
+  xloop1:
+    movdqa     xmm0, [esi]
+    lea        esi, [esi + 16]
+    movdqa     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 16
+    ja         xloop1
+
+    mov        al, [edi - 1]
+    mov        [edi], al
+    pop        edi
+    pop        esi
+    ret
+
+  xloop2:
+    movdqa     xmm0, [esi]
+    movdqa     xmm2, [esi + edx]
+    lea        esi, [esi + 16]
+    pavgb      xmm0, xmm2
+    movdqa     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 16
+    ja         xloop2
+
+    mov        al, [edi - 1]
+    mov        [edi], al
+    pop        edi
+    pop        esi
+    ret
+
+  }
+}
+
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked)
+static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                                    int dst_width) {
+  __asm {
+    mov        edx, [esp + 4]    // dst_ptr
+    mov        eax, [esp + 8]    // src_ptr
+    mov        ecx, [esp + 12]   // dst_width
+    movdqa     xmm1, _round34
+    movdqa     xmm2, _shuf01
+    movdqa     xmm3, _shuf11
+    movdqa     xmm4, _shuf21
+    movdqa     xmm5, _madd01
+    movdqa     xmm6, _madd11
+    movdqa     xmm7, _madd21
+
+  wloop:
+    movdqa     xmm0, [eax]           // pixels 0..7
+    pshufb     xmm0, xmm2
+    pmaddubsw  xmm0, xmm5
+    paddsw     xmm0, xmm1
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    movdqu     xmm0, [eax+8]         // pixels 8..15
+    pshufb     xmm0, xmm3
+    pmaddubsw  xmm0, xmm6
+    paddsw     xmm0, xmm1
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx+8], xmm0
+    movdqa     xmm0, [eax+16]        // pixels 16..23
+    lea        eax, [eax+32]
+    pshufb     xmm0, xmm4
+    pmaddubsw  xmm0, xmm7
+    paddsw     xmm0, xmm1
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx+16], xmm0
+    lea        edx, [edx+24]
+    sub        ecx, 24
+    ja         wloop
+    ret
+  }
+}
+
+#elif (defined(__x86_64__) || defined(__i386__)) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+
+// GCC versions of row functions are verbatim conversions from Visual C.
+// Generated using gcc disassembly on Visual C object file:
+// objdump -D yuvscaler.obj >yuvscaler.txt
+#define HAS_SCALEROWDOWN2_SSE2
+static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile(
+  "pcmpeqb    %%xmm7,%%xmm7\n"
+  "psrlw      $0x8,%%xmm7\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "lea        0x20(%0),%0\n"
+  "pand       %%xmm7,%%xmm0\n"
+  "pand       %%xmm7,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm0\n"
+  "movdqa     %%xmm0,(%1)\n"
+  "lea        0x10(%1),%1\n"
+  "sub        $0x10,%2\n"
+  "ja         1b\n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :
+  : "memory"
+);
+}
+
+static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
+                                  uint8* dst_ptr, int dst_width) {
+  asm volatile(
+  "pcmpeqb    %%xmm7,%%xmm7\n"
+  "psrlw      $0x8,%%xmm7\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "movdqa     (%0,%3,1),%%xmm2\n"
+  "movdqa     0x10(%0,%3,1),%%xmm3\n"
+  "lea        0x20(%0),%0\n"
+  "pavgb      %%xmm2,%%xmm0\n"
+  "pavgb      %%xmm3,%%xmm1\n"
+  "movdqa     %%xmm0,%%xmm2\n"
+  "psrlw      $0x8,%%xmm0\n"
+  "movdqa     %%xmm1,%%xmm3\n"
+  "psrlw      $0x8,%%xmm1\n"
+  "pand       %%xmm7,%%xmm2\n"
+  "pand       %%xmm7,%%xmm3\n"
+  "pavgw      %%xmm2,%%xmm0\n"
+  "pavgw      %%xmm3,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm0\n"
+  "movdqa     %%xmm0,(%1)\n"
+  "lea        0x10(%1),%1\n"
+  "sub        $0x10,%2\n"
+  "ja         1b\n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  : "r"(static_cast<intptr_t>(src_stride))   // %3
+  : "memory"
+);
+}
+
+#define HAS_SCALEROWDOWN4_SSE2
+static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile(
+  "pcmpeqb    %%xmm7,%%xmm7\n"
+  "psrld      $0x18,%%xmm7\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "lea        0x20(%0),%0\n"
+  "pand       %%xmm7,%%xmm0\n"
+  "pand       %%xmm7,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm0\n"
+  "packuswb   %%xmm0,%%xmm0\n"
+  "movq       %%xmm0,(%1)\n"
+  "lea        0x8(%1),%1\n"
+  "sub        $0x8,%2\n"
+  "ja         1b\n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :
+  : "memory"
+);
+}
+
+static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
+                                  uint8* dst_ptr, int dst_width) {
+  intptr_t temp = 0;
+  asm volatile(
+  "pcmpeqb    %%xmm7,%%xmm7\n"
+  "psrlw      $0x8,%%xmm7\n"
+  "lea        (%4,%4,2),%3\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "movdqa     (%0,%4,1),%%xmm2\n"
+  "movdqa     0x10(%0,%4,1),%%xmm3\n"
+  "pavgb      %%xmm2,%%xmm0\n"
+  "pavgb      %%xmm3,%%xmm1\n"
+  "movdqa     (%0,%4,2),%%xmm2\n"
+  "movdqa     0x10(%0,%4,2),%%xmm3\n"
+  "movdqa     (%0,%3,1),%%xmm4\n"
+  "movdqa     0x10(%0,%3,1),%%xmm5\n"
+  "lea        0x20(%0),%0\n"
+  "pavgb      %%xmm4,%%xmm2\n"
+  "pavgb      %%xmm2,%%xmm0\n"
+  "pavgb      %%xmm5,%%xmm3\n"
+  "pavgb      %%xmm3,%%xmm1\n"
+  "movdqa     %%xmm0,%%xmm2\n"
+  "psrlw      $0x8,%%xmm0\n"
+  "movdqa     %%xmm1,%%xmm3\n"
+  "psrlw      $0x8,%%xmm1\n"
+  "pand       %%xmm7,%%xmm2\n"
+  "pand       %%xmm7,%%xmm3\n"
+  "pavgw      %%xmm2,%%xmm0\n"
+  "pavgw      %%xmm3,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm0\n"
+  "movdqa     %%xmm0,%%xmm2\n"
+  "psrlw      $0x8,%%xmm0\n"
+  "pand       %%xmm7,%%xmm2\n"
+  "pavgw      %%xmm2,%%xmm0\n"
+  "packuswb   %%xmm0,%%xmm0\n"
+  "movq       %%xmm0,(%1)\n"
+  "lea        0x8(%1),%1\n"
+  "sub        $0x8,%2\n"
+  "ja         1b\n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width),   // %2
+    "+r"(temp)         // %3
+  : "r"(static_cast<intptr_t>(src_stride))    // %4
+  : "memory"
+);
+}
+
+#define HAS_SCALEROWDOWN8_SSE2
+static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile(
+  "pcmpeqb    %%xmm7,%%xmm7\n"
+  "psrlq      $0x38,%%xmm7\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "lea        0x20(%0),%0\n"
+  "pand       %%xmm7,%%xmm0\n"
+  "pand       %%xmm7,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm0\n"
+  "packuswb   %%xmm0,%%xmm0\n"
+  "packuswb   %%xmm0,%%xmm0\n"
+  "movd       %%xmm0,(%1)\n"
+  "lea        0x4(%1),%1\n"
+  "sub        $0x4,%2\n"
+  "ja         1b\n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :
+  : "memory"
+);
+}
+
+#if defined(__i386__)
+extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
+                                      uint8* dst_ptr, int dst_width);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleRowDown8Int_SSE2\n"
+"_ScaleRowDown8Int_SSE2:\n"
+#else
+    ".global ScaleRowDown8Int_SSE2\n"
+"ScaleRowDown8Int_SSE2:\n"
+#endif
+    "pusha\n"
+    "mov    0x24(%esp),%esi\n"
+    "mov    0x28(%esp),%ebx\n"
+    "mov    0x2c(%esp),%edi\n"
+    "mov    0x30(%esp),%ecx\n"
+    "lea    (%ebx,%ebx,2),%edx\n"
+    "pxor   %xmm7,%xmm7\n"
+
+"1:"
+    "movdqa (%esi),%xmm0\n"
+    "movdqa 0x10(%esi),%xmm1\n"
+    "movdqa (%esi,%ebx,1),%xmm2\n"
+    "movdqa 0x10(%esi,%ebx,1),%xmm3\n"
+    "pavgb  %xmm2,%xmm0\n"
+    "pavgb  %xmm3,%xmm1\n"
+    "movdqa (%esi,%ebx,2),%xmm2\n"
+    "movdqa 0x10(%esi,%ebx,2),%xmm3\n"
+    "movdqa (%esi,%edx,1),%xmm4\n"
+    "movdqa 0x10(%esi,%edx,1),%xmm5\n"
+    "lea    (%esi,%ebx,4),%ebp\n"
+    "lea    0x20(%esi),%esi\n"
+    "pavgb  %xmm4,%xmm2\n"
+    "pavgb  %xmm5,%xmm3\n"
+    "pavgb  %xmm2,%xmm0\n"
+    "pavgb  %xmm3,%xmm1\n"
+    "movdqa 0x0(%ebp),%xmm2\n"
+    "movdqa 0x10(%ebp),%xmm3\n"
+    "movdqa 0x0(%ebp,%ebx,1),%xmm4\n"
+    "movdqa 0x10(%ebp,%ebx,1),%xmm5\n"
+    "pavgb  %xmm4,%xmm2\n"
+    "pavgb  %xmm5,%xmm3\n"
+    "movdqa 0x0(%ebp,%ebx,2),%xmm4\n"
+    "movdqa 0x10(%ebp,%ebx,2),%xmm5\n"
+    "movdqa 0x0(%ebp,%edx,1),%xmm6\n"
+    "pavgb  %xmm6,%xmm4\n"
+    "movdqa 0x10(%ebp,%edx,1),%xmm6\n"
+    "pavgb  %xmm6,%xmm5\n"
+    "pavgb  %xmm4,%xmm2\n"
+    "pavgb  %xmm5,%xmm3\n"
+    "pavgb  %xmm2,%xmm0\n"
+    "pavgb  %xmm3,%xmm1\n"
+    "psadbw %xmm7,%xmm0\n"
+    "psadbw %xmm7,%xmm1\n"
+    "pshufd $0xd8,%xmm0,%xmm0\n"
+    "pshufd $0x8d,%xmm1,%xmm1\n"
+    "por    %xmm1,%xmm0\n"
+    "psrlw  $0x3,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "movd   %xmm0,(%edi)\n"
+    "lea    0x4(%edi),%edi\n"
+    "sub    $0x4,%ecx\n"
+    "ja     1b\n"
+    "popa\n"
+    "ret\n"
+);
+
+// fpic is used for magiccam plugin
+#if !defined(__PIC__)
+#define HAS_SCALEROWDOWN34_SSSE3
+extern "C" void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
+                                     uint8* dst_ptr, int dst_width);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleRowDown34_SSSE3\n"
+"_ScaleRowDown34_SSSE3:\n"
+#else
+    ".global ScaleRowDown34_SSSE3\n"
+"ScaleRowDown34_SSSE3:\n"
+#endif
+    "pusha\n"
+    "mov    0x24(%esp),%esi\n"
+    "mov    0x2c(%esp),%edi\n"
+    "mov    0x30(%esp),%ecx\n"
+    "movdqa _shuf0,%xmm3\n"
+    "movdqa _shuf1,%xmm4\n"
+    "movdqa _shuf2,%xmm5\n"
+
+"1:"
+    "movdqa (%esi),%xmm0\n"
+    "movdqa 0x10(%esi),%xmm2\n"
+    "lea    0x20(%esi),%esi\n"
+    "movdqa %xmm2,%xmm1\n"
+    "palignr $0x8,%xmm0,%xmm1\n"
+    "pshufb %xmm3,%xmm0\n"
+    "pshufb %xmm4,%xmm1\n"
+    "pshufb %xmm5,%xmm2\n"
+    "movq   %xmm0,(%edi)\n"
+    "movq   %xmm1,0x8(%edi)\n"
+    "movq   %xmm2,0x10(%edi)\n"
+    "lea    0x18(%edi),%edi\n"
+    "sub    $0x18,%ecx\n"
+    "ja     1b\n"
+    "popa\n"
+    "ret\n"
+);
+
+extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                           uint8* dst_ptr, int dst_width);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleRowDown34_1_Int_SSSE3\n"
+"_ScaleRowDown34_1_Int_SSSE3:\n"
+#else
+    ".global ScaleRowDown34_1_Int_SSSE3\n"
+"ScaleRowDown34_1_Int_SSSE3:\n"
+#endif
+    "pusha\n"
+    "mov    0x24(%esp),%esi\n"
+    "mov    0x28(%esp),%ebp\n"
+    "mov    0x2c(%esp),%edi\n"
+    "mov    0x30(%esp),%ecx\n"
+    "movdqa _shuf01,%xmm2\n"
+    "movdqa _shuf11,%xmm3\n"
+    "movdqa _shuf21,%xmm4\n"
+    "movdqa _madd01,%xmm5\n"
+    "movdqa _madd11,%xmm6\n"
+    "movdqa _round34,%xmm7\n"
+
+"1:"
+    "movdqa (%esi),%xmm0\n"
+    "movdqa (%esi,%ebp),%xmm1\n"
+    "pavgb  %xmm1,%xmm0\n"
+    "pshufb %xmm2,%xmm0\n"
+    "pmaddubsw %xmm5,%xmm0\n"
+    "paddsw %xmm7,%xmm0\n"
+    "psrlw  $0x2,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "movq   %xmm0,(%edi)\n"
+    "movdqu 0x8(%esi),%xmm0\n"
+    "movdqu 0x8(%esi,%ebp),%xmm1\n"
+    "pavgb  %xmm1,%xmm0\n"
+    "pshufb %xmm3,%xmm0\n"
+    "pmaddubsw %xmm6,%xmm0\n"
+    "paddsw %xmm7,%xmm0\n"
+    "psrlw  $0x2,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "movq   %xmm0,0x8(%edi)\n"
+    "movdqa 0x10(%esi),%xmm0\n"
+    "movdqa 0x10(%esi,%ebp),%xmm1\n"
+    "lea    0x20(%esi),%esi\n"
+    "pavgb  %xmm1,%xmm0\n"
+    "pshufb %xmm4,%xmm0\n"
+    "movdqa  _madd21,%xmm1\n"
+    "pmaddubsw %xmm1,%xmm0\n"
+    "paddsw %xmm7,%xmm0\n"
+    "psrlw  $0x2,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "movq   %xmm0,0x10(%edi)\n"
+    "lea    0x18(%edi),%edi\n"
+    "sub    $0x18,%ecx\n"
+    "ja     1b\n"
+
+    "popa\n"
+    "ret\n"
+);
+
+extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                           uint8* dst_ptr, int dst_width);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleRowDown34_0_Int_SSSE3\n"
+"_ScaleRowDown34_0_Int_SSSE3:\n"
+#else
+    ".global ScaleRowDown34_0_Int_SSSE3\n"
+"ScaleRowDown34_0_Int_SSSE3:\n"
+#endif
+    "pusha\n"
+    "mov    0x24(%esp),%esi\n"
+    "mov    0x28(%esp),%ebp\n"
+    "mov    0x2c(%esp),%edi\n"
+    "mov    0x30(%esp),%ecx\n"
+    "movdqa _shuf01,%xmm2\n"
+    "movdqa _shuf11,%xmm3\n"
+    "movdqa _shuf21,%xmm4\n"
+    "movdqa _madd01,%xmm5\n"
+    "movdqa _madd11,%xmm6\n"
+    "movdqa _round34,%xmm7\n"
+
+"1:"
+    "movdqa (%esi),%xmm0\n"
+    "movdqa (%esi,%ebp,1),%xmm1\n"
+    "pavgb  %xmm0,%xmm1\n"
+    "pavgb  %xmm1,%xmm0\n"
+    "pshufb %xmm2,%xmm0\n"
+    "pmaddubsw %xmm5,%xmm0\n"
+    "paddsw %xmm7,%xmm0\n"
+    "psrlw  $0x2,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "movq   %xmm0,(%edi)\n"
+    "movdqu 0x8(%esi),%xmm0\n"
+    "movdqu 0x8(%esi,%ebp,1),%xmm1\n"
+    "pavgb  %xmm0,%xmm1\n"
+    "pavgb  %xmm1,%xmm0\n"
+    "pshufb %xmm3,%xmm0\n"
+    "pmaddubsw %xmm6,%xmm0\n"
+    "paddsw %xmm7,%xmm0\n"
+    "psrlw  $0x2,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "movq   %xmm0,0x8(%edi)\n"
+    "movdqa 0x10(%esi),%xmm0\n"
+    "movdqa 0x10(%esi,%ebp,1),%xmm1\n"
+    "lea    0x20(%esi),%esi\n"
+    "pavgb  %xmm0,%xmm1\n"
+    "pavgb  %xmm1,%xmm0\n"
+    "pshufb %xmm4,%xmm0\n"
+    "movdqa  _madd21,%xmm1\n"
+    "pmaddubsw %xmm1,%xmm0\n"
+    "paddsw %xmm7,%xmm0\n"
+    "psrlw  $0x2,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "movq   %xmm0,0x10(%edi)\n"
+    "lea    0x18(%edi),%edi\n"
+    "sub    $0x18,%ecx\n"
+    "ja     1b\n"
+    "popa\n"
+    "ret\n"
+);
+
+#define HAS_SCALEROWDOWN38_SSSE3
+extern "C" void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
+                                     uint8* dst_ptr, int dst_width);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleRowDown38_SSSE3\n"
+"_ScaleRowDown38_SSSE3:\n"
+#else
+    ".global ScaleRowDown38_SSSE3\n"
+"ScaleRowDown38_SSSE3:\n"
+#endif
+    "pusha\n"
+    "mov    0x24(%esp),%esi\n"
+    "mov    0x28(%esp),%edx\n"
+    "mov    0x2c(%esp),%edi\n"
+    "mov    0x30(%esp),%ecx\n"
+    "movdqa _shuf38a ,%xmm5\n"
+    "movdqa _shuf38b ,%xmm6\n"
+    "pxor   %xmm7,%xmm7\n"
+
+"1:"
+    "movdqa (%esi),%xmm0\n"
+    "movdqa 0x10(%esi),%xmm1\n"
+    "lea    0x20(%esi),%esi\n"
+    "pshufb %xmm5,%xmm0\n"
+    "pshufb %xmm6,%xmm1\n"
+    "paddusb %xmm1,%xmm0\n"
+    "movq   %xmm0,(%edi)\n"
+    "movhlps %xmm0,%xmm1\n"
+    "movd   %xmm1,0x8(%edi)\n"
+    "lea    0xc(%edi),%edi\n"
+    "sub    $0xc,%ecx\n"
+    "ja     1b\n"
+    "popa\n"
+    "ret\n"
+);
+
+extern "C" void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                           uint8* dst_ptr, int dst_width);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleRowDown38_3_Int_SSSE3\n"
+"_ScaleRowDown38_3_Int_SSSE3:\n"
+#else
+    ".global ScaleRowDown38_3_Int_SSSE3\n"
+"ScaleRowDown38_3_Int_SSSE3:\n"
+#endif
+    "pusha\n"
+    "mov    0x24(%esp),%esi\n"
+    "mov    0x28(%esp),%edx\n"
+    "mov    0x2c(%esp),%edi\n"
+    "mov    0x30(%esp),%ecx\n"
+    "movdqa _shufac0,%xmm4\n"
+    "movdqa _shufac3,%xmm5\n"
+    "movdqa _scaleac3,%xmm6\n"
+    "pxor   %xmm7,%xmm7\n"
+
+"1:"
+    "movdqa (%esi),%xmm0\n"
+    "movdqa (%esi,%edx,1),%xmm2\n"
+    "movhlps %xmm0,%xmm1\n"
+    "movhlps %xmm2,%xmm3\n"
+    "punpcklbw %xmm7,%xmm0\n"
+    "punpcklbw %xmm7,%xmm1\n"
+    "punpcklbw %xmm7,%xmm2\n"
+    "punpcklbw %xmm7,%xmm3\n"
+    "paddusw %xmm2,%xmm0\n"
+    "paddusw %xmm3,%xmm1\n"
+    "movdqa (%esi,%edx,2),%xmm2\n"
+    "lea    0x10(%esi),%esi\n"
+    "movhlps %xmm2,%xmm3\n"
+    "punpcklbw %xmm7,%xmm2\n"
+    "punpcklbw %xmm7,%xmm3\n"
+    "paddusw %xmm2,%xmm0\n"
+    "paddusw %xmm3,%xmm1\n"
+    "movdqa %xmm0,%xmm2\n"
+    "psrldq $0x2,%xmm0\n"
+    "paddusw %xmm0,%xmm2\n"
+    "psrldq $0x2,%xmm0\n"
+    "paddusw %xmm0,%xmm2\n"
+    "pshufb %xmm4,%xmm2\n"
+    "movdqa %xmm1,%xmm3\n"
+    "psrldq $0x2,%xmm1\n"
+    "paddusw %xmm1,%xmm3\n"
+    "psrldq $0x2,%xmm1\n"
+    "paddusw %xmm1,%xmm3\n"
+    "pshufb %xmm5,%xmm3\n"
+    "paddusw %xmm3,%xmm2\n"
+    "pmulhuw %xmm6,%xmm2\n"
+    "packuswb %xmm2,%xmm2\n"
+    "movd   %xmm2,(%edi)\n"
+    "pextrw $0x2,%xmm2,%eax\n"
+    "mov    %ax,0x4(%edi)\n"
+    "lea    0x6(%edi),%edi\n"
+    "sub    $0x6,%ecx\n"
+    "ja     1b\n"
+    "popa\n"
+    "ret\n"
+);
+
+extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                           uint8* dst_ptr, int dst_width);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleRowDown38_2_Int_SSSE3\n"
+"_ScaleRowDown38_2_Int_SSSE3:\n"
+#else
+    ".global ScaleRowDown38_2_Int_SSSE3\n"
+"ScaleRowDown38_2_Int_SSSE3:\n"
+#endif
+    "pusha\n"
+    "mov    0x24(%esp),%esi\n"
+    "mov    0x28(%esp),%edx\n"
+    "mov    0x2c(%esp),%edi\n"
+    "mov    0x30(%esp),%ecx\n"
+    "movdqa _shufab0,%xmm4\n"
+    "movdqa _shufab1,%xmm5\n"
+    "movdqa _shufab2,%xmm6\n"
+    "movdqa _scaleab2,%xmm7\n"
+
+"1:"
+    "movdqa (%esi),%xmm2\n"
+    "pavgb  (%esi,%edx,1),%xmm2\n"
+    "lea    0x10(%esi),%esi\n"
+    "movdqa %xmm2,%xmm0\n"
+    "pshufb %xmm4,%xmm0\n"
+    "movdqa %xmm2,%xmm1\n"
+    "pshufb %xmm5,%xmm1\n"
+    "paddusw %xmm1,%xmm0\n"
+    "pshufb %xmm6,%xmm2\n"
+    "paddusw %xmm2,%xmm0\n"
+    "pmulhuw %xmm7,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "movd   %xmm0,(%edi)\n"
+    "pextrw $0x2,%xmm0,%eax\n"
+    "mov    %ax,0x4(%edi)\n"
+    "lea    0x6(%edi),%edi\n"
+    "sub    $0x6,%ecx\n"
+    "ja     1b\n"
+    "popa\n"
+    "ret\n"
+);
+#endif // __PIC__
+
+#define HAS_SCALEADDROWS_SSE2
+extern "C" void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
+                                  uint16* dst_ptr, int src_width,
+                                  int src_height);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleAddRows_SSE2\n"
+"_ScaleAddRows_SSE2:\n"
+#else
+    ".global ScaleAddRows_SSE2\n"
+"ScaleAddRows_SSE2:\n"
+#endif
+    "pusha\n"
+    "mov    0x24(%esp),%esi\n"
+    "mov    0x28(%esp),%edx\n"
+    "mov    0x2c(%esp),%edi\n"
+    "mov    0x30(%esp),%ecx\n"
+    "mov    0x34(%esp),%ebx\n"
+    "pxor   %xmm7,%xmm7\n"
+
+"1:"
+    "movdqa (%esi),%xmm2\n"
+    "lea    (%esi,%edx,1),%eax\n"
+    "movhlps %xmm2,%xmm3\n"
+    "lea    -0x1(%ebx),%ebp\n"
+    "punpcklbw %xmm7,%xmm2\n"
+    "punpcklbw %xmm7,%xmm3\n"
+
+"2:"
+    "movdqa (%eax),%xmm0\n"
+    "lea    (%eax,%edx,1),%eax\n"
+    "movhlps %xmm0,%xmm1\n"
+    "punpcklbw %xmm7,%xmm0\n"
+    "punpcklbw %xmm7,%xmm1\n"
+    "paddusw %xmm0,%xmm2\n"
+    "paddusw %xmm1,%xmm3\n"
+    "sub    $0x1,%ebp\n"
+    "ja     2b\n"
+
+    "movdqa %xmm2,(%edi)\n"
+    "movdqa %xmm3,0x10(%edi)\n"
+    "lea    0x20(%edi),%edi\n"
+    "lea    0x10(%esi),%esi\n"
+    "sub    $0x10,%ecx\n"
+    "ja     1b\n"
+    "popa\n"
+    "ret\n"
+);
+
+// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
+#define HAS_SCALEFILTERROWS_SSE2
+extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
+                                     const uint8* src_ptr, int src_stride,
+                                     int dst_width, int source_y_fraction);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleFilterRows_SSE2\n"
+"_ScaleFilterRows_SSE2:\n"
+#else
+    ".global ScaleFilterRows_SSE2\n"
+"ScaleFilterRows_SSE2:\n"
+#endif
+    "push   %esi\n"
+    "push   %edi\n"
+    "mov    0xc(%esp),%edi\n"
+    "mov    0x10(%esp),%esi\n"
+    "mov    0x14(%esp),%edx\n"
+    "mov    0x18(%esp),%ecx\n"
+    "mov    0x1c(%esp),%eax\n"
+    "cmp    $0x0,%eax\n"
+    "je     2f\n"
+    "cmp    $0x80,%eax\n"
+    "je     3f\n"
+    "movd   %eax,%xmm6\n"
+    "punpcklwd %xmm6,%xmm6\n"
+    "pshufd $0x0,%xmm6,%xmm6\n"
+    "neg    %eax\n"
+    "add    $0x100,%eax\n"
+    "movd   %eax,%xmm5\n"
+    "punpcklwd %xmm5,%xmm5\n"
+    "pshufd $0x0,%xmm5,%xmm5\n"
+    "pxor   %xmm7,%xmm7\n"
+
+"1:"
+    "movdqa (%esi),%xmm0\n"
+    "movdqa (%esi,%edx,1),%xmm2\n"
+    "lea    0x10(%esi),%esi\n"
+    "movdqa %xmm0,%xmm1\n"
+    "movdqa %xmm2,%xmm3\n"
+    "punpcklbw %xmm7,%xmm0\n"
+    "punpcklbw %xmm7,%xmm2\n"
+    "punpckhbw %xmm7,%xmm1\n"
+    "punpckhbw %xmm7,%xmm3\n"
+    "pmullw %xmm5,%xmm0\n"
+    "pmullw %xmm5,%xmm1\n"
+    "pmullw %xmm6,%xmm2\n"
+    "pmullw %xmm6,%xmm3\n"
+    "paddusw %xmm2,%xmm0\n"
+    "paddusw %xmm3,%xmm1\n"
+    "psrlw  $0x8,%xmm0\n"
+    "psrlw  $0x8,%xmm1\n"
+    "packuswb %xmm1,%xmm0\n"
+    "movdqa %xmm0,(%edi)\n"
+    "lea    0x10(%edi),%edi\n"
+    "sub    $0x10,%ecx\n"
+    "ja     1b\n"
+    "mov    -0x1(%edi),%al\n"
+    "mov    %al,(%edi)\n"
+    "pop    %edi\n"
+    "pop    %esi\n"
+    "ret\n"
+
+"2:"
+    "movdqa (%esi),%xmm0\n"
+    "lea    0x10(%esi),%esi\n"
+    "movdqa %xmm0,(%edi)\n"
+    "lea    0x10(%edi),%edi\n"
+    "sub    $0x10,%ecx\n"
+    "ja     2b\n"
+
+    "mov    -0x1(%edi),%al\n"
+    "mov    %al,(%edi)\n"
+    "pop    %edi\n"
+    "pop    %esi\n"
+    "ret\n"
+
+"3:"
+    "movdqa (%esi),%xmm0\n"
+    "movdqa (%esi,%edx,1),%xmm2\n"
+    "lea    0x10(%esi),%esi\n"
+    "pavgb  %xmm2,%xmm0\n"
+    "movdqa %xmm0,(%edi)\n"
+    "lea    0x10(%edi),%edi\n"
+    "sub    $0x10,%ecx\n"
+    "ja     3b\n"
+
+    "mov    -0x1(%edi),%al\n"
+    "mov    %al,(%edi)\n"
+    "pop    %edi\n"
+    "pop    %esi\n"
+    "ret\n"
+);
+
+// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
+#define HAS_SCALEFILTERROWS_SSSE3
+extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
+                                      const uint8* src_ptr, int src_stride,
+                                      int dst_width, int source_y_fraction);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleFilterRows_SSSE3\n"
+"_ScaleFilterRows_SSSE3:\n"
+#else
+    ".global ScaleFilterRows_SSSE3\n"
+"ScaleFilterRows_SSSE3:\n"
+#endif
+    "push   %esi\n"
+    "push   %edi\n"
+    "mov    0xc(%esp),%edi\n"
+    "mov    0x10(%esp),%esi\n"
+    "mov    0x14(%esp),%edx\n"
+    "mov    0x18(%esp),%ecx\n"
+    "mov    0x1c(%esp),%eax\n"
+    "cmp    $0x0,%eax\n"
+    "je     2f\n"
+    "cmp    $0x80,%eax\n"
+    "je     3f\n"
+    "shr    %eax\n"
+    "mov    %al,%ah\n"
+    "neg    %al\n"
+    "add    $0x80,%al\n"
+    "movd   %eax,%xmm7\n"
+    "punpcklwd %xmm7,%xmm7\n"
+    "pshufd $0x0,%xmm7,%xmm7\n"
+
+"1:"
+    "movdqa (%esi),%xmm0\n"
+    "movdqa (%esi,%edx,1),%xmm2\n"
+    "lea    0x10(%esi),%esi\n"
+    "movdqa %xmm0,%xmm1\n"
+    "punpcklbw %xmm2,%xmm0\n"
+    "punpckhbw %xmm2,%xmm1\n"
+    "pmaddubsw %xmm7,%xmm0\n"
+    "pmaddubsw %xmm7,%xmm1\n"
+    "psrlw  $0x7,%xmm0\n"
+    "psrlw  $0x7,%xmm1\n"
+    "packuswb %xmm1,%xmm0\n"
+    "movdqa %xmm0,(%edi)\n"
+    "lea    0x10(%edi),%edi\n"
+    "sub    $0x10,%ecx\n"
+    "ja     1b\n"
+    "mov    -0x1(%edi),%al\n"
+    "mov    %al,(%edi)\n"
+    "pop    %edi\n"
+    "pop    %esi\n"
+    "ret\n"
+
+"2:"
+    "movdqa (%esi),%xmm0\n"
+    "lea    0x10(%esi),%esi\n"
+    "movdqa %xmm0,(%edi)\n"
+    "lea    0x10(%edi),%edi\n"
+    "sub    $0x10,%ecx\n"
+    "ja     2b\n"
+    "mov    -0x1(%edi),%al\n"
+    "mov    %al,(%edi)\n"
+    "pop    %edi\n"
+    "pop    %esi\n"
+    "ret\n"
+
+"3:"
+    "movdqa (%esi),%xmm0\n"
+    "movdqa (%esi,%edx,1),%xmm2\n"
+    "lea    0x10(%esi),%esi\n"
+    "pavgb  %xmm2,%xmm0\n"
+    "movdqa %xmm0,(%edi)\n"
+    "lea    0x10(%edi),%edi\n"
+    "sub    $0x10,%ecx\n"
+    "ja     3b\n"
+    "mov    -0x1(%edi),%al\n"
+    "mov    %al,(%edi)\n"
+    "pop    %edi\n"
+    "pop    %esi\n"
+    "ret\n"
+);
+
+#elif defined(__x86_64__)
+static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
+                                  uint8* dst_ptr, int dst_width) {
+  asm volatile(
+  "lea        (%3,%3,2),%%r10\n"
+  "pxor       %%xmm7,%%xmm7\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "movdqa     (%0,%3,1),%%xmm2\n"
+  "movdqa     0x10(%0,%3,1),%%xmm3\n"
+  "pavgb      %%xmm2,%%xmm0\n"
+  "pavgb      %%xmm3,%%xmm1\n"
+  "movdqa     (%0,%3,2),%%xmm2\n"
+  "movdqa     0x10(%0,%3,2),%%xmm3\n"
+  "movdqa     (%0,%%r10,1),%%xmm4\n"
+  "movdqa     0x10(%0,%%r10,1),%%xmm5\n"
+  "lea        (%0,%3,4),%%r11\n"
+  "lea        0x20(%0),%0\n"
+  "pavgb      %%xmm4,%%xmm2\n"
+  "pavgb      %%xmm5,%%xmm3\n"
+  "pavgb      %%xmm2,%%xmm0\n"
+  "pavgb      %%xmm3,%%xmm1\n"
+  "movdqa     0x0(%%r11),%%xmm2\n"
+  "movdqa     0x10(%%r11),%%xmm3\n"
+  "movdqa     0x0(%%r11,%3,1),%%xmm4\n"
+  "movdqa     0x10(%%r11,%3,1),%%xmm5\n"
+  "pavgb      %%xmm4,%%xmm2\n"
+  "pavgb      %%xmm5,%%xmm3\n"
+  "movdqa     0x0(%%r11,%3,2),%%xmm4\n"
+  "movdqa     0x10(%%r11,%3,2),%%xmm5\n"
+  "movdqa     0x0(%%r11,%%r10,1),%%xmm6\n"
+  "pavgb      %%xmm6,%%xmm4\n"
+  "movdqa     0x10(%%r11,%%r10,1),%%xmm6\n"
+  "pavgb      %%xmm6,%%xmm5\n"
+  "pavgb      %%xmm4,%%xmm2\n"
+  "pavgb      %%xmm5,%%xmm3\n"
+  "pavgb      %%xmm2,%%xmm0\n"
+  "pavgb      %%xmm3,%%xmm1\n"
+  "psadbw     %%xmm7,%%xmm0\n"
+  "psadbw     %%xmm7,%%xmm1\n"
+  "pshufd     $0xd8,%%xmm0,%%xmm0\n"
+  "pshufd     $0x8d,%%xmm1,%%xmm1\n"
+  "por        %%xmm1,%%xmm0\n"
+  "psrlw      $0x3,%%xmm0\n"
+  "packuswb   %%xmm0,%%xmm0\n"
+  "packuswb   %%xmm0,%%xmm0\n"
+  "movd       %%xmm0,(%1)\n"
+  "lea        0x4(%1),%1\n"
+  "sub        $0x4,%2\n"
+  "ja         1b\n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width)    // %2
+  : "r"(static_cast<intptr_t>(src_stride))   // %3
+  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3",
+    "xmm4", "xmm5", "xmm6", "xmm7"
+);
+}
+
+#define HAS_SCALEROWDOWN34_SSSE3
+static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
+                                 uint8* dst_ptr, int dst_width) {
+  asm volatile(
+  "movdqa     (%3),%%xmm3\n"
+  "movdqa     (%4),%%xmm4\n"
+  "movdqa     (%5),%%xmm5\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm2\n"
+  "lea        0x20(%0),%0\n"
+  "movdqa     %%xmm2,%%xmm1\n"
+  "palignr    $0x8,%%xmm0,%%xmm1\n"
+  "pshufb     %%xmm3,%%xmm0\n"
+  "pshufb     %%xmm4,%%xmm1\n"
+  "pshufb     %%xmm5,%%xmm2\n"
+  "movq       %%xmm0,(%1)\n"
+  "movq       %%xmm1,0x8(%1)\n"
+  "movq       %%xmm2,0x10(%1)\n"
+  "lea        0x18(%1),%1\n"
+  "sub        $0x18,%2\n"
+  "ja         1b\n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width)    // %2
+  : "r"(_shuf0),   // %3
+    "r"(_shuf1),   // %4
+    "r"(_shuf2)    // %5
+  : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+);
+}
+
+static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                       uint8* dst_ptr, int dst_width) {
+  asm volatile(
+  "movdqa     (%4),%%xmm2\n"  // _shuf01
+  "movdqa     (%5),%%xmm3\n"  // _shuf11
+  "movdqa     (%6),%%xmm4\n"  // _shuf21
+  "movdqa     (%7),%%xmm5\n"  // _madd01
+  "movdqa     (%8),%%xmm6\n"  // _madd11
+  "movdqa     (%9),%%xmm7\n"  // _round34
+  "movdqa     (%10),%%xmm8\n"  // _madd21
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     (%0,%3),%%xmm1\n"
+  "pavgb      %%xmm1,%%xmm0\n"
+  "pshufb     %%xmm2,%%xmm0\n"
+  "pmaddubsw  %%xmm5,%%xmm0\n"
+  "paddsw     %%xmm7,%%xmm0\n"
+  "psrlw      $0x2,%%xmm0\n"
+  "packuswb   %%xmm0,%%xmm0\n"
+  "movq       %%xmm0,(%1)\n"
+  "movdqu     0x8(%0),%%xmm0\n"
+  "movdqu     0x8(%0,%3),%%xmm1\n"
+  "pavgb      %%xmm1,%%xmm0\n"
+  "pshufb     %%xmm3,%%xmm0\n"
+  "pmaddubsw  %%xmm6,%%xmm0\n"
+  "paddsw     %%xmm7,%%xmm0\n"
+  "psrlw      $0x2,%%xmm0\n"
+  "packuswb   %%xmm0,%%xmm0\n"
+  "movq       %%xmm0,0x8(%1)\n"
+  "movdqa     0x10(%0),%%xmm0\n"
+  "movdqa     0x10(%0,%3),%%xmm1\n"
+  "lea        0x20(%0),%0\n"
+  "pavgb      %%xmm1,%%xmm0\n"
+  "pshufb     %%xmm4,%%xmm0\n"
+  "pmaddubsw  %%xmm8,%%xmm0\n"
+  "paddsw     %%xmm7,%%xmm0\n"
+  "psrlw      $0x2,%%xmm0\n"
+  "packuswb   %%xmm0,%%xmm0\n"
+  "movq       %%xmm0,0x10(%1)\n"
+  "lea        0x18(%1),%1\n"
+  "sub        $0x18,%2\n"
+  "ja         1b\n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width)    // %2
+  : "r"(static_cast<intptr_t>(src_stride)),  // %3
+    "r"(_shuf01),   // %4
+    "r"(_shuf11),   // %5
+    "r"(_shuf21),   // %6
+    "r"(_madd01),   // %7
+    "r"(_madd11),   // %8
+    "r"(_round34),  // %9
+    "r"(_madd21)    // %10
+  : "memory", "xmm0", "xmm1", "xmm2", "xmm3",
+    "xmm4", "xmm5", "xmm6", "xmm7", "xmm8"
+);
+}
+
+static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                       uint8* dst_ptr, int dst_width) {
+  asm volatile(
+  "movdqa     (%4),%%xmm2\n"  // _shuf01
+  "movdqa     (%5),%%xmm3\n"  // _shuf11
+  "movdqa     (%6),%%xmm4\n"  // _shuf21
+  "movdqa     (%7),%%xmm5\n"  // _madd01
+  "movdqa     (%8),%%xmm6\n"  // _madd11
+  "movdqa     (%9),%%xmm7\n"  // _round34
+  "movdqa     (%10),%%xmm8\n"  // _madd21
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     (%0,%3,1),%%xmm1\n"
+  "pavgb      %%xmm0,%%xmm1\n"
+  "pavgb      %%xmm1,%%xmm0\n"
+  "pshufb     %%xmm2,%%xmm0\n"
+  "pmaddubsw  %%xmm5,%%xmm0\n"
+  "paddsw     %%xmm7,%%xmm0\n"
+  "psrlw      $0x2,%%xmm0\n"
+  "packuswb   %%xmm0,%%xmm0\n"
+  "movq       %%xmm0,(%1)\n"
+  "movdqu     0x8(%0),%%xmm0\n"
+  "movdqu     0x8(%0,%3,1),%%xmm1\n"
+  "pavgb      %%xmm0,%%xmm1\n"
+  "pavgb      %%xmm1,%%xmm0\n"
+  "pshufb     %%xmm3,%%xmm0\n"
+  "pmaddubsw  %%xmm6,%%xmm0\n"
+  "paddsw     %%xmm7,%%xmm0\n"
+  "psrlw      $0x2,%%xmm0\n"
+  "packuswb   %%xmm0,%%xmm0\n"
+  "movq       %%xmm0,0x8(%1)\n"
+  "movdqa     0x10(%0),%%xmm0\n"
+  "movdqa     0x10(%0,%3,1),%%xmm1\n"
+  "lea        0x20(%0),%0\n"
+  "pavgb      %%xmm0,%%xmm1\n"
+  "pavgb      %%xmm1,%%xmm0\n"
+  "pshufb     %%xmm4,%%xmm0\n"
+  "pmaddubsw  %%xmm8,%%xmm0\n"
+  "paddsw     %%xmm7,%%xmm0\n"
+  "psrlw      $0x2,%%xmm0\n"
+  "packuswb   %%xmm0,%%xmm0\n"
+  "movq       %%xmm0,0x10(%1)\n"
+  "lea        0x18(%1),%1\n"
+  "sub        $0x18,%2\n"
+  "ja         1b\n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width)    // %2
+  : "r"(static_cast<intptr_t>(src_stride)),  // %3
+    "r"(_shuf01),   // %4
+    "r"(_shuf11),   // %5
+    "r"(_shuf21),   // %6
+    "r"(_madd01),   // %7
+    "r"(_madd11),   // %8
+    "r"(_round34),  // %9
+    "r"(_madd21)    // %10
+  : "memory", "xmm0", "xmm1", "xmm2", "xmm3",
+    "xmm4", "xmm5", "xmm6", "xmm7", "xmm8"
+);
+}
+
+#define HAS_SCALEROWDOWN38_SSSE3
+static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
+                                 uint8* dst_ptr, int dst_width) {
+  asm volatile(
+  "movdqa     (%3),%%xmm5\n"
+  "movdqa     (%4),%%xmm6\n"
+  "pxor       %%xmm7,%%xmm7\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "lea        0x20(%0),%0\n"
+  "pshufb     %%xmm5,%%xmm0\n"
+  "pshufb     %%xmm6,%%xmm1\n"
+  "paddusb    %%xmm1,%%xmm0\n"
+  "movq       %%xmm0,(%1)\n"
+  "movhlps    %%xmm0,%%xmm1\n"
+  "movd       %%xmm1,0x8(%1)\n"
+  "lea        0xc(%1),%1\n"
+  "sub        $0xc,%2\n"
+  "ja         1b\n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width)    // %2
+  : "r"(_shuf38a),  // %3
+    "r"(_shuf38b)   // %4
+  : "memory", "xmm0", "xmm1", "xmm5", "xmm6", "xmm7"
+);
+}
+
+static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                       uint8* dst_ptr, int dst_width) {
+  asm volatile(
+  "movdqa     (%4),%%xmm4\n"
+  "movdqa     (%5),%%xmm5\n"
+  "movdqa     (%6),%%xmm6\n"
+  "pxor       %%xmm7,%%xmm7\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     (%0,%3,1),%%xmm2\n"
+  "movhlps    %%xmm0,%%xmm1\n"
+  "movhlps    %%xmm2,%%xmm3\n"
+  "punpcklbw  %%xmm7,%%xmm0\n"
+  "punpcklbw  %%xmm7,%%xmm1\n"
+  "punpcklbw  %%xmm7,%%xmm2\n"
+  "punpcklbw  %%xmm7,%%xmm3\n"
+  "paddusw    %%xmm2,%%xmm0\n"
+  "paddusw    %%xmm3,%%xmm1\n"
+  "movdqa     (%0,%3,2),%%xmm2\n"
+  "lea        0x10(%0),%0\n"
+  "movhlps    %%xmm2,%%xmm3\n"
+  "punpcklbw  %%xmm7,%%xmm2\n"
+  "punpcklbw  %%xmm7,%%xmm3\n"
+  "paddusw    %%xmm2,%%xmm0\n"
+  "paddusw    %%xmm3,%%xmm1\n"
+  "movdqa     %%xmm0,%%xmm2\n"
+  "psrldq     $0x2,%%xmm0\n"
+  "paddusw    %%xmm0,%%xmm2\n"
+  "psrldq     $0x2,%%xmm0\n"
+  "paddusw    %%xmm0,%%xmm2\n"
+  "pshufb     %%xmm4,%%xmm2\n"
+  "movdqa     %%xmm1,%%xmm3\n"
+  "psrldq     $0x2,%%xmm1\n"
+  "paddusw    %%xmm1,%%xmm3\n"
+  "psrldq     $0x2,%%xmm1\n"
+  "paddusw    %%xmm1,%%xmm3\n"
+  "pshufb     %%xmm5,%%xmm3\n"
+  "paddusw    %%xmm3,%%xmm2\n"
+  "pmulhuw    %%xmm6,%%xmm2\n"
+  "packuswb   %%xmm2,%%xmm2\n"
+  "movd       %%xmm2,(%1)\n"
+  "pextrw     $0x2,%%xmm2,%%eax\n"
+  "mov        %%ax,0x4(%1)\n"
+  "lea        0x6(%1),%1\n"
+  "sub        $0x6,%2\n"
+  "ja         1b\n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width)    // %2
+  : "r"(static_cast<intptr_t>(src_stride)),  // %3
+    "r"(_shufac0),   // %4
+    "r"(_shufac3),   // %5
+    "r"(_scaleac3)   // %6
+  : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm3",
+    "xmm4", "xmm5", "xmm6", "xmm7"
+);
+}
+
+static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                       uint8* dst_ptr, int dst_width) {
+  asm volatile(
+  "movdqa     (%4),%%xmm4\n"
+  "movdqa     (%5),%%xmm5\n"
+  "movdqa     (%6),%%xmm6\n"
+  "movdqa     (%7),%%xmm7\n"
+"1:"
+  "movdqa     (%0),%%xmm2\n"
+  "pavgb      (%0,%3,1),%%xmm2\n"
+  "lea        0x10(%0),%0\n"
+  "movdqa     %%xmm2,%%xmm0\n"
+  "pshufb     %%xmm4,%%xmm0\n"
+  "movdqa     %%xmm2,%%xmm1\n"
+  "pshufb     %%xmm5,%%xmm1\n"
+  "paddusw    %%xmm1,%%xmm0\n"
+  "pshufb     %%xmm6,%%xmm2\n"
+  "paddusw    %%xmm2,%%xmm0\n"
+  "pmulhuw    %%xmm7,%%xmm0\n"
+  "packuswb   %%xmm0,%%xmm0\n"
+  "movd       %%xmm0,(%1)\n"
+  "pextrw     $0x2,%%xmm0,%%eax\n"
+  "mov        %%ax,0x4(%1)\n"
+  "lea        0x6(%1),%1\n"
+  "sub        $0x6,%2\n"
+  "ja         1b\n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width)    // %2
+  : "r"(static_cast<intptr_t>(src_stride)),  // %3
+    "r"(_shufab0),   // %4
+    "r"(_shufab1),   // %5
+    "r"(_shufab2),   // %6
+    "r"(_scaleab2)   // %7
+  : "memory", "rax", "xmm0", "xmm1", "xmm2",
+    "xmm4", "xmm5", "xmm6", "xmm7"
+);
+}
+
+#define HAS_SCALEADDROWS_SSE2
+static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
+                              uint16* dst_ptr, int src_width,
+                              int src_height) {
+  asm volatile(
+  "pxor       %%xmm7,%%xmm7\n"
+"1:"
+  "movdqa     (%0),%%xmm2\n"
+  "lea        (%0,%4,1),%%r10\n"
+  "movhlps    %%xmm2,%%xmm3\n"
+  "lea        -0x1(%3),%%r11\n"
+  "punpcklbw  %%xmm7,%%xmm2\n"
+  "punpcklbw  %%xmm7,%%xmm3\n"
+
+"2:"
+  "movdqa     (%%r10),%%xmm0\n"
+  "lea        (%%r10,%4,1),%%r10\n"
+  "movhlps    %%xmm0,%%xmm1\n"
+  "punpcklbw  %%xmm7,%%xmm0\n"
+  "punpcklbw  %%xmm7,%%xmm1\n"
+  "paddusw    %%xmm0,%%xmm2\n"
+  "paddusw    %%xmm1,%%xmm3\n"
+  "sub        $0x1,%%r11\n"
+  "ja         2b\n"
+
+  "movdqa     %%xmm2,(%1)\n"
+  "movdqa     %%xmm3,0x10(%1)\n"
+  "lea        0x20(%1),%1\n"
+  "lea        0x10(%0),%0\n"
+  "sub        $0x10,%2\n"
+  "ja         1b\n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(src_width),   // %2
+    "+r"(src_height)   // %3
+  : "r"(static_cast<intptr_t>(src_stride))  // %4
+  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7"
+);
+}
+
+// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
+#define HAS_SCALEFILTERROWS_SSE2
+static void ScaleFilterRows_SSE2(uint8* dst_ptr,
+                                 const uint8* src_ptr, int src_stride,
+                                 int dst_width, int source_y_fraction) {
+  if (source_y_fraction == 0) {
+    asm volatile(
+    "1:"
+      "movdqa     (%1),%%xmm0\n"
+      "lea        0x10(%1),%1\n"
+      "movdqa     %%xmm0,(%0)\n"
+      "lea        0x10(%0),%0\n"
+      "sub        $0x10,%2\n"
+      "ja         1b\n"
+      "mov        -0x1(%0),%%al\n"
+      "mov        %%al,(%0)\n"
+      : "+r"(dst_ptr),     // %0
+        "+r"(src_ptr),     // %1
+        "+r"(dst_width)    // %2
+      :
+      : "memory", "rax", "xmm0"
+    );
+    return;
+  } else if (source_y_fraction == 128) {
+    asm volatile(
+    "1:"
+      "movdqa     (%1),%%xmm0\n"
+      "movdqa     (%1,%3,1),%%xmm2\n"
+      "lea        0x10(%1),%1\n"
+      "pavgb      %%xmm2,%%xmm0\n"
+      "movdqa     %%xmm0,(%0)\n"
+      "lea        0x10(%0),%0\n"
+      "sub        $0x10,%2\n"
+      "ja         1b\n"
+      "mov        -0x1(%0),%%al\n"
+      "mov        %%al,(%0)\n"
+      : "+r"(dst_ptr),     // %0
+        "+r"(src_ptr),     // %1
+        "+r"(dst_width)    // %2
+      : "r"(static_cast<intptr_t>(src_stride))  // %3
+      : "memory", "rax", "xmm0", "xmm2"
+    );
+    return;
+  } else {
+    asm volatile(
+      "mov        %3,%%eax\n"
+      "movd       %%eax,%%xmm6\n"
+      "punpcklwd  %%xmm6,%%xmm6\n"
+      "pshufd     $0x0,%%xmm6,%%xmm6\n"
+      "neg        %%eax\n"
+      "add        $0x100,%%eax\n"
+      "movd       %%eax,%%xmm5\n"
+      "punpcklwd  %%xmm5,%%xmm5\n"
+      "pshufd     $0x0,%%xmm5,%%xmm5\n"
+      "pxor       %%xmm7,%%xmm7\n"
+    "1:"
+      "movdqa     (%1),%%xmm0\n"
+      "movdqa     (%1,%4,1),%%xmm2\n"
+      "lea        0x10(%1),%1\n"
+      "movdqa     %%xmm0,%%xmm1\n"
+      "movdqa     %%xmm2,%%xmm3\n"
+      "punpcklbw  %%xmm7,%%xmm0\n"
+      "punpcklbw  %%xmm7,%%xmm2\n"
+      "punpckhbw  %%xmm7,%%xmm1\n"
+      "punpckhbw  %%xmm7,%%xmm3\n"
+      "pmullw     %%xmm5,%%xmm0\n"
+      "pmullw     %%xmm5,%%xmm1\n"
+      "pmullw     %%xmm6,%%xmm2\n"
+      "pmullw     %%xmm6,%%xmm3\n"
+      "paddusw    %%xmm2,%%xmm0\n"
+      "paddusw    %%xmm3,%%xmm1\n"
+      "psrlw      $0x8,%%xmm0\n"
+      "psrlw      $0x8,%%xmm1\n"
+      "packuswb   %%xmm1,%%xmm0\n"
+      "movdqa     %%xmm0,(%0)\n"
+      "lea        0x10(%0),%0\n"
+      "sub        $0x10,%2\n"
+      "ja         1b\n"
+      "mov        -0x1(%0),%%al\n"
+      "mov        %%al,(%0)\n"
+      : "+r"(dst_ptr),     // %0
+        "+r"(src_ptr),     // %1
+        "+r"(dst_width),   // %2
+        "+r"(source_y_fraction)  // %3
+      : "r"(static_cast<intptr_t>(src_stride))  // %4
+      : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm3",
+        "xmm5", "xmm6", "xmm7"
+    );
+  }
+  return;
+}
+
+// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
+#define HAS_SCALEFILTERROWS_SSSE3
+static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
+                                  const uint8* src_ptr, int src_stride,
+                                  int dst_width, int source_y_fraction) {
+  if (source_y_fraction == 0) {
+    asm volatile(
+   "1:"
+      "movdqa     (%1),%%xmm0\n"
+      "lea        0x10(%1),%1\n"
+      "movdqa     %%xmm0,(%0)\n"
+      "lea        0x10(%0),%0\n"
+      "sub        $0x10,%2\n"
+      "ja         1b\n"
+      "mov        -0x1(%0),%%al\n"
+      "mov        %%al,(%0)\n"
+      : "+r"(dst_ptr),     // %0
+        "+r"(src_ptr),     // %1
+        "+r"(dst_width)    // %2
+      :
+      : "memory", "rax", "xmm0"
+    );
+    return;
+  } else if (source_y_fraction == 128) {
+    asm volatile(
+    "1:"
+      "movdqa     (%1),%%xmm0\n"
+      "movdqa     (%1,%3,1),%%xmm2\n"
+      "lea        0x10(%1),%1\n"
+      "pavgb      %%xmm2,%%xmm0\n"
+      "movdqa     %%xmm0,(%0)\n"
+      "lea        0x10(%0),%0\n"
+      "sub        $0x10,%2\n"
+      "ja         1b\n"
+      "mov        -0x1(%0),%%al\n"
+      "mov        %%al,(%0)\n"
+      : "+r"(dst_ptr),     // %0
+        "+r"(src_ptr),     // %1
+        "+r"(dst_width)    // %2
+      : "r"(static_cast<intptr_t>(src_stride))  // %3
+     : "memory", "rax", "xmm0", "xmm2"
+    );
+    return;
+  } else {
+    asm volatile(
+      "mov        %3,%%eax\n"
+      "shr        %%eax\n"
+      "mov        %%al,%%ah\n"
+      "neg        %%al\n"
+      "add        $0x80,%%al\n"
+      "movd       %%eax,%%xmm7\n"
+      "punpcklwd  %%xmm7,%%xmm7\n"
+      "pshufd     $0x0,%%xmm7,%%xmm7\n"
+    "1:"
+      "movdqa     (%1),%%xmm0\n"
+      "movdqa     (%1,%4,1),%%xmm2\n"
+      "lea        0x10(%1),%1\n"
+      "movdqa     %%xmm0,%%xmm1\n"
+      "punpcklbw  %%xmm2,%%xmm0\n"
+      "punpckhbw  %%xmm2,%%xmm1\n"
+      "pmaddubsw  %%xmm7,%%xmm0\n"
+      "pmaddubsw  %%xmm7,%%xmm1\n"
+      "psrlw      $0x7,%%xmm0\n"
+      "psrlw      $0x7,%%xmm1\n"
+      "packuswb   %%xmm1,%%xmm0\n"
+      "movdqa     %%xmm0,(%0)\n"
+      "lea        0x10(%0),%0\n"
+      "sub        $0x10,%2\n"
+      "ja         1b\n"
+      "mov        -0x1(%0),%%al\n"
+      "mov        %%al,(%0)\n"
+      : "+r"(dst_ptr),     // %0
+        "+r"(src_ptr),     // %1
+        "+r"(dst_width),   // %2
+        "+r"(source_y_fraction)  // %3
+      : "r"(static_cast<intptr_t>(src_stride))  // %4
+      : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm7"
+    );
+  }
+  return;
+}
+#endif
+#endif
+
+// CPU agnostic row functions
+static void ScaleRowDown2_C(const uint8* src_ptr, int,
+                            uint8* dst, int dst_width) {
+  for (int x = 0; x < dst_width; ++x) {
+    *dst++ = *src_ptr;
+    src_ptr += 2;
+  }
+}
+
+static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride,
+                               uint8* dst, int dst_width) {
+  for (int x = 0; x < dst_width; ++x) {
+    *dst++ = (src_ptr[0] + src_ptr[1] +
+              src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2;
+    src_ptr += 2;
+  }
+}
+
+static void ScaleRowDown4_C(const uint8* src_ptr, int,
+                            uint8* dst, int dst_width) {
+  for (int x = 0; x < dst_width; ++x) {
+    *dst++ = *src_ptr;
+    src_ptr += 4;
+  }
+}
+
+static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride,
+                               uint8* dst, int dst_width) {
+  for (int x = 0; x < dst_width; ++x) {
+    *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+              src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
+              src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
+              src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
+              src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
+              src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
+              src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] +
+              8) >> 4;
+    src_ptr += 4;
+  }
+}
+
+// 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down.
+// Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu.
+static const int kMaxOutputWidth = 640;
+static const int kMaxRow12 = kMaxOutputWidth * 2;
+
+static void ScaleRowDown8_C(const uint8* src_ptr, int,
+                            uint8* dst, int dst_width) {
+  for (int x = 0; x < dst_width; ++x) {
+    *dst++ = *src_ptr;
+    src_ptr += 8;
+  }
+}
+
+// Note calling code checks width is less than max and if not
+// uses ScaleRowDown8_C instead.
+static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride,
+                               uint8* dst, int dst_width) {
+  ALIGN16(uint8 src_row[kMaxRow12 * 2]);
+  assert(dst_width <= kMaxOutputWidth);
+  ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2);
+  ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride,
+                     src_row + kMaxOutputWidth,
+                     dst_width * 2);
+  ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width);
+}
+
+static void ScaleRowDown34_C(const uint8* src_ptr, int,
+                             uint8* dst, int dst_width) {
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  uint8* dend = dst + dst_width;
+  do {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[1];
+    dst[2] = src_ptr[3];
+    dst += 3;
+    src_ptr += 4;
+  } while (dst < dend);
+}
+
+// Filter rows 0 and 1 together, 3 : 1
+static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride,
+                                   uint8* d, int dst_width) {
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  uint8* dend = d + dst_width;
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  do {
+    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 * 3 + b0 + 2) >> 2;
+    d[1] = (a1 * 3 + b1 + 2) >> 2;
+    d[2] = (a2 * 3 + b2 + 2) >> 2;
+    d += 3;
+    s += 4;
+    t += 4;
+  } while (d < dend);
+}
+
+// Filter rows 1 and 2 together, 1 : 1
+static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride,
+                                   uint8* d, int dst_width) {
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  uint8* dend = d + dst_width;
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  do {
+    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 + b0 + 1) >> 1;
+    d[1] = (a1 + b1 + 1) >> 1;
+    d[2] = (a2 + b2 + 1) >> 1;
+    d += 3;
+    s += 4;
+    t += 4;
+  } while (d < dend);
+}
+
+#if defined(HAS_SCALEFILTERROWS_SSE2)
+// Filter row to 3/4
+static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
+                                int dst_width) {
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  uint8* dend = dst_ptr + dst_width;
+  const uint8* s = src_ptr;
+  do {
+    dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    dst_ptr += 3;
+    s += 4;
+  } while (dst_ptr < dend);
+}
+#endif
+
+static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                              int dst_width, int dx) {
+  int x = 0;
+  for (int j = 0; j < dst_width; ++j) {
+    int xi = x >> 16;
+    int xf1 = x & 0xffff;
+    int xf0 = 65536 - xf1;
+
+    *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16;
+    x += dx;
+  }
+}
+
+static const int kMaxInputWidth = 2560;
+#if defined(HAS_SCALEFILTERROWS_SSE2)
+#define HAS_SCALEROWDOWN34_SSE2
+// Filter rows 0 and 1 together, 3 : 1
+static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  ALIGN16(uint8 row[kMaxInputWidth]);
+  ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3,
+                       256 / 4);
+  ScaleFilterCols34_C(dst_ptr, row, dst_width);
+}
+
+// Filter rows 1 and 2 together, 1 : 1
+static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  ALIGN16(uint8 row[kMaxInputWidth]);
+  ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2);
+  ScaleFilterCols34_C(dst_ptr, row, dst_width);
+}
+#endif
+
+static void ScaleRowDown38_C(const uint8* src_ptr, int,
+                             uint8* dst, int dst_width) {
+  assert(dst_width % 3 == 0);
+  for (int x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[3];
+    dst[2] = src_ptr[6];
+    dst += 3;
+    src_ptr += 8;
+  }
+}
+
+// 8x3 -> 3x1
+static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride,
+                                   uint8* dst_ptr, int dst_width) {
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (int i = 0; i < dst_width; i+=3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
+        src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +
+        src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *
+        (65536 / 9) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
+        src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +
+        src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *
+        (65536 / 9) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +
+        src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) *
+        (65536 / 6) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+// 8x2 -> 3x1
+static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride,
+                                   uint8* dst_ptr, int dst_width) {
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (int i = 0; i < dst_width; i+=3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
+        src_ptr[src_stride + 2]) * (65536 / 6) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
+        src_ptr[src_stride + 5]) * (65536 / 6) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) *
+        (65536 / 4) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+// C version 8x2 -> 8x1
+static void ScaleFilterRows_C(uint8* dst_ptr,
+                              const uint8* src_ptr, int src_stride,
+                              int dst_width, int source_y_fraction) {
+  assert(dst_width > 0);
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  uint8* end = dst_ptr + dst_width;
+  do {
+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+    dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
+    dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
+    dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
+    dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
+    dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
+    dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
+    src_ptr += 8;
+    src_ptr1 += 8;
+    dst_ptr += 8;
+  } while (dst_ptr < end);
+  dst_ptr[0] = dst_ptr[-1];
+}
+
+void ScaleAddRows_C(const uint8* src_ptr, int src_stride,
+                    uint16* dst_ptr, int src_width, int src_height) {
+  assert(src_width > 0);
+  assert(src_height > 0);
+  for (int x = 0; x < src_width; ++x) {
+    const uint8* s = src_ptr + x;
+    int sum = 0;
+    for (int y = 0; y < src_height; ++y) {
+      sum += s[0];
+      s += src_stride;
+    }
+    dst_ptr[x] = sum;
+  }
+}
+
+/**
+ * Scale plane, 1/2
+ *
+ * This is an optimized version for scaling down a plane to 1/2 of
+ * its original size.
+ *
+ */
+static void ScalePlaneDown2(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_ptr, uint8* dst_ptr,
+                            FilterMode filtering) {
+  assert(src_width % 2 == 0);
+  assert(src_height % 2 == 0);
+  void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride,
+                        uint8* dst_ptr, int dst_width);
+
+#if defined(HAS_SCALEROWDOWN2_NEON)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
+      (dst_width % 16 == 0) && (src_stride % 16 == 0) &&
+      (dst_stride % 16 == 0) &&
+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 16)) {
+    ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
+  } else
+#endif
+#if defined(HAS_SCALEROWDOWN2_SSE2)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
+      (dst_width % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
+      IS_ALIGNED(dst_ptr, 16)) {
+    ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;
+  } else
+#endif
+  {
+    ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;
+  }
+
+  for (int y = 0; y < dst_height; ++y) {
+    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += (src_stride << 1);
+    dst_ptr += dst_stride;
+  }
+}
+
+/**
+ * Scale plane, 1/4
+ *
+ * This is an optimized version for scaling down a plane to 1/4 of
+ * its original size.
+ */
+static void ScalePlaneDown4(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_ptr, uint8* dst_ptr,
+                            FilterMode filtering) {
+  assert(src_width % 4 == 0);
+  assert(src_height % 4 == 0);
+  void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride,
+                        uint8* dst_ptr, int dst_width);
+
+#if defined(HAS_SCALEROWDOWN4_NEON)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
+      (dst_width % 2 == 0) && (src_stride % 8 == 0) &&
+      IS_ALIGNED(src_ptr, 8)) {
+    ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON;
+  } else
+#endif
+#if defined(HAS_SCALEROWDOWN4_SSE2)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
+      (dst_width % 8 == 0) && (src_stride % 16 == 0) &&
+      (dst_stride % 8 == 0) &&
+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
+    ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
+  } else
+#endif
+  {
+    ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;
+  }
+
+  for (int y = 0; y < dst_height; ++y) {
+    ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += (src_stride << 2);
+    dst_ptr += dst_stride;
+  }
+}
+
+/**
+ * Scale plane, 1/8
+ *
+ * This is an optimized version for scaling down a plane to 1/8
+ * of its original size.
+ *
+ */
+static void ScalePlaneDown8(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_ptr, uint8* dst_ptr,
+                            FilterMode filtering) {
+  assert(src_width % 8 == 0);
+  assert(src_height % 8 == 0);
+  void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride,
+                        uint8* dst_ptr, int dst_width);
+#if defined(HAS_SCALEROWDOWN8_SSE2)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
+      (dst_width % 16 == 0) && dst_width <= kMaxOutputWidth &&
+      (src_stride % 16 == 0) && (dst_stride % 16 == 0) &&
+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 16)) {
+    ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2;
+  } else
+#endif
+  {
+    ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ?
+        ScaleRowDown8Int_C : ScaleRowDown8_C;
+  }
+  for (int y = 0; y < dst_height; ++y) {
+    ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += (src_stride << 3);
+    dst_ptr += dst_stride;
+  }
+}
+
+/**
+ * Scale plane down, 3/4
+ *
+ * Provided by Frank Barchard (fbarchard@google.com)
+ *
+ */
+static void ScalePlaneDown34(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint8* src_ptr, uint8* dst_ptr,
+                             FilterMode filtering) {
+  assert(dst_width % 3 == 0);
+  void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride,
+                           uint8* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride,
+                           uint8* dst_ptr, int dst_width);
+#if defined(HAS_SCALEROWDOWN34_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (dst_width % 24 == 0) && (src_stride % 16 == 0) &&
+      (dst_stride % 8 == 0) &&
+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3;
+    }
+  } else
+#endif
+#if defined(HAS_SCALEROWDOWN34_SSE2)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
+      (dst_width % 24 == 0) && (src_stride % 16 == 0) &&
+      (dst_stride % 8 == 0) &&
+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8) &&
+      filtering) {
+    ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2;
+    ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2;
+  } else
+#endif
+  {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_C;
+      ScaleRowDown34_1 = ScaleRowDown34_C;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Int_C;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Int_C;
+    }
+  }
+  int src_row = 0;
+  for (int y = 0; y < dst_height; ++y) {
+    switch (src_row) {
+      case 0:
+        ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
+        break;
+
+      case 1:
+        ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);
+        break;
+
+      case 2:
+        ScaleRowDown34_0(src_ptr + src_stride, -src_stride,
+                         dst_ptr, dst_width);
+        break;
+    }
+    ++src_row;
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    if (src_row >= 3) {
+      src_ptr += src_stride;
+      src_row = 0;
+    }
+  }
+}
+
+/**
+ * Scale plane, 3/8
+ *
+ * This is an optimized version for scaling down a plane to 3/8
+ * of its original size.
+ *
+ * Reduces 16x3 to 6x1
+ */
+static void ScalePlaneDown38(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint8* src_ptr, uint8* dst_ptr,
+                             FilterMode filtering) {
+  assert(dst_width % 3 == 0);
+  void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride,
+                           uint8* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride,
+                           uint8* dst_ptr, int dst_width);
+#if defined(HAS_SCALEROWDOWN38_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (dst_width % 24 == 0) && (src_stride % 16 == 0) &&
+      (dst_stride % 8 == 0) &&
+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;
+    }
+  } else
+#endif
+  {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_C;
+      ScaleRowDown38_2 = ScaleRowDown38_C;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Int_C;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Int_C;
+    }
+  }
+  int src_row = 0;
+  for (int y = 0; y < dst_height; ++y) {
+    switch (src_row) {
+      case 0:
+      case 1:
+        ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+        src_ptr += src_stride * 3;
+        ++src_row;
+        break;
+
+      case 2:
+        ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);
+        src_ptr += src_stride * 2;
+        src_row = 0;
+        break;
+    }
+    dst_ptr += dst_stride;
+  }
+}
+
+inline static uint32 SumBox(int iboxwidth, int iboxheight,
+                            int src_stride, const uint8* src_ptr) {
+  assert(iboxwidth > 0);
+  assert(iboxheight > 0);
+  uint32 sum = 0u;
+  for (int y = 0; y < iboxheight; ++y) {
+    for (int x = 0; x < iboxwidth; ++x) {
+      sum += src_ptr[x];
+    }
+    src_ptr += src_stride;
+  }
+  return sum;
+}
+
+static void ScalePlaneBoxRow(int dst_width, int boxheight,
+                             int dx, int src_stride,
+                             const uint8* src_ptr, uint8* dst_ptr) {
+  int x = 0;
+  for (int i = 0; i < dst_width; ++i) {
+    int ix = x >> 16;
+    x += dx;
+    int boxwidth = (x >> 16) - ix;
+    *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
+        (boxwidth * boxheight);
+  }
+}
+
+inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
+  assert(iboxwidth > 0);
+  uint32 sum = 0u;
+  for (int x = 0; x < iboxwidth; ++x) {
+    sum += src_ptr[x];
+  }
+  return sum;
+}
+
+static void ScaleAddCols2_C(int dst_width, int boxheight, int dx,
+                            const uint16* src_ptr, uint8* dst_ptr) {
+  int scaletbl[2];
+  int minboxwidth = (dx >> 16);
+  scaletbl[0] = 65536 / (minboxwidth * boxheight);
+  scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
+  int *scaleptr = scaletbl - minboxwidth;
+  int x = 0;
+  for (int i = 0; i < dst_width; ++i) {
+    int ix = x >> 16;
+    x += dx;
+    int boxwidth = (x >> 16) - ix;
+    *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
+  }
+}
+
+static void ScaleAddCols1_C(int dst_width, int boxheight, int dx,
+                            const uint16* src_ptr, uint8* dst_ptr) {
+  int boxwidth = (dx >> 16);
+  int scaleval = 65536 / (boxwidth * boxheight);
+  int x = 0;
+  for (int i = 0; i < dst_width; ++i) {
+    *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
+    x += boxwidth;
+  }
+}
+
+/**
+ * Scale plane down to any dimensions, with interpolation.
+ * (boxfilter).
+ *
+ * Same method as SimpleScale, which is fixed point, outputting
+ * one pixel of destination using fixed point (16.16) to step
+ * through source, sampling a box of pixel with simple
+ * averaging.
+ */
+static void ScalePlaneBox(int src_width, int src_height,
+                          int dst_width, int dst_height,
+                          int src_stride, int dst_stride,
+                          const uint8* src_ptr, uint8* dst_ptr) {
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  int dy = (src_height << 16) / dst_height;
+  int dx = (src_width << 16) / dst_width;
+  if ((src_width % 16 != 0) || (src_width > kMaxInputWidth) ||
+      dst_height * 2 > src_height) {
+    uint8* dst = dst_ptr;
+    int dy = (src_height << 16) / dst_height;
+    int dx = (src_width << 16) / dst_width;
+    int y = 0;
+    for (int j = 0; j < dst_height; ++j) {
+      int iy = y >> 16;
+      const uint8* const src = src_ptr + iy * src_stride;
+      y += dy;
+      if (y > (src_height << 16)) {
+        y = (src_height << 16);
+      }
+      int boxheight = (y >> 16) - iy;
+      ScalePlaneBoxRow(dst_width, boxheight,
+                       dx, src_stride,
+                       src, dst);
+
+      dst += dst_stride;
+    }
+  } else {
+    ALIGN16(uint16 row[kMaxInputWidth]);
+    void (*ScaleAddRows)(const uint8* src_ptr, int src_stride,
+                         uint16* dst_ptr, int src_width, int src_height);
+    void (*ScaleAddCols)(int dst_width, int boxheight, int dx,
+                         const uint16* src_ptr, uint8* dst_ptr);
+#if defined(HAS_SCALEADDROWS_SSE2)
+    if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
+        (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
+        (src_width % 16) == 0) {
+      ScaleAddRows = ScaleAddRows_SSE2;
+    } else
+#endif
+    {
+      ScaleAddRows = ScaleAddRows_C;
+    }
+    if (dx & 0xffff) {
+      ScaleAddCols = ScaleAddCols2_C;
+    } else {
+      ScaleAddCols = ScaleAddCols1_C;
+    }
+
+    int y = 0;
+    for (int j = 0; j < dst_height; ++j) {
+      int iy = y >> 16;
+      const uint8* const src = src_ptr + iy * src_stride;
+      y += dy;
+      if (y > (src_height << 16)) {
+        y = (src_height << 16);
+      }
+      int boxheight = (y >> 16) - iy;
+      ScaleAddRows(src, src_stride, row, src_width, boxheight);
+      ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr);
+      dst_ptr += dst_stride;
+    }
+  }
+}
+
+/**
+ * Scale plane to/from any dimensions, with interpolation.
+ */
+static void ScalePlaneBilinearSimple(int src_width, int src_height,
+                                     int dst_width, int dst_height,
+                                     int src_stride, int dst_stride,
+                                     const uint8* src_ptr, uint8* dst_ptr) {
+  uint8* dst = dst_ptr;
+  int dx = (src_width << 16) / dst_width;
+  int dy = (src_height << 16) / dst_height;
+  int maxx = ((src_width - 1) << 16) - 1;
+  int maxy = ((src_height - 1) << 16) - 1;
+  int y = (dst_height < src_height) ? 32768 :
+      (src_height << 16) / dst_height - 32768;
+  for (int i = 0; i < dst_height; ++i) {
+    int cy = (y < 0) ? 0 : y;
+    int yi = cy >> 16;
+    int yf = cy & 0xffff;
+    const uint8* const src = src_ptr + yi * src_stride;
+    int x = (dst_width < src_width) ? 32768 :
+        (src_width << 16) / dst_width - 32768;
+    for (int j = 0; j < dst_width; ++j) {
+      int cx = (x < 0) ? 0 : x;
+      int xi = cx >> 16;
+      int xf = cx & 0xffff;
+      int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16;
+      int r1 = (src[xi + src_stride] * (65536 - xf) +
+          src[xi + src_stride + 1] * xf) >> 16;
+      *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16;
+      x += dx;
+      if (x > maxx)
+        x = maxx;
+    }
+    dst += dst_stride - dst_width;
+    y += dy;
+    if (y > maxy)
+      y = maxy;
+  }
+}
+
+/**
+ * Scale plane to/from any dimensions, with bilinear
+ * interpolation.
+ */
+static void ScalePlaneBilinear(int src_width, int src_height,
+                               int dst_width, int dst_height,
+                               int src_stride, int dst_stride,
+                               const uint8* src_ptr, uint8* dst_ptr) {
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  int dy = (src_height << 16) / dst_height;
+  int dx = (src_width << 16) / dst_width;
+  if ((src_width % 8 != 0) || (src_width > kMaxInputWidth)) {
+    ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height,
+                             src_stride, dst_stride, src_ptr, dst_ptr);
+
+  } else {
+    ALIGN16(uint8 row[kMaxInputWidth + 1]);
+    void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
+                            int src_stride,
+                            int dst_width, int source_y_fraction);
+    void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
+                            int dst_width, int dx);
+#if defined(HAS_SCALEFILTERROWS_SSSE3)
+    if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+        (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
+        (src_width % 16) == 0) {
+      ScaleFilterRows = ScaleFilterRows_SSSE3;
+    } else
+#endif
+#if defined(HAS_SCALEFILTERROWS_SSE2)
+    if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
+        (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
+        (src_width % 16) == 0) {
+      ScaleFilterRows = ScaleFilterRows_SSE2;
+    } else
+#endif
+    {
+      ScaleFilterRows = ScaleFilterRows_C;
+    }
+    ScaleFilterCols = ScaleFilterCols_C;
+
+    int y = 0;
+    int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows.
+    for (int j = 0; j < dst_height; ++j) {
+      int iy = y >> 16;
+      int fy = (y >> 8) & 255;
+      const uint8* const src = src_ptr + iy * src_stride;
+      ScaleFilterRows(row, src, src_stride, src_width, fy);
+      ScaleFilterCols(dst_ptr, row, dst_width, dx);
+      dst_ptr += dst_stride;
+      y += dy;
+      if (y > maxy) {
+        y = maxy;
+      }
+    }
+  }
+}
+
+/**
+ * Scale plane to/from any dimensions, without interpolation.
+ * Fixed point math is used for performance: The upper 16 bits
+ * of x and dx is the integer part of the source position and
+ * the lower 16 bits are the fixed decimal part.
+ */
+static void ScalePlaneSimple(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint8* src_ptr, uint8* dst_ptr) {
+  uint8* dst = dst_ptr;
+  int dx = (src_width << 16) / dst_width;
+  for (int y = 0; y < dst_height; ++y) {
+    const uint8* const src = src_ptr + (y * src_height / dst_height) *
+        src_stride;
+    // TODO(fbarchard): Round X coordinate by setting x=0x8000.
+    int x = 0;
+    for (int i = 0; i < dst_width; ++i) {
+      *dst++ = src[x >> 16];
+      x += dx;
+    }
+    dst += dst_stride - dst_width;
+  }
+}
+
+/**
+ * Scale plane to/from any dimensions.
+ */
+static void ScalePlaneAnySize(int src_width, int src_height,
+                              int dst_width, int dst_height,
+                              int src_stride, int dst_stride,
+                              const uint8* src_ptr, uint8* dst_ptr,
+                              FilterMode filtering) {
+  if (!filtering) {
+    ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
+                     src_stride, dst_stride, src_ptr, dst_ptr);
+  } else {
+    // fall back to non-optimized version
+    ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
+                       src_stride, dst_stride, src_ptr, dst_ptr);
+  }
+}
+
+/**
+ * Scale plane down, any size
+ *
+ * This is an optimized version for scaling down a plane to any size.
+ * The current implementation is ~10 times faster compared to the
+ * reference implementation for e.g. XGA->LowResPAL
+ *
+ */
+static void ScalePlaneDown(int src_width, int src_height,
+                           int dst_width, int dst_height,
+                           int src_stride, int dst_stride,
+                           const uint8* src_ptr, uint8* dst_ptr,
+                           FilterMode filtering) {
+  if (!filtering) {
+    ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
+                     src_stride, dst_stride, src_ptr, dst_ptr);
+  } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) {
+    // between 1/2x and 1x use bilinear
+    ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
+                       src_stride, dst_stride, src_ptr, dst_ptr);
+  } else {
+    ScalePlaneBox(src_width, src_height, dst_width, dst_height,
+                  src_stride, dst_stride, src_ptr, dst_ptr);
+  }
+}
+
+/**
+ * Copy plane, no scaling
+ *
+ * This simply copies the given plane without scaling.
+ * The current implementation is ~115 times faster
+ * compared to the reference implementation.
+ *
+ */
+static void CopyPlane(int src_width, int src_height,
+                      int dst_width, int dst_height,
+                      int src_stride, int dst_stride,
+                      const uint8* src_ptr, uint8* dst_ptr) {
+  if (src_stride == src_width && dst_stride == dst_width) {
+    // All contiguous, so can use REALLY fast path.
+    memcpy(dst_ptr, src_ptr, src_width * src_height);
+  } else {
+    // Not all contiguous; must copy scanlines individually
+    const uint8* src = src_ptr;
+    uint8* dst = dst_ptr;
+    for (int i = 0; i < src_height; ++i) {
+      memcpy(dst, src, src_width);
+      dst += dst_stride;
+      src += src_stride;
+    }
+  }
+}
+
+static void ScalePlane(const uint8* src, int src_stride,
+                       int src_width, int src_height,
+                       uint8* dst, int dst_stride,
+                       int dst_width, int dst_height,
+                       FilterMode filtering, bool use_ref) {
+  // Use specialized scales to improve performance for common resolutions.
+  // For example, all the 1/2 scalings will use ScalePlaneDown2()
+  if (dst_width == src_width && dst_height == src_height) {
+    // Straight copy.
+    CopyPlane(src_width, src_height, dst_width, dst_height, src_stride,
+              dst_stride, src, dst);
+  } else if (dst_width <= src_width && dst_height <= src_height) {
+    // Scale down.
+    if (use_ref) {
+      // For testing, allow the optimized versions to be disabled.
+      ScalePlaneDown(src_width, src_height, dst_width, dst_height,
+                     src_stride, dst_stride, src, dst, filtering);
+    } else if (4 * dst_width == 3 * src_width &&
+               4 * dst_height == 3 * src_height) {
+      // optimized, 3/4
+      ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
+                       src_stride, dst_stride, src, dst, filtering);
+    } else if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+      // optimized, 1/2
+      ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src, dst, filtering);
+    // 3/8 rounded up for odd sized chroma height.
+    } else if (8 * dst_width == 3 * src_width &&
+               dst_height == ((src_height * 3 + 7) / 8)) {
+      // optimized, 3/8
+      ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
+                       src_stride, dst_stride, src, dst, filtering);
+    } else if (4 * dst_width == src_width && 4 * dst_height == src_height) {
+      // optimized, 1/4
+      ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src, dst, filtering);
+    } else if (8 * dst_width == src_width && 8 * dst_height == src_height) {
+      // optimized, 1/8
+      ScalePlaneDown8(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src, dst, filtering);
+    } else {
+      // Arbitrary downsample
+      ScalePlaneDown(src_width, src_height, dst_width, dst_height,
+                     src_stride, dst_stride, src, dst, filtering);
+    }
+  } else {
+    // Arbitrary scale up and/or down.
+    ScalePlaneAnySize(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src, dst, filtering);
+  }
+}
+
+/**
+ * Scale a plane.
+ *
+ * This function in turn calls a scaling function
+ * suitable for handling the desired resolutions.
+ *
+ */
+
+int I420Scale(const uint8* src_y, int src_stride_y,
+              const uint8* src_u, int src_stride_u,
+              const uint8* src_v, int src_stride_v,
+              int src_width, int src_height,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int dst_width, int dst_height,
+              FilterMode filtering) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    int halfheight = (src_height + 1) >> 1;
+    src_y = src_y + (src_height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+  int halfsrc_width = (src_width + 1) >> 1;
+  int halfsrc_height = (src_height + 1) >> 1;
+  int halfdst_width = (dst_width + 1) >> 1;
+  int halfoheight = (dst_height + 1) >> 1;
+
+  ScalePlane(src_y, src_stride_y, src_width, src_height,
+             dst_y, dst_stride_y, dst_width, dst_height,
+             filtering, use_reference_impl_);
+  ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height,
+             dst_u, dst_stride_u, halfdst_width, halfoheight,
+             filtering, use_reference_impl_);
+  ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height,
+             dst_v, dst_stride_v, halfdst_width, halfoheight,
+             filtering, use_reference_impl_);
+  return 0;
+}
+
+int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
+          int src_stride_y, int src_stride_u, int src_stride_v,
+          int src_width, int src_height,
+          uint8* dst_y, uint8* dst_u, uint8* dst_v,
+          int dst_stride_y, int dst_stride_u, int dst_stride_v,
+          int dst_width, int dst_height,
+          bool interpolate) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    int halfheight = (src_height + 1) >> 1;
+    src_y = src_y + (src_height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+  int halfsrc_width = (src_width + 1) >> 1;
+  int halfsrc_height = (src_height + 1) >> 1;
+  int halfdst_width = (dst_width + 1) >> 1;
+  int halfoheight = (dst_height + 1) >> 1;
+  FilterMode filtering = interpolate ? kFilterBox : kFilterNone;
+
+  ScalePlane(src_y, src_stride_y, src_width, src_height,
+             dst_y, dst_stride_y, dst_width, dst_height,
+             filtering, use_reference_impl_);
+  ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height,
+             dst_u, dst_stride_u, halfdst_width, halfoheight,
+             filtering, use_reference_impl_);
+  ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height,
+             dst_v, dst_stride_v, halfdst_width, halfoheight,
+             filtering, use_reference_impl_);
+  return 0;
+}
+
+int Scale(const uint8* src, int src_width, int src_height,
+          uint8* dst, int dst_width, int dst_height, int ooffset,
+          bool interpolate) {
+  if (!src || src_width <= 0 || src_height <= 0 ||
+      !dst || dst_width <= 0 || dst_height <= 0 || ooffset < 0 ||
+      ooffset >= dst_height) {
+    return -1;
+  }
+  ooffset = ooffset & ~1;  // chroma requires offset to multiple of 2.
+  int halfsrc_width = (src_width + 1) >> 1;
+  int halfsrc_height = (src_height + 1) >> 1;
+  int halfdst_width = (dst_width + 1) >> 1;
+  int halfoheight = (dst_height + 1) >> 1;
+  int aheight = dst_height - ooffset * 2;  // actual output height
+  const uint8* const iyptr = src;
+  uint8* oyptr = dst + ooffset * dst_width;
+  const uint8* const iuptr = src + src_width * src_height;
+  uint8* ouptr = dst + dst_width * dst_height + (ooffset >> 1) * halfdst_width;
+  const uint8* const ivptr = src + src_width * src_height +
+                             halfsrc_width * halfsrc_height;
+  uint8* ovptr = dst + dst_width * dst_height + halfdst_width * halfoheight +
+                 (ooffset >> 1) * halfdst_width;
+  return Scale(iyptr, iuptr, ivptr, src_width, halfsrc_width, halfsrc_width,
+               src_width, src_height, oyptr, ouptr, ovptr, dst_width,
+               halfdst_width, halfdst_width, dst_width, aheight, interpolate);
+}
+
+}  // namespace libyuv
diff --git a/files/source/video_common.cc b/files/source/video_common.cc
new file mode 100644
index 0000000..8b8ee62
--- /dev/null
+++ b/files/source/video_common.cc
@@ -0,0 +1,48 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "video_common.h"
+
+#include <sstream>
+
+namespace libyuv {
+
+#define ARRAY_SIZE(x) (static_cast<int>((sizeof(x)/sizeof(x[0]))))
+
+struct FourCCAliasEntry {
+  uint32 alias;
+  uint32 canonical;
+};
+
+static const FourCCAliasEntry kFourCCAliases[] = {
+  {FOURCC_IYUV, FOURCC_I420},
+  {FOURCC_YU12, FOURCC_I420},
+  {FOURCC_YUYV, FOURCC_YUY2},
+  {FOURCC_YUVS, FOURCC_YUY2},
+  {FOURCC_HDYC, FOURCC_UYVY},
+  {FOURCC_2VUY, FOURCC_UYVY},
+  {FOURCC_BA81, FOURCC_BGGR},
+  {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
+  {FOURCC_RGB3, FOURCC_RAW},
+  {FOURCC_BGR3, FOURCC_24BG},
+};
+
+uint32 CanonicalFourCC(uint32 fourcc) {
+  for (int i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
+    if (kFourCCAliases[i].alias == fourcc) {
+      return kFourCCAliases[i].canonical;
+    }
+  }
+  // Not an alias, so return it as-is.
+  return fourcc;
+}
+
+}  // namespace libyuv
diff --git a/files/source/video_common.h b/files/source/video_common.h
new file mode 100644
index 0000000..9fe08a0
--- /dev/null
+++ b/files/source/video_common.h
@@ -0,0 +1,82 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+* Common definitions for video, including fourcc and VideoFormat
+*/
+
+
+#ifndef LIBYUV_SOURCE_VIDEO_COMMON_H_
+#define LIBYUV_SOURCE_VIDEO_COMMON_H_
+
+#include <string>
+
+#include "libyuv/basic_types.h"
+
+namespace libyuv {
+
+//////////////////////////////////////////////////////////////////////////////
+// Definition of fourcc.
+//////////////////////////////////////////////////////////////////////////////
+// Convert four characters to a fourcc code.
+// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
+// constants are used in a switch.
+#define FOURCC(a, b, c, d) (\
+    (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
+    (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
+
+// Some good pages discussing FourCC codes:
+//   http://developer.apple.com/quicktime/icefloe/dispatch020.html
+//   http://www.fourcc.org/yuv.php
+enum FourCC {
+  // Canonical fourcc codes used in our code.
+  FOURCC_I420 = FOURCC('I', '4', '2', '0'),
+  FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
+  FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
+  FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+  FOURCC_M420 = FOURCC('M', '4', '2', '0'),
+  FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
+  FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+  FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
+  FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
+  FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
+  FOURCC_RAW  = FOURCC('r', 'a', 'w', ' '),
+  FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
+  FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
+  // Next four are Bayer RGB formats. The four characters define the order of
+  // the colours in each 2x2 pixel grid, going left-to-right and top-to-bottom.
+  FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
+  FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
+  FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
+  FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
+
+  // Aliases for canonical fourcc codes, replaced with their canonical
+  // equivalents by CanonicalFourCC().
+  FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420
+  FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Alias for I420
+  FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'),  // Alias for YUY2
+  FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'),  // Alias for YUY2 on Mac
+  FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'),  // Alias for UYVY
+  FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'),  // Alias for UYVY
+  FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'),  // Alias for MJPG
+  FOURCC_BA81 = FOURCC('B', 'A', '8', '1'),  // Alias for BGGR
+  FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'),  // Alias for RAW
+  FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'),  // Alias for 24BG
+
+  // Match any fourcc.
+  FOURCC_ANY  = 0xFFFFFFFF,
+};
+
+// Converts fourcc aliases into canonical ones.
+uint32 CanonicalFourCC(uint32 fourcc);
+
+}  // namespace libyuv
+
+#endif  // LIBYUV_SOURCE_VIDEO_COMMON_H_