ARM64 NEON SIMD support for YCC-to-RGB565 conversion


git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1386 632fc199-4ca6-4c93-a231-07263d6284db
diff --git a/simd/jsimd_arm_neon_64.S b/simd/jsimd_arm_neon_64.S
index b0ba480..0ef770a 100644
--- a/simd/jsimd_arm_neon_64.S
+++ b/simd/jsimd_arm_neon_64.S
@@ -4,7 +4,7 @@
  * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
  * All rights reserved.
  * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
- * Copyright (C) 2013, Linaro Limited
+ * Copyright (C) 2013-2014, Linaro Limited
  * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
  *
  * This software is provided 'as-is', without any express or implied
@@ -1576,7 +1576,20 @@
         .else
             .error unsupported macroblock size
         .endif
-    .else
+    .elseif \bpp==16
+        .if \size == 8
+            st1  {v25.8h}, [RGB],16
+        .elseif \size == 4
+            st1  {v25.4h}, [RGB],8
+        .elseif \size == 2
+            st1  {v25.h}[4], [RGB],2
+            st1  {v25.h}[5], [RGB],2
+        .elseif \size == 1
+            st1  {v25.h}[6], [RGB],2
+        .else
+            .error unsupported macroblock size
+        .endif
+     .else
         .error unsupported bpp
     .endif
 .endm
@@ -1610,24 +1623,33 @@
     uaddw        v20.8h, v20.8h, v0.8b
     uaddw        v24.8h, v24.8h, v0.8b
     uaddw        v28.8h, v28.8h, v0.8b
+.if \bpp != 16
     sqxtun       v1\g_offs\defsize, v20.8h
     sqxtun       v1\r_offs\defsize, v24.8h
     sqxtun       v1\b_offs\defsize, v28.8h
+.else
+    sqshlu       v21.8h, v20.8h, #8
+    sqshlu       v25.8h, v24.8h, #8
+    sqshlu       v29.8h, v28.8h, #8
+    sri          v25.8h, v21.8h, #5
+    sri          v25.8h, v29.8h, #11
+.endif
 
 .endm
 
 .macro do_yuv_to_rgb_stage2_store_load_stage1
-    ld1          {v4.8b}, [U], 8
     rshrn        v20.4h, v20.4s, #15
-    rshrn2       v20.8h, v22.4s, #15
     rshrn        v24.4h, v24.4s, #14
-    rshrn2       v24.8h, v26.4s, #14
     rshrn        v28.4h, v28.4s, #14
-    ld1          {v5.8b}, [V], 8
+    ld1          {v4.8b}, [U], 8
+    rshrn2       v20.8h, v22.4s, #15
+    rshrn2       v24.8h, v26.4s, #14
     rshrn2       v28.8h, v30.4s, #14
+    ld1          {v5.8b}, [V], 8
     uaddw        v20.8h, v20.8h, v0.8b
     uaddw        v24.8h, v24.8h, v0.8b
     uaddw        v28.8h, v28.8h, v0.8b
+.if \bpp != 16 /**************** rgb24/rgb32 *********************************/
     sqxtun       v1\g_offs\defsize, v20.8h
     ld1          {v0.8b}, [Y], 8
     sqxtun       v1\r_offs\defsize, v24.8h
@@ -1637,13 +1659,32 @@
     sqxtun       v1\b_offs\defsize, v28.8h
     uaddw        v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
     uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
-    do_store     \bpp, 8
     smull        v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
     smlal        v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
     smull2       v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
     smlal2       v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
     smull        v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
     smull2       v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
+.else /**************************** rgb565 ***********************************/
+    sqshlu       v21.8h, v20.8h, #8
+    sqshlu       v25.8h, v24.8h, #8
+    sqshlu       v29.8h, v28.8h, #8
+    uaddw        v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
+    uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
+    ld1          {v0.8b}, [Y], 8
+    smull        v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
+    smlal        v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
+    smull2       v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
+    smlal2       v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
+    sri          v25.8h, v21.8h, #5
+    smull        v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
+    smull2       v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
+    prfm         PLDL1KEEP, [U, #64]
+    prfm         PLDL1KEEP, [V, #64]
+    prfm         PLDL1KEEP, [Y, #64]
+    sri          v25.8h, v29.8h, #11
+.endif
+    do_store     \bpp, 8
     smull        v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
     smull2       v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
 .endm
@@ -1812,6 +1853,6 @@
 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,   1, .4h,   0, .4h,   .8b
 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,   2, .4h,   1, .4h,   .8b
 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,   2, .4h,   3, .4h,   .8b
-
+generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,   0, .4h,   0, .4h,   .8b
 .purgem do_load
 .purgem do_store