cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S - fp2-dev/platform/frameworks/rs - Gitiles

 /*
  * Copyright (C) 2014 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
 #define END(f) .size f, .-f;

 /* Perform the actual YuvToRGB conversion in a macro, from register to
  * register.  This macro will be called from within several different wrapper
  * variants for different data layouts.  Y data starts with the even and odd
  * bytes split into the low parts of v8 and v9 respectively.  U and V are in
  * v16 and v17.  Working constants are pre-loaded into v13-v15, and v3 is
  * pre-loaded with a constant 0xff alpha channel.
  *
  * The complicated arithmetic is the result of refactoring the original
  * equations to avoid 16-bit overflow without losing any precision.
  */
 .macro yuvkern
         movi        v7.8b, #149

         umull       v1.8h, v8.8b, v7.8b        // g0 = y0 * 149
         umull       v5.8h, v9.8b, v7.8b        // g1 = y1 * 149

         movi        v7.8b, #50
         movi        v10.8b, #104
         umull       v8.8h, v16.8b, v7.8b       // g2 = u * 50 + v * 104
         umlal       v8.8h, v17.8b, v10.8b

         ushr        v7.8b, v17.8b, #1
         uaddw       v0.8h, v1.8h, v7.8b        // r0 = y0 * 149 + (v >> 1)
         uaddw       v4.8h, v5.8h, v7.8b        // r1 = y1 * 149 + (v >> 1)

         ushll       v7.8h, v16.8b, #2
         add         v2.8h, v1.8h, v7.8h        // b0 = y0 * 149 + (u << 2)
         add         v6.8h, v5.8h, v7.8h        // b1 = y1 * 149 + (u << 2)

         movi        v7.16b, #204
         movi        v10.8b, #254
         umull       v11.8h, v17.8b, v7.8b     // r2 = v * 204
         umull       v12.8h, v16.8b, v10.8b      // b2 = u * 254

         uhadd       v0.8h, v0.8h, v11.8h       // r0 = (r0 + r2) >> 1
         uhadd       v4.8h, v4.8h, v11.8h       // r1 = (r1 + r2) >> 1
         uqadd       v1.8h, v1.8h, v14.8h       // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
         uqadd       v5.8h, v5.8h, v14.8h       // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
         uhadd       v2.8h, v2.8h, v12.8h       // b0 = (b0 + b2) >> 1
         uhadd       v6.8h, v6.8h, v12.8h       // b1 = (b1 + b2) >> 1

         uqsub       v0.8h, v0.8h, v13.8h       // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
         uqsub       v4.8h, v4.8h, v13.8h       // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
         uqsub       v1.8h, v1.8h, v8.8h        // g0 = satu16(g0 - g2)
         uqsub       v5.8h, v5.8h, v8.8h        // g1 = satu16(g1 - g2)
         uqsub       v2.8h, v2.8h, v15.8h       // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
         uqsub       v6.8h, v6.8h, v15.8h       // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)

         uqrshrn     v0.8b, v0.8h, #6
         uqrshrn     v4.8b, v4.8h, #6
         uqrshrn     v1.8b, v1.8h, #7
         uqrshrn     v5.8b, v5.8h, #7
         uqrshrn     v2.8b, v2.8h, #6
         uqrshrn     v6.8b, v6.8h, #6

         zip1        v0.16b, v0.16b, v4.16b
         zip1        v1.16b, v1.16b, v5.16b
         zip1        v2.16b, v2.16b, v6.16b
 .endm

 /* Define the wrapper code which will load and store the data, iterate the
  * correct number of times, and safely handle the remainder at the end of the
  * loop.  Some sections of code are switched out depending on the data packing
  * being handled.
  */
 .macro wrap_line kernel, interleaved=0, swapuv=0

         mov         w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
         dup         v13.8h, w5
         mov         w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
         dup         v14.8h, w5
         mov         w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
         dup         v15.8h, w5

         movi        v3.16b, #0xff

         subs        x2, x2, #16
         bhs         1f
         b           2f

         .align 4
 1:      ld2         {v8.8b,v9.8b}, [x1], #16
 //      prfm PLDL1STRM, [x1, #256]
   .if \interleaved
     .if \swapuv
         ld2         {v17.8b,v18.8b}, [x3], #16
         mov         v16.8b, v18.8b
     .else
         ld2         {v16.8b,v17.8b}, [x3], #16
     .endif
 //      prfm PLD1STRM,  [x3, #256]
   .else
         ld1         {v16.8b}, [x3], #8
         ld1         {v17.8b}, [x4], #8
 //      prfm PLD1STRM,  [x3, #128]
 //      prfm PLD1STRM,  [x4, #128]
   .endif

         \kernel

         subs        x2, x2, #16

         st4         {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64

         bhs         1b

 2:      adds        x2, x2, #16
         beq         2f

         /* To handle the tail portion of the data (something less than 16
          * bytes) load small power-of-two chunks into working registers.  It
          * doesn't matter where they end up in the register; the same process
          * will store them back out using the same positions and the
          * interaction between neighbouring pixels is constrained to odd
          * boundaries where the load operations don't interfere.
          */
         movi        v8.8b, #0
         movi        v9.8b, #0
         movi        v16.8b, #0
         movi        v17.8b, #0

         tbz         x2, #3, 1f
         ld1         {v9.8b}, [x1], #8
   .if \interleaved
         ld1         {v17.8b}, [x3], #8
   .else
         ld1         {v16.s}[1], [x3], #4
         ld1         {v17.s}[1], [x4], #4
   .endif
 1:      tbz         x2, #2, 1f
         ld1         {v8.s}[1], [x1], #4
   .if \interleaved
         ld1         {v16.s}[1], [x3], #4
   .else
         ld1         {v16.h}[1], [x3], #2
         ld1         {v17.h}[1], [x4], #2
   .endif
 1:      tbz         x2, #1, 1f
         ld1         {v8.h}[1], [x1], #2
   .if \interleaved
         ld1         {v16.h}[1], [x3], #2
   .else
         ld1         {v16.b}[1], [x3], #1
         ld1         {v17.b}[1], [x4], #1
   .endif
 1:      tbz         x2, #0, 1f
         ld1         {v8.b}[1], [x1], #1
   .if \interleaved
         ld1         {v16.h}[0], [x3], #2
   .else
         ld1         {v16.b}[0], [x3], #1
         ld1         {v17.b}[0], [x4], #1
   .endif

         /* One small impediment in the process above is that some of the load
          * operations can't perform byte-wise structure deinterleaving at the
          * same time as loading only part of a register.  So the data is loaded
          * linearly and unpacked manually at this point if necessary.
          */
 1:      uzp1        v8.16b, v8.16b, v9.16b
   .if \interleaved
     .if \swapuv
         uzp1        v16.16b, v17.16b, v16.16b
     .else
         uzp1        v16.16b, v16.16b, v17.16b
     .endif
   .endif

         \kernel

         /* As above but with the output; structured stores for partial vectors
          * aren't available, so the data is re-packed first and stored linearly.
          */
         zip1        v4.16b, v0.16b, v2.16b
         zip2        v6.16b, v0.16b, v2.16b
         zip1        v5.16b, v1.16b, v3.16b
         zip2        v7.16b, v1.16b, v3.16b
         zip1        v0.16b, v4.16b, v5.16b
         zip2        v1.16b, v4.16b, v5.16b
         zip1        v2.16b, v6.16b, v7.16b
         zip2        v3.16b, v6.16b, v7.16b

 1:      tbz         x2, #3, 1f
         st1         {v2.16b,v3.16b}, [x0], #32
 1:      tbz         x2, #2, 1f
         st1         {v1.16b}, [x0], #16
 1:      tbz         x2, #1, 1f
         st1         {v0.d}[1], [x0], #8
 1:      tbz         x2, #0, 2f
         st1         {v0.s}[1], [x0], #4
 2:
 .endm


 /*  void rsdIntrinsicYuv2_K(
  *          void *out,          // x0
  *          void const *yin,    // x1
  *          void const *uin,    // x2
  *          void const *vin,    // x3
  *          size_t xstart,      // x4
  *          size_t xend);       // x5
  */
 ENTRY(rsdIntrinsicYuv2_K)
         lsr         x6, x4, #1
         add         x0, x0, x4, LSL #2
         add         x1, x1, x4
         add         x4, x3, x6
         add         x3, x2, x6
         sub         x2, x5, x6, LSL #2

         sub         x6, sp, #32
         sub         sp, sp, #64
         st1         {v8.1d - v11.1d}, [sp]
         st1         {v12.1d - v15.1d}, [x6]

         wrap_line yuvkern, 0

         ld1         {v8.1d - v11.1d}, [sp], #32
         ld1         {v12.1d - v15.1d}, [sp], #32
         ret
 END(rsdIntrinsicYuv2_K)

 /*  void rsdIntrinsicYuv_K(
  *          void *out,          // x0
  *          void const *yin,    // x1
  *          void const *uvin,   // x2
  *          size_t xstart,      // x3
  *          size_t xend);       // x4
  */
 ENTRY(rsdIntrinsicYuv_K)
         bic         x5, x3, #1
         add         x0, x0, x5, LSL #2
         add         x1, x1, x5
         add         x3, x2, x5
         sub         x2, x4, x5

         sub         x5, sp, #32
         sub         sp, sp, #64
         st1         {v8.1d - v11.1d}, [sp]
         st1         {v12.1d - v15.1d}, [x5]

         wrap_line yuvkern, 1, 1

         ld1         {v8.1d - v11.1d}, [sp], #32
         ld1         {v12.1d - v15.1d}, [sp], #32
         ret
 END(rsdIntrinsicYuv_K)

 /*  void rsdIntrinsicYuvR_K(
  *          void *out,          // x0
  *          void const *yin,    // x1
  *          void const *uvin,   // x2
  *          size_t xstart,      // x3
  *          size_t xend);       // x4
  */
 ENTRY(rsdIntrinsicYuvR_K)
         bic         x5, x3, #1
         add         x0, x0, x5, LSL #2
         add         x1, x1, x5
         add         x3, x2, x5
         sub         x2, x4, x5

         sub         x5, sp, #32
         sub         sp, sp, #64
         st1         {v8.1d - v11.1d}, [sp]
         st1         {v12.1d - v15.1d}, [x5]

         wrap_line yuvkern, 1

         ld1         {v8.1d - v11.1d}, [sp], #32
         ld1         {v12.1d - v15.1d}, [sp], #32
         ret
 END(rsdIntrinsicYuvR_K)
	/*
	* Copyright (C) 2014 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
	#define END(f) .size f, .-f;

	/* Perform the actual YuvToRGB conversion in a macro, from register to
	* register. This macro will be called from within several different wrapper
	* variants for different data layouts. Y data starts with the even and odd
	* bytes split into the low parts of v8 and v9 respectively. U and V are in
	* v16 and v17. Working constants are pre-loaded into v13-v15, and v3 is
	* pre-loaded with a constant 0xff alpha channel.
	*
	* The complicated arithmetic is the result of refactoring the original
	* equations to avoid 16-bit overflow without losing any precision.
	*/
	.macro yuvkern
	movi v7.8b, #149

	umull v1.8h, v8.8b, v7.8b // g0 = y0 * 149
	umull v5.8h, v9.8b, v7.8b // g1 = y1 * 149

	movi v7.8b, #50
	movi v10.8b, #104
	umull v8.8h, v16.8b, v7.8b // g2 = u * 50 + v * 104
	umlal v8.8h, v17.8b, v10.8b

	ushr v7.8b, v17.8b, #1
	uaddw v0.8h, v1.8h, v7.8b // r0 = y0 * 149 + (v >> 1)
	uaddw v4.8h, v5.8h, v7.8b // r1 = y1 * 149 + (v >> 1)

	ushll v7.8h, v16.8b, #2
	add v2.8h, v1.8h, v7.8h // b0 = y0 * 149 + (u << 2)
	add v6.8h, v5.8h, v7.8h // b1 = y1 * 149 + (u << 2)

	movi v7.16b, #204
	movi v10.8b, #254
	umull v11.8h, v17.8b, v7.8b // r2 = v * 204
	umull v12.8h, v16.8b, v10.8b // b2 = u * 254

	uhadd v0.8h, v0.8h, v11.8h // r0 = (r0 + r2) >> 1
	uhadd v4.8h, v4.8h, v11.8h // r1 = (r1 + r2) >> 1
	uqadd v1.8h, v1.8h, v14.8h // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
	uqadd v5.8h, v5.8h, v14.8h // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
	uhadd v2.8h, v2.8h, v12.8h // b0 = (b0 + b2) >> 1
	uhadd v6.8h, v6.8h, v12.8h // b1 = (b1 + b2) >> 1

	uqsub v0.8h, v0.8h, v13.8h // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
	uqsub v4.8h, v4.8h, v13.8h // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
	uqsub v1.8h, v1.8h, v8.8h // g0 = satu16(g0 - g2)
	uqsub v5.8h, v5.8h, v8.8h // g1 = satu16(g1 - g2)
	uqsub v2.8h, v2.8h, v15.8h // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
	uqsub v6.8h, v6.8h, v15.8h // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)

	uqrshrn v0.8b, v0.8h, #6
	uqrshrn v4.8b, v4.8h, #6
	uqrshrn v1.8b, v1.8h, #7
	uqrshrn v5.8b, v5.8h, #7
	uqrshrn v2.8b, v2.8h, #6
	uqrshrn v6.8b, v6.8h, #6

	zip1 v0.16b, v0.16b, v4.16b
	zip1 v1.16b, v1.16b, v5.16b
	zip1 v2.16b, v2.16b, v6.16b
	.endm

	/* Define the wrapper code which will load and store the data, iterate the
	* correct number of times, and safely handle the remainder at the end of the
	* loop. Some sections of code are switched out depending on the data packing
	* being handled.
	*/
	.macro wrap_line kernel, interleaved=0, swapuv=0

	mov w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
	dup v13.8h, w5
	mov w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
	dup v14.8h, w5
	mov w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
	dup v15.8h, w5

	movi v3.16b, #0xff

	subs x2, x2, #16
	bhs 1f
	b 2f

	.align 4
	1: ld2 {v8.8b,v9.8b}, [x1], #16
	// prfm PLDL1STRM, [x1, #256]
	.if \interleaved
	.if \swapuv
	ld2 {v17.8b,v18.8b}, [x3], #16
	mov v16.8b, v18.8b
	.else
	ld2 {v16.8b,v17.8b}, [x3], #16
	.endif
	// prfm PLD1STRM, [x3, #256]
	.else
	ld1 {v16.8b}, [x3], #8
	ld1 {v17.8b}, [x4], #8
	// prfm PLD1STRM, [x3, #128]
	// prfm PLD1STRM, [x4, #128]
	.endif

	\kernel

	subs x2, x2, #16

	st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64

	bhs 1b

	2: adds x2, x2, #16
	beq 2f

	/* To handle the tail portion of the data (something less than 16
	* bytes) load small power-of-two chunks into working registers. It
	* doesn't matter where they end up in the register; the same process
	* will store them back out using the same positions and the
	* interaction between neighbouring pixels is constrained to odd
	* boundaries where the load operations don't interfere.
	*/
	movi v8.8b, #0
	movi v9.8b, #0
	movi v16.8b, #0
	movi v17.8b, #0

	tbz x2, #3, 1f
	ld1 {v9.8b}, [x1], #8
	.if \interleaved
	ld1 {v17.8b}, [x3], #8
	.else
	ld1 {v16.s}[1], [x3], #4
	ld1 {v17.s}[1], [x4], #4
	.endif
	1: tbz x2, #2, 1f
	ld1 {v8.s}[1], [x1], #4
	.if \interleaved
	ld1 {v16.s}[1], [x3], #4
	.else
	ld1 {v16.h}[1], [x3], #2
	ld1 {v17.h}[1], [x4], #2
	.endif
	1: tbz x2, #1, 1f
	ld1 {v8.h}[1], [x1], #2
	.if \interleaved
	ld1 {v16.h}[1], [x3], #2
	.else
	ld1 {v16.b}[1], [x3], #1
	ld1 {v17.b}[1], [x4], #1
	.endif
	1: tbz x2, #0, 1f
	ld1 {v8.b}[1], [x1], #1
	.if \interleaved
	ld1 {v16.h}[0], [x3], #2
	.else
	ld1 {v16.b}[0], [x3], #1
	ld1 {v17.b}[0], [x4], #1
	.endif

	/* One small impediment in the process above is that some of the load
	* operations can't perform byte-wise structure deinterleaving at the
	* same time as loading only part of a register. So the data is loaded
	* linearly and unpacked manually at this point if necessary.
	*/
	1: uzp1 v8.16b, v8.16b, v9.16b
	.if \interleaved
	.if \swapuv
	uzp1 v16.16b, v17.16b, v16.16b
	.else
	uzp1 v16.16b, v16.16b, v17.16b
	.endif
	.endif

	\kernel

	/* As above but with the output; structured stores for partial vectors
	* aren't available, so the data is re-packed first and stored linearly.
	*/
	zip1 v4.16b, v0.16b, v2.16b
	zip2 v6.16b, v0.16b, v2.16b
	zip1 v5.16b, v1.16b, v3.16b
	zip2 v7.16b, v1.16b, v3.16b
	zip1 v0.16b, v4.16b, v5.16b
	zip2 v1.16b, v4.16b, v5.16b
	zip1 v2.16b, v6.16b, v7.16b
	zip2 v3.16b, v6.16b, v7.16b

	1: tbz x2, #3, 1f
	st1 {v2.16b,v3.16b}, [x0], #32
	1: tbz x2, #2, 1f
	st1 {v1.16b}, [x0], #16
	1: tbz x2, #1, 1f
	st1 {v0.d}[1], [x0], #8
	1: tbz x2, #0, 2f
	st1 {v0.s}[1], [x0], #4
	2:
	.endm


	/* void rsdIntrinsicYuv2_K(
	* void *out, // x0
	* void const *yin, // x1
	* void const *uin, // x2
	* void const *vin, // x3
	* size_t xstart, // x4
	* size_t xend); // x5
	*/
	ENTRY(rsdIntrinsicYuv2_K)
	lsr x6, x4, #1
	add x0, x0, x4, LSL #2
	add x1, x1, x4
	add x4, x3, x6
	add x3, x2, x6
	sub x2, x5, x6, LSL #2

	sub x6, sp, #32
	sub sp, sp, #64
	st1 {v8.1d - v11.1d}, [sp]
	st1 {v12.1d - v15.1d}, [x6]

	wrap_line yuvkern, 0

	ld1 {v8.1d - v11.1d}, [sp], #32
	ld1 {v12.1d - v15.1d}, [sp], #32
	ret
	END(rsdIntrinsicYuv2_K)

	/* void rsdIntrinsicYuv_K(
	* void *out, // x0
	* void const *yin, // x1
	* void const *uvin, // x2
	* size_t xstart, // x3
	* size_t xend); // x4
	*/
	ENTRY(rsdIntrinsicYuv_K)
	bic x5, x3, #1
	add x0, x0, x5, LSL #2
	add x1, x1, x5
	add x3, x2, x5
	sub x2, x4, x5

	sub x5, sp, #32
	sub sp, sp, #64
	st1 {v8.1d - v11.1d}, [sp]
	st1 {v12.1d - v15.1d}, [x5]

	wrap_line yuvkern, 1, 1

	ld1 {v8.1d - v11.1d}, [sp], #32
	ld1 {v12.1d - v15.1d}, [sp], #32
	ret
	END(rsdIntrinsicYuv_K)

	/* void rsdIntrinsicYuvR_K(
	* void *out, // x0
	* void const *yin, // x1
	* void const *uvin, // x2
	* size_t xstart, // x3
	* size_t xend); // x4
	*/
	ENTRY(rsdIntrinsicYuvR_K)
	bic x5, x3, #1
	add x0, x0, x5, LSL #2
	add x1, x1, x5
	add x3, x2, x5
	sub x2, x4, x5

	sub x5, sp, #32
	sub sp, sp, #64
	st1 {v8.1d - v11.1d}, [sp]
	st1 {v12.1d - v15.1d}, [x5]

	wrap_line yuvkern, 1

	ld1 {v8.1d - v11.1d}, [sp], #32
	ld1 {v12.1d - v15.1d}, [sp], #32
	ret
	END(rsdIntrinsicYuvR_K)