src/qs8-vadd/scalar.c.in - platform/external/XNNPACK - Gitiles

 // Copyright 2021 Google LLC
 //
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.

 $assert BATCH_TILE >= 1
 #include <assert.h>

 #include <xnnpack/math.h>
 #include <xnnpack/vadd.h>


 void xnn_qs8_vadd_minmax_ukernel__scalar_x${BATCH_TILE}(
     size_t n,
     const int8_t* input_a,
     const int8_t* input_b,
     int8_t* output,
     const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
 {
   const int32_t vzero_point_product = params->scalar.zero_point_product;
   const int32_t va_multiplier = params->scalar.a_multiplier;
   const int32_t vb_multiplier = params->scalar.b_multiplier;
   const uint32_t vshift = params->scalar.shift;
   const int32_t vremainder_mask = params->scalar.remainder_mask;
   const int32_t vremainder_threshold = params->scalar.remainder_threshold;
   const int32_t voutput_zero_point = params->scalar.output_zero_point;
   const int32_t voutput_min = params->scalar.output_min;
   const int32_t voutput_max = params->scalar.output_max;

   $if BATCH_TILE == 1:
     do {
       const int32_t va = *input_a++;
       const int32_t vb = *input_b++;
       int32_t vacc = vzero_point_product + va * va_multiplier + vb * vb_multiplier;

       const int32_t vrem = (vacc & vremainder_mask) - (int32_t) (vacc < 0);
       int32_t vout = asr_s32(vacc, vshift) + (int32_t) (vrem > vremainder_threshold);
       vout += voutput_zero_point;
       vout = math_max_s32(vout, voutput_min);
       vout = math_min_s32(vout, voutput_max);
       *output++ = vout;

       n -= sizeof(int8_t);
     } while (n != 0);
   $else:
     for (; n >= ${BATCH_TILE} * sizeof(int8_t); n -= ${BATCH_TILE} * sizeof(int8_t)) {
       $for N in range(BATCH_TILE):
         const int32_t va${N} = input_a[${N}];
       input_a += ${BATCH_TILE};

       $for N in range(BATCH_TILE):
         const int32_t vb${N} = input_b[${N}];
         int32_t vacc${N} = vzero_point_product + va${N} * va_multiplier;
       input_b += ${BATCH_TILE};

       $for N in range(BATCH_TILE):
         vacc${N} += vb${N} * vb_multiplier;

       $for N in range(BATCH_TILE):
         const int32_t vrem${N} = (vacc${N} & vremainder_mask) - (int32_t) (vacc${N} < 0);

       $for N in range(BATCH_TILE):
         int32_t vout${N} = asr_s32(vacc${N}, vshift) + (int32_t) (vrem${N} > vremainder_threshold);

       $for N in range(BATCH_TILE):
         vout${N} += voutput_zero_point;

       $for N in range(BATCH_TILE):
         vout${N} = math_max_s32(vout${N}, voutput_min);

       $for N in range(BATCH_TILE):
         vout${N} = math_min_s32(vout${N}, voutput_max);

       $for N in range(BATCH_TILE):
         output[${N}] = vout${N};
       output += ${BATCH_TILE};
     }
     if XNN_UNLIKELY(n != 0) {
       $if BATCH_TILE == 2:
         const int32_t va = *input_a;
         const int32_t vb = *input_b;
         int32_t vacc = vzero_point_product + va * va_multiplier + vb * vb_multiplier;

         const int32_t vrem = (vacc & vremainder_mask) - (int32_t) (vacc < 0);
         int32_t vout = asr_s32(vacc, vshift) + (int32_t) (vrem > vremainder_threshold);
         vout += voutput_zero_point;
         vout = math_max_s32(vout, voutput_min);
         vout = math_min_s32(vout, voutput_max);
         *output = vout;
       $else:
         do {
           const int32_t va = *input_a++;
           const int32_t vb = *input_b++;
           int32_t vacc = vzero_point_product + va * va_multiplier + vb * vb_multiplier;

           const int32_t vrem = (vacc & vremainder_mask) - (int32_t) (vacc < 0);
           int32_t vout = asr_s32(vacc, vshift) + (int32_t) (vrem > vremainder_threshold);
           vout += voutput_zero_point;
           vout = math_max_s32(vout, voutput_min);
           vout = math_min_s32(vout, voutput_max);
           *output++ = vout;

           n -= sizeof(int8_t);
         } while (n != 0);
     }
 }
	// Copyright 2021 Google LLC
	//
	// This source code is licensed under the BSD-style license found in the
	// LICENSE file in the root directory of this source tree.

	$assert BATCH_TILE >= 1
	#include <assert.h>

	#include <xnnpack/math.h>
	#include <xnnpack/vadd.h>


	void xnn_qs8_vadd_minmax_ukernel__scalar_x${BATCH_TILE}(
	size_t n,
	const int8_t* input_a,
	const int8_t* input_b,
	int8_t* output,
	const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
	{
	const int32_t vzero_point_product = params->scalar.zero_point_product;
	const int32_t va_multiplier = params->scalar.a_multiplier;
	const int32_t vb_multiplier = params->scalar.b_multiplier;
	const uint32_t vshift = params->scalar.shift;
	const int32_t vremainder_mask = params->scalar.remainder_mask;
	const int32_t vremainder_threshold = params->scalar.remainder_threshold;
	const int32_t voutput_zero_point = params->scalar.output_zero_point;
	const int32_t voutput_min = params->scalar.output_min;
	const int32_t voutput_max = params->scalar.output_max;

	$if BATCH_TILE == 1:
	do {
	const int32_t va = *input_a++;
	const int32_t vb = *input_b++;
	int32_t vacc = vzero_point_product + va * va_multiplier + vb * vb_multiplier;

	const int32_t vrem = (vacc & vremainder_mask) - (int32_t) (vacc < 0);
	int32_t vout = asr_s32(vacc, vshift) + (int32_t) (vrem > vremainder_threshold);
	vout += voutput_zero_point;
	vout = math_max_s32(vout, voutput_min);
	vout = math_min_s32(vout, voutput_max);
	*output++ = vout;

	n -= sizeof(int8_t);
	} while (n != 0);
	$else:
	for (; n >= ${BATCH_TILE} * sizeof(int8_t); n -= ${BATCH_TILE} * sizeof(int8_t)) {
	$for N in range(BATCH_TILE):
	const int32_t va${N} = input_a[${N}];
	input_a += ${BATCH_TILE};

	$for N in range(BATCH_TILE):
	const int32_t vb${N} = input_b[${N}];
	int32_t vacc${N} = vzero_point_product + va${N} * va_multiplier;
	input_b += ${BATCH_TILE};

	$for N in range(BATCH_TILE):
	vacc${N} += vb${N} * vb_multiplier;

	$for N in range(BATCH_TILE):
	const int32_t vrem${N} = (vacc${N} & vremainder_mask) - (int32_t) (vacc${N} < 0);

	$for N in range(BATCH_TILE):
	int32_t vout${N} = asr_s32(vacc${N}, vshift) + (int32_t) (vrem${N} > vremainder_threshold);

	$for N in range(BATCH_TILE):
	vout${N} += voutput_zero_point;

	$for N in range(BATCH_TILE):
	vout${N} = math_max_s32(vout${N}, voutput_min);

	$for N in range(BATCH_TILE):
	vout${N} = math_min_s32(vout${N}, voutput_max);

	$for N in range(BATCH_TILE):
	output[${N}] = vout${N};
	output += ${BATCH_TILE};
	}
	if XNN_UNLIKELY(n != 0) {
	$if BATCH_TILE == 2:
	const int32_t va = *input_a;
	const int32_t vb = *input_b;
	int32_t vacc = vzero_point_product + va * va_multiplier + vb * vb_multiplier;

	const int32_t vrem = (vacc & vremainder_mask) - (int32_t) (vacc < 0);
	int32_t vout = asr_s32(vacc, vshift) + (int32_t) (vrem > vremainder_threshold);
	vout += voutput_zero_point;
	vout = math_max_s32(vout, voutput_min);
	vout = math_min_s32(vout, voutput_max);
	*output = vout;
	$else:
	do {
	const int32_t va = *input_a++;
	const int32_t vb = *input_b++;
	int32_t vacc = vzero_point_product + va * va_multiplier + vb * vb_multiplier;

	const int32_t vrem = (vacc & vremainder_mask) - (int32_t) (vacc < 0);
	int32_t vout = asr_s32(vacc, vshift) + (int32_t) (vrem > vremainder_threshold);
	vout += voutput_zero_point;
	vout = math_max_s32(vout, voutput_min);
	vout = math_min_s32(vout, voutput_max);
	*output++ = vout;

	n -= sizeof(int8_t);
	} while (n != 0);
	}
	}