Blame - src/qs8-requantization/fp32-sse4.c - platform/external/XNNPACK

blob: 1df64637863e439fae98b4379b42f47fe8aee522 [file] [log] [blame]

Marat Dukhan	2e23d2b	2020-07-29 16:01:37 -0700	[diff] [blame]	1	// Copyright (c) Facebook, Inc. and its affiliates.
				2	// All rights reserved.
				3	//
				4	// Copyright 2019 Google LLC
				5	//
				6	// This source code is licensed under the BSD-style license found in the
				7	// LICENSE file in the root directory of this source tree.
				8
				9	#include <assert.h>
				10	#include <stdint.h>
				11	#include <stddef.h>
				12
				13	#include <nmmintrin.h>
				14
				15	#include <xnnpack/requantization-stubs.h>
				16
				17
				18	void xnn_qs8_requantize_fp32__sse4(
				19	size_t n,
				20	const int32_t* input,
				21	float scale,
				22	int8_t zero_point,
				23	int8_t qmin,
				24	int8_t qmax,
				25	int8_t* output)
				26	{
				27	assert(n % 16 == 0);
				28	assert(scale < 1.0f);
				29	assert(scale >= 0x1.0p-32f);
				30
				31	const __m128 vscale = _mm_set1_ps(scale);
				32	const __m128i vzero_point = _mm_set1_epi16((short) zero_point);
				33	const __m128i vqmin = _mm_set1_epi8((char) qmin);
				34	const __m128i vqmax = _mm_set1_epi8((char) qmax);
				35	for (; n != 0; n -= 16) {
				36	const __m128i x = _mm_loadu_si128((const __m128i*) input);
				37	const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4));
				38	const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8));
				39	const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
				40	input += 16;
				41
				42	// Convert int32_t input to FP32 and multiply by FP32 scale.
				43	// Both operations involve statistically unbiased roundings (with default MXCSR rounding mode):
				44	// - Large int32_t values can't be exactly represented as FP32. CVTDQ2PS instruction on x86 would round it
				45	// according to nearest FP32 value with ties to even (assuming default MXCSR rounding mode).
				46	// - Product of two FP32 values is generally not exactly representation as an FP32 value, and will be rounded
				47	// to nearest FP32 value with ties to even with default MXCSR rounding mode.
				48	const __m128 x_scaled = _mm_mul_ps(_mm_cvtepi32_ps(x), vscale);
				49	const __m128 y_scaled = _mm_mul_ps(_mm_cvtepi32_ps(y), vscale);
				50	const __m128 z_scaled = _mm_mul_ps(_mm_cvtepi32_ps(z), vscale);
				51	const __m128 w_scaled = _mm_mul_ps(_mm_cvtepi32_ps(w), vscale);
				52
				53	// Convert scaled FP32 result to int32_t using CVTPS2DQ instruction from x86 SSE2. CVTPS2DQ instruction rounds
				54	// result according to nearest FP32 value with ties to even (assuming default MXCSR rounding mode).
				55	// However, when conversion overflows, it produces INT32_MIN as a result. For large positive inputs the result
				56	// of conversion can become negative, which affects the final requantization result. Note that on x86 SSE2 we
				57	// have e.g. int32_t(float(INT32_MAX)) == INT32_MIN! This happens because float(INT32_MAX) rounds to 2**31,
				58	// which overflows int32_t when it is converted back to integer.
				59	//
				60	// Thankfully, we can prove that overflow never happens in this requantization scheme. The largest positive
				61	// input is INT32_MAX (231 - 1), which turns into 231 when converted to float. The largest scale value
				62	// is 0x1.FFFFFEp-1. When multiplied together, the result is 2147483520 (compare to INT32_MAX = 2147483647),
				63	// which fits into int32_t without overflow.
				64	const __m128i x_rounded = _mm_cvtps_epi32(x_scaled);
				65	const __m128i y_rounded = _mm_cvtps_epi32(y_scaled);
				66	const __m128i z_rounded = _mm_cvtps_epi32(z_scaled);
				67	const __m128i w_rounded = _mm_cvtps_epi32(w_scaled);
				68
				69	// Standard final sequence on x86 SSE2:
				70	// - Pack to int16_t and saturate
				71	// - Add zero point
				72	// - Pack to int8_t and saturate
				73	// - Clamp between qmin and qmax
				74	const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_rounded, y_rounded), vzero_point);
				75	const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_rounded, w_rounded), vzero_point);
				76	const __m128i xyzw_packed = _mm_packs_epi16(xy_packed, zw_packed);
				77	const __m128i xyzw_clamped = _mm_max_epi8(_mm_min_epi8(xyzw_packed, vqmax), vqmin);
				78
				79	// 4x CVTDQ2PS
				80	// 4x MULPS
				81	// 4x CVTPS2DQ
				82	// 2x PACKSSDW
				83	// 2x PADDSW
				84	// 1x PACKSSWB
				85	// 1x PMAXSB
				86	// 1x PMINSB
				87	// ---------------------
				88	// 19 instructions total
				89
				90	_mm_storeu_si128((__m128i*) output, xyzw_clamped);
				91	output += 16;
				92	}
				93	}