| // Auto-generated file. Do not edit! |
| // Template: src/qu8-gemm/MRxNRc4-minmax-scalar.c.in |
| // Generator: tools/xngen |
| // |
| // Copyright 2020 Google LLC |
| // |
| // This source code is licensed under the BSD-style license found in the |
| // LICENSE file in the root directory of this source tree. |
| |
| #include <assert.h> |
| |
| #include <xnnpack/gemm.h> |
| |
| #include <xnnpack/scalar-utils.h> |
| |
| // This kernel is a scalar model for a kernel using ARMv8.2 dot-product |
| // instructions. |
| // |
| // XNN_DISABLE_TSAN is used because this kernel reads up to 3 bytes past the |
| // bounds of the `a` matrix region, which may be a race condition with |
| // another thread. We deem this acceptable because the values that are |
| // read out of bounds do not affect the result, and the the compiler can't know |
| // about this undefined behavior. |
| void xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar( |
| size_t mr, |
| size_t nc, |
| size_t kc, |
| const uint8_t* restrict a, |
| size_t a_stride, |
| const void* restrict w, |
| uint8_t* restrict c, |
| size_t cm_stride, |
| size_t cn_stride, |
| const union xnn_qu8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN { |
| assert(mr != 0); |
| assert(mr <= 8); |
| assert(nc != 0); |
| assert(kc != 0); |
| |
| const uint8_t* a0 = a; |
| uint8_t* c0 = c; |
| const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride); |
| uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride); |
| if XNN_UNPREDICTABLE(mr < 2) { |
| a1 = a0; |
| c1 = c0; |
| } |
| const uint8_t* a2 = (const uint8_t*) ((uintptr_t) a1 + a_stride); |
| uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride); |
| if XNN_UNPREDICTABLE(mr <= 2) { |
| a2 = a1; |
| c2 = c1; |
| } |
| const uint8_t* a3 = (const uint8_t*) ((uintptr_t) a2 + a_stride); |
| uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride); |
| if XNN_UNPREDICTABLE(mr < 4) { |
| a3 = a2; |
| c3 = c2; |
| } |
| const uint8_t* a4 = (const uint8_t*) ((uintptr_t) a3 + a_stride); |
| uint8_t* c4 = (uint8_t*) ((uintptr_t) c3 + cm_stride); |
| if XNN_UNPREDICTABLE(mr <= 4) { |
| a4 = a3; |
| c4 = c3; |
| } |
| const uint8_t* a5 = (const uint8_t*) ((uintptr_t) a4 + a_stride); |
| uint8_t* c5 = (uint8_t*) ((uintptr_t) c4 + cm_stride); |
| if XNN_UNPREDICTABLE(mr < 6) { |
| a5 = a4; |
| c5 = c4; |
| } |
| const uint8_t* a6 = (const uint8_t*) ((uintptr_t) a5 + a_stride); |
| uint8_t* c6 = (uint8_t*) ((uintptr_t) c5 + cm_stride); |
| if XNN_UNPREDICTABLE(mr <= 6) { |
| a6 = a5; |
| c6 = c5; |
| } |
| const uint8_t* a7 = (const uint8_t*) ((uintptr_t) a6 + a_stride); |
| uint8_t* c7 = (uint8_t*) ((uintptr_t) c6 + cm_stride); |
| if XNN_UNPREDICTABLE(mr != 8) { |
| a7 = a6; |
| c7 = c6; |
| } |
| |
| const int32_t vb_zero_point = params->scalar.kernel_zero_point; |
| |
| // Loop over groups of 8 columns. |
| do { |
| // `vaccMN` is the accumulator at row `M`, column `N`. |
| // Initialize accumulators with bias. 8 bias values are loaded from the |
| // weight matrix, at the start of the group of 8 columns. |
| int32_t bias0 = ((const int32_t*)w)[0]; |
| int32_t vacc00 = bias0; |
| int32_t vacc10 = bias0; |
| int32_t vacc20 = bias0; |
| int32_t vacc30 = bias0; |
| int32_t vacc40 = bias0; |
| int32_t vacc50 = bias0; |
| int32_t vacc60 = bias0; |
| int32_t vacc70 = bias0; |
| int32_t bias1 = ((const int32_t*)w)[1]; |
| int32_t vacc01 = bias1; |
| int32_t vacc11 = bias1; |
| int32_t vacc21 = bias1; |
| int32_t vacc31 = bias1; |
| int32_t vacc41 = bias1; |
| int32_t vacc51 = bias1; |
| int32_t vacc61 = bias1; |
| int32_t vacc71 = bias1; |
| int32_t bias2 = ((const int32_t*)w)[2]; |
| int32_t vacc02 = bias2; |
| int32_t vacc12 = bias2; |
| int32_t vacc22 = bias2; |
| int32_t vacc32 = bias2; |
| int32_t vacc42 = bias2; |
| int32_t vacc52 = bias2; |
| int32_t vacc62 = bias2; |
| int32_t vacc72 = bias2; |
| int32_t bias3 = ((const int32_t*)w)[3]; |
| int32_t vacc03 = bias3; |
| int32_t vacc13 = bias3; |
| int32_t vacc23 = bias3; |
| int32_t vacc33 = bias3; |
| int32_t vacc43 = bias3; |
| int32_t vacc53 = bias3; |
| int32_t vacc63 = bias3; |
| int32_t vacc73 = bias3; |
| int32_t bias4 = ((const int32_t*)w)[4]; |
| int32_t vacc04 = bias4; |
| int32_t vacc14 = bias4; |
| int32_t vacc24 = bias4; |
| int32_t vacc34 = bias4; |
| int32_t vacc44 = bias4; |
| int32_t vacc54 = bias4; |
| int32_t vacc64 = bias4; |
| int32_t vacc74 = bias4; |
| int32_t bias5 = ((const int32_t*)w)[5]; |
| int32_t vacc05 = bias5; |
| int32_t vacc15 = bias5; |
| int32_t vacc25 = bias5; |
| int32_t vacc35 = bias5; |
| int32_t vacc45 = bias5; |
| int32_t vacc55 = bias5; |
| int32_t vacc65 = bias5; |
| int32_t vacc75 = bias5; |
| int32_t bias6 = ((const int32_t*)w)[6]; |
| int32_t vacc06 = bias6; |
| int32_t vacc16 = bias6; |
| int32_t vacc26 = bias6; |
| int32_t vacc36 = bias6; |
| int32_t vacc46 = bias6; |
| int32_t vacc56 = bias6; |
| int32_t vacc66 = bias6; |
| int32_t vacc76 = bias6; |
| int32_t bias7 = ((const int32_t*)w)[7]; |
| int32_t vacc07 = bias7; |
| int32_t vacc17 = bias7; |
| int32_t vacc27 = bias7; |
| int32_t vacc37 = bias7; |
| int32_t vacc47 = bias7; |
| int32_t vacc57 = bias7; |
| int32_t vacc67 = bias7; |
| int32_t vacc77 = bias7; |
| |
| w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t)); |
| |
| // Inner accumulation loop along the 8 columns. |
| // Handle 4 rows at each iteration: this is key to modelling what an |
| // actual kernel using ARMv8.2 dot-product instructions would look like. |
| size_t k = 0; |
| while (k < kc) { |
| // Load a 8x4 block of activations, and compute sums along rows. |
| int16_t vasum0 = 0; |
| int32_t va00 = *a0++; |
| vasum0 += (int16_t) va00; |
| int32_t va01 = *a0++; |
| vasum0 += (int16_t) va01; |
| int32_t va02 = *a0++; |
| vasum0 += (int16_t) va02; |
| int32_t va03 = *a0++; |
| vasum0 += (int16_t) va03; |
| int16_t vasum1 = 0; |
| int32_t va10 = *a1++; |
| vasum1 += (int16_t) va10; |
| int32_t va11 = *a1++; |
| vasum1 += (int16_t) va11; |
| int32_t va12 = *a1++; |
| vasum1 += (int16_t) va12; |
| int32_t va13 = *a1++; |
| vasum1 += (int16_t) va13; |
| int16_t vasum2 = 0; |
| int32_t va20 = *a2++; |
| vasum2 += (int16_t) va20; |
| int32_t va21 = *a2++; |
| vasum2 += (int16_t) va21; |
| int32_t va22 = *a2++; |
| vasum2 += (int16_t) va22; |
| int32_t va23 = *a2++; |
| vasum2 += (int16_t) va23; |
| int16_t vasum3 = 0; |
| int32_t va30 = *a3++; |
| vasum3 += (int16_t) va30; |
| int32_t va31 = *a3++; |
| vasum3 += (int16_t) va31; |
| int32_t va32 = *a3++; |
| vasum3 += (int16_t) va32; |
| int32_t va33 = *a3++; |
| vasum3 += (int16_t) va33; |
| int16_t vasum4 = 0; |
| int32_t va40 = *a4++; |
| vasum4 += (int16_t) va40; |
| int32_t va41 = *a4++; |
| vasum4 += (int16_t) va41; |
| int32_t va42 = *a4++; |
| vasum4 += (int16_t) va42; |
| int32_t va43 = *a4++; |
| vasum4 += (int16_t) va43; |
| int16_t vasum5 = 0; |
| int32_t va50 = *a5++; |
| vasum5 += (int16_t) va50; |
| int32_t va51 = *a5++; |
| vasum5 += (int16_t) va51; |
| int32_t va52 = *a5++; |
| vasum5 += (int16_t) va52; |
| int32_t va53 = *a5++; |
| vasum5 += (int16_t) va53; |
| int16_t vasum6 = 0; |
| int32_t va60 = *a6++; |
| vasum6 += (int16_t) va60; |
| int32_t va61 = *a6++; |
| vasum6 += (int16_t) va61; |
| int32_t va62 = *a6++; |
| vasum6 += (int16_t) va62; |
| int32_t va63 = *a6++; |
| vasum6 += (int16_t) va63; |
| int16_t vasum7 = 0; |
| int32_t va70 = *a7++; |
| vasum7 += (int16_t) va70; |
| int32_t va71 = *a7++; |
| vasum7 += (int16_t) va71; |
| int32_t va72 = *a7++; |
| vasum7 += (int16_t) va72; |
| int32_t va73 = *a7++; |
| vasum7 += (int16_t) va73; |
| |
| // Load a 4x8 block of weights. |
| int32_t vb00 = (int32_t) ((const uint8_t*)w)[0]; |
| int32_t vb10 = (int32_t) ((const uint8_t*)w)[1]; |
| int32_t vb20 = (int32_t) ((const uint8_t*)w)[2]; |
| int32_t vb30 = (int32_t) ((const uint8_t*)w)[3]; |
| |
| w = (const void*) ((uintptr_t) w + 4 * sizeof(uint8_t)); |
| int32_t vb01 = (int32_t) ((const uint8_t*)w)[0]; |
| int32_t vb11 = (int32_t) ((const uint8_t*)w)[1]; |
| int32_t vb21 = (int32_t) ((const uint8_t*)w)[2]; |
| int32_t vb31 = (int32_t) ((const uint8_t*)w)[3]; |
| |
| w = (const void*) ((uintptr_t) w + 4 * sizeof(uint8_t)); |
| int32_t vb02 = (int32_t) ((const uint8_t*)w)[0]; |
| int32_t vb12 = (int32_t) ((const uint8_t*)w)[1]; |
| int32_t vb22 = (int32_t) ((const uint8_t*)w)[2]; |
| int32_t vb32 = (int32_t) ((const uint8_t*)w)[3]; |
| |
| w = (const void*) ((uintptr_t) w + 4 * sizeof(uint8_t)); |
| int32_t vb03 = (int32_t) ((const uint8_t*)w)[0]; |
| int32_t vb13 = (int32_t) ((const uint8_t*)w)[1]; |
| int32_t vb23 = (int32_t) ((const uint8_t*)w)[2]; |
| int32_t vb33 = (int32_t) ((const uint8_t*)w)[3]; |
| |
| w = (const void*) ((uintptr_t) w + 4 * sizeof(uint8_t)); |
| int32_t vb04 = (int32_t) ((const uint8_t*)w)[0]; |
| int32_t vb14 = (int32_t) ((const uint8_t*)w)[1]; |
| int32_t vb24 = (int32_t) ((const uint8_t*)w)[2]; |
| int32_t vb34 = (int32_t) ((const uint8_t*)w)[3]; |
| |
| w = (const void*) ((uintptr_t) w + 4 * sizeof(uint8_t)); |
| int32_t vb05 = (int32_t) ((const uint8_t*)w)[0]; |
| int32_t vb15 = (int32_t) ((const uint8_t*)w)[1]; |
| int32_t vb25 = (int32_t) ((const uint8_t*)w)[2]; |
| int32_t vb35 = (int32_t) ((const uint8_t*)w)[3]; |
| |
| w = (const void*) ((uintptr_t) w + 4 * sizeof(uint8_t)); |
| int32_t vb06 = (int32_t) ((const uint8_t*)w)[0]; |
| int32_t vb16 = (int32_t) ((const uint8_t*)w)[1]; |
| int32_t vb26 = (int32_t) ((const uint8_t*)w)[2]; |
| int32_t vb36 = (int32_t) ((const uint8_t*)w)[3]; |
| |
| w = (const void*) ((uintptr_t) w + 4 * sizeof(uint8_t)); |
| int32_t vb07 = (int32_t) ((const uint8_t*)w)[0]; |
| int32_t vb17 = (int32_t) ((const uint8_t*)w)[1]; |
| int32_t vb27 = (int32_t) ((const uint8_t*)w)[2]; |
| int32_t vb37 = (int32_t) ((const uint8_t*)w)[3]; |
| |
| w = (const void*) ((uintptr_t) w + 4 * sizeof(uint8_t)); |
| |
| // Multiply-accumulate: 8x4 * 4x8 --> 8x8. The inner size 4 here means |
| // we're computing 4D dot-products, which makes this a model for |
| // a ARMv8.2 dot-product kernel. |
| vacc00 += va00 * vb00; |
| vacc00 += va01 * vb10; |
| vacc00 += va02 * vb20; |
| vacc00 += va03 * vb30; |
| vacc00 -= ((int32_t) vasum0) * vb_zero_point; |
| vacc01 += va00 * vb01; |
| vacc01 += va01 * vb11; |
| vacc01 += va02 * vb21; |
| vacc01 += va03 * vb31; |
| vacc01 -= ((int32_t) vasum0) * vb_zero_point; |
| vacc02 += va00 * vb02; |
| vacc02 += va01 * vb12; |
| vacc02 += va02 * vb22; |
| vacc02 += va03 * vb32; |
| vacc02 -= ((int32_t) vasum0) * vb_zero_point; |
| vacc03 += va00 * vb03; |
| vacc03 += va01 * vb13; |
| vacc03 += va02 * vb23; |
| vacc03 += va03 * vb33; |
| vacc03 -= ((int32_t) vasum0) * vb_zero_point; |
| vacc04 += va00 * vb04; |
| vacc04 += va01 * vb14; |
| vacc04 += va02 * vb24; |
| vacc04 += va03 * vb34; |
| vacc04 -= ((int32_t) vasum0) * vb_zero_point; |
| vacc05 += va00 * vb05; |
| vacc05 += va01 * vb15; |
| vacc05 += va02 * vb25; |
| vacc05 += va03 * vb35; |
| vacc05 -= ((int32_t) vasum0) * vb_zero_point; |
| vacc06 += va00 * vb06; |
| vacc06 += va01 * vb16; |
| vacc06 += va02 * vb26; |
| vacc06 += va03 * vb36; |
| vacc06 -= ((int32_t) vasum0) * vb_zero_point; |
| vacc07 += va00 * vb07; |
| vacc07 += va01 * vb17; |
| vacc07 += va02 * vb27; |
| vacc07 += va03 * vb37; |
| vacc07 -= ((int32_t) vasum0) * vb_zero_point; |
| vacc10 += va10 * vb00; |
| vacc10 += va11 * vb10; |
| vacc10 += va12 * vb20; |
| vacc10 += va13 * vb30; |
| vacc10 -= ((int32_t) vasum1) * vb_zero_point; |
| vacc11 += va10 * vb01; |
| vacc11 += va11 * vb11; |
| vacc11 += va12 * vb21; |
| vacc11 += va13 * vb31; |
| vacc11 -= ((int32_t) vasum1) * vb_zero_point; |
| vacc12 += va10 * vb02; |
| vacc12 += va11 * vb12; |
| vacc12 += va12 * vb22; |
| vacc12 += va13 * vb32; |
| vacc12 -= ((int32_t) vasum1) * vb_zero_point; |
| vacc13 += va10 * vb03; |
| vacc13 += va11 * vb13; |
| vacc13 += va12 * vb23; |
| vacc13 += va13 * vb33; |
| vacc13 -= ((int32_t) vasum1) * vb_zero_point; |
| vacc14 += va10 * vb04; |
| vacc14 += va11 * vb14; |
| vacc14 += va12 * vb24; |
| vacc14 += va13 * vb34; |
| vacc14 -= ((int32_t) vasum1) * vb_zero_point; |
| vacc15 += va10 * vb05; |
| vacc15 += va11 * vb15; |
| vacc15 += va12 * vb25; |
| vacc15 += va13 * vb35; |
| vacc15 -= ((int32_t) vasum1) * vb_zero_point; |
| vacc16 += va10 * vb06; |
| vacc16 += va11 * vb16; |
| vacc16 += va12 * vb26; |
| vacc16 += va13 * vb36; |
| vacc16 -= ((int32_t) vasum1) * vb_zero_point; |
| vacc17 += va10 * vb07; |
| vacc17 += va11 * vb17; |
| vacc17 += va12 * vb27; |
| vacc17 += va13 * vb37; |
| vacc17 -= ((int32_t) vasum1) * vb_zero_point; |
| vacc20 += va20 * vb00; |
| vacc20 += va21 * vb10; |
| vacc20 += va22 * vb20; |
| vacc20 += va23 * vb30; |
| vacc20 -= ((int32_t) vasum2) * vb_zero_point; |
| vacc21 += va20 * vb01; |
| vacc21 += va21 * vb11; |
| vacc21 += va22 * vb21; |
| vacc21 += va23 * vb31; |
| vacc21 -= ((int32_t) vasum2) * vb_zero_point; |
| vacc22 += va20 * vb02; |
| vacc22 += va21 * vb12; |
| vacc22 += va22 * vb22; |
| vacc22 += va23 * vb32; |
| vacc22 -= ((int32_t) vasum2) * vb_zero_point; |
| vacc23 += va20 * vb03; |
| vacc23 += va21 * vb13; |
| vacc23 += va22 * vb23; |
| vacc23 += va23 * vb33; |
| vacc23 -= ((int32_t) vasum2) * vb_zero_point; |
| vacc24 += va20 * vb04; |
| vacc24 += va21 * vb14; |
| vacc24 += va22 * vb24; |
| vacc24 += va23 * vb34; |
| vacc24 -= ((int32_t) vasum2) * vb_zero_point; |
| vacc25 += va20 * vb05; |
| vacc25 += va21 * vb15; |
| vacc25 += va22 * vb25; |
| vacc25 += va23 * vb35; |
| vacc25 -= ((int32_t) vasum2) * vb_zero_point; |
| vacc26 += va20 * vb06; |
| vacc26 += va21 * vb16; |
| vacc26 += va22 * vb26; |
| vacc26 += va23 * vb36; |
| vacc26 -= ((int32_t) vasum2) * vb_zero_point; |
| vacc27 += va20 * vb07; |
| vacc27 += va21 * vb17; |
| vacc27 += va22 * vb27; |
| vacc27 += va23 * vb37; |
| vacc27 -= ((int32_t) vasum2) * vb_zero_point; |
| vacc30 += va30 * vb00; |
| vacc30 += va31 * vb10; |
| vacc30 += va32 * vb20; |
| vacc30 += va33 * vb30; |
| vacc30 -= ((int32_t) vasum3) * vb_zero_point; |
| vacc31 += va30 * vb01; |
| vacc31 += va31 * vb11; |
| vacc31 += va32 * vb21; |
| vacc31 += va33 * vb31; |
| vacc31 -= ((int32_t) vasum3) * vb_zero_point; |
| vacc32 += va30 * vb02; |
| vacc32 += va31 * vb12; |
| vacc32 += va32 * vb22; |
| vacc32 += va33 * vb32; |
| vacc32 -= ((int32_t) vasum3) * vb_zero_point; |
| vacc33 += va30 * vb03; |
| vacc33 += va31 * vb13; |
| vacc33 += va32 * vb23; |
| vacc33 += va33 * vb33; |
| vacc33 -= ((int32_t) vasum3) * vb_zero_point; |
| vacc34 += va30 * vb04; |
| vacc34 += va31 * vb14; |
| vacc34 += va32 * vb24; |
| vacc34 += va33 * vb34; |
| vacc34 -= ((int32_t) vasum3) * vb_zero_point; |
| vacc35 += va30 * vb05; |
| vacc35 += va31 * vb15; |
| vacc35 += va32 * vb25; |
| vacc35 += va33 * vb35; |
| vacc35 -= ((int32_t) vasum3) * vb_zero_point; |
| vacc36 += va30 * vb06; |
| vacc36 += va31 * vb16; |
| vacc36 += va32 * vb26; |
| vacc36 += va33 * vb36; |
| vacc36 -= ((int32_t) vasum3) * vb_zero_point; |
| vacc37 += va30 * vb07; |
| vacc37 += va31 * vb17; |
| vacc37 += va32 * vb27; |
| vacc37 += va33 * vb37; |
| vacc37 -= ((int32_t) vasum3) * vb_zero_point; |
| vacc40 += va40 * vb00; |
| vacc40 += va41 * vb10; |
| vacc40 += va42 * vb20; |
| vacc40 += va43 * vb30; |
| vacc40 -= ((int32_t) vasum4) * vb_zero_point; |
| vacc41 += va40 * vb01; |
| vacc41 += va41 * vb11; |
| vacc41 += va42 * vb21; |
| vacc41 += va43 * vb31; |
| vacc41 -= ((int32_t) vasum4) * vb_zero_point; |
| vacc42 += va40 * vb02; |
| vacc42 += va41 * vb12; |
| vacc42 += va42 * vb22; |
| vacc42 += va43 * vb32; |
| vacc42 -= ((int32_t) vasum4) * vb_zero_point; |
| vacc43 += va40 * vb03; |
| vacc43 += va41 * vb13; |
| vacc43 += va42 * vb23; |
| vacc43 += va43 * vb33; |
| vacc43 -= ((int32_t) vasum4) * vb_zero_point; |
| vacc44 += va40 * vb04; |
| vacc44 += va41 * vb14; |
| vacc44 += va42 * vb24; |
| vacc44 += va43 * vb34; |
| vacc44 -= ((int32_t) vasum4) * vb_zero_point; |
| vacc45 += va40 * vb05; |
| vacc45 += va41 * vb15; |
| vacc45 += va42 * vb25; |
| vacc45 += va43 * vb35; |
| vacc45 -= ((int32_t) vasum4) * vb_zero_point; |
| vacc46 += va40 * vb06; |
| vacc46 += va41 * vb16; |
| vacc46 += va42 * vb26; |
| vacc46 += va43 * vb36; |
| vacc46 -= ((int32_t) vasum4) * vb_zero_point; |
| vacc47 += va40 * vb07; |
| vacc47 += va41 * vb17; |
| vacc47 += va42 * vb27; |
| vacc47 += va43 * vb37; |
| vacc47 -= ((int32_t) vasum4) * vb_zero_point; |
| vacc50 += va50 * vb00; |
| vacc50 += va51 * vb10; |
| vacc50 += va52 * vb20; |
| vacc50 += va53 * vb30; |
| vacc50 -= ((int32_t) vasum5) * vb_zero_point; |
| vacc51 += va50 * vb01; |
| vacc51 += va51 * vb11; |
| vacc51 += va52 * vb21; |
| vacc51 += va53 * vb31; |
| vacc51 -= ((int32_t) vasum5) * vb_zero_point; |
| vacc52 += va50 * vb02; |
| vacc52 += va51 * vb12; |
| vacc52 += va52 * vb22; |
| vacc52 += va53 * vb32; |
| vacc52 -= ((int32_t) vasum5) * vb_zero_point; |
| vacc53 += va50 * vb03; |
| vacc53 += va51 * vb13; |
| vacc53 += va52 * vb23; |
| vacc53 += va53 * vb33; |
| vacc53 -= ((int32_t) vasum5) * vb_zero_point; |
| vacc54 += va50 * vb04; |
| vacc54 += va51 * vb14; |
| vacc54 += va52 * vb24; |
| vacc54 += va53 * vb34; |
| vacc54 -= ((int32_t) vasum5) * vb_zero_point; |
| vacc55 += va50 * vb05; |
| vacc55 += va51 * vb15; |
| vacc55 += va52 * vb25; |
| vacc55 += va53 * vb35; |
| vacc55 -= ((int32_t) vasum5) * vb_zero_point; |
| vacc56 += va50 * vb06; |
| vacc56 += va51 * vb16; |
| vacc56 += va52 * vb26; |
| vacc56 += va53 * vb36; |
| vacc56 -= ((int32_t) vasum5) * vb_zero_point; |
| vacc57 += va50 * vb07; |
| vacc57 += va51 * vb17; |
| vacc57 += va52 * vb27; |
| vacc57 += va53 * vb37; |
| vacc57 -= ((int32_t) vasum5) * vb_zero_point; |
| vacc60 += va60 * vb00; |
| vacc60 += va61 * vb10; |
| vacc60 += va62 * vb20; |
| vacc60 += va63 * vb30; |
| vacc60 -= ((int32_t) vasum6) * vb_zero_point; |
| vacc61 += va60 * vb01; |
| vacc61 += va61 * vb11; |
| vacc61 += va62 * vb21; |
| vacc61 += va63 * vb31; |
| vacc61 -= ((int32_t) vasum6) * vb_zero_point; |
| vacc62 += va60 * vb02; |
| vacc62 += va61 * vb12; |
| vacc62 += va62 * vb22; |
| vacc62 += va63 * vb32; |
| vacc62 -= ((int32_t) vasum6) * vb_zero_point; |
| vacc63 += va60 * vb03; |
| vacc63 += va61 * vb13; |
| vacc63 += va62 * vb23; |
| vacc63 += va63 * vb33; |
| vacc63 -= ((int32_t) vasum6) * vb_zero_point; |
| vacc64 += va60 * vb04; |
| vacc64 += va61 * vb14; |
| vacc64 += va62 * vb24; |
| vacc64 += va63 * vb34; |
| vacc64 -= ((int32_t) vasum6) * vb_zero_point; |
| vacc65 += va60 * vb05; |
| vacc65 += va61 * vb15; |
| vacc65 += va62 * vb25; |
| vacc65 += va63 * vb35; |
| vacc65 -= ((int32_t) vasum6) * vb_zero_point; |
| vacc66 += va60 * vb06; |
| vacc66 += va61 * vb16; |
| vacc66 += va62 * vb26; |
| vacc66 += va63 * vb36; |
| vacc66 -= ((int32_t) vasum6) * vb_zero_point; |
| vacc67 += va60 * vb07; |
| vacc67 += va61 * vb17; |
| vacc67 += va62 * vb27; |
| vacc67 += va63 * vb37; |
| vacc67 -= ((int32_t) vasum6) * vb_zero_point; |
| vacc70 += va70 * vb00; |
| vacc70 += va71 * vb10; |
| vacc70 += va72 * vb20; |
| vacc70 += va73 * vb30; |
| vacc70 -= ((int32_t) vasum7) * vb_zero_point; |
| vacc71 += va70 * vb01; |
| vacc71 += va71 * vb11; |
| vacc71 += va72 * vb21; |
| vacc71 += va73 * vb31; |
| vacc71 -= ((int32_t) vasum7) * vb_zero_point; |
| vacc72 += va70 * vb02; |
| vacc72 += va71 * vb12; |
| vacc72 += va72 * vb22; |
| vacc72 += va73 * vb32; |
| vacc72 -= ((int32_t) vasum7) * vb_zero_point; |
| vacc73 += va70 * vb03; |
| vacc73 += va71 * vb13; |
| vacc73 += va72 * vb23; |
| vacc73 += va73 * vb33; |
| vacc73 -= ((int32_t) vasum7) * vb_zero_point; |
| vacc74 += va70 * vb04; |
| vacc74 += va71 * vb14; |
| vacc74 += va72 * vb24; |
| vacc74 += va73 * vb34; |
| vacc74 -= ((int32_t) vasum7) * vb_zero_point; |
| vacc75 += va70 * vb05; |
| vacc75 += va71 * vb15; |
| vacc75 += va72 * vb25; |
| vacc75 += va73 * vb35; |
| vacc75 -= ((int32_t) vasum7) * vb_zero_point; |
| vacc76 += va70 * vb06; |
| vacc76 += va71 * vb16; |
| vacc76 += va72 * vb26; |
| vacc76 += va73 * vb36; |
| vacc76 -= ((int32_t) vasum7) * vb_zero_point; |
| vacc77 += va70 * vb07; |
| vacc77 += va71 * vb17; |
| vacc77 += va72 * vb27; |
| vacc77 += va73 * vb37; |
| vacc77 -= ((int32_t) vasum7) * vb_zero_point; |
| |
| k += 4 * sizeof(uint8_t); |
| } |
| // End of accumulation loop. The variable `k` contains the amount by which |
| // we advanced the `va` pointers, so we rewind by this amount now. |
| a0 = (const uint8_t*)((uintptr_t) a0 - k); |
| a1 = (const uint8_t*)((uintptr_t) a1 - k); |
| a2 = (const uint8_t*)((uintptr_t) a2 - k); |
| a3 = (const uint8_t*)((uintptr_t) a3 - k); |
| a4 = (const uint8_t*)((uintptr_t) a4 - k); |
| a5 = (const uint8_t*)((uintptr_t) a5 - k); |
| a6 = (const uint8_t*)((uintptr_t) a6 - k); |
| a7 = (const uint8_t*)((uintptr_t) a7 - k); |
| |
| // Post-accumulation work |
| |
| const int32_t vmultiplier = params->scalar.multiplier; |
| const int64_t vq31rounding = INT64_C(0x40000000); |
| const int32_t vremainder_mask = params->scalar.remainder_mask; |
| const uint32_t vshift = params->scalar.shift; |
| const int32_t vremainder_threshold = params->scalar.remainder_threshold; |
| const int32_t voutput_min = params->scalar.output_min_less_zero_point; |
| const int32_t voutput_max = params->scalar.output_max_less_zero_point; |
| const int32_t voutput_zero_point = params->scalar.output_zero_point; |
| |
| const int64_t vproduct00 = (int64_t)vacc00 * (int64_t)vmultiplier; |
| const int64_t vproduct01 = (int64_t)vacc01 * (int64_t)vmultiplier; |
| const int64_t vproduct02 = (int64_t)vacc02 * (int64_t)vmultiplier; |
| const int64_t vproduct03 = (int64_t)vacc03 * (int64_t)vmultiplier; |
| const int64_t vproduct04 = (int64_t)vacc04 * (int64_t)vmultiplier; |
| const int64_t vproduct05 = (int64_t)vacc05 * (int64_t)vmultiplier; |
| const int64_t vproduct06 = (int64_t)vacc06 * (int64_t)vmultiplier; |
| const int64_t vproduct07 = (int64_t)vacc07 * (int64_t)vmultiplier; |
| const int64_t vproduct10 = (int64_t)vacc10 * (int64_t)vmultiplier; |
| const int64_t vproduct11 = (int64_t)vacc11 * (int64_t)vmultiplier; |
| const int64_t vproduct12 = (int64_t)vacc12 * (int64_t)vmultiplier; |
| const int64_t vproduct13 = (int64_t)vacc13 * (int64_t)vmultiplier; |
| const int64_t vproduct14 = (int64_t)vacc14 * (int64_t)vmultiplier; |
| const int64_t vproduct15 = (int64_t)vacc15 * (int64_t)vmultiplier; |
| const int64_t vproduct16 = (int64_t)vacc16 * (int64_t)vmultiplier; |
| const int64_t vproduct17 = (int64_t)vacc17 * (int64_t)vmultiplier; |
| const int64_t vproduct20 = (int64_t)vacc20 * (int64_t)vmultiplier; |
| const int64_t vproduct21 = (int64_t)vacc21 * (int64_t)vmultiplier; |
| const int64_t vproduct22 = (int64_t)vacc22 * (int64_t)vmultiplier; |
| const int64_t vproduct23 = (int64_t)vacc23 * (int64_t)vmultiplier; |
| const int64_t vproduct24 = (int64_t)vacc24 * (int64_t)vmultiplier; |
| const int64_t vproduct25 = (int64_t)vacc25 * (int64_t)vmultiplier; |
| const int64_t vproduct26 = (int64_t)vacc26 * (int64_t)vmultiplier; |
| const int64_t vproduct27 = (int64_t)vacc27 * (int64_t)vmultiplier; |
| const int64_t vproduct30 = (int64_t)vacc30 * (int64_t)vmultiplier; |
| const int64_t vproduct31 = (int64_t)vacc31 * (int64_t)vmultiplier; |
| const int64_t vproduct32 = (int64_t)vacc32 * (int64_t)vmultiplier; |
| const int64_t vproduct33 = (int64_t)vacc33 * (int64_t)vmultiplier; |
| const int64_t vproduct34 = (int64_t)vacc34 * (int64_t)vmultiplier; |
| const int64_t vproduct35 = (int64_t)vacc35 * (int64_t)vmultiplier; |
| const int64_t vproduct36 = (int64_t)vacc36 * (int64_t)vmultiplier; |
| const int64_t vproduct37 = (int64_t)vacc37 * (int64_t)vmultiplier; |
| const int64_t vproduct40 = (int64_t)vacc40 * (int64_t)vmultiplier; |
| const int64_t vproduct41 = (int64_t)vacc41 * (int64_t)vmultiplier; |
| const int64_t vproduct42 = (int64_t)vacc42 * (int64_t)vmultiplier; |
| const int64_t vproduct43 = (int64_t)vacc43 * (int64_t)vmultiplier; |
| const int64_t vproduct44 = (int64_t)vacc44 * (int64_t)vmultiplier; |
| const int64_t vproduct45 = (int64_t)vacc45 * (int64_t)vmultiplier; |
| const int64_t vproduct46 = (int64_t)vacc46 * (int64_t)vmultiplier; |
| const int64_t vproduct47 = (int64_t)vacc47 * (int64_t)vmultiplier; |
| const int64_t vproduct50 = (int64_t)vacc50 * (int64_t)vmultiplier; |
| const int64_t vproduct51 = (int64_t)vacc51 * (int64_t)vmultiplier; |
| const int64_t vproduct52 = (int64_t)vacc52 * (int64_t)vmultiplier; |
| const int64_t vproduct53 = (int64_t)vacc53 * (int64_t)vmultiplier; |
| const int64_t vproduct54 = (int64_t)vacc54 * (int64_t)vmultiplier; |
| const int64_t vproduct55 = (int64_t)vacc55 * (int64_t)vmultiplier; |
| const int64_t vproduct56 = (int64_t)vacc56 * (int64_t)vmultiplier; |
| const int64_t vproduct57 = (int64_t)vacc57 * (int64_t)vmultiplier; |
| const int64_t vproduct60 = (int64_t)vacc60 * (int64_t)vmultiplier; |
| const int64_t vproduct61 = (int64_t)vacc61 * (int64_t)vmultiplier; |
| const int64_t vproduct62 = (int64_t)vacc62 * (int64_t)vmultiplier; |
| const int64_t vproduct63 = (int64_t)vacc63 * (int64_t)vmultiplier; |
| const int64_t vproduct64 = (int64_t)vacc64 * (int64_t)vmultiplier; |
| const int64_t vproduct65 = (int64_t)vacc65 * (int64_t)vmultiplier; |
| const int64_t vproduct66 = (int64_t)vacc66 * (int64_t)vmultiplier; |
| const int64_t vproduct67 = (int64_t)vacc67 * (int64_t)vmultiplier; |
| const int64_t vproduct70 = (int64_t)vacc70 * (int64_t)vmultiplier; |
| const int64_t vproduct71 = (int64_t)vacc71 * (int64_t)vmultiplier; |
| const int64_t vproduct72 = (int64_t)vacc72 * (int64_t)vmultiplier; |
| const int64_t vproduct73 = (int64_t)vacc73 * (int64_t)vmultiplier; |
| const int64_t vproduct74 = (int64_t)vacc74 * (int64_t)vmultiplier; |
| const int64_t vproduct75 = (int64_t)vacc75 * (int64_t)vmultiplier; |
| const int64_t vproduct76 = (int64_t)vacc76 * (int64_t)vmultiplier; |
| const int64_t vproduct77 = (int64_t)vacc77 * (int64_t)vmultiplier; |
| |
| const int32_t vq31product00 = (int32_t)(uint32_t)((uint64_t)(vproduct00 + vq31rounding) >> 31); |
| const int32_t vq31product01 = (int32_t)(uint32_t)((uint64_t)(vproduct01 + vq31rounding) >> 31); |
| const int32_t vq31product02 = (int32_t)(uint32_t)((uint64_t)(vproduct02 + vq31rounding) >> 31); |
| const int32_t vq31product03 = (int32_t)(uint32_t)((uint64_t)(vproduct03 + vq31rounding) >> 31); |
| const int32_t vq31product04 = (int32_t)(uint32_t)((uint64_t)(vproduct04 + vq31rounding) >> 31); |
| const int32_t vq31product05 = (int32_t)(uint32_t)((uint64_t)(vproduct05 + vq31rounding) >> 31); |
| const int32_t vq31product06 = (int32_t)(uint32_t)((uint64_t)(vproduct06 + vq31rounding) >> 31); |
| const int32_t vq31product07 = (int32_t)(uint32_t)((uint64_t)(vproduct07 + vq31rounding) >> 31); |
| const int32_t vq31product10 = (int32_t)(uint32_t)((uint64_t)(vproduct10 + vq31rounding) >> 31); |
| const int32_t vq31product11 = (int32_t)(uint32_t)((uint64_t)(vproduct11 + vq31rounding) >> 31); |
| const int32_t vq31product12 = (int32_t)(uint32_t)((uint64_t)(vproduct12 + vq31rounding) >> 31); |
| const int32_t vq31product13 = (int32_t)(uint32_t)((uint64_t)(vproduct13 + vq31rounding) >> 31); |
| const int32_t vq31product14 = (int32_t)(uint32_t)((uint64_t)(vproduct14 + vq31rounding) >> 31); |
| const int32_t vq31product15 = (int32_t)(uint32_t)((uint64_t)(vproduct15 + vq31rounding) >> 31); |
| const int32_t vq31product16 = (int32_t)(uint32_t)((uint64_t)(vproduct16 + vq31rounding) >> 31); |
| const int32_t vq31product17 = (int32_t)(uint32_t)((uint64_t)(vproduct17 + vq31rounding) >> 31); |
| const int32_t vq31product20 = (int32_t)(uint32_t)((uint64_t)(vproduct20 + vq31rounding) >> 31); |
| const int32_t vq31product21 = (int32_t)(uint32_t)((uint64_t)(vproduct21 + vq31rounding) >> 31); |
| const int32_t vq31product22 = (int32_t)(uint32_t)((uint64_t)(vproduct22 + vq31rounding) >> 31); |
| const int32_t vq31product23 = (int32_t)(uint32_t)((uint64_t)(vproduct23 + vq31rounding) >> 31); |
| const int32_t vq31product24 = (int32_t)(uint32_t)((uint64_t)(vproduct24 + vq31rounding) >> 31); |
| const int32_t vq31product25 = (int32_t)(uint32_t)((uint64_t)(vproduct25 + vq31rounding) >> 31); |
| const int32_t vq31product26 = (int32_t)(uint32_t)((uint64_t)(vproduct26 + vq31rounding) >> 31); |
| const int32_t vq31product27 = (int32_t)(uint32_t)((uint64_t)(vproduct27 + vq31rounding) >> 31); |
| const int32_t vq31product30 = (int32_t)(uint32_t)((uint64_t)(vproduct30 + vq31rounding) >> 31); |
| const int32_t vq31product31 = (int32_t)(uint32_t)((uint64_t)(vproduct31 + vq31rounding) >> 31); |
| const int32_t vq31product32 = (int32_t)(uint32_t)((uint64_t)(vproduct32 + vq31rounding) >> 31); |
| const int32_t vq31product33 = (int32_t)(uint32_t)((uint64_t)(vproduct33 + vq31rounding) >> 31); |
| const int32_t vq31product34 = (int32_t)(uint32_t)((uint64_t)(vproduct34 + vq31rounding) >> 31); |
| const int32_t vq31product35 = (int32_t)(uint32_t)((uint64_t)(vproduct35 + vq31rounding) >> 31); |
| const int32_t vq31product36 = (int32_t)(uint32_t)((uint64_t)(vproduct36 + vq31rounding) >> 31); |
| const int32_t vq31product37 = (int32_t)(uint32_t)((uint64_t)(vproduct37 + vq31rounding) >> 31); |
| const int32_t vq31product40 = (int32_t)(uint32_t)((uint64_t)(vproduct40 + vq31rounding) >> 31); |
| const int32_t vq31product41 = (int32_t)(uint32_t)((uint64_t)(vproduct41 + vq31rounding) >> 31); |
| const int32_t vq31product42 = (int32_t)(uint32_t)((uint64_t)(vproduct42 + vq31rounding) >> 31); |
| const int32_t vq31product43 = (int32_t)(uint32_t)((uint64_t)(vproduct43 + vq31rounding) >> 31); |
| const int32_t vq31product44 = (int32_t)(uint32_t)((uint64_t)(vproduct44 + vq31rounding) >> 31); |
| const int32_t vq31product45 = (int32_t)(uint32_t)((uint64_t)(vproduct45 + vq31rounding) >> 31); |
| const int32_t vq31product46 = (int32_t)(uint32_t)((uint64_t)(vproduct46 + vq31rounding) >> 31); |
| const int32_t vq31product47 = (int32_t)(uint32_t)((uint64_t)(vproduct47 + vq31rounding) >> 31); |
| const int32_t vq31product50 = (int32_t)(uint32_t)((uint64_t)(vproduct50 + vq31rounding) >> 31); |
| const int32_t vq31product51 = (int32_t)(uint32_t)((uint64_t)(vproduct51 + vq31rounding) >> 31); |
| const int32_t vq31product52 = (int32_t)(uint32_t)((uint64_t)(vproduct52 + vq31rounding) >> 31); |
| const int32_t vq31product53 = (int32_t)(uint32_t)((uint64_t)(vproduct53 + vq31rounding) >> 31); |
| const int32_t vq31product54 = (int32_t)(uint32_t)((uint64_t)(vproduct54 + vq31rounding) >> 31); |
| const int32_t vq31product55 = (int32_t)(uint32_t)((uint64_t)(vproduct55 + vq31rounding) >> 31); |
| const int32_t vq31product56 = (int32_t)(uint32_t)((uint64_t)(vproduct56 + vq31rounding) >> 31); |
| const int32_t vq31product57 = (int32_t)(uint32_t)((uint64_t)(vproduct57 + vq31rounding) >> 31); |
| const int32_t vq31product60 = (int32_t)(uint32_t)((uint64_t)(vproduct60 + vq31rounding) >> 31); |
| const int32_t vq31product61 = (int32_t)(uint32_t)((uint64_t)(vproduct61 + vq31rounding) >> 31); |
| const int32_t vq31product62 = (int32_t)(uint32_t)((uint64_t)(vproduct62 + vq31rounding) >> 31); |
| const int32_t vq31product63 = (int32_t)(uint32_t)((uint64_t)(vproduct63 + vq31rounding) >> 31); |
| const int32_t vq31product64 = (int32_t)(uint32_t)((uint64_t)(vproduct64 + vq31rounding) >> 31); |
| const int32_t vq31product65 = (int32_t)(uint32_t)((uint64_t)(vproduct65 + vq31rounding) >> 31); |
| const int32_t vq31product66 = (int32_t)(uint32_t)((uint64_t)(vproduct66 + vq31rounding) >> 31); |
| const int32_t vq31product67 = (int32_t)(uint32_t)((uint64_t)(vproduct67 + vq31rounding) >> 31); |
| const int32_t vq31product70 = (int32_t)(uint32_t)((uint64_t)(vproduct70 + vq31rounding) >> 31); |
| const int32_t vq31product71 = (int32_t)(uint32_t)((uint64_t)(vproduct71 + vq31rounding) >> 31); |
| const int32_t vq31product72 = (int32_t)(uint32_t)((uint64_t)(vproduct72 + vq31rounding) >> 31); |
| const int32_t vq31product73 = (int32_t)(uint32_t)((uint64_t)(vproduct73 + vq31rounding) >> 31); |
| const int32_t vq31product74 = (int32_t)(uint32_t)((uint64_t)(vproduct74 + vq31rounding) >> 31); |
| const int32_t vq31product75 = (int32_t)(uint32_t)((uint64_t)(vproduct75 + vq31rounding) >> 31); |
| const int32_t vq31product76 = (int32_t)(uint32_t)((uint64_t)(vproduct76 + vq31rounding) >> 31); |
| const int32_t vq31product77 = (int32_t)(uint32_t)((uint64_t)(vproduct77 + vq31rounding) >> 31); |
| |
| const int32_t vremainder00 = (vq31product00 & vremainder_mask) - (int32_t)(vq31product00 < 0); |
| const int32_t vremainder01 = (vq31product01 & vremainder_mask) - (int32_t)(vq31product01 < 0); |
| const int32_t vremainder02 = (vq31product02 & vremainder_mask) - (int32_t)(vq31product02 < 0); |
| const int32_t vremainder03 = (vq31product03 & vremainder_mask) - (int32_t)(vq31product03 < 0); |
| const int32_t vremainder04 = (vq31product04 & vremainder_mask) - (int32_t)(vq31product04 < 0); |
| const int32_t vremainder05 = (vq31product05 & vremainder_mask) - (int32_t)(vq31product05 < 0); |
| const int32_t vremainder06 = (vq31product06 & vremainder_mask) - (int32_t)(vq31product06 < 0); |
| const int32_t vremainder07 = (vq31product07 & vremainder_mask) - (int32_t)(vq31product07 < 0); |
| const int32_t vremainder10 = (vq31product10 & vremainder_mask) - (int32_t)(vq31product10 < 0); |
| const int32_t vremainder11 = (vq31product11 & vremainder_mask) - (int32_t)(vq31product11 < 0); |
| const int32_t vremainder12 = (vq31product12 & vremainder_mask) - (int32_t)(vq31product12 < 0); |
| const int32_t vremainder13 = (vq31product13 & vremainder_mask) - (int32_t)(vq31product13 < 0); |
| const int32_t vremainder14 = (vq31product14 & vremainder_mask) - (int32_t)(vq31product14 < 0); |
| const int32_t vremainder15 = (vq31product15 & vremainder_mask) - (int32_t)(vq31product15 < 0); |
| const int32_t vremainder16 = (vq31product16 & vremainder_mask) - (int32_t)(vq31product16 < 0); |
| const int32_t vremainder17 = (vq31product17 & vremainder_mask) - (int32_t)(vq31product17 < 0); |
| const int32_t vremainder20 = (vq31product20 & vremainder_mask) - (int32_t)(vq31product20 < 0); |
| const int32_t vremainder21 = (vq31product21 & vremainder_mask) - (int32_t)(vq31product21 < 0); |
| const int32_t vremainder22 = (vq31product22 & vremainder_mask) - (int32_t)(vq31product22 < 0); |
| const int32_t vremainder23 = (vq31product23 & vremainder_mask) - (int32_t)(vq31product23 < 0); |
| const int32_t vremainder24 = (vq31product24 & vremainder_mask) - (int32_t)(vq31product24 < 0); |
| const int32_t vremainder25 = (vq31product25 & vremainder_mask) - (int32_t)(vq31product25 < 0); |
| const int32_t vremainder26 = (vq31product26 & vremainder_mask) - (int32_t)(vq31product26 < 0); |
| const int32_t vremainder27 = (vq31product27 & vremainder_mask) - (int32_t)(vq31product27 < 0); |
| const int32_t vremainder30 = (vq31product30 & vremainder_mask) - (int32_t)(vq31product30 < 0); |
| const int32_t vremainder31 = (vq31product31 & vremainder_mask) - (int32_t)(vq31product31 < 0); |
| const int32_t vremainder32 = (vq31product32 & vremainder_mask) - (int32_t)(vq31product32 < 0); |
| const int32_t vremainder33 = (vq31product33 & vremainder_mask) - (int32_t)(vq31product33 < 0); |
| const int32_t vremainder34 = (vq31product34 & vremainder_mask) - (int32_t)(vq31product34 < 0); |
| const int32_t vremainder35 = (vq31product35 & vremainder_mask) - (int32_t)(vq31product35 < 0); |
| const int32_t vremainder36 = (vq31product36 & vremainder_mask) - (int32_t)(vq31product36 < 0); |
| const int32_t vremainder37 = (vq31product37 & vremainder_mask) - (int32_t)(vq31product37 < 0); |
| const int32_t vremainder40 = (vq31product40 & vremainder_mask) - (int32_t)(vq31product40 < 0); |
| const int32_t vremainder41 = (vq31product41 & vremainder_mask) - (int32_t)(vq31product41 < 0); |
| const int32_t vremainder42 = (vq31product42 & vremainder_mask) - (int32_t)(vq31product42 < 0); |
| const int32_t vremainder43 = (vq31product43 & vremainder_mask) - (int32_t)(vq31product43 < 0); |
| const int32_t vremainder44 = (vq31product44 & vremainder_mask) - (int32_t)(vq31product44 < 0); |
| const int32_t vremainder45 = (vq31product45 & vremainder_mask) - (int32_t)(vq31product45 < 0); |
| const int32_t vremainder46 = (vq31product46 & vremainder_mask) - (int32_t)(vq31product46 < 0); |
| const int32_t vremainder47 = (vq31product47 & vremainder_mask) - (int32_t)(vq31product47 < 0); |
| const int32_t vremainder50 = (vq31product50 & vremainder_mask) - (int32_t)(vq31product50 < 0); |
| const int32_t vremainder51 = (vq31product51 & vremainder_mask) - (int32_t)(vq31product51 < 0); |
| const int32_t vremainder52 = (vq31product52 & vremainder_mask) - (int32_t)(vq31product52 < 0); |
| const int32_t vremainder53 = (vq31product53 & vremainder_mask) - (int32_t)(vq31product53 < 0); |
| const int32_t vremainder54 = (vq31product54 & vremainder_mask) - (int32_t)(vq31product54 < 0); |
| const int32_t vremainder55 = (vq31product55 & vremainder_mask) - (int32_t)(vq31product55 < 0); |
| const int32_t vremainder56 = (vq31product56 & vremainder_mask) - (int32_t)(vq31product56 < 0); |
| const int32_t vremainder57 = (vq31product57 & vremainder_mask) - (int32_t)(vq31product57 < 0); |
| const int32_t vremainder60 = (vq31product60 & vremainder_mask) - (int32_t)(vq31product60 < 0); |
| const int32_t vremainder61 = (vq31product61 & vremainder_mask) - (int32_t)(vq31product61 < 0); |
| const int32_t vremainder62 = (vq31product62 & vremainder_mask) - (int32_t)(vq31product62 < 0); |
| const int32_t vremainder63 = (vq31product63 & vremainder_mask) - (int32_t)(vq31product63 < 0); |
| const int32_t vremainder64 = (vq31product64 & vremainder_mask) - (int32_t)(vq31product64 < 0); |
| const int32_t vremainder65 = (vq31product65 & vremainder_mask) - (int32_t)(vq31product65 < 0); |
| const int32_t vremainder66 = (vq31product66 & vremainder_mask) - (int32_t)(vq31product66 < 0); |
| const int32_t vremainder67 = (vq31product67 & vremainder_mask) - (int32_t)(vq31product67 < 0); |
| const int32_t vremainder70 = (vq31product70 & vremainder_mask) - (int32_t)(vq31product70 < 0); |
| const int32_t vremainder71 = (vq31product71 & vremainder_mask) - (int32_t)(vq31product71 < 0); |
| const int32_t vremainder72 = (vq31product72 & vremainder_mask) - (int32_t)(vq31product72 < 0); |
| const int32_t vremainder73 = (vq31product73 & vremainder_mask) - (int32_t)(vq31product73 < 0); |
| const int32_t vremainder74 = (vq31product74 & vremainder_mask) - (int32_t)(vq31product74 < 0); |
| const int32_t vremainder75 = (vq31product75 & vremainder_mask) - (int32_t)(vq31product75 < 0); |
| const int32_t vremainder76 = (vq31product76 & vremainder_mask) - (int32_t)(vq31product76 < 0); |
| const int32_t vremainder77 = (vq31product77 & vremainder_mask) - (int32_t)(vq31product77 < 0); |
| |
| int32_t vout00 = asr_s32(vq31product00, vshift) + (int32_t)(vremainder00 > vremainder_threshold); |
| int32_t vout01 = asr_s32(vq31product01, vshift) + (int32_t)(vremainder01 > vremainder_threshold); |
| int32_t vout02 = asr_s32(vq31product02, vshift) + (int32_t)(vremainder02 > vremainder_threshold); |
| int32_t vout03 = asr_s32(vq31product03, vshift) + (int32_t)(vremainder03 > vremainder_threshold); |
| int32_t vout04 = asr_s32(vq31product04, vshift) + (int32_t)(vremainder04 > vremainder_threshold); |
| int32_t vout05 = asr_s32(vq31product05, vshift) + (int32_t)(vremainder05 > vremainder_threshold); |
| int32_t vout06 = asr_s32(vq31product06, vshift) + (int32_t)(vremainder06 > vremainder_threshold); |
| int32_t vout07 = asr_s32(vq31product07, vshift) + (int32_t)(vremainder07 > vremainder_threshold); |
| int32_t vout10 = asr_s32(vq31product10, vshift) + (int32_t)(vremainder10 > vremainder_threshold); |
| int32_t vout11 = asr_s32(vq31product11, vshift) + (int32_t)(vremainder11 > vremainder_threshold); |
| int32_t vout12 = asr_s32(vq31product12, vshift) + (int32_t)(vremainder12 > vremainder_threshold); |
| int32_t vout13 = asr_s32(vq31product13, vshift) + (int32_t)(vremainder13 > vremainder_threshold); |
| int32_t vout14 = asr_s32(vq31product14, vshift) + (int32_t)(vremainder14 > vremainder_threshold); |
| int32_t vout15 = asr_s32(vq31product15, vshift) + (int32_t)(vremainder15 > vremainder_threshold); |
| int32_t vout16 = asr_s32(vq31product16, vshift) + (int32_t)(vremainder16 > vremainder_threshold); |
| int32_t vout17 = asr_s32(vq31product17, vshift) + (int32_t)(vremainder17 > vremainder_threshold); |
| int32_t vout20 = asr_s32(vq31product20, vshift) + (int32_t)(vremainder20 > vremainder_threshold); |
| int32_t vout21 = asr_s32(vq31product21, vshift) + (int32_t)(vremainder21 > vremainder_threshold); |
| int32_t vout22 = asr_s32(vq31product22, vshift) + (int32_t)(vremainder22 > vremainder_threshold); |
| int32_t vout23 = asr_s32(vq31product23, vshift) + (int32_t)(vremainder23 > vremainder_threshold); |
| int32_t vout24 = asr_s32(vq31product24, vshift) + (int32_t)(vremainder24 > vremainder_threshold); |
| int32_t vout25 = asr_s32(vq31product25, vshift) + (int32_t)(vremainder25 > vremainder_threshold); |
| int32_t vout26 = asr_s32(vq31product26, vshift) + (int32_t)(vremainder26 > vremainder_threshold); |
| int32_t vout27 = asr_s32(vq31product27, vshift) + (int32_t)(vremainder27 > vremainder_threshold); |
| int32_t vout30 = asr_s32(vq31product30, vshift) + (int32_t)(vremainder30 > vremainder_threshold); |
| int32_t vout31 = asr_s32(vq31product31, vshift) + (int32_t)(vremainder31 > vremainder_threshold); |
| int32_t vout32 = asr_s32(vq31product32, vshift) + (int32_t)(vremainder32 > vremainder_threshold); |
| int32_t vout33 = asr_s32(vq31product33, vshift) + (int32_t)(vremainder33 > vremainder_threshold); |
| int32_t vout34 = asr_s32(vq31product34, vshift) + (int32_t)(vremainder34 > vremainder_threshold); |
| int32_t vout35 = asr_s32(vq31product35, vshift) + (int32_t)(vremainder35 > vremainder_threshold); |
| int32_t vout36 = asr_s32(vq31product36, vshift) + (int32_t)(vremainder36 > vremainder_threshold); |
| int32_t vout37 = asr_s32(vq31product37, vshift) + (int32_t)(vremainder37 > vremainder_threshold); |
| int32_t vout40 = asr_s32(vq31product40, vshift) + (int32_t)(vremainder40 > vremainder_threshold); |
| int32_t vout41 = asr_s32(vq31product41, vshift) + (int32_t)(vremainder41 > vremainder_threshold); |
| int32_t vout42 = asr_s32(vq31product42, vshift) + (int32_t)(vremainder42 > vremainder_threshold); |
| int32_t vout43 = asr_s32(vq31product43, vshift) + (int32_t)(vremainder43 > vremainder_threshold); |
| int32_t vout44 = asr_s32(vq31product44, vshift) + (int32_t)(vremainder44 > vremainder_threshold); |
| int32_t vout45 = asr_s32(vq31product45, vshift) + (int32_t)(vremainder45 > vremainder_threshold); |
| int32_t vout46 = asr_s32(vq31product46, vshift) + (int32_t)(vremainder46 > vremainder_threshold); |
| int32_t vout47 = asr_s32(vq31product47, vshift) + (int32_t)(vremainder47 > vremainder_threshold); |
| int32_t vout50 = asr_s32(vq31product50, vshift) + (int32_t)(vremainder50 > vremainder_threshold); |
| int32_t vout51 = asr_s32(vq31product51, vshift) + (int32_t)(vremainder51 > vremainder_threshold); |
| int32_t vout52 = asr_s32(vq31product52, vshift) + (int32_t)(vremainder52 > vremainder_threshold); |
| int32_t vout53 = asr_s32(vq31product53, vshift) + (int32_t)(vremainder53 > vremainder_threshold); |
| int32_t vout54 = asr_s32(vq31product54, vshift) + (int32_t)(vremainder54 > vremainder_threshold); |
| int32_t vout55 = asr_s32(vq31product55, vshift) + (int32_t)(vremainder55 > vremainder_threshold); |
| int32_t vout56 = asr_s32(vq31product56, vshift) + (int32_t)(vremainder56 > vremainder_threshold); |
| int32_t vout57 = asr_s32(vq31product57, vshift) + (int32_t)(vremainder57 > vremainder_threshold); |
| int32_t vout60 = asr_s32(vq31product60, vshift) + (int32_t)(vremainder60 > vremainder_threshold); |
| int32_t vout61 = asr_s32(vq31product61, vshift) + (int32_t)(vremainder61 > vremainder_threshold); |
| int32_t vout62 = asr_s32(vq31product62, vshift) + (int32_t)(vremainder62 > vremainder_threshold); |
| int32_t vout63 = asr_s32(vq31product63, vshift) + (int32_t)(vremainder63 > vremainder_threshold); |
| int32_t vout64 = asr_s32(vq31product64, vshift) + (int32_t)(vremainder64 > vremainder_threshold); |
| int32_t vout65 = asr_s32(vq31product65, vshift) + (int32_t)(vremainder65 > vremainder_threshold); |
| int32_t vout66 = asr_s32(vq31product66, vshift) + (int32_t)(vremainder66 > vremainder_threshold); |
| int32_t vout67 = asr_s32(vq31product67, vshift) + (int32_t)(vremainder67 > vremainder_threshold); |
| int32_t vout70 = asr_s32(vq31product70, vshift) + (int32_t)(vremainder70 > vremainder_threshold); |
| int32_t vout71 = asr_s32(vq31product71, vshift) + (int32_t)(vremainder71 > vremainder_threshold); |
| int32_t vout72 = asr_s32(vq31product72, vshift) + (int32_t)(vremainder72 > vremainder_threshold); |
| int32_t vout73 = asr_s32(vq31product73, vshift) + (int32_t)(vremainder73 > vremainder_threshold); |
| int32_t vout74 = asr_s32(vq31product74, vshift) + (int32_t)(vremainder74 > vremainder_threshold); |
| int32_t vout75 = asr_s32(vq31product75, vshift) + (int32_t)(vremainder75 > vremainder_threshold); |
| int32_t vout76 = asr_s32(vq31product76, vshift) + (int32_t)(vremainder76 > vremainder_threshold); |
| int32_t vout77 = asr_s32(vq31product77, vshift) + (int32_t)(vremainder77 > vremainder_threshold); |
| |
| vout00 = vout00 < voutput_min ? voutput_min : vout00; |
| vout01 = vout01 < voutput_min ? voutput_min : vout01; |
| vout02 = vout02 < voutput_min ? voutput_min : vout02; |
| vout03 = vout03 < voutput_min ? voutput_min : vout03; |
| vout04 = vout04 < voutput_min ? voutput_min : vout04; |
| vout05 = vout05 < voutput_min ? voutput_min : vout05; |
| vout06 = vout06 < voutput_min ? voutput_min : vout06; |
| vout07 = vout07 < voutput_min ? voutput_min : vout07; |
| vout10 = vout10 < voutput_min ? voutput_min : vout10; |
| vout11 = vout11 < voutput_min ? voutput_min : vout11; |
| vout12 = vout12 < voutput_min ? voutput_min : vout12; |
| vout13 = vout13 < voutput_min ? voutput_min : vout13; |
| vout14 = vout14 < voutput_min ? voutput_min : vout14; |
| vout15 = vout15 < voutput_min ? voutput_min : vout15; |
| vout16 = vout16 < voutput_min ? voutput_min : vout16; |
| vout17 = vout17 < voutput_min ? voutput_min : vout17; |
| vout20 = vout20 < voutput_min ? voutput_min : vout20; |
| vout21 = vout21 < voutput_min ? voutput_min : vout21; |
| vout22 = vout22 < voutput_min ? voutput_min : vout22; |
| vout23 = vout23 < voutput_min ? voutput_min : vout23; |
| vout24 = vout24 < voutput_min ? voutput_min : vout24; |
| vout25 = vout25 < voutput_min ? voutput_min : vout25; |
| vout26 = vout26 < voutput_min ? voutput_min : vout26; |
| vout27 = vout27 < voutput_min ? voutput_min : vout27; |
| vout30 = vout30 < voutput_min ? voutput_min : vout30; |
| vout31 = vout31 < voutput_min ? voutput_min : vout31; |
| vout32 = vout32 < voutput_min ? voutput_min : vout32; |
| vout33 = vout33 < voutput_min ? voutput_min : vout33; |
| vout34 = vout34 < voutput_min ? voutput_min : vout34; |
| vout35 = vout35 < voutput_min ? voutput_min : vout35; |
| vout36 = vout36 < voutput_min ? voutput_min : vout36; |
| vout37 = vout37 < voutput_min ? voutput_min : vout37; |
| vout40 = vout40 < voutput_min ? voutput_min : vout40; |
| vout41 = vout41 < voutput_min ? voutput_min : vout41; |
| vout42 = vout42 < voutput_min ? voutput_min : vout42; |
| vout43 = vout43 < voutput_min ? voutput_min : vout43; |
| vout44 = vout44 < voutput_min ? voutput_min : vout44; |
| vout45 = vout45 < voutput_min ? voutput_min : vout45; |
| vout46 = vout46 < voutput_min ? voutput_min : vout46; |
| vout47 = vout47 < voutput_min ? voutput_min : vout47; |
| vout50 = vout50 < voutput_min ? voutput_min : vout50; |
| vout51 = vout51 < voutput_min ? voutput_min : vout51; |
| vout52 = vout52 < voutput_min ? voutput_min : vout52; |
| vout53 = vout53 < voutput_min ? voutput_min : vout53; |
| vout54 = vout54 < voutput_min ? voutput_min : vout54; |
| vout55 = vout55 < voutput_min ? voutput_min : vout55; |
| vout56 = vout56 < voutput_min ? voutput_min : vout56; |
| vout57 = vout57 < voutput_min ? voutput_min : vout57; |
| vout60 = vout60 < voutput_min ? voutput_min : vout60; |
| vout61 = vout61 < voutput_min ? voutput_min : vout61; |
| vout62 = vout62 < voutput_min ? voutput_min : vout62; |
| vout63 = vout63 < voutput_min ? voutput_min : vout63; |
| vout64 = vout64 < voutput_min ? voutput_min : vout64; |
| vout65 = vout65 < voutput_min ? voutput_min : vout65; |
| vout66 = vout66 < voutput_min ? voutput_min : vout66; |
| vout67 = vout67 < voutput_min ? voutput_min : vout67; |
| vout70 = vout70 < voutput_min ? voutput_min : vout70; |
| vout71 = vout71 < voutput_min ? voutput_min : vout71; |
| vout72 = vout72 < voutput_min ? voutput_min : vout72; |
| vout73 = vout73 < voutput_min ? voutput_min : vout73; |
| vout74 = vout74 < voutput_min ? voutput_min : vout74; |
| vout75 = vout75 < voutput_min ? voutput_min : vout75; |
| vout76 = vout76 < voutput_min ? voutput_min : vout76; |
| vout77 = vout77 < voutput_min ? voutput_min : vout77; |
| |
| vout00 = vout00 > voutput_max ? voutput_max : vout00; |
| vout01 = vout01 > voutput_max ? voutput_max : vout01; |
| vout02 = vout02 > voutput_max ? voutput_max : vout02; |
| vout03 = vout03 > voutput_max ? voutput_max : vout03; |
| vout04 = vout04 > voutput_max ? voutput_max : vout04; |
| vout05 = vout05 > voutput_max ? voutput_max : vout05; |
| vout06 = vout06 > voutput_max ? voutput_max : vout06; |
| vout07 = vout07 > voutput_max ? voutput_max : vout07; |
| vout10 = vout10 > voutput_max ? voutput_max : vout10; |
| vout11 = vout11 > voutput_max ? voutput_max : vout11; |
| vout12 = vout12 > voutput_max ? voutput_max : vout12; |
| vout13 = vout13 > voutput_max ? voutput_max : vout13; |
| vout14 = vout14 > voutput_max ? voutput_max : vout14; |
| vout15 = vout15 > voutput_max ? voutput_max : vout15; |
| vout16 = vout16 > voutput_max ? voutput_max : vout16; |
| vout17 = vout17 > voutput_max ? voutput_max : vout17; |
| vout20 = vout20 > voutput_max ? voutput_max : vout20; |
| vout21 = vout21 > voutput_max ? voutput_max : vout21; |
| vout22 = vout22 > voutput_max ? voutput_max : vout22; |
| vout23 = vout23 > voutput_max ? voutput_max : vout23; |
| vout24 = vout24 > voutput_max ? voutput_max : vout24; |
| vout25 = vout25 > voutput_max ? voutput_max : vout25; |
| vout26 = vout26 > voutput_max ? voutput_max : vout26; |
| vout27 = vout27 > voutput_max ? voutput_max : vout27; |
| vout30 = vout30 > voutput_max ? voutput_max : vout30; |
| vout31 = vout31 > voutput_max ? voutput_max : vout31; |
| vout32 = vout32 > voutput_max ? voutput_max : vout32; |
| vout33 = vout33 > voutput_max ? voutput_max : vout33; |
| vout34 = vout34 > voutput_max ? voutput_max : vout34; |
| vout35 = vout35 > voutput_max ? voutput_max : vout35; |
| vout36 = vout36 > voutput_max ? voutput_max : vout36; |
| vout37 = vout37 > voutput_max ? voutput_max : vout37; |
| vout40 = vout40 > voutput_max ? voutput_max : vout40; |
| vout41 = vout41 > voutput_max ? voutput_max : vout41; |
| vout42 = vout42 > voutput_max ? voutput_max : vout42; |
| vout43 = vout43 > voutput_max ? voutput_max : vout43; |
| vout44 = vout44 > voutput_max ? voutput_max : vout44; |
| vout45 = vout45 > voutput_max ? voutput_max : vout45; |
| vout46 = vout46 > voutput_max ? voutput_max : vout46; |
| vout47 = vout47 > voutput_max ? voutput_max : vout47; |
| vout50 = vout50 > voutput_max ? voutput_max : vout50; |
| vout51 = vout51 > voutput_max ? voutput_max : vout51; |
| vout52 = vout52 > voutput_max ? voutput_max : vout52; |
| vout53 = vout53 > voutput_max ? voutput_max : vout53; |
| vout54 = vout54 > voutput_max ? voutput_max : vout54; |
| vout55 = vout55 > voutput_max ? voutput_max : vout55; |
| vout56 = vout56 > voutput_max ? voutput_max : vout56; |
| vout57 = vout57 > voutput_max ? voutput_max : vout57; |
| vout60 = vout60 > voutput_max ? voutput_max : vout60; |
| vout61 = vout61 > voutput_max ? voutput_max : vout61; |
| vout62 = vout62 > voutput_max ? voutput_max : vout62; |
| vout63 = vout63 > voutput_max ? voutput_max : vout63; |
| vout64 = vout64 > voutput_max ? voutput_max : vout64; |
| vout65 = vout65 > voutput_max ? voutput_max : vout65; |
| vout66 = vout66 > voutput_max ? voutput_max : vout66; |
| vout67 = vout67 > voutput_max ? voutput_max : vout67; |
| vout70 = vout70 > voutput_max ? voutput_max : vout70; |
| vout71 = vout71 > voutput_max ? voutput_max : vout71; |
| vout72 = vout72 > voutput_max ? voutput_max : vout72; |
| vout73 = vout73 > voutput_max ? voutput_max : vout73; |
| vout74 = vout74 > voutput_max ? voutput_max : vout74; |
| vout75 = vout75 > voutput_max ? voutput_max : vout75; |
| vout76 = vout76 > voutput_max ? voutput_max : vout76; |
| vout77 = vout77 > voutput_max ? voutput_max : vout77; |
| |
| vout00 += voutput_zero_point; |
| vout01 += voutput_zero_point; |
| vout02 += voutput_zero_point; |
| vout03 += voutput_zero_point; |
| vout04 += voutput_zero_point; |
| vout05 += voutput_zero_point; |
| vout06 += voutput_zero_point; |
| vout07 += voutput_zero_point; |
| vout10 += voutput_zero_point; |
| vout11 += voutput_zero_point; |
| vout12 += voutput_zero_point; |
| vout13 += voutput_zero_point; |
| vout14 += voutput_zero_point; |
| vout15 += voutput_zero_point; |
| vout16 += voutput_zero_point; |
| vout17 += voutput_zero_point; |
| vout20 += voutput_zero_point; |
| vout21 += voutput_zero_point; |
| vout22 += voutput_zero_point; |
| vout23 += voutput_zero_point; |
| vout24 += voutput_zero_point; |
| vout25 += voutput_zero_point; |
| vout26 += voutput_zero_point; |
| vout27 += voutput_zero_point; |
| vout30 += voutput_zero_point; |
| vout31 += voutput_zero_point; |
| vout32 += voutput_zero_point; |
| vout33 += voutput_zero_point; |
| vout34 += voutput_zero_point; |
| vout35 += voutput_zero_point; |
| vout36 += voutput_zero_point; |
| vout37 += voutput_zero_point; |
| vout40 += voutput_zero_point; |
| vout41 += voutput_zero_point; |
| vout42 += voutput_zero_point; |
| vout43 += voutput_zero_point; |
| vout44 += voutput_zero_point; |
| vout45 += voutput_zero_point; |
| vout46 += voutput_zero_point; |
| vout47 += voutput_zero_point; |
| vout50 += voutput_zero_point; |
| vout51 += voutput_zero_point; |
| vout52 += voutput_zero_point; |
| vout53 += voutput_zero_point; |
| vout54 += voutput_zero_point; |
| vout55 += voutput_zero_point; |
| vout56 += voutput_zero_point; |
| vout57 += voutput_zero_point; |
| vout60 += voutput_zero_point; |
| vout61 += voutput_zero_point; |
| vout62 += voutput_zero_point; |
| vout63 += voutput_zero_point; |
| vout64 += voutput_zero_point; |
| vout65 += voutput_zero_point; |
| vout66 += voutput_zero_point; |
| vout67 += voutput_zero_point; |
| vout70 += voutput_zero_point; |
| vout71 += voutput_zero_point; |
| vout72 += voutput_zero_point; |
| vout73 += voutput_zero_point; |
| vout74 += voutput_zero_point; |
| vout75 += voutput_zero_point; |
| vout76 += voutput_zero_point; |
| vout77 += voutput_zero_point; |
| |
| if XNN_LIKELY (nc >= 8) { |
| // Main case where there the 8 columns fit in the destination. |
| c0[0] = vout00; |
| c0[1] = vout01; |
| c0[2] = vout02; |
| c0[3] = vout03; |
| c0[4] = vout04; |
| c0[5] = vout05; |
| c0[6] = vout06; |
| c0[7] = vout07; |
| c1[0] = vout10; |
| c1[1] = vout11; |
| c1[2] = vout12; |
| c1[3] = vout13; |
| c1[4] = vout14; |
| c1[5] = vout15; |
| c1[6] = vout16; |
| c1[7] = vout17; |
| c2[0] = vout20; |
| c2[1] = vout21; |
| c2[2] = vout22; |
| c2[3] = vout23; |
| c2[4] = vout24; |
| c2[5] = vout25; |
| c2[6] = vout26; |
| c2[7] = vout27; |
| c3[0] = vout30; |
| c3[1] = vout31; |
| c3[2] = vout32; |
| c3[3] = vout33; |
| c3[4] = vout34; |
| c3[5] = vout35; |
| c3[6] = vout36; |
| c3[7] = vout37; |
| c4[0] = vout40; |
| c4[1] = vout41; |
| c4[2] = vout42; |
| c4[3] = vout43; |
| c4[4] = vout44; |
| c4[5] = vout45; |
| c4[6] = vout46; |
| c4[7] = vout47; |
| c5[0] = vout50; |
| c5[1] = vout51; |
| c5[2] = vout52; |
| c5[3] = vout53; |
| c5[4] = vout54; |
| c5[5] = vout55; |
| c5[6] = vout56; |
| c5[7] = vout57; |
| c6[0] = vout60; |
| c6[1] = vout61; |
| c6[2] = vout62; |
| c6[3] = vout63; |
| c6[4] = vout64; |
| c6[5] = vout65; |
| c6[6] = vout66; |
| c6[7] = vout67; |
| c7[0] = vout70; |
| c7[1] = vout71; |
| c7[2] = vout72; |
| c7[3] = vout73; |
| c7[4] = vout74; |
| c7[5] = vout75; |
| c7[6] = vout76; |
| c7[7] = vout77; |
| |
| // Advance to the next 8 columns. |
| c0 = (uint8_t*)((uintptr_t) c0 + cn_stride); |
| c1 = (uint8_t*)((uintptr_t) c1 + cn_stride); |
| c2 = (uint8_t*)((uintptr_t) c2 + cn_stride); |
| c3 = (uint8_t*)((uintptr_t) c3 + cn_stride); |
| c4 = (uint8_t*)((uintptr_t) c4 + cn_stride); |
| c5 = (uint8_t*)((uintptr_t) c5 + cn_stride); |
| c6 = (uint8_t*)((uintptr_t) c6 + cn_stride); |
| c7 = (uint8_t*)((uintptr_t) c7 + cn_stride); |
| |
| nc -= 8; |
| } else { |
| // Final case where not all of the 8 columns fit in the destination. |
| if (nc > 0) { |
| c0[0] = vout00; |
| c1[0] = vout10; |
| c2[0] = vout20; |
| c3[0] = vout30; |
| c4[0] = vout40; |
| c5[0] = vout50; |
| c6[0] = vout60; |
| c7[0] = vout70; |
| } |
| if (nc > 1) { |
| c0[1] = vout01; |
| c1[1] = vout11; |
| c2[1] = vout21; |
| c3[1] = vout31; |
| c4[1] = vout41; |
| c5[1] = vout51; |
| c6[1] = vout61; |
| c7[1] = vout71; |
| } |
| if (nc > 2) { |
| c0[2] = vout02; |
| c1[2] = vout12; |
| c2[2] = vout22; |
| c3[2] = vout32; |
| c4[2] = vout42; |
| c5[2] = vout52; |
| c6[2] = vout62; |
| c7[2] = vout72; |
| } |
| if (nc > 3) { |
| c0[3] = vout03; |
| c1[3] = vout13; |
| c2[3] = vout23; |
| c3[3] = vout33; |
| c4[3] = vout43; |
| c5[3] = vout53; |
| c6[3] = vout63; |
| c7[3] = vout73; |
| } |
| if (nc > 4) { |
| c0[4] = vout04; |
| c1[4] = vout14; |
| c2[4] = vout24; |
| c3[4] = vout34; |
| c4[4] = vout44; |
| c5[4] = vout54; |
| c6[4] = vout64; |
| c7[4] = vout74; |
| } |
| if (nc > 5) { |
| c0[5] = vout05; |
| c1[5] = vout15; |
| c2[5] = vout25; |
| c3[5] = vout35; |
| c4[5] = vout45; |
| c5[5] = vout55; |
| c6[5] = vout65; |
| c7[5] = vout75; |
| } |
| if (nc > 6) { |
| c0[6] = vout06; |
| c1[6] = vout16; |
| c2[6] = vout26; |
| c3[6] = vout36; |
| c4[6] = vout46; |
| c5[6] = vout56; |
| c6[6] = vout66; |
| c7[6] = vout76; |
| } |
| if (nc > 7) { |
| c0[7] = vout07; |
| c1[7] = vout17; |
| c2[7] = vout27; |
| c3[7] = vout37; |
| c4[7] = vout47; |
| c5[7] = vout57; |
| c6[7] = vout67; |
| c7[7] = vout77; |
| } |
| |
| nc = 0; |
| } |
| } while (nc != 0); |
| } |