pack_x86.h - platform/external/ruy - Gitiles

 /* Copyright 2019 Google LLC. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 // # What is "packing"?
 //
 // Before feeding data to the gemm kernels (the parts of Ruy that do lots
 // of multiply-add operations), Ruy first performs a data transformation (which
 // we call "packing") on the input matrices. This transformation has two main
 // goals:
 // - rearrange data into blocks that are a convenient size/layout for the gemm
 // kernels to consume. This helps make the memory access pattern of the gemm
 // kernel simpler and more contiguous, and puts the data in a layout most
 // convenient for specific arithmetic instructions in the gemm kernel.
 // - compute row/column sums needed for handling quantization with non-symmetric
 // zero points.
 //
 // # Simplified algorithmic analysis of packing
 //
 // Packing is a relatively simple transformation which does a small constant
 // amount of work on each element of an input matrix, and hence for an NxM
 // matrix performs O(N*M) work. If N and M are of the same order, then this is
 // O(N^2) work.
 //
 // A NxKxM matrix multiplication requires N*K*M multiply-accumulate operations.
 // Note that if N, K, and M are all the same order, then the number of
 // multiply-accumulate operations is O(N^3).
 //
 // Thus, the O(N^2) cost of packing is small compared to the O(N^3) work, in the
 // case of all dimensions being roughly the same order.
 //
 // # Packing cost can be significant
 //
 // When matrix * matrix multiplications begin to look more like matrix * vector
 // multiplications, packing cost can become significant. We sometimes call these
 // cases "gemv-like".
 //
 // Continuing the algorithmic analysis above, if we consider a case where an
 // NxKxM matrix multiplication has either N = O(1) or M = O(1), then the
 // situation is different. In this case, the multiply-accumulate work is only
 // quadratic, so the quadratic cost of packing can be come significant.
 //
 // Another way to say this is that the cost of packing an input matrix (either
 // the LHS or RHS) is amortized across the non-depth dimension of the opposite
 // input matrix. Thus, when the LHS has very few rows or the RHS has very few
 // columns, the cost of packing the opposite input matrix can become
 // significant.
 //
 // As a rough rule of thumb, the cost of packing starts to become significant
 // when either N or M is below 32 (and other dimensions are hundreds), with very
 // significant packing costs at 8 or below. This varies by data type, Path, and
 // tuning, so these numbers are only rough guides.
 //
 // One practical use case that is affected by this is inference of
 // fully connected neural network layers with a low batch size. The weight
 // matrix (which is a constant for inference) is the one affected by significant
 // packing cost.
 //
 // Ruy provides an API in ruy_advanced.h for advanced users to pre-pack
 // input matrices that are affected by significant packing costs.
 //
 // # Implementation notes
 //
 // Ruy's packing routines always operate on a range of columns and can be
 // applied to either the LHS or RHS. This is possible because Ruy internally
 // implements a TrMul, so the accumulation along depth is done along columns of
 // both the LHS and RHS (whereas for a normal Mul the accumulation along depth
 // for the LHS is along rows). As another example, we are always computing
 // column sums for quantization (and never row sums, since the LHS is
 // transposed).

 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_X86_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_X86_H_

 #include <cstdint>
 #include <cstring>
 #include <type_traits>

 #include "check_macros.h"
 #include "common.h"
 #include "internal_matrix.h"
 #include "matrix.h"
 #include "opt_set.h"
 #include "pack_common.h"
 #include "path.h"
 #include "platform.h"
 #include "profiler/instrumentation.h"
 #include "tune.h"

 namespace ruy {

 #if RUY_PLATFORM(X86)
 // TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
 // Optimization is not finished. In particular the dimensions of the kernel
 // blocks can be changed as desired.
 //
 // Note that source and zero buffers can be uint8 type, but in the packing
 // function are reinterpreted as int8, and are XOR-ed with input_xor.
 void Pack8bitSse42(const std::int8_t* src_ptr, std::int8_t input_xor,
                    const std::int8_t* zerobuf, int src_stride,
                    int remaining_src_cols, int src_rows,
                    std::int8_t* packed_ptr, std::int32_t* sums_ptr);

 template <typename Scalar>
 struct PackImpl<Path::kSse42, FixedKernelLayout<Order::kColMajor, 4, 8>, Scalar,
                 std::int8_t, std::int32_t> {
   static_assert(std::is_same<Scalar, std::int8_t>::value ||
                     std::is_same<Scalar, std::uint8_t>::value,
                 "");
   using Layout = FixedKernelLayout<Order::kColMajor, 4, 8>;
   static constexpr std::int8_t kInputXor =
       std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80;

   static void Run(Tuning tuning, const Matrix<Scalar>& src_matrix,
                   PackedMatrix<std::int8_t>* packed_matrix, int start_col,
                   int end_col) {
     profiler::ScopeLabel label("Pack (SSE 4.2 8-bit)");

     RUY_DCHECK(IsColMajor(src_matrix.layout));
     RUY_DCHECK(IsColMajor(packed_matrix->layout));
     RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
     RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
     std::int32_t* sums = packed_matrix->sums;
     Scalar zerobuf[Layout::kCols * Layout::kRows];
     memset(zerobuf, packed_matrix->zero_point ^ kInputXor,
            Layout::kCols * Layout::kRows * sizeof(Scalar));
     for (int block_col = start_col; block_col < end_col;
          block_col += Layout::kCols) {
       std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
       int src_stride = src_matrix.layout.stride;
       const Scalar* src_ptr = src_matrix.data.get() + src_stride * block_col;
       int remaining_src_cols = src_matrix.layout.cols - block_col;

       static constexpr int block_col_mask = ~(Layout::kCols - 1);  // High bits.
       std::int8_t* packed_ptr =
           packed_matrix->data +
           packed_matrix->layout.stride * (block_col & block_col_mask);
       Pack8bitSse42(reinterpret_cast<const std::int8_t*>(src_ptr), kInputXor,
                     reinterpret_cast<const std::int8_t*>(zerobuf), src_stride,
                     remaining_src_cols, src_matrix.layout.rows, packed_ptr,
                     sums_ptr);
     }
   }
 };

 // TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
 // Optimization is not finished. In particular the dimensions of the kernel
 // blocks can be changed as desired.
 //
 void PackFloatSse42(const float* src_ptr, const float* zerobuf, int src_stride,
                     int remaining_src_cols, int src_rows, float* packed_ptr);

 template <>
 struct PackImpl<Path::kSse42, FixedKernelLayout<Order::kRowMajor, 1, 8>, float,
                 float, float> {
   using Layout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
   static void Run(Tuning, const Matrix<float>& src_matrix,
                   PackedMatrix<float>* packed_matrix, int start_col,
                   int end_col) {
     profiler::ScopeLabel label("Pack (SSE 4.2 float)");

     RUY_DCHECK(IsColMajor(src_matrix.layout));
     RUY_DCHECK(IsColMajor(packed_matrix->layout));
     RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
     RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
     const float zerobuf[Layout::kCols] = {
         0.0f};  // Remainder default inits to 0.0f.
     for (int block_col = start_col; block_col < end_col;
          block_col += Layout::kCols) {
       int src_stride = src_matrix.layout.stride;
       const float* src_ptr = src_matrix.data.get() + src_stride * block_col;
       int remaining_src_cols = src_matrix.layout.cols - block_col;

       static constexpr int block_col_mask = ~(Layout::kCols - 1);  // High bits.
       float* packed_ptr =
           packed_matrix->data +
           packed_matrix->layout.stride * (block_col & block_col_mask);
       PackFloatSse42(src_ptr, zerobuf, src_stride, remaining_src_cols,
                      src_matrix.layout.rows, packed_ptr);
     }
   }
 };

 // Note that source and zero buffers can be uint8 type, but in the packing
 // function are reinterpreted as int8, and are XOR-ed with input_xor.
 void Pack8bitAvx2(const std::int8_t* src_ptr, std::int8_t input_xor,
                   const std::int8_t* zerobuf, int src_stride,
                   int remaining_src_cols, int src_rows, std::int8_t* packed_ptr,
                   std::int32_t* sums_ptr);

 template <typename Scalar>
 struct PackImpl<Path::kAvx2, FixedKernelLayout<Order::kColMajor, 4, 8>, Scalar,
                 std::int8_t, std::int32_t> {
   static_assert(std::is_same<Scalar, std::int8_t>::value ||
                     std::is_same<Scalar, std::uint8_t>::value,
                 "");
   using Layout = FixedKernelLayout<Order::kColMajor, 4, 8>;
   static constexpr std::int8_t kInputXor =
       std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80;

   static void Run(Tuning tuning, const Matrix<Scalar>& src_matrix,
                   PackedMatrix<std::int8_t>* packed_matrix, int start_col,
                   int end_col) {
     profiler::ScopeLabel label("Pack (AVX2 8-bit)");

     RUY_DCHECK(IsColMajor(src_matrix.layout));
     RUY_DCHECK(IsColMajor(packed_matrix->layout));
     RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
     RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
     std::int32_t* sums = packed_matrix->sums;
     Scalar zerobuf[Layout::kCols * Layout::kRows];
     memset(zerobuf, packed_matrix->zero_point ^ kInputXor,
            Layout::kCols * Layout::kRows * sizeof(Scalar));
     for (int block_col = start_col; block_col < end_col;
          block_col += Layout::kCols) {
       std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
       int src_stride = src_matrix.layout.stride;
       const Scalar* src_ptr = src_matrix.data.get() + src_stride * block_col;
       int remaining_src_cols = src_matrix.layout.cols - block_col;

       static constexpr int block_col_mask = ~(Layout::kCols - 1);  // High bits.
       std::int8_t* packed_ptr =
           packed_matrix->data +
           packed_matrix->layout.stride * (block_col & block_col_mask);
       Pack8bitAvx2(reinterpret_cast<const std::int8_t*>(src_ptr), kInputXor,
                    reinterpret_cast<const std::int8_t*>(zerobuf), src_stride,
                    remaining_src_cols, src_matrix.layout.rows, packed_ptr,
                    sums_ptr);
     }
   }
 };

 void PackFloatAvx2(const float* src_ptr, const float* zerobuf, int src_stride,
                    int remaining_src_cols, int src_rows, float* packed_ptr);

 template <>
 struct PackImpl<Path::kAvx2, FixedKernelLayout<Order::kRowMajor, 1, 8>, float,
                 float, float> {
   using Layout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
   static void Run(Tuning, const Matrix<float>& src_matrix,
                   PackedMatrix<float>* packed_matrix, int start_col,
                   int end_col) {
     profiler::ScopeLabel label("Pack (AVX2 float)");

     RUY_DCHECK(IsColMajor(src_matrix.layout));
     RUY_DCHECK(IsColMajor(packed_matrix->layout));
     RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
     RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
     const float zerobuf[Layout::kCols] = {
         0.0f};  // Remainder default inits to 0.0f.
     for (int block_col = start_col; block_col < end_col;
          block_col += Layout::kCols) {
       int src_stride = src_matrix.layout.stride;
       const float* src_ptr = src_matrix.data.get() + src_stride * block_col;
       int remaining_src_cols = src_matrix.layout.cols - block_col;

       static constexpr int block_col_mask = ~(Layout::kCols - 1);  // High bits.
       float* packed_ptr =
           packed_matrix->data +
           packed_matrix->layout.stride * (block_col & block_col_mask);
       PackFloatAvx2(src_ptr, zerobuf, src_stride, remaining_src_cols,
                     src_matrix.layout.rows, packed_ptr);
     }
   }
 };

 // Note that source and zero buffers can be uint8 type, but in the packing
 // function are reinterpreted as int8, and are XOR-ed with input_xor.
 void Pack8bitAvx512(const std::int8_t* src_ptr, std::int8_t input_xor,
                     const std::int8_t* zerobuf, int src_stride,
                     int remaining_src_cols, int src_rows,
                     std::int8_t* packed_ptr, std::int32_t* sums_ptr);

 template <typename Scalar>
 struct PackImpl<Path::kAvx512, FixedKernelLayout<Order::kColMajor, 4, 16>,
                 Scalar, std::int8_t, std::int32_t> {
   static_assert(std::is_same<Scalar, std::int8_t>::value ||
                     std::is_same<Scalar, std::uint8_t>::value,
                 "");
   using Layout = FixedKernelLayout<Order::kColMajor, 4, 16>;
   static constexpr int kHalfLayoutCols =
       8;  // Half the number of cols in a block.
   static constexpr std::int8_t kInputXor =
       std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80;

   static void Run(Tuning tuning, const Matrix<Scalar>& src_matrix,
                   PackedMatrix<std::int8_t>* packed_matrix, int start_col,
                   int end_col) {
     profiler::ScopeLabel label("Pack (AVX-512 8-bit)");

     RUY_DCHECK(IsColMajor(src_matrix.layout));
     RUY_DCHECK(IsColMajor(packed_matrix->layout));
     RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
     RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
     RUY_DCHECK_EQ(kHalfLayoutCols * 2, Layout::kCols);
     std::int32_t* sums = packed_matrix->sums;
     Scalar zerobuf[kHalfLayoutCols * Layout::kRows];
     memset(zerobuf, packed_matrix->zero_point ^ kInputXor,
            kHalfLayoutCols * Layout::kRows * sizeof(Scalar));
     for (int block_col = start_col; block_col < end_col;
          block_col += Layout::kCols) {
       std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
       int src_stride = src_matrix.layout.stride;
       const Scalar* src_ptr = src_matrix.data.get() + src_stride * block_col;
       int remaining_src_cols = src_matrix.layout.cols - block_col;

       static constexpr int block_col_mask = ~(Layout::kCols - 1);  // High bits.
       std::int8_t* packed_ptr =
           packed_matrix->data +
           packed_matrix->layout.stride * (block_col & block_col_mask);
       Pack8bitAvx512(reinterpret_cast<const std::int8_t*>(src_ptr), kInputXor,
                      reinterpret_cast<const std::int8_t*>(zerobuf), src_stride,
                      remaining_src_cols, src_matrix.layout.rows, packed_ptr,
                      sums_ptr);
     }
   }
 };

 void PackFloatAvx512(const float* src_ptr, const float* zerobuf, int src_stride,
                      int remaining_src_cols, int src_rows, float* packed_ptr);

 template <>
 struct PackImpl<Path::kAvx512, FixedKernelLayout<Order::kRowMajor, 1, 16>,
                 float, float, float> {
   static void Run(Tuning, const Matrix<float>& src_matrix,
                   PackedMatrix<float>* packed_matrix, int start_col,
                   int end_col) {
     profiler::ScopeLabel label("Pack (AVX-512 float)");
     using Layout = FixedKernelLayout<Order::kRowMajor, 1, 16>;
     RUY_DCHECK(IsColMajor(src_matrix.layout));
     RUY_DCHECK(IsColMajor(packed_matrix->layout));
     RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
     RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
     const float zerobuf[Layout::kCols] = {
         0.0f};  // Remainder default inits to 0.0f.
     for (int block_col = start_col; block_col < end_col;
          block_col += Layout::kCols) {
       int src_stride = src_matrix.layout.stride;
       const float* src_ptr = src_matrix.data.get() + src_stride * block_col;
       int remaining_src_cols = src_matrix.layout.cols - block_col;

       static constexpr int block_col_mask = ~(Layout::kCols - 1);  // High bits.
       float* packed_ptr =
           packed_matrix->data +
           packed_matrix->layout.stride * (block_col & block_col_mask);
       PackFloatAvx512(src_ptr, zerobuf, src_stride, remaining_src_cols,
                       src_matrix.layout.rows, packed_ptr);
     }
   }
 };

 // TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
 // Optimization is not finished. In particular the dimensions of the kernel
 // blocks can be changed as desired.
 //
 // Note that source and zero buffers can be uint8 type, but in the packing
 // function are reinterpreted as int8, and are XOR-ed with input_xor.
 void Pack8bitAvxVnni(const std::int8_t* src_ptr, std::int8_t input_xor,
                      const std::int8_t* zerobuf, int src_stride,
                      int remaining_src_cols, int src_rows,
                      std::int8_t* packed_ptr, std::int32_t* sums_ptr);

 template <typename Scalar>
 struct PackImpl<Path::kAvxVnni, FixedKernelLayout<Order::kColMajor, 4, 16>,
                 Scalar, std::int8_t, std::int32_t> {
   static_assert(std::is_same<Scalar, std::int8_t>::value ||
                     std::is_same<Scalar, std::uint8_t>::value,
                 "");
   using Layout = FixedKernelLayout<Order::kColMajor, 4, 16>;
   static constexpr int kHalfLayoutCols =
       8;  // Half the number of cols in a block.
   static constexpr std::int8_t kInputXor =
       std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80;

   static void Run(Tuning tuning, const Matrix<Scalar>& src_matrix,
                   PackedMatrix<std::int8_t>* packed_matrix, int start_col,
                   int end_col) {
     profiler::ScopeLabel label("Pack (AVX-512 8-bit)");

     RUY_DCHECK(IsColMajor(src_matrix.layout));
     RUY_DCHECK(IsColMajor(packed_matrix->layout));
     RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
     RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
     RUY_DCHECK_EQ(kHalfLayoutCols * 2, Layout::kCols);
     std::int32_t* sums = packed_matrix->sums;
     Scalar zerobuf[kHalfLayoutCols * Layout::kRows];
     memset(zerobuf, packed_matrix->zero_point ^ kInputXor,
            kHalfLayoutCols * Layout::kRows * sizeof(Scalar));
     for (int block_col = start_col; block_col < end_col;
          block_col += Layout::kCols) {
       std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
       int src_stride = src_matrix.layout.stride;
       const Scalar* src_ptr = src_matrix.data.get() + src_stride * block_col;
       int remaining_src_cols = src_matrix.layout.cols - block_col;

       static constexpr int block_col_mask = ~(Layout::kCols - 1);  // High bits.
       std::int8_t* packed_ptr =
           packed_matrix->data +
           packed_matrix->layout.stride * (block_col & block_col_mask);
       Pack8bitAvxVnni(reinterpret_cast<const std::int8_t*>(src_ptr), kInputXor,
                       reinterpret_cast<const std::int8_t*>(zerobuf), src_stride,
                       remaining_src_cols, src_matrix.layout.rows, packed_ptr,
                       sums_ptr);
     }
   }
 };

 // TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
 // Optimization is not finished. In particular the dimensions of the kernel
 // blocks can be changed as desired.
 //
 void PackFloatAvxVnni(const float* src_ptr, const float* zerobuf,
                       int src_stride, int remaining_src_cols, int src_rows,
                       float* packed_ptr);

 template <>
 struct PackImpl<Path::kAvxVnni, FixedKernelLayout<Order::kRowMajor, 1, 16>,
                 float, float, float> {
   static void Run(Tuning, const Matrix<float>& src_matrix,
                   PackedMatrix<float>* packed_matrix, int start_col,
                   int end_col) {
     profiler::ScopeLabel label("Pack (AVX-512 float)");

     using Layout = FixedKernelLayout<Order::kRowMajor, 1, 16>;
     RUY_DCHECK(IsColMajor(src_matrix.layout));
     RUY_DCHECK(IsColMajor(packed_matrix->layout));
     RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
     RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
     const float zerobuf[Layout::kCols] = {
         0.0f};  // Remainder default inits to 0.0f.
     for (int block_col = start_col; block_col < end_col;
          block_col += Layout::kCols) {
       int src_stride = src_matrix.layout.stride;
       const float* src_ptr = src_matrix.data.get() + src_stride * block_col;
       int remaining_src_cols = src_matrix.layout.cols - block_col;

       static constexpr int block_col_mask = ~(Layout::kCols - 1);  // High bits.
       float* packed_ptr =
           packed_matrix->data +
           packed_matrix->layout.stride * (block_col & block_col_mask);
       PackFloatAvxVnni(src_ptr, zerobuf, src_stride, remaining_src_cols,
                        src_matrix.layout.rows, packed_ptr);
     }
   }
 };
 #endif  // RUY_PLATFORM(X86)

 }  // namespace ruy

 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_X86_H_
	/* Copyright 2019 Google LLC. All Rights Reserved.

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==============================================================================*/

	// # What is "packing"?
	//
	// Before feeding data to the gemm kernels (the parts of Ruy that do lots
	// of multiply-add operations), Ruy first performs a data transformation (which
	// we call "packing") on the input matrices. This transformation has two main
	// goals:
	// - rearrange data into blocks that are a convenient size/layout for the gemm
	// kernels to consume. This helps make the memory access pattern of the gemm
	// kernel simpler and more contiguous, and puts the data in a layout most
	// convenient for specific arithmetic instructions in the gemm kernel.
	// - compute row/column sums needed for handling quantization with non-symmetric
	// zero points.
	//
	// # Simplified algorithmic analysis of packing
	//
	// Packing is a relatively simple transformation which does a small constant
	// amount of work on each element of an input matrix, and hence for an NxM
	// matrix performs O(N*M) work. If N and M are of the same order, then this is
	// O(N^2) work.
	//
	// A NxKxM matrix multiplication requires NKM multiply-accumulate operations.
	// Note that if N, K, and M are all the same order, then the number of
	// multiply-accumulate operations is O(N^3).
	//
	// Thus, the O(N^2) cost of packing is small compared to the O(N^3) work, in the
	// case of all dimensions being roughly the same order.
	//
	// # Packing cost can be significant
	//
	// When matrix * matrix multiplications begin to look more like matrix * vector
	// multiplications, packing cost can become significant. We sometimes call these
	// cases "gemv-like".
	//
	// Continuing the algorithmic analysis above, if we consider a case where an
	// NxKxM matrix multiplication has either N = O(1) or M = O(1), then the
	// situation is different. In this case, the multiply-accumulate work is only
	// quadratic, so the quadratic cost of packing can be come significant.
	//
	// Another way to say this is that the cost of packing an input matrix (either
	// the LHS or RHS) is amortized across the non-depth dimension of the opposite
	// input matrix. Thus, when the LHS has very few rows or the RHS has very few
	// columns, the cost of packing the opposite input matrix can become
	// significant.
	//
	// As a rough rule of thumb, the cost of packing starts to become significant
	// when either N or M is below 32 (and other dimensions are hundreds), with very
	// significant packing costs at 8 or below. This varies by data type, Path, and
	// tuning, so these numbers are only rough guides.
	//
	// One practical use case that is affected by this is inference of
	// fully connected neural network layers with a low batch size. The weight
	// matrix (which is a constant for inference) is the one affected by significant
	// packing cost.
	//
	// Ruy provides an API in ruy_advanced.h for advanced users to pre-pack
	// input matrices that are affected by significant packing costs.
	//
	// # Implementation notes
	//
	// Ruy's packing routines always operate on a range of columns and can be
	// applied to either the LHS or RHS. This is possible because Ruy internally
	// implements a TrMul, so the accumulation along depth is done along columns of
	// both the LHS and RHS (whereas for a normal Mul the accumulation along depth
	// for the LHS is along rows). As another example, we are always computing
	// column sums for quantization (and never row sums, since the LHS is
	// transposed).

	#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_X86_H_
	#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_X86_H_

	#include <cstdint>
	#include <cstring>
	#include <type_traits>

	#include "check_macros.h"
	#include "common.h"
	#include "internal_matrix.h"
	#include "matrix.h"
	#include "opt_set.h"
	#include "pack_common.h"
	#include "path.h"
	#include "platform.h"
	#include "profiler/instrumentation.h"
	#include "tune.h"

	namespace ruy {

	#if RUY_PLATFORM(X86)
	// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
	// Optimization is not finished. In particular the dimensions of the kernel
	// blocks can be changed as desired.
	//
	// Note that source and zero buffers can be uint8 type, but in the packing
	// function are reinterpreted as int8, and are XOR-ed with input_xor.
	void Pack8bitSse42(const std::int8_t* src_ptr, std::int8_t input_xor,
	const std::int8_t* zerobuf, int src_stride,
	int remaining_src_cols, int src_rows,
	std::int8_t* packed_ptr, std::int32_t* sums_ptr);

	template <typename Scalar>
	struct PackImpl<Path::kSse42, FixedKernelLayout<Order::kColMajor, 4, 8>, Scalar,
	std::int8_t, std::int32_t> {
	static_assert(std::is_same<Scalar, std::int8_t>::value \|\|
	std::is_same<Scalar, std::uint8_t>::value,
	"");
	using Layout = FixedKernelLayout<Order::kColMajor, 4, 8>;
	static constexpr std::int8_t kInputXor =
	std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80;

	static void Run(Tuning tuning, const Matrix<Scalar>& src_matrix,
	PackedMatrix<std::int8_t>* packed_matrix, int start_col,
	int end_col) {
	profiler::ScopeLabel label("Pack (SSE 4.2 8-bit)");

	RUY_DCHECK(IsColMajor(src_matrix.layout));
	RUY_DCHECK(IsColMajor(packed_matrix->layout));
	RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
	RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
	std::int32_t* sums = packed_matrix->sums;
	Scalar zerobuf[Layout::kCols * Layout::kRows];
	memset(zerobuf, packed_matrix->zero_point ^ kInputXor,
	Layout::kCols * Layout::kRows * sizeof(Scalar));
	for (int block_col = start_col; block_col < end_col;
	block_col += Layout::kCols) {
	std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
	int src_stride = src_matrix.layout.stride;
	const Scalar* src_ptr = src_matrix.data.get() + src_stride * block_col;
	int remaining_src_cols = src_matrix.layout.cols - block_col;

	static constexpr int block_col_mask = ~(Layout::kCols - 1); // High bits.
	std::int8_t* packed_ptr =
	packed_matrix->data +
	packed_matrix->layout.stride * (block_col & block_col_mask);
	Pack8bitSse42(reinterpret_cast<const std::int8_t*>(src_ptr), kInputXor,
	reinterpret_cast<const std::int8_t*>(zerobuf), src_stride,
	remaining_src_cols, src_matrix.layout.rows, packed_ptr,
	sums_ptr);
	}
	}
	};

	// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
	// Optimization is not finished. In particular the dimensions of the kernel
	// blocks can be changed as desired.
	//
	void PackFloatSse42(const float* src_ptr, const float* zerobuf, int src_stride,
	int remaining_src_cols, int src_rows, float* packed_ptr);

	template <>
	struct PackImpl<Path::kSse42, FixedKernelLayout<Order::kRowMajor, 1, 8>, float,
	float, float> {
	using Layout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
	static void Run(Tuning, const Matrix<float>& src_matrix,
	PackedMatrix<float>* packed_matrix, int start_col,
	int end_col) {
	profiler::ScopeLabel label("Pack (SSE 4.2 float)");

	RUY_DCHECK(IsColMajor(src_matrix.layout));
	RUY_DCHECK(IsColMajor(packed_matrix->layout));
	RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
	RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
	const float zerobuf[Layout::kCols] = {
	0.0f}; // Remainder default inits to 0.0f.
	for (int block_col = start_col; block_col < end_col;
	block_col += Layout::kCols) {
	int src_stride = src_matrix.layout.stride;
	const float* src_ptr = src_matrix.data.get() + src_stride * block_col;
	int remaining_src_cols = src_matrix.layout.cols - block_col;

	static constexpr int block_col_mask = ~(Layout::kCols - 1); // High bits.
	float* packed_ptr =
	packed_matrix->data +
	packed_matrix->layout.stride * (block_col & block_col_mask);
	PackFloatSse42(src_ptr, zerobuf, src_stride, remaining_src_cols,
	src_matrix.layout.rows, packed_ptr);
	}
	}
	};

	// Note that source and zero buffers can be uint8 type, but in the packing
	// function are reinterpreted as int8, and are XOR-ed with input_xor.
	void Pack8bitAvx2(const std::int8_t* src_ptr, std::int8_t input_xor,
	const std::int8_t* zerobuf, int src_stride,
	int remaining_src_cols, int src_rows, std::int8_t* packed_ptr,
	std::int32_t* sums_ptr);

	template <typename Scalar>
	struct PackImpl<Path::kAvx2, FixedKernelLayout<Order::kColMajor, 4, 8>, Scalar,
	std::int8_t, std::int32_t> {
	static_assert(std::is_same<Scalar, std::int8_t>::value \|\|
	std::is_same<Scalar, std::uint8_t>::value,
	"");
	using Layout = FixedKernelLayout<Order::kColMajor, 4, 8>;
	static constexpr std::int8_t kInputXor =
	std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80;

	static void Run(Tuning tuning, const Matrix<Scalar>& src_matrix,
	PackedMatrix<std::int8_t>* packed_matrix, int start_col,
	int end_col) {
	profiler::ScopeLabel label("Pack (AVX2 8-bit)");

	RUY_DCHECK(IsColMajor(src_matrix.layout));
	RUY_DCHECK(IsColMajor(packed_matrix->layout));
	RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
	RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
	std::int32_t* sums = packed_matrix->sums;
	Scalar zerobuf[Layout::kCols * Layout::kRows];
	memset(zerobuf, packed_matrix->zero_point ^ kInputXor,
	Layout::kCols * Layout::kRows * sizeof(Scalar));
	for (int block_col = start_col; block_col < end_col;
	block_col += Layout::kCols) {
	std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
	int src_stride = src_matrix.layout.stride;
	const Scalar* src_ptr = src_matrix.data.get() + src_stride * block_col;
	int remaining_src_cols = src_matrix.layout.cols - block_col;

	static constexpr int block_col_mask = ~(Layout::kCols - 1); // High bits.
	std::int8_t* packed_ptr =
	packed_matrix->data +
	packed_matrix->layout.stride * (block_col & block_col_mask);
	Pack8bitAvx2(reinterpret_cast<const std::int8_t*>(src_ptr), kInputXor,
	reinterpret_cast<const std::int8_t*>(zerobuf), src_stride,
	remaining_src_cols, src_matrix.layout.rows, packed_ptr,
	sums_ptr);
	}
	}
	};

	void PackFloatAvx2(const float* src_ptr, const float* zerobuf, int src_stride,
	int remaining_src_cols, int src_rows, float* packed_ptr);

	template <>
	struct PackImpl<Path::kAvx2, FixedKernelLayout<Order::kRowMajor, 1, 8>, float,
	float, float> {
	using Layout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
	static void Run(Tuning, const Matrix<float>& src_matrix,
	PackedMatrix<float>* packed_matrix, int start_col,
	int end_col) {
	profiler::ScopeLabel label("Pack (AVX2 float)");

	RUY_DCHECK(IsColMajor(src_matrix.layout));
	RUY_DCHECK(IsColMajor(packed_matrix->layout));
	RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
	RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
	const float zerobuf[Layout::kCols] = {
	0.0f}; // Remainder default inits to 0.0f.
	for (int block_col = start_col; block_col < end_col;
	block_col += Layout::kCols) {
	int src_stride = src_matrix.layout.stride;
	const float* src_ptr = src_matrix.data.get() + src_stride * block_col;
	int remaining_src_cols = src_matrix.layout.cols - block_col;

	static constexpr int block_col_mask = ~(Layout::kCols - 1); // High bits.
	float* packed_ptr =
	packed_matrix->data +
	packed_matrix->layout.stride * (block_col & block_col_mask);
	PackFloatAvx2(src_ptr, zerobuf, src_stride, remaining_src_cols,
	src_matrix.layout.rows, packed_ptr);
	}
	}
	};

	// Note that source and zero buffers can be uint8 type, but in the packing
	// function are reinterpreted as int8, and are XOR-ed with input_xor.
	void Pack8bitAvx512(const std::int8_t* src_ptr, std::int8_t input_xor,
	const std::int8_t* zerobuf, int src_stride,
	int remaining_src_cols, int src_rows,
	std::int8_t* packed_ptr, std::int32_t* sums_ptr);

	template <typename Scalar>
	struct PackImpl<Path::kAvx512, FixedKernelLayout<Order::kColMajor, 4, 16>,
	Scalar, std::int8_t, std::int32_t> {
	static_assert(std::is_same<Scalar, std::int8_t>::value \|\|
	std::is_same<Scalar, std::uint8_t>::value,
	"");
	using Layout = FixedKernelLayout<Order::kColMajor, 4, 16>;
	static constexpr int kHalfLayoutCols =
	8; // Half the number of cols in a block.
	static constexpr std::int8_t kInputXor =
	std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80;

	static void Run(Tuning tuning, const Matrix<Scalar>& src_matrix,
	PackedMatrix<std::int8_t>* packed_matrix, int start_col,
	int end_col) {
	profiler::ScopeLabel label("Pack (AVX-512 8-bit)");

	RUY_DCHECK(IsColMajor(src_matrix.layout));
	RUY_DCHECK(IsColMajor(packed_matrix->layout));
	RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
	RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
	RUY_DCHECK_EQ(kHalfLayoutCols * 2, Layout::kCols);
	std::int32_t* sums = packed_matrix->sums;
	Scalar zerobuf[kHalfLayoutCols * Layout::kRows];
	memset(zerobuf, packed_matrix->zero_point ^ kInputXor,
	kHalfLayoutCols * Layout::kRows * sizeof(Scalar));
	for (int block_col = start_col; block_col < end_col;
	block_col += Layout::kCols) {
	std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
	int src_stride = src_matrix.layout.stride;
	const Scalar* src_ptr = src_matrix.data.get() + src_stride * block_col;
	int remaining_src_cols = src_matrix.layout.cols - block_col;

	static constexpr int block_col_mask = ~(Layout::kCols - 1); // High bits.
	std::int8_t* packed_ptr =
	packed_matrix->data +
	packed_matrix->layout.stride * (block_col & block_col_mask);
	Pack8bitAvx512(reinterpret_cast<const std::int8_t*>(src_ptr), kInputXor,
	reinterpret_cast<const std::int8_t*>(zerobuf), src_stride,
	remaining_src_cols, src_matrix.layout.rows, packed_ptr,
	sums_ptr);
	}
	}
	};

	void PackFloatAvx512(const float* src_ptr, const float* zerobuf, int src_stride,
	int remaining_src_cols, int src_rows, float* packed_ptr);

	template <>
	struct PackImpl<Path::kAvx512, FixedKernelLayout<Order::kRowMajor, 1, 16>,
	float, float, float> {
	static void Run(Tuning, const Matrix<float>& src_matrix,
	PackedMatrix<float>* packed_matrix, int start_col,
	int end_col) {
	profiler::ScopeLabel label("Pack (AVX-512 float)");
	using Layout = FixedKernelLayout<Order::kRowMajor, 1, 16>;
	RUY_DCHECK(IsColMajor(src_matrix.layout));
	RUY_DCHECK(IsColMajor(packed_matrix->layout));
	RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
	RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
	const float zerobuf[Layout::kCols] = {
	0.0f}; // Remainder default inits to 0.0f.
	for (int block_col = start_col; block_col < end_col;
	block_col += Layout::kCols) {
	int src_stride = src_matrix.layout.stride;
	const float* src_ptr = src_matrix.data.get() + src_stride * block_col;
	int remaining_src_cols = src_matrix.layout.cols - block_col;

	static constexpr int block_col_mask = ~(Layout::kCols - 1); // High bits.
	float* packed_ptr =
	packed_matrix->data +
	packed_matrix->layout.stride * (block_col & block_col_mask);
	PackFloatAvx512(src_ptr, zerobuf, src_stride, remaining_src_cols,
	src_matrix.layout.rows, packed_ptr);
	}
	}
	};

	// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
	// Optimization is not finished. In particular the dimensions of the kernel
	// blocks can be changed as desired.
	//
	// Note that source and zero buffers can be uint8 type, but in the packing
	// function are reinterpreted as int8, and are XOR-ed with input_xor.
	void Pack8bitAvxVnni(const std::int8_t* src_ptr, std::int8_t input_xor,
	const std::int8_t* zerobuf, int src_stride,
	int remaining_src_cols, int src_rows,
	std::int8_t* packed_ptr, std::int32_t* sums_ptr);

	template <typename Scalar>
	struct PackImpl<Path::kAvxVnni, FixedKernelLayout<Order::kColMajor, 4, 16>,
	Scalar, std::int8_t, std::int32_t> {
	static_assert(std::is_same<Scalar, std::int8_t>::value \|\|
	std::is_same<Scalar, std::uint8_t>::value,
	"");
	using Layout = FixedKernelLayout<Order::kColMajor, 4, 16>;
	static constexpr int kHalfLayoutCols =
	8; // Half the number of cols in a block.
	static constexpr std::int8_t kInputXor =
	std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80;

	static void Run(Tuning tuning, const Matrix<Scalar>& src_matrix,
	PackedMatrix<std::int8_t>* packed_matrix, int start_col,
	int end_col) {
	profiler::ScopeLabel label("Pack (AVX-512 8-bit)");

	RUY_DCHECK(IsColMajor(src_matrix.layout));
	RUY_DCHECK(IsColMajor(packed_matrix->layout));
	RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
	RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
	RUY_DCHECK_EQ(kHalfLayoutCols * 2, Layout::kCols);
	std::int32_t* sums = packed_matrix->sums;
	Scalar zerobuf[kHalfLayoutCols * Layout::kRows];
	memset(zerobuf, packed_matrix->zero_point ^ kInputXor,
	kHalfLayoutCols * Layout::kRows * sizeof(Scalar));
	for (int block_col = start_col; block_col < end_col;
	block_col += Layout::kCols) {
	std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
	int src_stride = src_matrix.layout.stride;
	const Scalar* src_ptr = src_matrix.data.get() + src_stride * block_col;
	int remaining_src_cols = src_matrix.layout.cols - block_col;

	static constexpr int block_col_mask = ~(Layout::kCols - 1); // High bits.
	std::int8_t* packed_ptr =
	packed_matrix->data +
	packed_matrix->layout.stride * (block_col & block_col_mask);
	Pack8bitAvxVnni(reinterpret_cast<const std::int8_t*>(src_ptr), kInputXor,
	reinterpret_cast<const std::int8_t*>(zerobuf), src_stride,
	remaining_src_cols, src_matrix.layout.rows, packed_ptr,
	sums_ptr);
	}
	}
	};

	// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
	// Optimization is not finished. In particular the dimensions of the kernel
	// blocks can be changed as desired.
	//
	void PackFloatAvxVnni(const float* src_ptr, const float* zerobuf,
	int src_stride, int remaining_src_cols, int src_rows,
	float* packed_ptr);

	template <>
	struct PackImpl<Path::kAvxVnni, FixedKernelLayout<Order::kRowMajor, 1, 16>,
	float, float, float> {
	static void Run(Tuning, const Matrix<float>& src_matrix,
	PackedMatrix<float>* packed_matrix, int start_col,
	int end_col) {
	profiler::ScopeLabel label("Pack (AVX-512 float)");

	using Layout = FixedKernelLayout<Order::kRowMajor, 1, 16>;
	RUY_DCHECK(IsColMajor(src_matrix.layout));
	RUY_DCHECK(IsColMajor(packed_matrix->layout));
	RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
	RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
	const float zerobuf[Layout::kCols] = {
	0.0f}; // Remainder default inits to 0.0f.
	for (int block_col = start_col; block_col < end_col;
	block_col += Layout::kCols) {
	int src_stride = src_matrix.layout.stride;
	const float* src_ptr = src_matrix.data.get() + src_stride * block_col;
	int remaining_src_cols = src_matrix.layout.cols - block_col;

	static constexpr int block_col_mask = ~(Layout::kCols - 1); // High bits.
	float* packed_ptr =
	packed_matrix->data +
	packed_matrix->layout.stride * (block_col & block_col_mask);
	PackFloatAvxVnni(src_ptr, zerobuf, src_stride, remaining_src_cols,
	src_matrix.layout.rows, packed_ptr);
	}
	}
	};
	#endif // RUY_PLATFORM(X86)

	} // namespace ruy

	#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_X86_H_