blob: 790ad61820f73e02082bccc099c1f11ce34eadb9 [file] [log] [blame]
/* Copyright 2019 Google LLC. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_BLOCK_MAP_H_
#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_BLOCK_MAP_H_
#include "path.h"
#include "side_pair.h"
namespace ruy {
enum class BlockMapTraversalOrder {
// Plain old row-by-row or column-by-column traversal.
kLinear,
// Fractal Z-order curve, https://en.wikipedia.org/wiki/Z-order_curve
kFractalZ,
// Variant of Z-order doing a U instead of a Z.
kFractalU
// TODO(benoitjacob) add Hilbert curve order. More complex decoding might be
// worth it.
};
// A BlockMap describes a tiling of a matrix, typically the destination matrix
// of a matrix multiplication computation. As is standard in matrix
// multiplication, a tile is called a "block".
//
// Ruy subdivides work by blocks of the destination matrix: each thread fully
// computes a block at once, then moves on to another block; each block is
// produced by a single thread.
//
// This ensures that the workloads for each block are mutually independent,
// which reduces synchronization requirements.
//
// Typically, a matrix multiplication will early on create a BlockMap by
// calling MakeBlockMap. It will then query the number of blocks in that
// BlockMap by calling NumBlocks. It will then create a single atomic integer
// counter indexing these blocks, called the 'index', and will distribute
// work to its N threads by ensuring that each thread works on disjoint sets
// of index values. For a given index value, the thread will call
// GetBlockByIndex to get the corresponding block, then GetBlockMatrixCoords
// to find the actual row and column numbers of this block.
//
// There are two nested levels of subdivision. On a local level, the matrix is
// tiled into a square NxN grid where N is a power of two, specifically:
// N = 2^num_blocks_base_log2.
//
// At a larger scale, around these blocks, there may be one further
// level of subdivision, in only one dimension: either along rows or along
// columns. That is used to handle arbitrarily rectangular matrices. The
// aforementioned high-level block grid is square, so it does not readily fit
// well very rectangular matrices.
//
// Taking together these two nested levels of subdivision, the effective
// tiling is by
// 2^(num_blocks_base_log2 + rows_rectangularness_log2)
// blocks in the row dimension, and by
// 2^(num_blocks_base_log2 + cols_rectangularness_log2)
// blocks in the column dimension. See NumBlocksOfRows, NumBlocksOfCols.
//
// Either rows_rectangularness_log2 or cols_rectangularness_log2 must be zero.
//
// Finally, this BlockMap is designed to operate under alignment constraints:
// two fields, kernel_rows and kernel_cols, describe the requested alignment
// of the effective grid in both dimensions. The idea is to feed matrix
// multiplication kernels with tiles that fit their width as much as possible.
// Of course, if rows (resp. cols) is not a multiple of kernel_rows (resp.
// kernel_cols) then some tile will have to have unaligned size. BlockMap
// will only allow that to happen in the last position along each axis, so
// as to minimize the overhead incurred onto the matrix multiplication kernels.
struct BlockMap {
// The number of threads to use (to distribute the blocks to).
int thread_count;
// The order in which to traverse the matrix of which this BlockMap represents
// a tiling (hereafter "the matrix").
BlockMapTraversalOrder traversal_order;
// The dimensions of the block_map, that is, of the destination
// matrix rounded up to next multiples of kernel_dims.
SidePair<int> dims;
// Log2 of the minimum number of subdivisions of the grid along either axis.
int num_blocks_base_log2;
// Log2 of the additional subdivision of the rows/columns axis.
SidePair<int> rectangularness_log2;
// Requested alignment of the subdivisions of the grid along the rows/columns
// axis.
SidePair<int> kernel_dims;
// Internal helper. Minimum number of rows/columns in each block.
SidePair<int> small_block_dims;
// Internal helper. Number of blocks along each dimension that need to have
// their size in that dimension be given by (small_block_dims + kernel_dims)
// instead of just small_block_dims.
SidePair<int> large_blocks;
};
// Create a BlockMap suitable for tiling the destination matrix in a
// matrix multiplication with the given parameters.
void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
int tentative_thread_count, Path path,
int cache_friendly_traversal_threshold, BlockMap* block_map);
// Maps an integer index to a block position in the grid.
void GetBlockByIndex(const BlockMap& block_map, int index,
SidePair<int>* block);
// Given a block position in the grid, returns its actual
// position in the matrix that the BlockMap refers to in the dimension
// referred to by `side`: along rows if side==kLhs, along columns if
// side==kRhs.
void GetBlockMatrixCoords(Side side, const BlockMap& block_map, int block,
int* start, int* end);
// Given a block position in the grid, returns its actual
// position in the matrix that the BlockMap refers to in terms of
// actual row/column indices.
void GetBlockMatrixCoords(const BlockMap& block_map, const SidePair<int>& block,
SidePair<int>* start, SidePair<int>* end);
// Returns the number of grid subdivisions along the rows dimension (if
// side == kLhs) or columns dimension (if side == kRhs).
inline int NumBlocksPerSide(Side side, const BlockMap& block_map) {
return 1 << (block_map.num_blocks_base_log2 +
block_map.rectangularness_log2[side]);
}
// Returns the overall number of blocks in
// the BlockMap. The valid index values to pass to GetBlockByIndex are the
// integers from 0 to N-1 where N is the value returned here.
//
// Note that it is always true that
// NumBlocks == NumBlocksOfRows * NumBlocksOfCols
// because either rows_rectangularness_log2 or cols_rectangularness_log2 is 0.
inline int NumBlocks(const BlockMap& block_map) {
return 1 << (2 * block_map.num_blocks_base_log2 +
block_map.rectangularness_log2[Side::kLhs] +
block_map.rectangularness_log2[Side::kRhs]);
}
} // namespace ruy
#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_BLOCK_MAP_H_