| /* |
| * Copyright (c) 2011-2013 Luc Verhaegen <libv@skynet.be> |
| * Copyright (c) 2018 Alyssa Rosenzweig <alyssa@rosenzweig.io> |
| * Copyright (c) 2018 Vasily Khoruzhick <anarsoul@gmail.com> |
| * Copyright (c) 2019 Collabora, Ltd. |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sub license, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the |
| * next paragraph) shall be included in all copies or substantial portions |
| * of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
| * DEALINGS IN THE SOFTWARE. |
| * |
| */ |
| |
| #include "pan_tiling.h" |
| #include <stdbool.h> |
| #include "util/macros.h" |
| |
| /* This file implements software encode/decode of the tiling format used for |
| * textures and framebuffers primarily on Utgard GPUs. Names for this format |
| * include "Utgard-style tiling", "(Mali) swizzled textures", and |
| * "U-interleaved" (the former two names being used in the community |
| * Lima/Panfrost drivers; the latter name used internally at Arm). |
| * Conceptually, like any tiling scheme, the pixel reordering attempts to 2D |
| * spatial locality, to improve cache locality in both horizontal and vertical |
| * directions. |
| * |
| * This format is tiled: first, the image dimensions must be aligned to 16 |
| * pixels in each axis. Once aligned, the image is divided into 16x16 tiles. |
| * This size harmonizes with other properties of the GPU; on Midgard, |
| * framebuffer tiles are logically 16x16 (this is the tile size used in |
| * Transaction Elimination and the minimum tile size used in Hierarchical |
| * Tiling). Conversely, for a standard 4 bytes-per-pixel format (like |
| * RGBA8888), 16 pixels * 4 bytes/pixel = 64 bytes, equal to the cache line |
| * size. |
| * |
| * Within each 16x16 block, the bits are reordered according to this pattern: |
| * |
| * | y3 | (x3 ^ y3) | y2 | (y2 ^ x2) | y1 | (y1 ^ x1) | y0 | (y0 ^ x0) | |
| * |
| * Basically, interleaving the X and Y bits, with XORs thrown in for every |
| * adjacent bit pair. |
| * |
| * This is cheap to implement both encode/decode in both hardware and software. |
| * In hardware, lines are simply rerouted to reorder and some XOR gates are |
| * thrown in. Software has to be a bit more clever. |
| * |
| * In software, the trick is to divide the pattern into two lines: |
| * |
| * | y3 | y3 | y2 | y2 | y1 | y1 | y0 | y0 | |
| * ^ | 0 | x3 | 0 | x2 | 0 | x1 | 0 | x0 | |
| * |
| * That is, duplicate the bits of the Y and space out the bits of the X. The |
| * top line is a function only of Y, so it can be calculated once per row and |
| * stored in a register. The bottom line is simply X with the bits spaced out. |
| * Spacing out the X is easy enough with a LUT, or by subtracting+ANDing the |
| * mask pattern (abusing carry bits). |
| * |
| * This format is also supported on Midgard GPUs, where it *can* be used for |
| * textures and framebuffers. That said, in practice it is usually as a |
| * fallback layout; Midgard introduces Arm FrameBuffer Compression, which is |
| * significantly more efficient than Utgard-style tiling and preferred for both |
| * textures and framebuffers, where possible. For unsupported texture types, |
| * for instance sRGB textures and framebuffers, this tiling scheme is used at a |
| * performance penalty, as AFBC is not compatible. |
| */ |
| |
| /* Given the lower 4-bits of the Y coordinate, we would like to |
| * duplicate every bit over. So instead of 0b1010, we would like |
| * 0b11001100. The idea is that for the bits in the solely Y place, we |
| * get a Y place, and the bits in the XOR place *also* get a Y. */ |
| |
| const uint32_t bit_duplication[16] = { |
| 0b00000000, |
| 0b00000011, |
| 0b00001100, |
| 0b00001111, |
| 0b00110000, |
| 0b00110011, |
| 0b00111100, |
| 0b00111111, |
| 0b11000000, |
| 0b11000011, |
| 0b11001100, |
| 0b11001111, |
| 0b11110000, |
| 0b11110011, |
| 0b11111100, |
| 0b11111111, |
| }; |
| |
| /* Space the bits out of a 4-bit nibble */ |
| |
| const unsigned space_4[16] = { |
| 0b0000000, |
| 0b0000001, |
| 0b0000100, |
| 0b0000101, |
| 0b0010000, |
| 0b0010001, |
| 0b0010100, |
| 0b0010101, |
| 0b1000000, |
| 0b1000001, |
| 0b1000100, |
| 0b1000101, |
| 0b1010000, |
| 0b1010001, |
| 0b1010100, |
| 0b1010101 |
| }; |
| |
| /* The scheme uses 16x16 tiles */ |
| |
| #define TILE_WIDTH 16 |
| #define TILE_HEIGHT 16 |
| #define PIXELS_PER_TILE (TILE_WIDTH * TILE_HEIGHT) |
| |
| /* We need a 128-bit type for idiomatically tiling bpp128 formats. The type must |
| * only support copies and sizeof, so emulating with a packed structure works |
| * well enough, but if there's a native 128-bit type we may we well prefer |
| * that. */ |
| |
| #ifdef __SIZEOF_INT128__ |
| typedef __uint128_t pan_uint128_t; |
| #else |
| typedef struct { |
| uint64_t lo; |
| uint64_t hi; |
| } __attribute__((packed)) pan_uint128_t; |
| #endif |
| |
| typedef struct { |
| uint16_t lo; |
| uint8_t hi; |
| } __attribute__((packed)) pan_uint24_t; |
| |
| /* Optimized routine to tile an aligned (w & 0xF == 0) texture. Explanation: |
| * |
| * dest_start precomputes the offset to the beginning of the first horizontal |
| * tile we're writing to, knowing that x is 16-aligned. Tiles themselves are |
| * stored linearly, so we get the X tile number by shifting and then multiply |
| * by the bytes per tile . |
| * |
| * We iterate across the pixels we're trying to store in source-order. For each |
| * row in the destination image, we figure out which row of 16x16 block we're |
| * in, by slicing off the lower 4-bits (block_y). |
| * |
| * dest then precomputes the location of the top-left corner of the block the |
| * row starts in. In pixel coordinates (where the origin is the top-left), |
| * (block_y, 0) is the top-left corner of the leftmost tile in this row. While |
| * pixels are reordered within a block, the blocks themselves are stored |
| * linearly, so multiplying block_y by the pixel stride of the destination |
| * image equals the byte offset of that top-left corner of the block this row |
| * is in. |
| * |
| * On the other hand, the source is linear so we compute the locations of the |
| * start and end of the row in the source by a simple linear addressing. |
| * |
| * For indexing within the tile, we need to XOR with the [y3 y3 y2 y2 y1 y1 y0 |
| * y0] value. Since this is constant across a row, we look it up per-row and |
| * store in expanded_y. |
| * |
| * Finally, we iterate each row in source order. In the outer loop, we iterate |
| * each 16 pixel tile. Within each tile, we iterate the 16 pixels (this should |
| * be unrolled), calculating the index within the tile and writing. |
| */ |
| |
| #define TILED_ACCESS_TYPE(pixel_t, shift) \ |
| static ALWAYS_INLINE void \ |
| panfrost_access_tiled_image_##pixel_t \ |
| (void *dst, void *src, \ |
| uint16_t sx, uint16_t sy, \ |
| uint16_t w, uint16_t h, \ |
| uint32_t dst_stride, \ |
| uint32_t src_stride, \ |
| bool is_store) \ |
| { \ |
| uint8_t *dest_start = dst + ((sx >> 4) * PIXELS_PER_TILE * sizeof(pixel_t)); \ |
| for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \ |
| uint16_t block_y = y & ~0x0f; \ |
| uint8_t *dest = (uint8_t *) (dest_start + (block_y * dst_stride)); \ |
| pixel_t *source = src + (src_y * src_stride); \ |
| pixel_t *source_end = source + w; \ |
| unsigned expanded_y = bit_duplication[y & 0xF] << shift; \ |
| for (; source < source_end; dest += (PIXELS_PER_TILE << shift)) { \ |
| for (uint8_t i = 0; i < 16; ++i) { \ |
| unsigned index = expanded_y ^ (space_4[i] << shift); \ |
| if (is_store) \ |
| *((pixel_t *) (dest + index)) = *(source++); \ |
| else \ |
| *(source++) = *((pixel_t *) (dest + index)); \ |
| } \ |
| } \ |
| } \ |
| } \ |
| |
| TILED_ACCESS_TYPE(uint8_t, 0); |
| TILED_ACCESS_TYPE(uint16_t, 1); |
| TILED_ACCESS_TYPE(uint32_t, 2); |
| TILED_ACCESS_TYPE(uint64_t, 3); |
| TILED_ACCESS_TYPE(pan_uint128_t, 4); |
| |
| #define TILED_UNALIGNED_TYPE(pixel_t, is_store, tile_shift) { \ |
| const unsigned mask = (1 << tile_shift) - 1; \ |
| for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \ |
| unsigned block_y = y & ~mask; \ |
| unsigned block_start_s = block_y * dst_stride; \ |
| unsigned source_start = src_y * src_stride; \ |
| unsigned expanded_y = bit_duplication[y & mask]; \ |
| \ |
| for (int x = sx, src_x = 0; src_x < w; ++x, ++src_x) { \ |
| unsigned block_x_s = (x >> tile_shift) * (1 << (tile_shift * 2)); \ |
| unsigned index = expanded_y ^ space_4[x & mask]; \ |
| uint8_t *source = src + source_start + sizeof(pixel_t) * src_x; \ |
| uint8_t *dest = dst + block_start_s + sizeof(pixel_t) * (block_x_s + index); \ |
| \ |
| pixel_t *outp = (pixel_t *) (is_store ? dest : source); \ |
| pixel_t *inp = (pixel_t *) (is_store ? source : dest); \ |
| *outp = *inp; \ |
| } \ |
| } \ |
| } |
| |
| #define TILED_UNALIGNED_TYPES(store, shift) { \ |
| if (bpp == 8) \ |
| TILED_UNALIGNED_TYPE(uint8_t, store, shift) \ |
| else if (bpp == 16) \ |
| TILED_UNALIGNED_TYPE(uint16_t, store, shift) \ |
| else if (bpp == 24) \ |
| TILED_UNALIGNED_TYPE(pan_uint24_t, store, shift) \ |
| else if (bpp == 32) \ |
| TILED_UNALIGNED_TYPE(uint32_t, store, shift) \ |
| else if (bpp == 64) \ |
| TILED_UNALIGNED_TYPE(uint64_t, store, shift) \ |
| else if (bpp == 128) \ |
| TILED_UNALIGNED_TYPE(pan_uint128_t, store, shift) \ |
| } |
| |
| static void |
| panfrost_access_tiled_image_generic(void *dst, void *src, |
| unsigned sx, unsigned sy, |
| unsigned w, unsigned h, |
| uint32_t dst_stride, |
| uint32_t src_stride, |
| const struct util_format_description *desc, |
| bool _is_store) |
| { |
| unsigned bpp = desc->block.bits; |
| |
| if (desc->block.width > 1) { |
| w = DIV_ROUND_UP(w, desc->block.width); |
| h = DIV_ROUND_UP(h, desc->block.height); |
| |
| if (_is_store) |
| TILED_UNALIGNED_TYPES(true, 2) |
| else |
| TILED_UNALIGNED_TYPES(false, 2) |
| } else { |
| if (_is_store) |
| TILED_UNALIGNED_TYPES(true, 4) |
| else |
| TILED_UNALIGNED_TYPES(false, 4) |
| } |
| } |
| |
| #define OFFSET(src, _x, _y) (void *) ((uint8_t *) src + ((_y) - orig_y) * src_stride + (((_x) - orig_x) * (bpp / 8))) |
| |
| static ALWAYS_INLINE void |
| panfrost_access_tiled_image(void *dst, void *src, |
| unsigned x, unsigned y, |
| unsigned w, unsigned h, |
| uint32_t dst_stride, |
| uint32_t src_stride, |
| enum pipe_format format, |
| bool is_store) |
| { |
| const struct util_format_description *desc = util_format_description(format); |
| |
| if (desc->block.width > 1 || desc->block.bits == 24) { |
| panfrost_access_tiled_image_generic(dst, (void *) src, |
| x, y, w, h, |
| dst_stride, src_stride, desc, is_store); |
| |
| return; |
| } |
| |
| unsigned bpp = desc->block.bits; |
| unsigned first_full_tile_x = DIV_ROUND_UP(x, TILE_WIDTH) * TILE_WIDTH; |
| unsigned first_full_tile_y = DIV_ROUND_UP(y, TILE_HEIGHT) * TILE_HEIGHT; |
| unsigned last_full_tile_x = ((x + w) / TILE_WIDTH) * TILE_WIDTH; |
| unsigned last_full_tile_y = ((y + h) / TILE_HEIGHT) * TILE_HEIGHT; |
| |
| /* First, tile the top portion */ |
| |
| unsigned orig_x = x, orig_y = y; |
| |
| if (first_full_tile_y != y) { |
| unsigned dist = MIN2(first_full_tile_y - y, h); |
| |
| panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y), |
| x, y, w, dist, |
| dst_stride, src_stride, desc, is_store); |
| |
| if (dist == h) |
| return; |
| |
| y += dist; |
| h -= dist; |
| } |
| |
| /* Next, the bottom portion */ |
| if (last_full_tile_y != (y + h)) { |
| unsigned dist = (y + h) - last_full_tile_y; |
| |
| panfrost_access_tiled_image_generic(dst, OFFSET(src, x, last_full_tile_y), |
| x, last_full_tile_y, w, dist, |
| dst_stride, src_stride, desc, is_store); |
| |
| h -= dist; |
| } |
| |
| /* The left portion */ |
| if (first_full_tile_x != x) { |
| unsigned dist = MIN2(first_full_tile_x - x, w); |
| |
| panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y), |
| x, y, dist, h, |
| dst_stride, src_stride, desc, is_store); |
| |
| if (dist == w) |
| return; |
| |
| x += dist; |
| w -= dist; |
| } |
| |
| /* Finally, the right portion */ |
| if (last_full_tile_x != (x + w)) { |
| unsigned dist = (x + w) - last_full_tile_x; |
| |
| panfrost_access_tiled_image_generic(dst, OFFSET(src, last_full_tile_x, y), |
| last_full_tile_x, y, dist, h, |
| dst_stride, src_stride, desc, is_store); |
| |
| w -= dist; |
| } |
| |
| if (bpp == 8) |
| panfrost_access_tiled_image_uint8_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); |
| else if (bpp == 16) |
| panfrost_access_tiled_image_uint16_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); |
| else if (bpp == 32) |
| panfrost_access_tiled_image_uint32_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); |
| else if (bpp == 64) |
| panfrost_access_tiled_image_uint64_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); |
| else if (bpp == 128) |
| panfrost_access_tiled_image_pan_uint128_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); |
| } |
| |
| void |
| panfrost_store_tiled_image(void *dst, const void *src, |
| unsigned x, unsigned y, |
| unsigned w, unsigned h, |
| uint32_t dst_stride, |
| uint32_t src_stride, |
| enum pipe_format format) |
| { |
| panfrost_access_tiled_image(dst, (void *) src, |
| x, y, w, h, |
| dst_stride, src_stride, format, true); |
| } |
| |
| void |
| panfrost_load_tiled_image(void *dst, const void *src, |
| unsigned x, unsigned y, |
| unsigned w, unsigned h, |
| uint32_t dst_stride, |
| uint32_t src_stride, |
| enum pipe_format format) |
| { |
| panfrost_access_tiled_image((void *) src, dst, |
| x, y, w, h, |
| src_stride, dst_stride, format, false); |
| } |