Dave Airlie | 5320918 | 2010-12-15 07:14:24 +1000 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2012 Red Hat |
| 3 | * based in parts on udlfb.c: |
| 4 | * Copyright (C) 2009 Roberto De Ioris <roberto@unbit.it> |
| 5 | * Copyright (C) 2009 Jaya Kumar <jayakumar.lkml@gmail.com> |
| 6 | * Copyright (C) 2009 Bernie Thompson <bernie@plugable.com> |
| 7 | * |
| 8 | * This file is subject to the terms and conditions of the GNU General Public |
| 9 | * License v2. See the file COPYING in the main directory of this archive for |
| 10 | * more details. |
| 11 | */ |
| 12 | |
| 13 | #include <linux/module.h> |
| 14 | #include <linux/slab.h> |
| 15 | #include <linux/fb.h> |
| 16 | #include <linux/prefetch.h> |
| 17 | |
David Howells | 760285e | 2012-10-02 18:01:07 +0100 | [diff] [blame] | 18 | #include <drm/drmP.h> |
Dave Airlie | 5320918 | 2010-12-15 07:14:24 +1000 | [diff] [blame] | 19 | #include "udl_drv.h" |
| 20 | |
| 21 | #define MAX_CMD_PIXELS 255 |
| 22 | |
| 23 | #define RLX_HEADER_BYTES 7 |
| 24 | #define MIN_RLX_PIX_BYTES 4 |
| 25 | #define MIN_RLX_CMD_BYTES (RLX_HEADER_BYTES + MIN_RLX_PIX_BYTES) |
| 26 | |
| 27 | #define RLE_HEADER_BYTES 6 |
| 28 | #define MIN_RLE_PIX_BYTES 3 |
| 29 | #define MIN_RLE_CMD_BYTES (RLE_HEADER_BYTES + MIN_RLE_PIX_BYTES) |
| 30 | |
| 31 | #define RAW_HEADER_BYTES 6 |
| 32 | #define MIN_RAW_PIX_BYTES 2 |
| 33 | #define MIN_RAW_CMD_BYTES (RAW_HEADER_BYTES + MIN_RAW_PIX_BYTES) |
| 34 | |
| 35 | /* |
| 36 | * Trims identical data from front and back of line |
| 37 | * Sets new front buffer address and width |
| 38 | * And returns byte count of identical pixels |
| 39 | * Assumes CPU natural alignment (unsigned long) |
| 40 | * for back and front buffer ptrs and width |
| 41 | */ |
| 42 | #if 0 |
| 43 | static int udl_trim_hline(const u8 *bback, const u8 **bfront, int *width_bytes) |
| 44 | { |
| 45 | int j, k; |
| 46 | const unsigned long *back = (const unsigned long *) bback; |
| 47 | const unsigned long *front = (const unsigned long *) *bfront; |
| 48 | const int width = *width_bytes / sizeof(unsigned long); |
| 49 | int identical = width; |
| 50 | int start = width; |
| 51 | int end = width; |
| 52 | |
| 53 | prefetch((void *) front); |
| 54 | prefetch((void *) back); |
| 55 | |
| 56 | for (j = 0; j < width; j++) { |
| 57 | if (back[j] != front[j]) { |
| 58 | start = j; |
| 59 | break; |
| 60 | } |
| 61 | } |
| 62 | |
| 63 | for (k = width - 1; k > j; k--) { |
| 64 | if (back[k] != front[k]) { |
| 65 | end = k+1; |
| 66 | break; |
| 67 | } |
| 68 | } |
| 69 | |
| 70 | identical = start + (width - end); |
| 71 | *bfront = (u8 *) &front[start]; |
| 72 | *width_bytes = (end - start) * sizeof(unsigned long); |
| 73 | |
| 74 | return identical * sizeof(unsigned long); |
| 75 | } |
| 76 | #endif |
| 77 | |
| 78 | static inline u16 pixel32_to_be16p(const uint8_t *pixel) |
| 79 | { |
| 80 | uint32_t pix = *(uint32_t *)pixel; |
| 81 | u16 retval; |
| 82 | |
| 83 | retval = (((pix >> 3) & 0x001f) | |
| 84 | ((pix >> 5) & 0x07e0) | |
| 85 | ((pix >> 8) & 0xf800)); |
| 86 | return retval; |
| 87 | } |
| 88 | |
| 89 | /* |
| 90 | * Render a command stream for an encoded horizontal line segment of pixels. |
| 91 | * |
| 92 | * A command buffer holds several commands. |
| 93 | * It always begins with a fresh command header |
| 94 | * (the protocol doesn't require this, but we enforce it to allow |
| 95 | * multiple buffers to be potentially encoded and sent in parallel). |
| 96 | * A single command encodes one contiguous horizontal line of pixels |
| 97 | * |
| 98 | * The function relies on the client to do all allocation, so that |
| 99 | * rendering can be done directly to output buffers (e.g. USB URBs). |
| 100 | * The function fills the supplied command buffer, providing information |
| 101 | * on where it left off, so the client may call in again with additional |
| 102 | * buffers if the line will take several buffers to complete. |
| 103 | * |
| 104 | * A single command can transmit a maximum of 256 pixels, |
| 105 | * regardless of the compression ratio (protocol design limit). |
| 106 | * To the hardware, 0 for a size byte means 256 |
| 107 | * |
| 108 | * Rather than 256 pixel commands which are either rl or raw encoded, |
| 109 | * the rlx command simply assumes alternating raw and rl spans within one cmd. |
| 110 | * This has a slightly larger header overhead, but produces more even results. |
| 111 | * It also processes all data (read and write) in a single pass. |
| 112 | * Performance benchmarks of common cases show it having just slightly better |
| 113 | * compression than 256 pixel raw or rle commands, with similar CPU consumpion. |
| 114 | * But for very rl friendly data, will compress not quite as well. |
| 115 | */ |
| 116 | static void udl_compress_hline16( |
| 117 | const u8 **pixel_start_ptr, |
| 118 | const u8 *const pixel_end, |
| 119 | uint32_t *device_address_ptr, |
| 120 | uint8_t **command_buffer_ptr, |
| 121 | const uint8_t *const cmd_buffer_end, int bpp) |
| 122 | { |
| 123 | const u8 *pixel = *pixel_start_ptr; |
| 124 | uint32_t dev_addr = *device_address_ptr; |
| 125 | uint8_t *cmd = *command_buffer_ptr; |
| 126 | |
| 127 | while ((pixel_end > pixel) && |
| 128 | (cmd_buffer_end - MIN_RLX_CMD_BYTES > cmd)) { |
Sachin Kamat | 74401b1 | 2012-09-22 06:22:17 +0000 | [diff] [blame] | 129 | uint8_t *raw_pixels_count_byte = NULL; |
| 130 | uint8_t *cmd_pixels_count_byte = NULL; |
| 131 | const u8 *raw_pixel_start = NULL; |
| 132 | const u8 *cmd_pixel_start, *cmd_pixel_end = NULL; |
Dave Airlie | 5320918 | 2010-12-15 07:14:24 +1000 | [diff] [blame] | 133 | |
| 134 | prefetchw((void *) cmd); /* pull in one cache line at least */ |
| 135 | |
| 136 | *cmd++ = 0xaf; |
| 137 | *cmd++ = 0x6b; |
| 138 | *cmd++ = (uint8_t) ((dev_addr >> 16) & 0xFF); |
| 139 | *cmd++ = (uint8_t) ((dev_addr >> 8) & 0xFF); |
| 140 | *cmd++ = (uint8_t) ((dev_addr) & 0xFF); |
| 141 | |
| 142 | cmd_pixels_count_byte = cmd++; /* we'll know this later */ |
| 143 | cmd_pixel_start = pixel; |
| 144 | |
| 145 | raw_pixels_count_byte = cmd++; /* we'll know this later */ |
| 146 | raw_pixel_start = pixel; |
| 147 | |
| 148 | cmd_pixel_end = pixel + (min(MAX_CMD_PIXELS + 1, |
| 149 | min((int)(pixel_end - pixel) / bpp, |
| 150 | (int)(cmd_buffer_end - cmd) / 2))) * bpp; |
| 151 | |
| 152 | prefetch_range((void *) pixel, (cmd_pixel_end - pixel) * bpp); |
| 153 | |
| 154 | while (pixel < cmd_pixel_end) { |
| 155 | const u8 * const repeating_pixel = pixel; |
| 156 | |
| 157 | if (bpp == 2) |
| 158 | *(uint16_t *)cmd = cpu_to_be16p((uint16_t *)pixel); |
| 159 | else if (bpp == 4) |
| 160 | *(uint16_t *)cmd = cpu_to_be16(pixel32_to_be16p(pixel)); |
| 161 | |
| 162 | cmd += 2; |
| 163 | pixel += bpp; |
| 164 | |
| 165 | if (unlikely((pixel < cmd_pixel_end) && |
| 166 | (!memcmp(pixel, repeating_pixel, bpp)))) { |
| 167 | /* go back and fill in raw pixel count */ |
| 168 | *raw_pixels_count_byte = (((repeating_pixel - |
| 169 | raw_pixel_start) / bpp) + 1) & 0xFF; |
| 170 | |
| 171 | while ((pixel < cmd_pixel_end) |
| 172 | && (!memcmp(pixel, repeating_pixel, bpp))) { |
| 173 | pixel += bpp; |
| 174 | } |
| 175 | |
| 176 | /* immediately after raw data is repeat byte */ |
| 177 | *cmd++ = (((pixel - repeating_pixel) / bpp) - 1) & 0xFF; |
| 178 | |
| 179 | /* Then start another raw pixel span */ |
| 180 | raw_pixel_start = pixel; |
| 181 | raw_pixels_count_byte = cmd++; |
| 182 | } |
| 183 | } |
| 184 | |
| 185 | if (pixel > raw_pixel_start) { |
| 186 | /* finalize last RAW span */ |
| 187 | *raw_pixels_count_byte = ((pixel-raw_pixel_start) / bpp) & 0xFF; |
| 188 | } |
| 189 | |
| 190 | *cmd_pixels_count_byte = ((pixel - cmd_pixel_start) / bpp) & 0xFF; |
| 191 | dev_addr += ((pixel - cmd_pixel_start) / bpp) * 2; |
| 192 | } |
| 193 | |
| 194 | if (cmd_buffer_end <= MIN_RLX_CMD_BYTES + cmd) { |
| 195 | /* Fill leftover bytes with no-ops */ |
| 196 | if (cmd_buffer_end > cmd) |
| 197 | memset(cmd, 0xAF, cmd_buffer_end - cmd); |
| 198 | cmd = (uint8_t *) cmd_buffer_end; |
| 199 | } |
| 200 | |
| 201 | *command_buffer_ptr = cmd; |
| 202 | *pixel_start_ptr = pixel; |
| 203 | *device_address_ptr = dev_addr; |
| 204 | |
| 205 | return; |
| 206 | } |
| 207 | |
| 208 | /* |
| 209 | * There are 3 copies of every pixel: The front buffer that the fbdev |
| 210 | * client renders to, the actual framebuffer across the USB bus in hardware |
| 211 | * (that we can only write to, slowly, and can never read), and (optionally) |
| 212 | * our shadow copy that tracks what's been sent to that hardware buffer. |
| 213 | */ |
| 214 | int udl_render_hline(struct drm_device *dev, int bpp, struct urb **urb_ptr, |
| 215 | const char *front, char **urb_buf_ptr, |
| 216 | u32 byte_offset, u32 byte_width, |
| 217 | int *ident_ptr, int *sent_ptr) |
| 218 | { |
| 219 | const u8 *line_start, *line_end, *next_pixel; |
| 220 | u32 base16 = 0 + (byte_offset / bpp) * 2; |
| 221 | struct urb *urb = *urb_ptr; |
| 222 | u8 *cmd = *urb_buf_ptr; |
| 223 | u8 *cmd_end = (u8 *) urb->transfer_buffer + urb->transfer_buffer_length; |
| 224 | |
| 225 | line_start = (u8 *) (front + byte_offset); |
| 226 | next_pixel = line_start; |
| 227 | line_end = next_pixel + byte_width; |
| 228 | |
| 229 | while (next_pixel < line_end) { |
| 230 | |
| 231 | udl_compress_hline16(&next_pixel, |
| 232 | line_end, &base16, |
| 233 | (u8 **) &cmd, (u8 *) cmd_end, bpp); |
| 234 | |
| 235 | if (cmd >= cmd_end) { |
| 236 | int len = cmd - (u8 *) urb->transfer_buffer; |
| 237 | if (udl_submit_urb(dev, urb, len)) |
| 238 | return 1; /* lost pixels is set */ |
| 239 | *sent_ptr += len; |
| 240 | urb = udl_get_urb(dev); |
| 241 | if (!urb) |
| 242 | return 1; /* lost_pixels is set */ |
| 243 | *urb_ptr = urb; |
| 244 | cmd = urb->transfer_buffer; |
| 245 | cmd_end = &cmd[urb->transfer_buffer_length]; |
| 246 | } |
| 247 | } |
| 248 | |
| 249 | *urb_buf_ptr = cmd; |
| 250 | |
| 251 | return 0; |
| 252 | } |
| 253 | |