Michael Schmitz | a100501 | 2007-05-01 22:32:39 +0200 | [diff] [blame] | 1 | #ifndef _VIDEO_ATAFB_UTILS_H |
| 2 | #define _VIDEO_ATAFB_UTILS_H |
| 3 | |
| 4 | /* ================================================================= */ |
| 5 | /* Utility Assembler Functions */ |
| 6 | /* ================================================================= */ |
| 7 | |
| 8 | /* ====================================================================== */ |
| 9 | |
| 10 | /* Those of a delicate disposition might like to skip the next couple of |
| 11 | * pages. |
| 12 | * |
| 13 | * These functions are drop in replacements for memmove and |
| 14 | * memset(_, 0, _). However their five instances add at least a kilobyte |
| 15 | * to the object file. You have been warned. |
| 16 | * |
| 17 | * Not a great fan of assembler for the sake of it, but I think |
| 18 | * that these routines are at least 10 times faster than their C |
| 19 | * equivalents for large blits, and that's important to the lowest level of |
| 20 | * a graphics driver. Question is whether some scheme with the blitter |
| 21 | * would be faster. I suspect not for simple text system - not much |
| 22 | * asynchrony. |
| 23 | * |
| 24 | * Code is very simple, just gruesome expansion. Basic strategy is to |
| 25 | * increase data moved/cleared at each step to 16 bytes to reduce |
| 26 | * instruction per data move overhead. movem might be faster still |
| 27 | * For more than 15 bytes, we try to align the write direction on a |
| 28 | * longword boundary to get maximum speed. This is even more gruesome. |
| 29 | * Unaligned read/write used requires 68020+ - think this is a problem? |
| 30 | * |
| 31 | * Sorry! |
| 32 | */ |
| 33 | |
| 34 | |
| 35 | /* ++roman: I've optimized Robert's original versions in some minor |
| 36 | * aspects, e.g. moveq instead of movel, let gcc choose the registers, |
| 37 | * use movem in some places... |
| 38 | * For other modes than 1 plane, lots of more such assembler functions |
| 39 | * were needed (e.g. the ones using movep or expanding color values). |
| 40 | */ |
| 41 | |
| 42 | /* ++andreas: more optimizations: |
| 43 | subl #65536,d0 replaced by clrw d0; subql #1,d0 for dbcc |
| 44 | addal is faster than addaw |
| 45 | movep is rather expensive compared to ordinary move's |
| 46 | some functions rewritten in C for clarity, no speed loss */ |
| 47 | |
| 48 | static inline void *fb_memclear_small(void *s, size_t count) |
| 49 | { |
| 50 | if (!count) |
| 51 | return 0; |
| 52 | |
| 53 | asm volatile ("\n" |
| 54 | " lsr.l #1,%1 ; jcc 1f ; move.b %2,-(%0)\n" |
| 55 | "1: lsr.l #1,%1 ; jcc 1f ; move.w %2,-(%0)\n" |
| 56 | "1: lsr.l #1,%1 ; jcc 1f ; move.l %2,-(%0)\n" |
| 57 | "1: lsr.l #1,%1 ; jcc 1f ; move.l %2,-(%0) ; move.l %2,-(%0)\n" |
| 58 | "1:" |
| 59 | : "=a" (s), "=d" (count) |
| 60 | : "d" (0), "0" ((char *)s + count), "1" (count)); |
| 61 | asm volatile ("\n" |
| 62 | " subq.l #1,%1\n" |
| 63 | " jcs 3f\n" |
| 64 | " move.l %2,%%d4; move.l %2,%%d5; move.l %2,%%d6\n" |
| 65 | "2: movem.l %2/%%d4/%%d5/%%d6,-(%0)\n" |
| 66 | " dbra %1,2b\n" |
| 67 | "3:" |
| 68 | : "=a" (s), "=d" (count) |
| 69 | : "d" (0), "0" (s), "1" (count) |
| 70 | : "d4", "d5", "d6" |
| 71 | ); |
| 72 | |
| 73 | return 0; |
| 74 | } |
| 75 | |
| 76 | |
| 77 | static inline void *fb_memclear(void *s, size_t count) |
| 78 | { |
| 79 | if (!count) |
| 80 | return 0; |
| 81 | |
| 82 | if (count < 16) { |
| 83 | asm volatile ("\n" |
| 84 | " lsr.l #1,%1 ; jcc 1f ; clr.b (%0)+\n" |
| 85 | "1: lsr.l #1,%1 ; jcc 1f ; clr.w (%0)+\n" |
| 86 | "1: lsr.l #1,%1 ; jcc 1f ; clr.l (%0)+\n" |
| 87 | "1: lsr.l #1,%1 ; jcc 1f ; clr.l (%0)+ ; clr.l (%0)+\n" |
| 88 | "1:" |
| 89 | : "=a" (s), "=d" (count) |
| 90 | : "0" (s), "1" (count)); |
| 91 | } else { |
| 92 | long tmp; |
| 93 | asm volatile ("\n" |
| 94 | " move.l %1,%2\n" |
| 95 | " lsr.l #1,%2 ; jcc 1f ; clr.b (%0)+ ; subq.w #1,%1\n" |
| 96 | " lsr.l #1,%2 ; jcs 2f\n" /* %0 increased=>bit 2 switched*/ |
| 97 | " clr.w (%0)+ ; subq.w #2,%1 ; jra 2f\n" |
| 98 | "1: lsr.l #1,%2 ; jcc 2f\n" |
| 99 | " clr.w (%0)+ ; subq.w #2,%1\n" |
| 100 | "2: move.w %1,%2; lsr.l #2,%1 ; jeq 6f\n" |
| 101 | " lsr.l #1,%1 ; jcc 3f ; clr.l (%0)+\n" |
| 102 | "3: lsr.l #1,%1 ; jcc 4f ; clr.l (%0)+ ; clr.l (%0)+\n" |
| 103 | "4: subq.l #1,%1 ; jcs 6f\n" |
| 104 | "5: clr.l (%0)+; clr.l (%0)+ ; clr.l (%0)+ ; clr.l (%0)+\n" |
| 105 | " dbra %1,5b ; clr.w %1; subq.l #1,%1; jcc 5b\n" |
| 106 | "6: move.w %2,%1; btst #1,%1 ; jeq 7f ; clr.w (%0)+\n" |
| 107 | "7: btst #0,%1 ; jeq 8f ; clr.b (%0)+\n" |
| 108 | "8:" |
| 109 | : "=a" (s), "=d" (count), "=d" (tmp) |
| 110 | : "0" (s), "1" (count)); |
| 111 | } |
| 112 | |
| 113 | return 0; |
| 114 | } |
| 115 | |
| 116 | |
| 117 | static inline void *fb_memset255(void *s, size_t count) |
| 118 | { |
| 119 | if (!count) |
| 120 | return 0; |
| 121 | |
| 122 | asm volatile ("\n" |
| 123 | " lsr.l #1,%1 ; jcc 1f ; move.b %2,-(%0)\n" |
| 124 | "1: lsr.l #1,%1 ; jcc 1f ; move.w %2,-(%0)\n" |
| 125 | "1: lsr.l #1,%1 ; jcc 1f ; move.l %2,-(%0)\n" |
| 126 | "1: lsr.l #1,%1 ; jcc 1f ; move.l %2,-(%0) ; move.l %2,-(%0)\n" |
| 127 | "1:" |
| 128 | : "=a" (s), "=d" (count) |
| 129 | : "d" (-1), "0" ((char *)s+count), "1" (count)); |
| 130 | asm volatile ("\n" |
| 131 | " subq.l #1,%1 ; jcs 3f\n" |
| 132 | " move.l %2,%%d4; move.l %2,%%d5; move.l %2,%%d6\n" |
| 133 | "2: movem.l %2/%%d4/%%d5/%%d6,-(%0)\n" |
| 134 | " dbra %1,2b\n" |
| 135 | "3:" |
| 136 | : "=a" (s), "=d" (count) |
| 137 | : "d" (-1), "0" (s), "1" (count) |
| 138 | : "d4", "d5", "d6"); |
| 139 | |
| 140 | return 0; |
| 141 | } |
| 142 | |
| 143 | |
| 144 | static inline void *fb_memmove(void *d, const void *s, size_t count) |
| 145 | { |
| 146 | if (d < s) { |
| 147 | if (count < 16) { |
| 148 | asm volatile ("\n" |
| 149 | " lsr.l #1,%2 ; jcc 1f ; move.b (%1)+,(%0)+\n" |
| 150 | "1: lsr.l #1,%2 ; jcc 1f ; move.w (%1)+,(%0)+\n" |
| 151 | "1: lsr.l #1,%2 ; jcc 1f ; move.l (%1)+,(%0)+\n" |
| 152 | "1: lsr.l #1,%2 ; jcc 1f ; move.l (%1)+,(%0)+ ; move.l (%1)+,(%0)+\n" |
| 153 | "1:" |
| 154 | : "=a" (d), "=a" (s), "=d" (count) |
| 155 | : "0" (d), "1" (s), "2" (count)); |
| 156 | } else { |
| 157 | long tmp; |
| 158 | asm volatile ("\n" |
| 159 | " move.l %0,%3\n" |
| 160 | " lsr.l #1,%3 ; jcc 1f ; move.b (%1)+,(%0)+ ; subqw #1,%2\n" |
| 161 | " lsr.l #1,%3 ; jcs 2f\n" /* %0 increased=>bit 2 switched*/ |
| 162 | " move.w (%1)+,(%0)+ ; subqw #2,%2 ; jra 2f\n" |
| 163 | "1: lsr.l #1,%3 ; jcc 2f\n" |
| 164 | " move.w (%1)+,(%0)+ ; subqw #2,%2\n" |
| 165 | "2: move.w %2,%-; lsr.l #2,%2 ; jeq 6f\n" |
| 166 | " lsr.l #1,%2 ; jcc 3f ; move.l (%1)+,(%0)+\n" |
| 167 | "3: lsr.l #1,%2 ; jcc 4f ; move.l (%1)+,(%0)+ ; move.l (%1)+,(%0)+\n" |
| 168 | "4: subq.l #1,%2 ; jcs 6f\n" |
| 169 | "5: move.l (%1)+,(%0)+; move.l (%1)+,(%0)+\n" |
| 170 | " move.l (%1)+,(%0)+; move.l (%1)+,(%0)+\n" |
| 171 | " dbra %2,5b ; clr.w %2; subq.l #1,%2; jcc 5b\n" |
| 172 | "6: move.w %+,%2; btst #1,%2 ; jeq 7f ; move.w (%1)+,(%0)+\n" |
| 173 | "7: btst #0,%2 ; jeq 8f ; move.b (%1)+,(%0)+\n" |
| 174 | "8:" |
| 175 | : "=a" (d), "=a" (s), "=d" (count), "=d" (tmp) |
| 176 | : "0" (d), "1" (s), "2" (count)); |
| 177 | } |
| 178 | } else { |
| 179 | if (count < 16) { |
| 180 | asm volatile ("\n" |
| 181 | " lsr.l #1,%2 ; jcc 1f ; move.b -(%1),-(%0)\n" |
| 182 | "1: lsr.l #1,%2 ; jcc 1f ; move.w -(%1),-(%0)\n" |
| 183 | "1: lsr.l #1,%2 ; jcc 1f ; move.l -(%1),-(%0)\n" |
| 184 | "1: lsr.l #1,%2 ; jcc 1f ; move.l -(%1),-(%0) ; move.l -(%1),-(%0)\n" |
| 185 | "1:" |
| 186 | : "=a" (d), "=a" (s), "=d" (count) |
| 187 | : "0" ((char *) d + count), "1" ((char *) s + count), "2" (count)); |
| 188 | } else { |
| 189 | long tmp; |
| 190 | |
| 191 | asm volatile ("\n" |
| 192 | " move.l %0,%3\n" |
| 193 | " lsr.l #1,%3 ; jcc 1f ; move.b -(%1),-(%0) ; subqw #1,%2\n" |
| 194 | " lsr.l #1,%3 ; jcs 2f\n" /* %0 increased=>bit 2 switched*/ |
| 195 | " move.w -(%1),-(%0) ; subqw #2,%2 ; jra 2f\n" |
| 196 | "1: lsr.l #1,%3 ; jcc 2f\n" |
| 197 | " move.w -(%1),-(%0) ; subqw #2,%2\n" |
| 198 | "2: move.w %2,%-; lsr.l #2,%2 ; jeq 6f\n" |
| 199 | " lsr.l #1,%2 ; jcc 3f ; move.l -(%1),-(%0)\n" |
| 200 | "3: lsr.l #1,%2 ; jcc 4f ; move.l -(%1),-(%0) ; move.l -(%1),-(%0)\n" |
| 201 | "4: subq.l #1,%2 ; jcs 6f\n" |
| 202 | "5: move.l -(%1),-(%0); move.l -(%1),-(%0)\n" |
| 203 | " move.l -(%1),-(%0); move.l -(%1),-(%0)\n" |
| 204 | " dbra %2,5b ; clr.w %2; subq.l #1,%2; jcc 5b\n" |
| 205 | "6: move.w %+,%2; btst #1,%2 ; jeq 7f ; move.w -(%1),-(%0)\n" |
| 206 | "7: btst #0,%2 ; jeq 8f ; move.b -(%1),-(%0)\n" |
| 207 | "8:" |
| 208 | : "=a" (d), "=a" (s), "=d" (count), "=d" (tmp) |
| 209 | : "0" ((char *) d + count), "1" ((char *) s + count), "2" (count)); |
| 210 | } |
| 211 | } |
| 212 | |
| 213 | return 0; |
| 214 | } |
| 215 | |
| 216 | |
| 217 | /* ++andreas: Simple and fast version of memmove, assumes size is |
| 218 | divisible by 16, suitable for moving the whole screen bitplane */ |
| 219 | static inline void fast_memmove(char *dst, const char *src, size_t size) |
| 220 | { |
| 221 | if (!size) |
| 222 | return; |
| 223 | if (dst < src) |
| 224 | asm volatile ("\n" |
| 225 | "1: movem.l (%0)+,%%d0/%%d1/%%a0/%%a1\n" |
| 226 | " movem.l %%d0/%%d1/%%a0/%%a1,%1@\n" |
| 227 | " addq.l #8,%1; addq.l #8,%1\n" |
| 228 | " dbra %2,1b\n" |
| 229 | " clr.w %2; subq.l #1,%2\n" |
| 230 | " jcc 1b" |
| 231 | : "=a" (src), "=a" (dst), "=d" (size) |
| 232 | : "0" (src), "1" (dst), "2" (size / 16 - 1) |
| 233 | : "d0", "d1", "a0", "a1", "memory"); |
| 234 | else |
| 235 | asm volatile ("\n" |
| 236 | "1: subq.l #8,%0; subq.l #8,%0\n" |
| 237 | " movem.l %0@,%%d0/%%d1/%%a0/%%a1\n" |
| 238 | " movem.l %%d0/%%d1/%%a0/%%a1,-(%1)\n" |
| 239 | " dbra %2,1b\n" |
| 240 | " clr.w %2; subq.l #1,%2\n" |
| 241 | " jcc 1b" |
| 242 | : "=a" (src), "=a" (dst), "=d" (size) |
| 243 | : "0" (src + size), "1" (dst + size), "2" (size / 16 - 1) |
| 244 | : "d0", "d1", "a0", "a1", "memory"); |
| 245 | } |
| 246 | |
| 247 | #ifdef BPL |
| 248 | |
| 249 | /* |
| 250 | * This expands a up to 8 bit color into two longs |
| 251 | * for movel operations. |
| 252 | */ |
| 253 | static const u32 four2long[] = { |
| 254 | 0x00000000, 0x000000ff, 0x0000ff00, 0x0000ffff, |
| 255 | 0x00ff0000, 0x00ff00ff, 0x00ffff00, 0x00ffffff, |
| 256 | 0xff000000, 0xff0000ff, 0xff00ff00, 0xff00ffff, |
| 257 | 0xffff0000, 0xffff00ff, 0xffffff00, 0xffffffff, |
| 258 | }; |
| 259 | |
| 260 | static inline void expand8_col2mask(u8 c, u32 m[]) |
| 261 | { |
| 262 | m[0] = four2long[c & 15]; |
| 263 | #if BPL > 4 |
| 264 | m[1] = four2long[c >> 4]; |
| 265 | #endif |
| 266 | } |
| 267 | |
| 268 | static inline void expand8_2col2mask(u8 fg, u8 bg, u32 fgm[], u32 bgm[]) |
| 269 | { |
| 270 | fgm[0] = four2long[fg & 15] ^ (bgm[0] = four2long[bg & 15]); |
| 271 | #if BPL > 4 |
| 272 | fgm[1] = four2long[fg >> 4] ^ (bgm[1] = four2long[bg >> 4]); |
| 273 | #endif |
| 274 | } |
| 275 | |
| 276 | /* |
| 277 | * set an 8bit value to a color |
| 278 | */ |
| 279 | static inline void fill8_col(u8 *dst, u32 m[]) |
| 280 | { |
| 281 | u32 tmp = m[0]; |
| 282 | dst[0] = tmp; |
| 283 | dst[2] = (tmp >>= 8); |
| 284 | #if BPL > 2 |
| 285 | dst[4] = (tmp >>= 8); |
| 286 | dst[6] = tmp >> 8; |
| 287 | #endif |
| 288 | #if BPL > 4 |
| 289 | tmp = m[1]; |
| 290 | dst[8] = tmp; |
| 291 | dst[10] = (tmp >>= 8); |
| 292 | dst[12] = (tmp >>= 8); |
| 293 | dst[14] = tmp >> 8; |
| 294 | #endif |
| 295 | } |
| 296 | |
| 297 | /* |
| 298 | * set an 8bit value according to foreground/background color |
| 299 | */ |
| 300 | static inline void fill8_2col(u8 *dst, u8 fg, u8 bg, u32 mask) |
| 301 | { |
| 302 | u32 fgm[2], bgm[2], tmp; |
| 303 | |
| 304 | expand8_2col2mask(fg, bg, fgm, bgm); |
| 305 | |
| 306 | mask |= mask << 8; |
| 307 | #if BPL > 2 |
| 308 | mask |= mask << 16; |
| 309 | #endif |
| 310 | tmp = (mask & fgm[0]) ^ bgm[0]; |
| 311 | dst[0] = tmp; |
| 312 | dst[2] = (tmp >>= 8); |
| 313 | #if BPL > 2 |
| 314 | dst[4] = (tmp >>= 8); |
| 315 | dst[6] = tmp >> 8; |
| 316 | #endif |
| 317 | #if BPL > 4 |
| 318 | tmp = (mask & fgm[1]) ^ bgm[1]; |
| 319 | dst[8] = tmp; |
| 320 | dst[10] = (tmp >>= 8); |
| 321 | dst[12] = (tmp >>= 8); |
| 322 | dst[14] = tmp >> 8; |
| 323 | #endif |
| 324 | } |
| 325 | |
| 326 | static const u32 two2word[] = { |
| 327 | 0x00000000, 0xffff0000, 0x0000ffff, 0xffffffff |
| 328 | }; |
| 329 | |
| 330 | static inline void expand16_col2mask(u8 c, u32 m[]) |
| 331 | { |
| 332 | m[0] = two2word[c & 3]; |
| 333 | #if BPL > 2 |
| 334 | m[1] = two2word[(c >> 2) & 3]; |
| 335 | #endif |
| 336 | #if BPL > 4 |
| 337 | m[2] = two2word[(c >> 4) & 3]; |
| 338 | m[3] = two2word[c >> 6]; |
| 339 | #endif |
| 340 | } |
| 341 | |
| 342 | static inline void expand16_2col2mask(u8 fg, u8 bg, u32 fgm[], u32 bgm[]) |
| 343 | { |
| 344 | bgm[0] = two2word[bg & 3]; |
| 345 | fgm[0] = two2word[fg & 3] ^ bgm[0]; |
| 346 | #if BPL > 2 |
| 347 | bgm[1] = two2word[(bg >> 2) & 3]; |
| 348 | fgm[1] = two2word[(fg >> 2) & 3] ^ bgm[1]; |
| 349 | #endif |
| 350 | #if BPL > 4 |
| 351 | bgm[2] = two2word[(bg >> 4) & 3]; |
| 352 | fgm[2] = two2word[(fg >> 4) & 3] ^ bgm[2]; |
| 353 | bgm[3] = two2word[bg >> 6]; |
| 354 | fgm[3] = two2word[fg >> 6] ^ bgm[3]; |
| 355 | #endif |
| 356 | } |
| 357 | |
| 358 | static inline u32 *fill16_col(u32 *dst, int rows, u32 m[]) |
| 359 | { |
| 360 | while (rows) { |
| 361 | *dst++ = m[0]; |
| 362 | #if BPL > 2 |
| 363 | *dst++ = m[1]; |
| 364 | #endif |
| 365 | #if BPL > 4 |
| 366 | *dst++ = m[2]; |
| 367 | *dst++ = m[3]; |
| 368 | #endif |
| 369 | rows--; |
| 370 | } |
| 371 | return dst; |
| 372 | } |
| 373 | |
| 374 | static inline void memmove32_col(void *dst, void *src, u32 mask, u32 h, u32 bytes) |
| 375 | { |
| 376 | u32 *s, *d, v; |
| 377 | |
| 378 | s = src; |
| 379 | d = dst; |
| 380 | do { |
| 381 | v = (*s++ & mask) | (*d & ~mask); |
| 382 | *d++ = v; |
| 383 | #if BPL > 2 |
| 384 | v = (*s++ & mask) | (*d & ~mask); |
| 385 | *d++ = v; |
| 386 | #endif |
| 387 | #if BPL > 4 |
| 388 | v = (*s++ & mask) | (*d & ~mask); |
| 389 | *d++ = v; |
| 390 | v = (*s++ & mask) | (*d & ~mask); |
| 391 | *d++ = v; |
| 392 | #endif |
| 393 | d = (u32 *)((u8 *)d + bytes); |
| 394 | s = (u32 *)((u8 *)s + bytes); |
| 395 | } while (--h); |
| 396 | } |
| 397 | |
| 398 | #endif |
| 399 | |
| 400 | #endif /* _VIDEO_ATAFB_UTILS_H */ |