blob: 9b48ebd72715fc8d3f2ffef086a3f26af654dfde [file] [log] [blame]
John Bauman89401822014-05-06 15:04:28 -04001// SwiftShader Software Renderer
2//
John Bauman66b8ab22014-05-06 15:57:45 -04003// Copyright(c) 2005-2013 TransGaming Inc.
John Bauman89401822014-05-06 15:04:28 -04004//
5// All rights reserved. No part of this software may be copied, distributed, transmitted,
6// transcribed, stored in a retrieval system, translated into any human or computer
7// language by any means, or disclosed to third parties without the explicit written
8// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
9// or implied, including but not limited to any patent rights, are granted to you.
10//
11
12#include "Surface.hpp"
13
14#include "Color.hpp"
15#include "Context.hpp"
John Bauman19bac1e2014-05-06 15:23:49 -040016#include "Renderer.hpp"
John Bauman89401822014-05-06 15:04:28 -040017#include "Common/Half.hpp"
18#include "Common/Memory.hpp"
19#include "Common/CPUID.hpp"
20#include "Common/Resource.hpp"
21#include "Common/Debug.hpp"
John Bauman19bac1e2014-05-06 15:23:49 -040022#include "Reactor/Reactor.hpp"
John Bauman89401822014-05-06 15:04:28 -040023
24#include <xmmintrin.h>
25#include <emmintrin.h>
26
27#undef min
28#undef max
29
30namespace sw
31{
32 extern bool quadLayoutEnabled;
33 extern bool complementaryDepthBuffer;
34 extern TranscendentalPrecision logPrecision;
35
36 unsigned int *Surface::palette = 0;
37 unsigned int Surface::paletteID = 0;
38
John Bauman19bac1e2014-05-06 15:23:49 -040039 void Rect::clip(int minX, int minY, int maxX, int maxY)
40 {
Nicolas Capens22658242014-11-29 00:31:41 -050041 x0 = clamp(x0, minX, maxX);
42 y0 = clamp(y0, minY, maxY);
43 x1 = clamp(x1, minX, maxX);
44 y1 = clamp(y1, minY, maxY);
John Bauman19bac1e2014-05-06 15:23:49 -040045 }
46
John Bauman89401822014-05-06 15:04:28 -040047 void Surface::Buffer::write(int x, int y, int z, const Color<float> &color)
48 {
49 void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
50
51 write(element, color);
52 }
53
54 void Surface::Buffer::write(int x, int y, const Color<float> &color)
55 {
56 void *element = (unsigned char*)buffer + x * bytes + y * pitchB;
57
58 write(element, color);
59 }
60
61 inline void Surface::Buffer::write(void *element, const Color<float> &color)
62 {
63 switch(format)
64 {
65 case FORMAT_A8:
66 *(unsigned char*)element = unorm<8>(color.a);
67 break;
68 case FORMAT_R8:
69 *(unsigned char*)element = unorm<8>(color.r);
70 break;
71 case FORMAT_R3G3B2:
72 *(unsigned char*)element = (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0);
73 break;
74 case FORMAT_A8R3G3B2:
75 *(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0);
76 break;
77 case FORMAT_X4R4G4B4:
78 *(unsigned short*)element = 0xF000 | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0);
79 break;
80 case FORMAT_A4R4G4B4:
81 *(unsigned short*)element = (unorm<4>(color.a) << 12) | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0);
82 break;
Nicolas Capens80594422015-06-09 16:42:56 -040083 case FORMAT_R4G4B4A4:
84 *(unsigned short*)element = (unorm<4>(color.r) << 12) | (unorm<4>(color.g) << 8) | (unorm<4>(color.b) << 4) | (unorm<4>(color.a) << 0);
85 break;
John Bauman89401822014-05-06 15:04:28 -040086 case FORMAT_R5G6B5:
87 *(unsigned short*)element = (unorm<5>(color.r) << 11) | (unorm<6>(color.g) << 5) | (unorm<5>(color.b) << 0);
88 break;
89 case FORMAT_A1R5G5B5:
90 *(unsigned short*)element = (unorm<1>(color.a) << 15) | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0);
91 break;
Nicolas Capens80594422015-06-09 16:42:56 -040092 case FORMAT_R5G5B5A1:
93 *(unsigned short*)element = (unorm<5>(color.r) << 11) | (unorm<5>(color.g) << 6) | (unorm<5>(color.b) << 1) | (unorm<5>(color.a) << 0);
94 break;
John Bauman89401822014-05-06 15:04:28 -040095 case FORMAT_X1R5G5B5:
96 *(unsigned short*)element = 0x8000 | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0);
97 break;
98 case FORMAT_A8R8G8B8:
99 *(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0);
100 break;
101 case FORMAT_X8R8G8B8:
102 *(unsigned int*)element = 0xFF000000 | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0);
103 break;
104 case FORMAT_A8B8G8R8:
105 *(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
106 break;
107 case FORMAT_X8B8G8R8:
108 *(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
109 break;
110 case FORMAT_A2R10G10B10:
111 *(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.r) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.b) << 0);
112 break;
113 case FORMAT_A2B10G10R10:
114 *(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.b) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.r) << 0);
115 break;
116 case FORMAT_G8R8:
117 *(unsigned int*)element = (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
118 break;
119 case FORMAT_G16R16:
120 *(unsigned int*)element = (unorm<16>(color.g) << 16) | (unorm<16>(color.r) << 0);
121 break;
122 case FORMAT_A16B16G16R16:
123 ((unsigned short*)element)[0] = unorm<16>(color.r);
124 ((unsigned short*)element)[1] = unorm<16>(color.g);
125 ((unsigned short*)element)[2] = unorm<16>(color.b);
126 ((unsigned short*)element)[3] = unorm<16>(color.a);
127 break;
128 case FORMAT_V8U8:
129 *(unsigned short*)element = (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
130 break;
131 case FORMAT_L6V5U5:
132 *(unsigned short*)element = (unorm<6>(color.b) << 10) | (snorm<5>(color.g) << 5) | (snorm<5>(color.r) << 0);
133 break;
134 case FORMAT_Q8W8V8U8:
135 *(unsigned int*)element = (snorm<8>(color.a) << 24) | (snorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
136 break;
137 case FORMAT_X8L8V8U8:
138 *(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
139 break;
140 case FORMAT_V16U16:
141 *(unsigned int*)element = (snorm<16>(color.g) << 16) | (snorm<16>(color.r) << 0);
142 break;
143 case FORMAT_A2W10V10U10:
144 *(unsigned int*)element = (unorm<2>(color.a) << 30) | (snorm<10>(color.b) << 20) | (snorm<10>(color.g) << 10) | (snorm<10>(color.r) << 0);
145 break;
146 case FORMAT_A16W16V16U16:
147 ((unsigned short*)element)[0] = snorm<16>(color.r);
148 ((unsigned short*)element)[1] = snorm<16>(color.g);
149 ((unsigned short*)element)[2] = snorm<16>(color.b);
150 ((unsigned short*)element)[3] = unorm<16>(color.a);
151 break;
152 case FORMAT_Q16W16V16U16:
153 ((unsigned short*)element)[0] = snorm<16>(color.r);
154 ((unsigned short*)element)[1] = snorm<16>(color.g);
155 ((unsigned short*)element)[2] = snorm<16>(color.b);
156 ((unsigned short*)element)[3] = snorm<16>(color.a);
157 break;
158 case FORMAT_R8G8B8:
159 ((unsigned char*)element)[0] = unorm<8>(color.b);
160 ((unsigned char*)element)[1] = unorm<8>(color.g);
161 ((unsigned char*)element)[2] = unorm<8>(color.r);
162 break;
Nicolas Capens80594422015-06-09 16:42:56 -0400163 case FORMAT_B8G8R8:
164 ((unsigned char*)element)[0] = unorm<8>(color.r);
165 ((unsigned char*)element)[1] = unorm<8>(color.g);
166 ((unsigned char*)element)[2] = unorm<8>(color.b);
167 break;
John Bauman89401822014-05-06 15:04:28 -0400168 case FORMAT_R16F:
169 *(half*)element = (half)color.r;
170 break;
Nicolas Capens80594422015-06-09 16:42:56 -0400171 case FORMAT_A16F:
172 *(half*)element = (half)color.a;
173 break;
John Bauman89401822014-05-06 15:04:28 -0400174 case FORMAT_G16R16F:
175 ((half*)element)[0] = (half)color.r;
176 ((half*)element)[1] = (half)color.g;
177 break;
Nicolas Capens80594422015-06-09 16:42:56 -0400178 case FORMAT_B16G16R16F:
179 ((half*)element)[0] = (half)color.r;
180 ((half*)element)[1] = (half)color.g;
181 ((half*)element)[2] = (half)color.b;
182 break;
John Bauman89401822014-05-06 15:04:28 -0400183 case FORMAT_A16B16G16R16F:
184 ((half*)element)[0] = (half)color.r;
185 ((half*)element)[1] = (half)color.g;
186 ((half*)element)[2] = (half)color.b;
187 ((half*)element)[3] = (half)color.a;
188 break;
Nicolas Capens80594422015-06-09 16:42:56 -0400189 case FORMAT_A32F:
190 *(float*)element = color.a;
191 break;
John Bauman89401822014-05-06 15:04:28 -0400192 case FORMAT_R32F:
193 *(float*)element = color.r;
194 break;
195 case FORMAT_G32R32F:
196 ((float*)element)[0] = color.r;
197 ((float*)element)[1] = color.g;
198 break;
Nicolas Capens80594422015-06-09 16:42:56 -0400199 case FORMAT_B32G32R32F:
200 ((float*)element)[0] = color.r;
201 ((float*)element)[1] = color.g;
202 ((float*)element)[2] = color.b;
203 break;
John Bauman89401822014-05-06 15:04:28 -0400204 case FORMAT_A32B32G32R32F:
205 ((float*)element)[0] = color.r;
206 ((float*)element)[1] = color.g;
207 ((float*)element)[2] = color.b;
208 ((float*)element)[3] = color.a;
209 break;
210 case FORMAT_D32F:
211 case FORMAT_D32F_LOCKABLE:
John Bauman66b8ab22014-05-06 15:57:45 -0400212 case FORMAT_D32FS8_TEXTURE:
213 case FORMAT_D32FS8_SHADOW:
John Bauman89401822014-05-06 15:04:28 -0400214 *((float*)element) = color.r;
215 break;
216 case FORMAT_D32F_COMPLEMENTARY:
217 *((float*)element) = 1 - color.r;
218 break;
219 case FORMAT_S8:
220 *((unsigned char*)element) = unorm<8>(color.r);
221 break;
222 case FORMAT_L8:
223 *(unsigned char*)element = unorm<8>(color.r);
224 break;
225 case FORMAT_A4L4:
226 *(unsigned char*)element = (unorm<4>(color.a) << 4) | (unorm<4>(color.r) << 0);
227 break;
228 case FORMAT_L16:
229 *(unsigned short*)element = unorm<16>(color.r);
230 break;
231 case FORMAT_A8L8:
232 *(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<8>(color.r) << 0);
233 break;
Nicolas Capens80594422015-06-09 16:42:56 -0400234 case FORMAT_L16F:
235 *(half*)element = (half)color.r;
236 break;
237 case FORMAT_A16L16F:
238 ((half*)element)[0] = (half)color.r;
239 ((half*)element)[1] = (half)color.a;
240 break;
241 case FORMAT_L32F:
242 *(float*)element = color.r;
243 break;
244 case FORMAT_A32L32F:
245 ((float*)element)[0] = color.r;
246 ((float*)element)[1] = color.a;
247 break;
John Bauman89401822014-05-06 15:04:28 -0400248 default:
249 ASSERT(false);
250 }
251 }
252
253 Color<float> Surface::Buffer::read(int x, int y, int z) const
254 {
255 void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
256
257 return read(element);
258 }
259
260 Color<float> Surface::Buffer::read(int x, int y) const
261 {
262 void *element = (unsigned char*)buffer + x * bytes + y * pitchB;
263
264 return read(element);
265 }
266
267 inline Color<float> Surface::Buffer::read(void *element) const
268 {
Nicolas Capens3f439242015-06-09 16:33:50 -0400269 float r = 0.0f;
270 float g = 0.0f;
271 float b = 0.0f;
272 float a = 1.0f;
John Bauman89401822014-05-06 15:04:28 -0400273
274 switch(format)
275 {
276 case FORMAT_P8:
277 {
278 ASSERT(palette);
279
280 unsigned int abgr = palette[*(unsigned char*)element];
281
282 r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
283 g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
284 b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
285 a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
286 }
287 break;
288 case FORMAT_A8P8:
289 {
290 ASSERT(palette);
291
292 unsigned int bgr = palette[((unsigned char*)element)[0]];
293
294 r = (bgr & 0x000000FF) * (1.0f / 0x000000FF);
295 g = (bgr & 0x0000FF00) * (1.0f / 0x0000FF00);
296 b = (bgr & 0x00FF0000) * (1.0f / 0x00FF0000);
297 a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
298 }
299 break;
300 case FORMAT_A8:
301 r = 0;
302 g = 0;
303 b = 0;
304 a = *(unsigned char*)element * (1.0f / 0xFF);
305 break;
306 case FORMAT_R8:
307 r = *(unsigned char*)element * (1.0f / 0xFF);
308 break;
309 case FORMAT_R3G3B2:
310 {
311 unsigned char rgb = *(unsigned char*)element;
312
313 r = (rgb & 0xE0) * (1.0f / 0xE0);
314 g = (rgb & 0x1C) * (1.0f / 0x1C);
315 b = (rgb & 0x03) * (1.0f / 0x03);
316 }
317 break;
318 case FORMAT_A8R3G3B2:
319 {
320 unsigned short argb = *(unsigned short*)element;
321
322 a = (argb & 0xFF00) * (1.0f / 0xFF00);
323 r = (argb & 0x00E0) * (1.0f / 0x00E0);
324 g = (argb & 0x001C) * (1.0f / 0x001C);
325 b = (argb & 0x0003) * (1.0f / 0x0003);
326 }
327 break;
328 case FORMAT_X4R4G4B4:
329 {
330 unsigned short rgb = *(unsigned short*)element;
331
332 r = (rgb & 0x0F00) * (1.0f / 0x0F00);
333 g = (rgb & 0x00F0) * (1.0f / 0x00F0);
334 b = (rgb & 0x000F) * (1.0f / 0x000F);
335 }
336 break;
337 case FORMAT_A4R4G4B4:
338 {
339 unsigned short argb = *(unsigned short*)element;
340
341 a = (argb & 0xF000) * (1.0f / 0xF000);
342 r = (argb & 0x0F00) * (1.0f / 0x0F00);
343 g = (argb & 0x00F0) * (1.0f / 0x00F0);
344 b = (argb & 0x000F) * (1.0f / 0x000F);
345 }
346 break;
Nicolas Capens80594422015-06-09 16:42:56 -0400347 case FORMAT_R4G4B4A4:
348 {
349 unsigned short rgba = *(unsigned short*)element;
350
351 r = (rgba & 0xF000) * (1.0f / 0xF000);
352 g = (rgba & 0x0F00) * (1.0f / 0x0F00);
353 b = (rgba & 0x00F0) * (1.0f / 0x00F0);
354 a = (rgba & 0x000F) * (1.0f / 0x000F);
355 }
356 break;
John Bauman89401822014-05-06 15:04:28 -0400357 case FORMAT_R5G6B5:
358 {
359 unsigned short rgb = *(unsigned short*)element;
360
361 r = (rgb & 0xF800) * (1.0f / 0xF800);
362 g = (rgb & 0x07E0) * (1.0f / 0x07E0);
363 b = (rgb & 0x001F) * (1.0f / 0x001F);
364 }
365 break;
366 case FORMAT_A1R5G5B5:
367 {
368 unsigned short argb = *(unsigned short*)element;
369
370 a = (argb & 0x8000) * (1.0f / 0x8000);
371 r = (argb & 0x7C00) * (1.0f / 0x7C00);
372 g = (argb & 0x03E0) * (1.0f / 0x03E0);
373 b = (argb & 0x001F) * (1.0f / 0x001F);
374 }
375 break;
Nicolas Capens80594422015-06-09 16:42:56 -0400376 case FORMAT_R5G5B5A1:
377 {
378 unsigned short rgba = *(unsigned short*)element;
379
380 r = (rgba & 0xF800) * (1.0f / 0xF800);
381 g = (rgba & 0x07C0) * (1.0f / 0x07C0);
382 b = (rgba & 0x003E) * (1.0f / 0x003E);
383 a = (rgba & 0x0001) * (1.0f / 0x0001);
384 }
385 break;
John Bauman89401822014-05-06 15:04:28 -0400386 case FORMAT_X1R5G5B5:
387 {
388 unsigned short xrgb = *(unsigned short*)element;
389
390 r = (xrgb & 0x7C00) * (1.0f / 0x7C00);
391 g = (xrgb & 0x03E0) * (1.0f / 0x03E0);
392 b = (xrgb & 0x001F) * (1.0f / 0x001F);
393 }
394 break;
395 case FORMAT_A8R8G8B8:
396 {
397 unsigned int argb = *(unsigned int*)element;
398
399 a = (argb & 0xFF000000) * (1.0f / 0xFF000000);
400 r = (argb & 0x00FF0000) * (1.0f / 0x00FF0000);
401 g = (argb & 0x0000FF00) * (1.0f / 0x0000FF00);
402 b = (argb & 0x000000FF) * (1.0f / 0x000000FF);
403 }
404 break;
405 case FORMAT_X8R8G8B8:
406 {
407 unsigned int xrgb = *(unsigned int*)element;
408
409 r = (xrgb & 0x00FF0000) * (1.0f / 0x00FF0000);
410 g = (xrgb & 0x0000FF00) * (1.0f / 0x0000FF00);
411 b = (xrgb & 0x000000FF) * (1.0f / 0x000000FF);
412 }
413 break;
414 case FORMAT_A8B8G8R8:
415 {
416 unsigned int abgr = *(unsigned int*)element;
417
418 a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
419 b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
420 g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
421 r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
422 }
423 break;
424 case FORMAT_X8B8G8R8:
425 {
426 unsigned int xbgr = *(unsigned int*)element;
427
428 b = (xbgr & 0x00FF0000) * (1.0f / 0x00FF0000);
429 g = (xbgr & 0x0000FF00) * (1.0f / 0x0000FF00);
430 r = (xbgr & 0x000000FF) * (1.0f / 0x000000FF);
431 }
432 break;
433 case FORMAT_G8R8:
434 {
435 unsigned short gr = *(unsigned short*)element;
436
437 g = (gr & 0xFF00) * (1.0f / 0xFF00);
438 r = (gr & 0x00FF) * (1.0f / 0x00FF);
439 }
440 break;
441 case FORMAT_G16R16:
442 {
443 unsigned int gr = *(unsigned int*)element;
444
445 g = (gr & 0xFFFF0000) * (1.0f / 0xFFFF0000);
446 r = (gr & 0x0000FFFF) * (1.0f / 0x0000FFFF);
447 }
448 break;
449 case FORMAT_A2R10G10B10:
450 {
451 unsigned int argb = *(unsigned int*)element;
452
453 a = (argb & 0xC0000000) * (1.0f / 0xC0000000);
454 r = (argb & 0x3FF00000) * (1.0f / 0x3FF00000);
455 g = (argb & 0x000FFC00) * (1.0f / 0x000FFC00);
456 b = (argb & 0x000003FF) * (1.0f / 0x000003FF);
457 }
458 break;
459 case FORMAT_A2B10G10R10:
460 {
461 unsigned int abgr = *(unsigned int*)element;
462
463 a = (abgr & 0xC0000000) * (1.0f / 0xC0000000);
464 b = (abgr & 0x3FF00000) * (1.0f / 0x3FF00000);
465 g = (abgr & 0x000FFC00) * (1.0f / 0x000FFC00);
466 r = (abgr & 0x000003FF) * (1.0f / 0x000003FF);
467 }
468 break;
469 case FORMAT_A16B16G16R16:
470 r = ((unsigned short*)element)[0] * (1.0f / 0xFFFF);
471 g = ((unsigned short*)element)[1] * (1.0f / 0xFFFF);
472 b = ((unsigned short*)element)[2] * (1.0f / 0xFFFF);
473 a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
474 break;
475 case FORMAT_V8U8:
476 {
477 unsigned short vu = *(unsigned short*)element;
478
479 r = ((int)(vu & 0x00FF) << 24) * (1.0f / 0x7F000000);
480 g = ((int)(vu & 0xFF00) << 16) * (1.0f / 0x7F000000);
481 }
482 break;
483 case FORMAT_L6V5U5:
484 {
485 unsigned short lvu = *(unsigned short*)element;
486
487 r = ((int)(lvu & 0x001F) << 27) * (1.0f / 0x78000000);
488 g = ((int)(lvu & 0x03E0) << 22) * (1.0f / 0x78000000);
489 b = (lvu & 0xFC00) * (1.0f / 0xFC00);
490 }
491 break;
492 case FORMAT_Q8W8V8U8:
493 {
494 unsigned int qwvu = *(unsigned int*)element;
495
496 r = ((int)(qwvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
497 g = ((int)(qwvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
498 b = ((int)(qwvu & 0x00FF0000) << 8) * (1.0f / 0x7F000000);
499 a = ((int)(qwvu & 0xFF000000) << 0) * (1.0f / 0x7F000000);
500 }
501 break;
502 case FORMAT_X8L8V8U8:
503 {
504 unsigned int xlvu = *(unsigned int*)element;
505
506 r = ((int)(xlvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
507 g = ((int)(xlvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
508 b = (xlvu & 0x00FF0000) * (1.0f / 0x00FF0000);
509 }
510 break;
511 case FORMAT_R8G8B8:
512 r = ((unsigned char*)element)[2] * (1.0f / 0xFF);
513 g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
514 b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
515 break;
Nicolas Capens80594422015-06-09 16:42:56 -0400516 case FORMAT_B8G8R8:
517 r = ((unsigned char*)element)[0] * (1.0f / 0xFF);
518 g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
519 b = ((unsigned char*)element)[2] * (1.0f / 0xFF);
520 break;
John Bauman89401822014-05-06 15:04:28 -0400521 case FORMAT_V16U16:
522 {
523 unsigned int vu = *(unsigned int*)element;
524
525 r = ((int)(vu & 0x0000FFFF) << 16) * (1.0f / 0x7FFF0000);
526 g = ((int)(vu & 0xFFFF0000) << 0) * (1.0f / 0x7FFF0000);
527 }
528 break;
529 case FORMAT_A2W10V10U10:
530 {
531 unsigned int awvu = *(unsigned int*)element;
532
533 r = ((int)(awvu & 0x000003FF) << 22) * (1.0f / 0x7FC00000);
534 g = ((int)(awvu & 0x000FFC00) << 12) * (1.0f / 0x7FC00000);
535 b = ((int)(awvu & 0x3FF00000) << 2) * (1.0f / 0x7FC00000);
536 a = (awvu & 0xC0000000) * (1.0f / 0xC0000000);
537 }
538 break;
539 case FORMAT_A16W16V16U16:
540 r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
541 g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
542 b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
543 a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
544 break;
545 case FORMAT_Q16W16V16U16:
546 r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
547 g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
548 b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
549 a = ((signed short*)element)[3] * (1.0f / 0x7FFF);
550 break;
551 case FORMAT_L8:
552 r =
553 g =
554 b = *(unsigned char*)element * (1.0f / 0xFF);
555 break;
556 case FORMAT_A4L4:
557 {
558 unsigned char al = *(unsigned char*)element;
559
560 r =
561 g =
562 b = (al & 0x0F) * (1.0f / 0x0F);
563 a = (al & 0xF0) * (1.0f / 0xF0);
564 }
565 break;
566 case FORMAT_L16:
567 r =
568 g =
569 b = *(unsigned short*)element * (1.0f / 0xFFFF);
570 break;
571 case FORMAT_A8L8:
572 r =
573 g =
574 b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
575 a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
576 break;
Nicolas Capens80594422015-06-09 16:42:56 -0400577 case FORMAT_L16F:
578 r =
579 g =
580 b = *(half*)element;
581 break;
582 case FORMAT_A16L16F:
583 r =
584 g =
585 b = ((half*)element)[0];
586 a = ((half*)element)[1];
587 break;
588 case FORMAT_L32F:
589 r =
590 g =
591 b = *(float*)element;
592 break;
593 case FORMAT_A32L32F:
594 r =
595 g =
596 b = ((float*)element)[0];
597 a = ((float*)element)[1];
598 break;
599 case FORMAT_A16F:
600 a = *(half*)element;
601 break;
John Bauman89401822014-05-06 15:04:28 -0400602 case FORMAT_R16F:
603 r = *(half*)element;
604 break;
605 case FORMAT_G16R16F:
606 r = ((half*)element)[0];
607 g = ((half*)element)[1];
608 break;
Nicolas Capens80594422015-06-09 16:42:56 -0400609 case FORMAT_B16G16R16F:
610 r = ((half*)element)[0];
611 g = ((half*)element)[1];
612 b = ((half*)element)[2];
613 break;
John Bauman89401822014-05-06 15:04:28 -0400614 case FORMAT_A16B16G16R16F:
615 r = ((half*)element)[0];
616 g = ((half*)element)[1];
617 b = ((half*)element)[2];
618 a = ((half*)element)[3];
619 break;
Nicolas Capens80594422015-06-09 16:42:56 -0400620 case FORMAT_A32F:
621 a = *(float*)element;
622 break;
John Bauman89401822014-05-06 15:04:28 -0400623 case FORMAT_R32F:
624 r = *(float*)element;
625 break;
626 case FORMAT_G32R32F:
627 r = ((float*)element)[0];
628 g = ((float*)element)[1];
629 break;
Nicolas Capens80594422015-06-09 16:42:56 -0400630 case FORMAT_B32G32R32F:
631 r = ((float*)element)[0];
632 g = ((float*)element)[1];
633 b = ((float*)element)[2];
634 break;
John Bauman89401822014-05-06 15:04:28 -0400635 case FORMAT_A32B32G32R32F:
636 r = ((float*)element)[0];
637 g = ((float*)element)[1];
638 b = ((float*)element)[2];
639 a = ((float*)element)[3];
640 break;
641 case FORMAT_D32F:
642 case FORMAT_D32F_LOCKABLE:
John Bauman66b8ab22014-05-06 15:57:45 -0400643 case FORMAT_D32FS8_TEXTURE:
644 case FORMAT_D32FS8_SHADOW:
John Bauman89401822014-05-06 15:04:28 -0400645 r = *(float*)element;
646 g = r;
647 b = r;
648 a = r;
649 break;
650 case FORMAT_D32F_COMPLEMENTARY:
John Bauman66b8ab22014-05-06 15:57:45 -0400651 r = 1.0f - *(float*)element;
John Bauman89401822014-05-06 15:04:28 -0400652 g = r;
653 b = r;
654 a = r;
655 break;
656 case FORMAT_S8:
657 r = *(unsigned char*)element * (1.0f / 0xFF);
658 break;
659 default:
660 ASSERT(false);
661 }
662
663 // if(sRGB)
664 // {
665 // r = sRGBtoLinear(r);
666 // g = sRGBtoLinear(g);
667 // b = sRGBtoLinear(b);
668 // }
669
670 return Color<float>(r, g, b, a);
671 }
672
673 Color<float> Surface::Buffer::sample(float x, float y, float z) const
674 {
675 x -= 0.5f;
676 y -= 0.5f;
677 z -= 0.5f;
678
679 int x0 = clamp((int)x, 0, width - 1);
680 int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
681
682 int y0 = clamp((int)y, 0, height - 1);
683 int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
684
685 int z0 = clamp((int)z, 0, depth - 1);
686 int z1 = (z0 + 1 >= depth) ? z0 : z0 + 1;
687
688 Color<float> c000 = read(x0, y0, z0);
689 Color<float> c100 = read(x1, y0, z0);
690 Color<float> c010 = read(x0, y1, z0);
691 Color<float> c110 = read(x1, y1, z0);
692 Color<float> c001 = read(x0, y0, z1);
693 Color<float> c101 = read(x1, y0, z1);
694 Color<float> c011 = read(x0, y1, z1);
695 Color<float> c111 = read(x1, y1, z1);
696
697 float fx = x - x0;
698 float fy = y - y0;
699 float fz = z - z0;
700
701 c000 *= (1 - fx) * (1 - fy) * (1 - fz);
702 c100 *= fx * (1 - fy) * (1 - fz);
703 c010 *= (1 - fx) * fy * (1 - fz);
704 c110 *= fx * fy * (1 - fz);
705 c001 *= (1 - fx) * (1 - fy) * fz;
706 c101 *= fx * (1 - fy) * fz;
707 c011 *= (1 - fx) * fy * fz;
708 c111 *= fx * fy * fz;
709
710 return c000 + c100 + c010 + c110 + c001 + c101 + c011 + c111;
711 }
712
713 Color<float> Surface::Buffer::sample(float x, float y) const
714 {
715 x -= 0.5f;
716 y -= 0.5f;
717
718 int x0 = clamp((int)x, 0, width - 1);
719 int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
720
721 int y0 = clamp((int)y, 0, height - 1);
722 int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
723
724 Color<float> c00 = read(x0, y0);
725 Color<float> c10 = read(x1, y0);
726 Color<float> c01 = read(x0, y1);
727 Color<float> c11 = read(x1, y1);
728
729 float fx = x - x0;
730 float fy = y - y0;
731
732 c00 *= (1 - fx) * (1 - fy);
733 c10 *= fx * (1 - fy);
734 c01 *= (1 - fx) * fy;
735 c11 *= fx * fy;
736
737 return c00 + c10 + c01 + c11;
738 }
739
John Bauman19bac1e2014-05-06 15:23:49 -0400740 void *Surface::Buffer::lockRect(int x, int y, int z, Lock lock)
John Bauman89401822014-05-06 15:04:28 -0400741 {
742 this->lock = lock;
743
744 switch(lock)
745 {
746 case LOCK_UNLOCKED:
747 case LOCK_READONLY:
748 break;
749 case LOCK_WRITEONLY:
750 case LOCK_READWRITE:
751 case LOCK_DISCARD:
752 dirty = true;
753 break;
754 default:
755 ASSERT(false);
756 }
757
John Baumand4ae8632014-05-06 16:18:33 -0400758 if(buffer)
John Bauman89401822014-05-06 15:04:28 -0400759 {
John Baumand4ae8632014-05-06 16:18:33 -0400760 switch(format)
761 {
762 #if S3TC_SUPPORT
763 case FORMAT_DXT1:
764 #endif
765 case FORMAT_ATI1:
Nicolas Capens22658242014-11-29 00:31:41 -0500766 case FORMAT_ETC1:
John Baumand4ae8632014-05-06 16:18:33 -0400767 return (unsigned char*)buffer + 8 * (x / 4) + (y / 4) * pitchB + z * sliceB;
768 #if S3TC_SUPPORT
769 case FORMAT_DXT3:
770 case FORMAT_DXT5:
771 #endif
772 case FORMAT_ATI2:
773 return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
774 default:
775 return (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
776 }
John Bauman89401822014-05-06 15:04:28 -0400777 }
778
779 return 0;
780 }
781
782 void Surface::Buffer::unlockRect()
783 {
784 lock = LOCK_UNLOCKED;
785 }
786
Nicolas Capens477314b2015-06-09 16:47:29 -0400787 Surface::Surface(int width, int height, int depth, Format format, void *pixels, int pitch, int slice) : lockable(true), renderTarget(false)
788 {
789 resource = new Resource(0);
790 hasParent = false;
791 ownExternal = false;
792 depth = max(1, depth);
793
794 external.buffer = pixels;
795 external.width = width;
796 external.height = height;
797 external.depth = depth;
798 external.format = format;
799 external.bytes = bytes(external.format);
800 external.pitchB = pitch;
801 external.pitchP = pitch / external.bytes;
802 external.sliceB = slice;
803 external.sliceP = slice / external.bytes;
804 external.lock = LOCK_UNLOCKED;
805 external.dirty = true;
806
807 internal.buffer = 0;
808 internal.width = width;
809 internal.height = height;
810 internal.depth = depth;
811 internal.format = selectInternalFormat(format);
812 internal.bytes = bytes(internal.format);
813 internal.pitchB = pitchB(internal.width, internal.format, false);
814 internal.pitchP = pitchP(internal.width, internal.format, false);
815 internal.sliceB = sliceB(internal.width, internal.height, internal.format, false);
816 internal.sliceP = sliceP(internal.width, internal.height, internal.format, false);
817 internal.lock = LOCK_UNLOCKED;
818 internal.dirty = false;
819
820 stencil.buffer = 0;
821 stencil.width = width;
822 stencil.height = height;
823 stencil.depth = depth;
824 stencil.format = FORMAT_S8;
825 stencil.bytes = bytes(stencil.format);
826 stencil.pitchB = pitchB(stencil.width, stencil.format, false);
827 stencil.pitchP = pitchP(stencil.width, stencil.format, false);
828 stencil.sliceB = sliceB(stencil.width, stencil.height, stencil.format, false);
829 stencil.sliceP = sliceP(stencil.width, stencil.height, stencil.format, false);
830 stencil.lock = LOCK_UNLOCKED;
831 stencil.dirty = false;
832
833 dirtyMipmaps = true;
834 paletteUsed = 0;
835 }
836
John Bauman89401822014-05-06 15:04:28 -0400837 Surface::Surface(Resource *texture, int width, int height, int depth, Format format, bool lockable, bool renderTarget) : lockable(lockable), renderTarget(renderTarget)
838 {
839 resource = texture ? texture : new Resource(0);
John Bauman19bac1e2014-05-06 15:23:49 -0400840 hasParent = texture != 0;
Nicolas Capens477314b2015-06-09 16:47:29 -0400841 ownExternal = true;
John Bauman89401822014-05-06 15:04:28 -0400842 depth = max(1, depth);
843
844 external.buffer = 0;
845 external.width = width;
846 external.height = height;
847 external.depth = depth;
848 external.format = format;
849 external.bytes = bytes(external.format);
850 external.pitchB = pitchB(external.width, external.format, renderTarget && !texture);
851 external.pitchP = pitchP(external.width, external.format, renderTarget && !texture);
852 external.sliceB = sliceB(external.width, external.height, external.format, renderTarget && !texture);
853 external.sliceP = sliceP(external.width, external.height, external.format, renderTarget && !texture);
854 external.lock = LOCK_UNLOCKED;
855 external.dirty = false;
John Bauman89401822014-05-06 15:04:28 -0400856
857 internal.buffer = 0;
858 internal.width = width;
859 internal.height = height;
860 internal.depth = depth;
861 internal.format = selectInternalFormat(format);
862 internal.bytes = bytes(internal.format);
863 internal.pitchB = pitchB(internal.width, internal.format, renderTarget);
864 internal.pitchP = pitchP(internal.width, internal.format, renderTarget);
865 internal.sliceB = sliceB(internal.width, internal.height, internal.format, renderTarget);
866 internal.sliceP = sliceP(internal.width, internal.height, internal.format, renderTarget);
867 internal.lock = LOCK_UNLOCKED;
868 internal.dirty = false;
John Bauman89401822014-05-06 15:04:28 -0400869
870 stencil.buffer = 0;
871 stencil.width = width;
872 stencil.height = height;
873 stencil.depth = depth;
874 stencil.format = FORMAT_S8;
875 stencil.bytes = bytes(stencil.format);
876 stencil.pitchB = pitchB(stencil.width, stencil.format, renderTarget);
877 stencil.pitchP = pitchP(stencil.width, stencil.format, renderTarget);
878 stencil.sliceB = sliceB(stencil.width, stencil.height, stencil.format, renderTarget);
879 stencil.sliceP = sliceP(stencil.width, stencil.height, stencil.format, renderTarget);
880 stencil.lock = LOCK_UNLOCKED;
881 stencil.dirty = false;
John Bauman89401822014-05-06 15:04:28 -0400882
883 dirtyMipmaps = true;
John Bauman66b8ab22014-05-06 15:57:45 -0400884 paletteUsed = 0;
John Bauman89401822014-05-06 15:04:28 -0400885 }
886
887 Surface::~Surface()
888 {
John Bauman8a4f6fc2014-05-06 15:26:18 -0400889 // Synchronize so we can deallocate the buffers below
890 resource->lock(DESTRUCT);
891 resource->unlock();
892
John Bauman89401822014-05-06 15:04:28 -0400893 if(!hasParent)
894 {
895 resource->destruct();
896 }
897
Nicolas Capens477314b2015-06-09 16:47:29 -0400898 if(ownExternal)
899 {
900 deallocate(external.buffer);
901 }
John Bauman89401822014-05-06 15:04:28 -0400902
903 if(internal.buffer != external.buffer)
904 {
905 deallocate(internal.buffer);
906 }
907
908 deallocate(stencil.buffer);
909
910 external.buffer = 0;
911 internal.buffer = 0;
912 stencil.buffer = 0;
913 }
914
John Bauman19bac1e2014-05-06 15:23:49 -0400915 void *Surface::lockExternal(int x, int y, int z, Lock lock, Accessor client)
John Bauman89401822014-05-06 15:04:28 -0400916 {
917 resource->lock(client);
918
919 if(!external.buffer)
920 {
921 if(internal.buffer && identicalFormats())
922 {
923 external.buffer = internal.buffer;
924 }
925 else
926 {
927 external.buffer = allocateBuffer(external.width, external.height, external.depth, external.format);
928 }
929 }
930
931 if(internal.dirty)
932 {
933 if(lock != LOCK_DISCARD)
934 {
935 update(external, internal);
936 }
John Bauman66b8ab22014-05-06 15:57:45 -0400937
938 internal.dirty = false;
John Bauman89401822014-05-06 15:04:28 -0400939 }
940
941 switch(lock)
942 {
943 case LOCK_READONLY:
944 break;
945 case LOCK_WRITEONLY:
946 case LOCK_READWRITE:
947 case LOCK_DISCARD:
948 dirtyMipmaps = true;
949 break;
950 default:
951 ASSERT(false);
952 }
953
John Bauman19bac1e2014-05-06 15:23:49 -0400954 return external.lockRect(x, y, z, lock);
John Bauman89401822014-05-06 15:04:28 -0400955 }
956
957 void Surface::unlockExternal()
958 {
959 resource->unlock();
960
961 external.unlockRect();
962 }
963
John Bauman19bac1e2014-05-06 15:23:49 -0400964 void *Surface::lockInternal(int x, int y, int z, Lock lock, Accessor client)
John Bauman89401822014-05-06 15:04:28 -0400965 {
966 if(lock != LOCK_UNLOCKED)
967 {
968 resource->lock(client);
969 }
970
971 if(!internal.buffer)
972 {
973 if(external.buffer && identicalFormats())
974 {
975 internal.buffer = external.buffer;
976 }
977 else
978 {
979 internal.buffer = allocateBuffer(internal.width, internal.height, internal.depth, internal.format);
980 }
981 }
982
983 // FIXME: WHQL requires conversion to lower external precision and back
984 if(logPrecision >= WHQL)
985 {
986 if(internal.dirty && renderTarget && internal.format != external.format)
987 {
988 if(lock != LOCK_DISCARD)
989 {
990 switch(external.format)
991 {
992 case FORMAT_R3G3B2:
993 case FORMAT_A8R3G3B2:
994 case FORMAT_A1R5G5B5:
995 case FORMAT_A2R10G10B10:
996 case FORMAT_A2B10G10R10:
997 lockExternal(0, 0, 0, LOCK_READWRITE, client);
998 unlockExternal();
999 break;
1000 default:
1001 // Difference passes WHQL
1002 break;
1003 }
1004 }
1005 }
1006 }
1007
John Bauman66b8ab22014-05-06 15:57:45 -04001008 if(external.dirty || (isPalette(external.format) && paletteUsed != Surface::paletteID))
John Bauman89401822014-05-06 15:04:28 -04001009 {
1010 if(lock != LOCK_DISCARD)
1011 {
1012 update(internal, external);
1013 }
John Bauman89401822014-05-06 15:04:28 -04001014
John Bauman66b8ab22014-05-06 15:57:45 -04001015 external.dirty = false;
1016 paletteUsed = Surface::paletteID;
John Bauman89401822014-05-06 15:04:28 -04001017 }
1018
1019 switch(lock)
1020 {
1021 case LOCK_UNLOCKED:
1022 case LOCK_READONLY:
1023 break;
1024 case LOCK_WRITEONLY:
1025 case LOCK_READWRITE:
1026 case LOCK_DISCARD:
1027 dirtyMipmaps = true;
1028 break;
1029 default:
1030 ASSERT(false);
1031 }
1032
1033 if(lock == LOCK_READONLY && client == PUBLIC)
1034 {
1035 resolve();
1036 }
1037
John Bauman19bac1e2014-05-06 15:23:49 -04001038 return internal.lockRect(x, y, z, lock);
John Bauman89401822014-05-06 15:04:28 -04001039 }
1040
1041 void Surface::unlockInternal()
1042 {
1043 resource->unlock();
1044
1045 internal.unlockRect();
1046 }
1047
1048 void *Surface::lockStencil(int front, Accessor client)
1049 {
1050 resource->lock(client);
1051
1052 if(!stencil.buffer)
1053 {
1054 stencil.buffer = allocateBuffer(stencil.width, stencil.height, stencil.depth, stencil.format);
1055 }
1056
John Bauman89401822014-05-06 15:04:28 -04001057 return stencil.lockRect(0, 0, front, LOCK_READWRITE); // FIXME
1058 }
1059
1060 void Surface::unlockStencil()
1061 {
1062 resource->unlock();
1063
1064 stencil.unlockRect();
1065 }
1066
1067 int Surface::bytes(Format format)
1068 {
1069 switch(format)
1070 {
1071 case FORMAT_NULL: return 0;
1072 case FORMAT_P8: return 1;
1073 case FORMAT_A8P8: return 2;
1074 case FORMAT_A8: return 1;
1075 case FORMAT_R8: return 1;
1076 case FORMAT_R3G3B2: return 1;
1077 case FORMAT_A8R3G3B2: return 2;
1078 case FORMAT_R5G6B5: return 2;
1079 case FORMAT_A1R5G5B5: return 2;
1080 case FORMAT_X1R5G5B5: return 2;
Nicolas Capens80594422015-06-09 16:42:56 -04001081 case FORMAT_R5G5B5A1: return 2;
John Bauman89401822014-05-06 15:04:28 -04001082 case FORMAT_X4R4G4B4: return 2;
1083 case FORMAT_A4R4G4B4: return 2;
Nicolas Capens80594422015-06-09 16:42:56 -04001084 case FORMAT_R4G4B4A4: return 2;
John Bauman89401822014-05-06 15:04:28 -04001085 case FORMAT_R8G8B8: return 3;
Nicolas Capens80594422015-06-09 16:42:56 -04001086 case FORMAT_B8G8R8: return 3;
John Bauman89401822014-05-06 15:04:28 -04001087 case FORMAT_X8R8G8B8: return 4;
1088 // case FORMAT_X8G8R8B8Q: return 4;
1089 case FORMAT_A8R8G8B8: return 4;
1090 // case FORMAT_A8G8R8B8Q: return 4;
1091 case FORMAT_X8B8G8R8: return 4;
1092 case FORMAT_A8B8G8R8: return 4;
1093 case FORMAT_A2R10G10B10: return 4;
1094 case FORMAT_A2B10G10R10: return 4;
1095 case FORMAT_G8R8: return 2;
1096 case FORMAT_G16R16: return 4;
1097 case FORMAT_A16B16G16R16: return 8;
1098 // Compressed formats
1099 #if S3TC_SUPPORT
1100 case FORMAT_DXT1: return 2; // Column of four pixels
1101 case FORMAT_DXT3: return 4; // Column of four pixels
1102 case FORMAT_DXT5: return 4; // Column of four pixels
John Bauman66b8ab22014-05-06 15:57:45 -04001103 #endif
John Bauman89401822014-05-06 15:04:28 -04001104 case FORMAT_ATI1: return 2; // Column of four pixels
1105 case FORMAT_ATI2: return 4; // Column of four pixels
Nicolas Capens22658242014-11-29 00:31:41 -05001106 case FORMAT_ETC1: return 2; // Column of four pixels
John Bauman89401822014-05-06 15:04:28 -04001107 // Bumpmap formats
1108 case FORMAT_V8U8: return 2;
1109 case FORMAT_L6V5U5: return 2;
1110 case FORMAT_Q8W8V8U8: return 4;
1111 case FORMAT_X8L8V8U8: return 4;
1112 case FORMAT_A2W10V10U10: return 4;
1113 case FORMAT_V16U16: return 4;
1114 case FORMAT_A16W16V16U16: return 8;
1115 case FORMAT_Q16W16V16U16: return 8;
1116 // Luminance formats
1117 case FORMAT_L8: return 1;
1118 case FORMAT_A4L4: return 1;
1119 case FORMAT_L16: return 2;
1120 case FORMAT_A8L8: return 2;
Nicolas Capens80594422015-06-09 16:42:56 -04001121 case FORMAT_L16F: return 2;
1122 case FORMAT_A16L16F: return 4;
1123 case FORMAT_L32F: return 4;
1124 case FORMAT_A32L32F: return 8;
John Bauman89401822014-05-06 15:04:28 -04001125 // Floating-point formats
Nicolas Capens80594422015-06-09 16:42:56 -04001126 case FORMAT_A16F: return 2;
John Bauman89401822014-05-06 15:04:28 -04001127 case FORMAT_R16F: return 2;
1128 case FORMAT_G16R16F: return 4;
Nicolas Capens80594422015-06-09 16:42:56 -04001129 case FORMAT_B16G16R16F: return 6;
John Bauman89401822014-05-06 15:04:28 -04001130 case FORMAT_A16B16G16R16F: return 8;
Nicolas Capens80594422015-06-09 16:42:56 -04001131 case FORMAT_A32F: return 4;
John Bauman89401822014-05-06 15:04:28 -04001132 case FORMAT_R32F: return 4;
1133 case FORMAT_G32R32F: return 8;
Nicolas Capens80594422015-06-09 16:42:56 -04001134 case FORMAT_B32G32R32F: return 12;
John Bauman89401822014-05-06 15:04:28 -04001135 case FORMAT_A32B32G32R32F: return 16;
1136 // Depth/stencil formats
1137 case FORMAT_D16: return 2;
1138 case FORMAT_D32: return 4;
1139 case FORMAT_D24X8: return 4;
1140 case FORMAT_D24S8: return 4;
1141 case FORMAT_D24FS8: return 4;
1142 case FORMAT_D32F: return 4;
1143 case FORMAT_D32F_COMPLEMENTARY: return 4;
1144 case FORMAT_D32F_LOCKABLE: return 4;
John Bauman66b8ab22014-05-06 15:57:45 -04001145 case FORMAT_D32FS8_TEXTURE: return 4;
1146 case FORMAT_D32FS8_SHADOW: return 4;
1147 case FORMAT_DF24S8: return 4;
1148 case FORMAT_DF16S8: return 2;
John Bauman89401822014-05-06 15:04:28 -04001149 case FORMAT_INTZ: return 4;
1150 case FORMAT_S8: return 1;
1151 default:
1152 ASSERT(false);
1153 }
1154
1155 return 0;
1156 }
1157
1158 int Surface::pitchB(int width, Format format, bool target)
1159 {
1160 if(target || isDepth(format) || isStencil(format))
1161 {
1162 width = ((width + 1) & ~1);
1163 }
1164
1165 switch(format)
1166 {
1167 #if S3TC_SUPPORT
1168 case FORMAT_DXT1:
Nicolas Capens22658242014-11-29 00:31:41 -05001169 #endif
1170 case FORMAT_ETC1:
John Bauman89401822014-05-06 15:04:28 -04001171 return 8 * ((width + 3) / 4); // 64 bit per 4x4 block, computed per 4 rows
Nicolas Capens22658242014-11-29 00:31:41 -05001172 #if S3TC_SUPPORT
John Bauman89401822014-05-06 15:04:28 -04001173 case FORMAT_DXT3:
1174 case FORMAT_DXT5:
1175 return 16 * ((width + 3) / 4); // 128 bit per 4x4 block, computed per 4 rows
John Bauman66b8ab22014-05-06 15:57:45 -04001176 #endif
John Bauman89401822014-05-06 15:04:28 -04001177 case FORMAT_ATI1:
1178 return 2 * ((width + 3) / 4); // 64 bit per 4x4 block, computed per row
1179 case FORMAT_ATI2:
1180 return 4 * ((width + 3) / 4); // 128 bit per 4x4 block, computed per row
John Bauman89401822014-05-06 15:04:28 -04001181 default:
1182 return bytes(format) * width;
1183 }
1184 }
1185
1186 int Surface::pitchP(int width, Format format, bool target)
1187 {
1188 int B = bytes(format);
1189
1190 return B > 0 ? pitchB(width, format, target) / B : 0;
1191 }
1192
1193 int Surface::sliceB(int width, int height, Format format, bool target)
1194 {
1195 if(target || isDepth(format) || isStencil(format))
1196 {
1197 height = ((height + 1) & ~1);
1198 }
1199
1200 switch(format)
1201 {
1202 #if S3TC_SUPPORT
1203 case FORMAT_DXT1:
1204 case FORMAT_DXT3:
1205 case FORMAT_DXT5:
John Bauman66b8ab22014-05-06 15:57:45 -04001206 #endif
Nicolas Capens22658242014-11-29 00:31:41 -05001207 case FORMAT_ETC1:
1208 return pitchB(width, format, target) * ((height + 3) / 4); // Pitch computed per 4 rows
1209 case FORMAT_ATI1:
1210 case FORMAT_ATI2:
John Bauman89401822014-05-06 15:04:28 -04001211 default:
Nicolas Capens22658242014-11-29 00:31:41 -05001212 return pitchB(width, format, target) * height; // Pitch computed per row
John Bauman89401822014-05-06 15:04:28 -04001213 }
1214 }
1215
1216 int Surface::sliceP(int width, int height, Format format, bool target)
1217 {
1218 int B = bytes(format);
1219
1220 return B > 0 ? sliceB(width, height, format, target) / B : 0;
1221 }
1222
1223 void Surface::update(Buffer &destination, Buffer &source)
1224 {
1225 // ASSERT(source.lock != LOCK_UNLOCKED);
1226 // ASSERT(destination.lock != LOCK_UNLOCKED);
1227
1228 if(destination.buffer != source.buffer)
1229 {
1230 ASSERT(source.dirty && !destination.dirty);
1231
1232 switch(source.format)
1233 {
1234 case FORMAT_R8G8B8: decodeR8G8B8(destination, source); break; // FIXME: Check destination format
John Bauman89401822014-05-06 15:04:28 -04001235 case FORMAT_R5G6B5: decodeR5G6B5(destination, source); break; // FIXME: Check destination format
1236 case FORMAT_X1R5G5B5: decodeX1R5G5B5(destination, source); break; // FIXME: Check destination format
1237 case FORMAT_A1R5G5B5: decodeA1R5G5B5(destination, source); break; // FIXME: Check destination format
1238 case FORMAT_X4R4G4B4: decodeX4R4G4B4(destination, source); break; // FIXME: Check destination format
1239 case FORMAT_A4R4G4B4: decodeA4R4G4B4(destination, source); break; // FIXME: Check destination format
1240 case FORMAT_P8: decodeP8(destination, source); break; // FIXME: Check destination format
1241 #if S3TC_SUPPORT
1242 case FORMAT_DXT1: decodeDXT1(destination, source); break; // FIXME: Check destination format
1243 case FORMAT_DXT3: decodeDXT3(destination, source); break; // FIXME: Check destination format
1244 case FORMAT_DXT5: decodeDXT5(destination, source); break; // FIXME: Check destination format
Nicolas Capens22658242014-11-29 00:31:41 -05001245 #endif
John Bauman89401822014-05-06 15:04:28 -04001246 case FORMAT_ATI1: decodeATI1(destination, source); break; // FIXME: Check destination format
1247 case FORMAT_ATI2: decodeATI2(destination, source); break; // FIXME: Check destination format
Nicolas Capens22658242014-11-29 00:31:41 -05001248 case FORMAT_ETC1: decodeETC1(destination, source); break; // FIXME: Check destination format
John Bauman89401822014-05-06 15:04:28 -04001249 default: genericUpdate(destination, source); break;
1250 }
1251 }
John Bauman89401822014-05-06 15:04:28 -04001252 }
1253
1254 void Surface::genericUpdate(Buffer &destination, Buffer &source)
1255 {
1256 unsigned char *sourceSlice = (unsigned char*)source.buffer;
1257 unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1258
1259 int depth = min(destination.depth, source.depth);
1260 int height = min(destination.height, source.height);
1261 int width = min(destination.width, source.width);
1262 int rowBytes = width * source.bytes;
1263
1264 for(int z = 0; z < depth; z++)
1265 {
1266 unsigned char *sourceRow = sourceSlice;
1267 unsigned char *destinationRow = destinationSlice;
1268
1269 for(int y = 0; y < height; y++)
1270 {
1271 if(source.format == destination.format)
1272 {
1273 memcpy(destinationRow, sourceRow, rowBytes);
1274 }
1275 else
1276 {
1277 unsigned char *sourceElement = sourceRow;
1278 unsigned char *destinationElement = destinationRow;
1279
1280 for(int x = 0; x < width; x++)
1281 {
1282 Color<float> color = source.read(sourceElement);
1283 destination.write(destinationElement, color);
1284
1285 sourceElement += source.bytes;
1286 destinationElement += destination.bytes;
1287 }
1288 }
1289
1290 sourceRow += source.pitchB;
1291 destinationRow += destination.pitchB;
1292 }
1293
1294 sourceSlice += source.sliceB;
1295 destinationSlice += destination.sliceB;
1296 }
1297 }
1298
1299 void Surface::decodeR8G8B8(Buffer &destination, const Buffer &source)
1300 {
1301 unsigned char *sourceSlice = (unsigned char*)source.buffer;
1302 unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1303
1304 for(int z = 0; z < destination.depth && z < source.depth; z++)
1305 {
1306 unsigned char *sourceRow = sourceSlice;
1307 unsigned char *destinationRow = destinationSlice;
1308
1309 for(int y = 0; y < destination.height && y < source.height; y++)
1310 {
1311 unsigned char *sourceElement = sourceRow;
1312 unsigned char *destinationElement = destinationRow;
1313
1314 for(int x = 0; x < destination.width && x < source.width; x++)
1315 {
1316 unsigned int b = sourceElement[0];
1317 unsigned int g = sourceElement[1];
1318 unsigned int r = sourceElement[2];
1319
1320 *(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0);
1321
1322 sourceElement += source.bytes;
1323 destinationElement += destination.bytes;
1324 }
1325
1326 sourceRow += source.pitchB;
1327 destinationRow += destination.pitchB;
1328 }
1329
1330 sourceSlice += source.sliceB;
1331 destinationSlice += destination.sliceB;
1332 }
1333 }
1334
John Bauman89401822014-05-06 15:04:28 -04001335 void Surface::decodeR5G6B5(Buffer &destination, const Buffer &source)
1336 {
1337 unsigned char *sourceSlice = (unsigned char*)source.buffer;
1338 unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1339
1340 for(int z = 0; z < destination.depth && z < source.depth; z++)
1341 {
1342 unsigned char *sourceRow = sourceSlice;
1343 unsigned char *destinationRow = destinationSlice;
1344
1345 for(int y = 0; y < destination.height && y < source.height; y++)
1346 {
1347 unsigned char *sourceElement = sourceRow;
1348 unsigned char *destinationElement = destinationRow;
1349
1350 for(int x = 0; x < destination.width && x < source.width; x++)
1351 {
1352 unsigned int rgb = *(unsigned short*)sourceElement;
1353
1354 unsigned int r = (((rgb & 0xF800) * 67385 + 0x800000) >> 8) & 0x00FF0000;
1355 unsigned int g = (((rgb & 0x07E0) * 8289 + 0x8000) >> 8) & 0x0000FF00;
1356 unsigned int b = (((rgb & 0x001F) * 2106 + 0x80) >> 8);
1357
1358 *(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
1359
1360 sourceElement += source.bytes;
1361 destinationElement += destination.bytes;
1362 }
1363
1364 sourceRow += source.pitchB;
1365 destinationRow += destination.pitchB;
1366 }
1367
1368 sourceSlice += source.sliceB;
1369 destinationSlice += destination.sliceB;
1370 }
1371 }
1372
1373 void Surface::decodeX1R5G5B5(Buffer &destination, const Buffer &source)
1374 {
1375 unsigned char *sourceSlice = (unsigned char*)source.buffer;
1376 unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1377
1378 for(int z = 0; z < destination.depth && z < source.depth; z++)
1379 {
1380 unsigned char *sourceRow = sourceSlice;
1381 unsigned char *destinationRow = destinationSlice;
1382
1383 for(int y = 0; y < destination.height && y < source.height; y++)
1384 {
1385 unsigned char *sourceElement = sourceRow;
1386 unsigned char *destinationElement = destinationRow;
1387
1388 for(int x = 0; x < destination.width && x < source.width; x++)
1389 {
1390 unsigned int xrgb = *(unsigned short*)sourceElement;
1391
1392 unsigned int r = (((xrgb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
1393 unsigned int g = (((xrgb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00;
1394 unsigned int b = (((xrgb & 0x001F) * 2106 + 0x80) >> 8);
1395
1396 *(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
1397
1398 sourceElement += source.bytes;
1399 destinationElement += destination.bytes;
1400 }
1401
1402 sourceRow += source.pitchB;
1403 destinationRow += destination.pitchB;
1404 }
1405
1406 sourceSlice += source.sliceB;
1407 destinationSlice += destination.sliceB;
1408 }
1409 }
1410
1411 void Surface::decodeA1R5G5B5(Buffer &destination, const Buffer &source)
1412 {
1413 unsigned char *sourceSlice = (unsigned char*)source.buffer;
1414 unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1415
1416 for(int z = 0; z < destination.depth && z < source.depth; z++)
1417 {
1418 unsigned char *sourceRow = sourceSlice;
1419 unsigned char *destinationRow = destinationSlice;
1420
1421 for(int y = 0; y < destination.height && y < source.height; y++)
1422 {
1423 unsigned char *sourceElement = sourceRow;
1424 unsigned char *destinationElement = destinationRow;
1425
1426 for(int x = 0; x < destination.width && x < source.width; x++)
1427 {
1428 unsigned int argb = *(unsigned short*)sourceElement;
1429
1430 unsigned int a = (argb & 0x8000) * 130560;
1431 unsigned int r = (((argb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
1432 unsigned int g = (((argb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00;
1433 unsigned int b = (((argb & 0x001F) * 2106 + 0x80) >> 8);
1434
1435 *(unsigned int*)destinationElement = a | r | g | b;
1436
1437 sourceElement += source.bytes;
1438 destinationElement += destination.bytes;
1439 }
1440
1441 sourceRow += source.pitchB;
1442 destinationRow += destination.pitchB;
1443 }
1444
1445 sourceSlice += source.sliceB;
1446 destinationSlice += destination.sliceB;
1447 }
1448 }
1449
1450 void Surface::decodeX4R4G4B4(Buffer &destination, const Buffer &source)
1451 {
1452 unsigned char *sourceSlice = (unsigned char*)source.buffer;
1453 unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1454
1455 for(int z = 0; z < destination.depth && z < source.depth; z++)
1456 {
1457 unsigned char *sourceRow = sourceSlice;
1458 unsigned char *destinationRow = destinationSlice;
1459
1460 for(int y = 0; y < destination.height && y < source.height; y++)
1461 {
1462 unsigned char *sourceElement = sourceRow;
1463 unsigned char *destinationElement = destinationRow;
1464
1465 for(int x = 0; x < destination.width && x < source.width; x++)
1466 {
1467 unsigned int xrgb = *(unsigned short*)sourceElement;
1468
1469 unsigned int r = ((xrgb & 0x0F00) * 0x00001100) & 0x00FF0000;
1470 unsigned int g = ((xrgb & 0x00F0) * 0x00000110) & 0x0000FF00;
1471 unsigned int b = (xrgb & 0x000F) * 0x00000011;
1472
1473 *(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
1474
1475 sourceElement += source.bytes;
1476 destinationElement += destination.bytes;
1477 }
1478
1479 sourceRow += source.pitchB;
1480 destinationRow += destination.pitchB;
1481 }
1482
1483 sourceSlice += source.sliceB;
1484 destinationSlice += destination.sliceB;
1485 }
1486 }
1487
1488 void Surface::decodeA4R4G4B4(Buffer &destination, const Buffer &source)
1489 {
1490 unsigned char *sourceSlice = (unsigned char*)source.buffer;
1491 unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1492
1493 for(int z = 0; z < destination.depth && z < source.depth; z++)
1494 {
1495 unsigned char *sourceRow = sourceSlice;
1496 unsigned char *destinationRow = destinationSlice;
1497
1498 for(int y = 0; y < destination.height && y < source.height; y++)
1499 {
1500 unsigned char *sourceElement = sourceRow;
1501 unsigned char *destinationElement = destinationRow;
1502
1503 for(int x = 0; x < destination.width && x < source.width; x++)
1504 {
1505 unsigned int argb = *(unsigned short*)sourceElement;
1506
1507 unsigned int a = ((argb & 0xF000) * 0x00011000) & 0xFF000000;
1508 unsigned int r = ((argb & 0x0F00) * 0x00001100) & 0x00FF0000;
1509 unsigned int g = ((argb & 0x00F0) * 0x00000110) & 0x0000FF00;
1510 unsigned int b = (argb & 0x000F) * 0x00000011;
1511
1512 *(unsigned int*)destinationElement = a | r | g | b;
1513
1514 sourceElement += source.bytes;
1515 destinationElement += destination.bytes;
1516 }
1517
1518 sourceRow += source.pitchB;
1519 destinationRow += destination.pitchB;
1520 }
1521
1522 sourceSlice += source.sliceB;
1523 destinationSlice += destination.sliceB;
1524 }
1525 }
1526
1527 void Surface::decodeP8(Buffer &destination, const Buffer &source)
1528 {
1529 unsigned char *sourceSlice = (unsigned char*)source.buffer;
1530 unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1531
1532 for(int z = 0; z < destination.depth && z < source.depth; z++)
1533 {
1534 unsigned char *sourceRow = sourceSlice;
1535 unsigned char *destinationRow = destinationSlice;
1536
1537 for(int y = 0; y < destination.height && y < source.height; y++)
1538 {
1539 unsigned char *sourceElement = sourceRow;
1540 unsigned char *destinationElement = destinationRow;
1541
1542 for(int x = 0; x < destination.width && x < source.width; x++)
1543 {
1544 unsigned int abgr = palette[*(unsigned char*)sourceElement];
1545
1546 unsigned int r = (abgr & 0x000000FF) << 16;
1547 unsigned int g = (abgr & 0x0000FF00) << 0;
1548 unsigned int b = (abgr & 0x00FF0000) >> 16;
1549 unsigned int a = (abgr & 0xFF000000) >> 0;
1550
1551 *(unsigned int*)destinationElement = a | r | g | b;
1552
1553 sourceElement += source.bytes;
1554 destinationElement += destination.bytes;
1555 }
1556
1557 sourceRow += source.pitchB;
1558 destinationRow += destination.pitchB;
1559 }
1560
1561 sourceSlice += source.sliceB;
1562 destinationSlice += destination.sliceB;
1563 }
1564 }
1565
1566#if S3TC_SUPPORT
1567 void Surface::decodeDXT1(Buffer &internal, const Buffer &external)
1568 {
1569 unsigned int *destSlice = (unsigned int*)internal.buffer;
Nicolas Capens22658242014-11-29 00:31:41 -05001570 const DXT1 *source = (const DXT1*)external.buffer;
John Bauman89401822014-05-06 15:04:28 -04001571
1572 for(int z = 0; z < external.depth; z++)
1573 {
1574 unsigned int *dest = destSlice;
1575
1576 for(int y = 0; y < external.height; y += 4)
1577 {
1578 for(int x = 0; x < external.width; x += 4)
1579 {
1580 Color<byte> c[4];
1581
1582 c[0] = source->c0;
1583 c[1] = source->c1;
1584
1585 if(source->c0 > source->c1) // No transparency
1586 {
1587 // c2 = 2 / 3 * c0 + 1 / 3 * c1
1588 c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
1589 c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
1590 c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
1591 c[2].a = 0xFF;
1592
1593 // c3 = 1 / 3 * c0 + 2 / 3 * c1
1594 c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
1595 c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
1596 c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
1597 c[3].a = 0xFF;
1598 }
1599 else // c3 transparent
1600 {
1601 // c2 = 1 / 2 * c0 + 1 / 2 * c1
1602 c[2].r = (byte)(((word)c[0].r + (word)c[1].r) / 2);
1603 c[2].g = (byte)(((word)c[0].g + (word)c[1].g) / 2);
1604 c[2].b = (byte)(((word)c[0].b + (word)c[1].b) / 2);
1605 c[2].a = 0xFF;
1606
1607 c[3].r = 0;
1608 c[3].g = 0;
1609 c[3].b = 0;
1610 c[3].a = 0;
1611 }
1612
1613 for(int j = 0; j < 4 && (y + j) < internal.height; j++)
1614 {
1615 for(int i = 0; i < 4 && (x + i) < internal.width; i++)
1616 {
1617 dest[(x + i) + (y + j) * internal.width] = c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4];
1618 }
1619 }
1620
1621 source++;
1622 }
1623 }
1624
1625 (byte*&)destSlice += internal.sliceB;
1626 }
1627 }
1628
1629 void Surface::decodeDXT3(Buffer &internal, const Buffer &external)
1630 {
1631 unsigned int *destSlice = (unsigned int*)internal.buffer;
Nicolas Capens22658242014-11-29 00:31:41 -05001632 const DXT3 *source = (const DXT3*)external.buffer;
John Bauman89401822014-05-06 15:04:28 -04001633
1634 for(int z = 0; z < external.depth; z++)
1635 {
1636 unsigned int *dest = destSlice;
1637
1638 for(int y = 0; y < external.height; y += 4)
1639 {
1640 for(int x = 0; x < external.width; x += 4)
1641 {
1642 Color<byte> c[4];
1643
1644 c[0] = source->c0;
1645 c[1] = source->c1;
1646
1647 // c2 = 2 / 3 * c0 + 1 / 3 * c1
1648 c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
1649 c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
1650 c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
1651
1652 // c3 = 1 / 3 * c0 + 2 / 3 * c1
1653 c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
1654 c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
1655 c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
1656
1657 for(int j = 0; j < 4 && (y + j) < internal.height; j++)
1658 {
1659 for(int i = 0; i < 4 && (x + i) < internal.width; i++)
1660 {
1661 unsigned int a = (unsigned int)(source->a >> 4 * (i + j * 4)) & 0x0F;
1662 unsigned int color = (c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | ((a << 28) + (a << 24));
1663
1664 dest[(x + i) + (y + j) * internal.width] = color;
1665 }
1666 }
1667
1668 source++;
1669 }
1670 }
1671
1672 (byte*&)destSlice += internal.sliceB;
1673 }
1674 }
1675
1676 void Surface::decodeDXT5(Buffer &internal, const Buffer &external)
1677 {
1678 unsigned int *destSlice = (unsigned int*)internal.buffer;
Nicolas Capens22658242014-11-29 00:31:41 -05001679 const DXT5 *source = (const DXT5*)external.buffer;
John Bauman89401822014-05-06 15:04:28 -04001680
1681 for(int z = 0; z < external.depth; z++)
1682 {
1683 unsigned int *dest = destSlice;
1684
1685 for(int y = 0; y < external.height; y += 4)
1686 {
1687 for(int x = 0; x < external.width; x += 4)
1688 {
1689 Color<byte> c[4];
1690
1691 c[0] = source->c0;
1692 c[1] = source->c1;
1693
1694 // c2 = 2 / 3 * c0 + 1 / 3 * c1
1695 c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
1696 c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
1697 c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
1698
1699 // c3 = 1 / 3 * c0 + 2 / 3 * c1
1700 c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
1701 c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
1702 c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
1703
1704 byte a[8];
1705
1706 a[0] = source->a0;
1707 a[1] = source->a1;
1708
1709 if(a[0] > a[1])
1710 {
1711 a[2] = (byte)((6 * (word)a[0] + 1 * (word)a[1] + 3) / 7);
1712 a[3] = (byte)((5 * (word)a[0] + 2 * (word)a[1] + 3) / 7);
1713 a[4] = (byte)((4 * (word)a[0] + 3 * (word)a[1] + 3) / 7);
1714 a[5] = (byte)((3 * (word)a[0] + 4 * (word)a[1] + 3) / 7);
1715 a[6] = (byte)((2 * (word)a[0] + 5 * (word)a[1] + 3) / 7);
1716 a[7] = (byte)((1 * (word)a[0] + 6 * (word)a[1] + 3) / 7);
1717 }
1718 else
1719 {
1720 a[2] = (byte)((4 * (word)a[0] + 1 * (word)a[1] + 2) / 5);
1721 a[3] = (byte)((3 * (word)a[0] + 2 * (word)a[1] + 2) / 5);
1722 a[4] = (byte)((2 * (word)a[0] + 3 * (word)a[1] + 2) / 5);
1723 a[5] = (byte)((1 * (word)a[0] + 4 * (word)a[1] + 2) / 5);
1724 a[6] = 0;
1725 a[7] = 0xFF;
1726 }
1727
1728 for(int j = 0; j < 4 && (y + j) < internal.height; j++)
1729 {
1730 for(int i = 0; i < 4 && (x + i) < internal.width; i++)
1731 {
1732 unsigned int alpha = (unsigned int)a[(unsigned int)(source->alut >> (16 + 3 * (i + j * 4))) % 8] << 24;
1733 unsigned int color = (c[(source->clut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | alpha;
1734
1735 dest[(x + i) + (y + j) * internal.width] = color;
1736 }
1737 }
1738
1739 source++;
1740 }
1741 }
1742
1743 (byte*&)destSlice += internal.sliceB;
1744 }
1745 }
Nicolas Capens22658242014-11-29 00:31:41 -05001746#endif
John Bauman89401822014-05-06 15:04:28 -04001747
1748 void Surface::decodeATI1(Buffer &internal, const Buffer &external)
1749 {
1750 byte *destSlice = (byte*)internal.buffer;
Nicolas Capens22658242014-11-29 00:31:41 -05001751 const ATI1 *source = (const ATI1*)external.buffer;
John Bauman89401822014-05-06 15:04:28 -04001752
1753 for(int z = 0; z < external.depth; z++)
1754 {
1755 byte *dest = destSlice;
1756
1757 for(int y = 0; y < external.height; y += 4)
1758 {
1759 for(int x = 0; x < external.width; x += 4)
1760 {
1761 byte r[8];
1762
1763 r[0] = source->r0;
1764 r[1] = source->r1;
1765
1766 if(r[0] > r[1])
1767 {
1768 r[2] = (byte)((6 * (word)r[0] + 1 * (word)r[1] + 3) / 7);
1769 r[3] = (byte)((5 * (word)r[0] + 2 * (word)r[1] + 3) / 7);
1770 r[4] = (byte)((4 * (word)r[0] + 3 * (word)r[1] + 3) / 7);
1771 r[5] = (byte)((3 * (word)r[0] + 4 * (word)r[1] + 3) / 7);
1772 r[6] = (byte)((2 * (word)r[0] + 5 * (word)r[1] + 3) / 7);
1773 r[7] = (byte)((1 * (word)r[0] + 6 * (word)r[1] + 3) / 7);
1774 }
1775 else
1776 {
1777 r[2] = (byte)((4 * (word)r[0] + 1 * (word)r[1] + 2) / 5);
1778 r[3] = (byte)((3 * (word)r[0] + 2 * (word)r[1] + 2) / 5);
1779 r[4] = (byte)((2 * (word)r[0] + 3 * (word)r[1] + 2) / 5);
1780 r[5] = (byte)((1 * (word)r[0] + 4 * (word)r[1] + 2) / 5);
1781 r[6] = 0;
1782 r[7] = 0xFF;
1783 }
1784
1785 for(int j = 0; j < 4 && (y + j) < internal.height; j++)
1786 {
1787 for(int i = 0; i < 4 && (x + i) < internal.width; i++)
1788 {
1789 dest[(x + i) + (y + j) * internal.width] = r[(unsigned int)(source->rlut >> (16 + 3 * (i + j * 4))) % 8];
1790 }
1791 }
1792
1793 source++;
1794 }
1795 }
1796
1797 destSlice += internal.sliceB;
1798 }
1799 }
1800
1801 void Surface::decodeATI2(Buffer &internal, const Buffer &external)
1802 {
1803 word *destSlice = (word*)internal.buffer;
Nicolas Capens22658242014-11-29 00:31:41 -05001804 const ATI2 *source = (const ATI2*)external.buffer;
John Bauman89401822014-05-06 15:04:28 -04001805
1806 for(int z = 0; z < external.depth; z++)
1807 {
1808 word *dest = destSlice;
1809
1810 for(int y = 0; y < external.height; y += 4)
1811 {
1812 for(int x = 0; x < external.width; x += 4)
1813 {
1814 byte X[8];
1815
1816 X[0] = source->x0;
1817 X[1] = source->x1;
1818
1819 if(X[0] > X[1])
1820 {
1821 X[2] = (byte)((6 * (word)X[0] + 1 * (word)X[1] + 3) / 7);
1822 X[3] = (byte)((5 * (word)X[0] + 2 * (word)X[1] + 3) / 7);
1823 X[4] = (byte)((4 * (word)X[0] + 3 * (word)X[1] + 3) / 7);
1824 X[5] = (byte)((3 * (word)X[0] + 4 * (word)X[1] + 3) / 7);
1825 X[6] = (byte)((2 * (word)X[0] + 5 * (word)X[1] + 3) / 7);
1826 X[7] = (byte)((1 * (word)X[0] + 6 * (word)X[1] + 3) / 7);
1827 }
1828 else
1829 {
1830 X[2] = (byte)((4 * (word)X[0] + 1 * (word)X[1] + 2) / 5);
1831 X[3] = (byte)((3 * (word)X[0] + 2 * (word)X[1] + 2) / 5);
1832 X[4] = (byte)((2 * (word)X[0] + 3 * (word)X[1] + 2) / 5);
1833 X[5] = (byte)((1 * (word)X[0] + 4 * (word)X[1] + 2) / 5);
1834 X[6] = 0;
1835 X[7] = 0xFF;
1836 }
1837
1838 byte Y[8];
1839
1840 Y[0] = source->y0;
1841 Y[1] = source->y1;
1842
1843 if(Y[0] > Y[1])
1844 {
1845 Y[2] = (byte)((6 * (word)Y[0] + 1 * (word)Y[1] + 3) / 7);
1846 Y[3] = (byte)((5 * (word)Y[0] + 2 * (word)Y[1] + 3) / 7);
1847 Y[4] = (byte)((4 * (word)Y[0] + 3 * (word)Y[1] + 3) / 7);
1848 Y[5] = (byte)((3 * (word)Y[0] + 4 * (word)Y[1] + 3) / 7);
1849 Y[6] = (byte)((2 * (word)Y[0] + 5 * (word)Y[1] + 3) / 7);
1850 Y[7] = (byte)((1 * (word)Y[0] + 6 * (word)Y[1] + 3) / 7);
1851 }
1852 else
1853 {
1854 Y[2] = (byte)((4 * (word)Y[0] + 1 * (word)Y[1] + 2) / 5);
1855 Y[3] = (byte)((3 * (word)Y[0] + 2 * (word)Y[1] + 2) / 5);
1856 Y[4] = (byte)((2 * (word)Y[0] + 3 * (word)Y[1] + 2) / 5);
1857 Y[5] = (byte)((1 * (word)Y[0] + 4 * (word)Y[1] + 2) / 5);
1858 Y[6] = 0;
1859 Y[7] = 0xFF;
1860 }
1861
1862 for(int j = 0; j < 4 && (y + j) < internal.height; j++)
1863 {
1864 for(int i = 0; i < 4 && (x + i) < internal.width; i++)
1865 {
1866 word r = X[(unsigned int)(source->xlut >> (16 + 3 * (i + j * 4))) % 8];
1867 word g = Y[(unsigned int)(source->ylut >> (16 + 3 * (i + j * 4))) % 8];
1868
1869 dest[(x + i) + (y + j) * internal.width] = (g << 8) + r;
1870 }
1871 }
1872
1873 source++;
1874 }
1875 }
1876
1877 (byte*&)destSlice += internal.sliceB;
1878 }
1879 }
Nicolas Capens22658242014-11-29 00:31:41 -05001880
1881 struct bgrx8
1882 {
1883 byte b;
1884 byte g;
1885 byte r;
1886 byte x;
1887
1888 inline bgrx8()
1889 {
1890 }
1891
1892 inline void set(int red, int green, int blue)
1893 {
1894 r = static_cast<byte>(clamp(red, 0, 255));
1895 g = static_cast<byte>(clamp(green, 0, 255));
1896 b = static_cast<byte>(clamp(blue, 0, 255));
1897 x = 255;
1898 }
1899 };
1900
1901 struct ETC1
1902 {
1903 struct
1904 {
1905 union
1906 {
1907 struct // Individual colors
1908 {
1909 byte R2 : 4;
1910 byte R1 : 4;
1911 byte G2 : 4;
1912 byte G1 : 4;
1913 byte B2 : 4;
1914 byte B1 : 4;
1915 };
1916
1917 struct // Differential colors
1918 {
1919 sbyte dR : 3;
1920 byte R : 5;
1921 sbyte dG : 3;
1922 byte G : 5;
1923 sbyte dB : 3;
1924 byte B : 5;
1925 };
1926 };
1927
1928 bool flipbit : 1;
1929 bool diffbit : 1;
1930 byte cw2 : 3;
1931 byte cw1 : 3;
1932
1933 byte pixelIndexMSB[2];
1934 byte pixelIndexLSB[2];
1935 };
1936
1937 inline int getIndex(int x, int y) const
1938 {
1939 int bitIndex = x * 4 + y;
1940 int bitOffset = bitIndex & 7;
1941 int lsb = (pixelIndexLSB[1 - (bitIndex >> 3)] >> bitOffset) & 1;
1942 int msb = (pixelIndexMSB[1 - (bitIndex >> 3)] >> bitOffset) & 1;
1943
1944 return (msb << 1) | lsb;
1945 }
1946 };
1947
1948 inline int extend_4to8bits(int x)
1949 {
1950 return (x << 4) | x;
1951 }
1952
1953 inline int extend_5to8bits(int x)
1954 {
1955 return (x << 3) | (x >> 2);
1956 }
1957
1958 void Surface::decodeETC1(Buffer &internal, const Buffer &external)
1959 {
1960 unsigned int *destSlice = (unsigned int*)internal.buffer;
1961 const ETC1 *source = (const ETC1*)external.buffer;
1962
1963 for(int z = 0; z < external.depth; z++)
1964 {
1965 unsigned int *dest = destSlice;
1966
1967 for(int y = 0; y < external.height; y += 4)
1968 {
1969 for(int x = 0; x < external.width; x += 4)
1970 {
1971 bgrx8 *color = reinterpret_cast<bgrx8*>(&dest[x + y * internal.width]);
1972
1973 int r1, g1, b1;
1974 int r2, g2, b2;
1975
1976 if(source->diffbit)
1977 {
1978 b1 = extend_5to8bits(source->B);
1979 g1 = extend_5to8bits(source->G);
1980 r1 = extend_5to8bits(source->R);
1981
1982 r2 = extend_5to8bits(source->R + source->dR);
1983 g2 = extend_5to8bits(source->G + source->dG);
1984 b2 = extend_5to8bits(source->B + source->dB);
1985 }
1986 else
1987 {
1988 r1 = extend_4to8bits(source->R1);
1989 g1 = extend_4to8bits(source->G1);
1990 b1 = extend_4to8bits(source->B1);
1991
1992 r2 = extend_4to8bits(source->R2);
1993 g2 = extend_4to8bits(source->G2);
1994 b2 = extend_4to8bits(source->B2);
1995 }
1996
1997 bgrx8 subblockColors0[4];
1998 bgrx8 subblockColors1[4];
1999
2000 // Table 3.17.2 sorted according to table 3.17.3
2001 static const int intensityModifier[8][4] =
2002 {
2003 {2, 8, -2, -8},
2004 {5, 17, -5, -17},
2005 {9, 29, -9, -29},
2006 {13, 42, -13, -42},
2007 {18, 60, -18, -60},
2008 {24, 80, -24, -80},
2009 {33, 106, -33, -106},
2010 {47, 183, -47, -183}
2011 };
2012
2013 const int i10 = intensityModifier[source->cw1][0];
2014 const int i11 = intensityModifier[source->cw1][1];
2015 const int i12 = intensityModifier[source->cw1][2];
2016 const int i13 = intensityModifier[source->cw1][3];
2017
2018 subblockColors0[0].set(r1 + i10, g1 + i10, b1 + i10);
2019 subblockColors0[1].set(r1 + i11, g1 + i11, b1 + i11);
2020 subblockColors0[2].set(r1 + i12, g1 + i12, b1 + i12);
2021 subblockColors0[3].set(r1 + i13, g1 + i13, b1 + i13);
2022
2023 const int i20 = intensityModifier[source->cw2][0];
2024 const int i21 = intensityModifier[source->cw2][1];
2025 const int i22 = intensityModifier[source->cw2][2];
2026 const int i23 = intensityModifier[source->cw2][3];
2027
2028 subblockColors1[0].set(r2 + i20, g2 + i20, b2 + i20);
2029 subblockColors1[1].set(r2 + i21, g2 + i21, b2 + i21);
2030 subblockColors1[2].set(r2 + i22, g2 + i22, b2 + i22);
2031 subblockColors1[3].set(r2 + i23, g2 + i23, b2 + i23);
2032
2033 if(source->flipbit)
2034 {
2035 for(int y = 0; y < 2; y++)
2036 {
2037 color[0] = subblockColors0[source->getIndex(0, y)];
2038 color[1] = subblockColors0[source->getIndex(1, y)];
2039 color[2] = subblockColors0[source->getIndex(2, y)];
2040 color[3] = subblockColors0[source->getIndex(3, y)];
2041 color += internal.width;
2042 }
2043
2044 for(int y = 2; y < 4; y++)
2045 {
2046 color[0] = subblockColors1[source->getIndex(0, y)];
2047 color[1] = subblockColors1[source->getIndex(1, y)];
2048 color[2] = subblockColors1[source->getIndex(2, y)];
2049 color[3] = subblockColors1[source->getIndex(3, y)];
2050 color += internal.width;
2051 }
2052 }
2053 else
2054 {
2055 for(int y = 0; y < 4; y++)
2056 {
2057 color[0] = subblockColors0[source->getIndex(0, y)];
2058 color[1] = subblockColors0[source->getIndex(1, y)];
2059 color[2] = subblockColors1[source->getIndex(2, y)];
2060 color[3] = subblockColors1[source->getIndex(3, y)];
2061 color += internal.width;
2062 }
2063 }
2064
2065 source++;
2066 }
2067 }
2068
2069 (byte*&)destSlice += internal.sliceB;
2070 }
2071 }
John Bauman89401822014-05-06 15:04:28 -04002072
2073 unsigned int Surface::size(int width, int height, int depth, Format format)
2074 {
2075 // Dimensions rounded up to multiples of 4, used for DXTC formats
2076 int width4 = (width + 3) & ~3;
2077 int height4 = (height + 3) & ~3;
2078
2079 switch(format)
2080 {
2081 #if S3TC_SUPPORT
2082 case FORMAT_DXT1:
John Bauman66b8ab22014-05-06 15:57:45 -04002083 #endif
John Bauman89401822014-05-06 15:04:28 -04002084 case FORMAT_ATI1:
Nicolas Capens22658242014-11-29 00:31:41 -05002085 case FORMAT_ETC1:
John Bauman89401822014-05-06 15:04:28 -04002086 return width4 * height4 * depth / 2;
John Bauman66b8ab22014-05-06 15:57:45 -04002087 #if S3TC_SUPPORT
John Bauman89401822014-05-06 15:04:28 -04002088 case FORMAT_DXT3:
2089 case FORMAT_DXT5:
John Bauman66b8ab22014-05-06 15:57:45 -04002090 #endif
John Bauman89401822014-05-06 15:04:28 -04002091 case FORMAT_ATI2:
2092 return width4 * height4 * depth;
John Bauman89401822014-05-06 15:04:28 -04002093 default:
2094 return bytes(format) * width * height * depth;
2095 }
2096
2097 return 0;
2098 }
2099
2100 bool Surface::isStencil(Format format)
2101 {
2102 switch(format)
2103 {
2104 case FORMAT_D32:
2105 case FORMAT_D16:
2106 case FORMAT_D24X8:
2107 case FORMAT_D32F:
2108 case FORMAT_D32F_COMPLEMENTARY:
2109 case FORMAT_D32F_LOCKABLE:
2110 return false;
2111 case FORMAT_D24S8:
2112 case FORMAT_D24FS8:
2113 case FORMAT_S8:
John Bauman66b8ab22014-05-06 15:57:45 -04002114 case FORMAT_DF24S8:
2115 case FORMAT_DF16S8:
2116 case FORMAT_D32FS8_TEXTURE:
2117 case FORMAT_D32FS8_SHADOW:
John Bauman89401822014-05-06 15:04:28 -04002118 case FORMAT_INTZ:
2119 return true;
2120 default:
2121 return false;
2122 }
2123 }
2124
2125 bool Surface::isDepth(Format format)
2126 {
2127 switch(format)
2128 {
2129 case FORMAT_D32:
2130 case FORMAT_D16:
2131 case FORMAT_D24X8:
2132 case FORMAT_D24S8:
2133 case FORMAT_D24FS8:
2134 case FORMAT_D32F:
2135 case FORMAT_D32F_COMPLEMENTARY:
2136 case FORMAT_D32F_LOCKABLE:
John Bauman66b8ab22014-05-06 15:57:45 -04002137 case FORMAT_DF24S8:
2138 case FORMAT_DF16S8:
2139 case FORMAT_D32FS8_TEXTURE:
2140 case FORMAT_D32FS8_SHADOW:
John Bauman89401822014-05-06 15:04:28 -04002141 case FORMAT_INTZ:
2142 return true;
2143 case FORMAT_S8:
2144 return false;
2145 default:
2146 return false;
2147 }
2148 }
2149
2150 bool Surface::isPalette(Format format)
2151 {
2152 switch(format)
2153 {
2154 case FORMAT_P8:
2155 case FORMAT_A8P8:
2156 return true;
2157 default:
2158 return false;
2159 }
2160 }
2161
2162 bool Surface::isFloatFormat(Format format)
2163 {
2164 switch(format)
2165 {
2166 case FORMAT_X8R8G8B8:
Nicolas Capensef77ac12015-03-28 21:48:51 -04002167 case FORMAT_X8B8G8R8:
John Bauman89401822014-05-06 15:04:28 -04002168 case FORMAT_A8R8G8B8:
Nicolas Capensef77ac12015-03-28 21:48:51 -04002169 case FORMAT_A8B8G8R8:
John Bauman89401822014-05-06 15:04:28 -04002170 case FORMAT_G8R8:
2171 case FORMAT_G16R16:
2172 case FORMAT_A16B16G16R16:
2173 case FORMAT_V8U8:
2174 case FORMAT_Q8W8V8U8:
2175 case FORMAT_X8L8V8U8:
2176 case FORMAT_V16U16:
2177 case FORMAT_A16W16V16U16:
2178 case FORMAT_Q16W16V16U16:
2179 case FORMAT_A8:
2180 case FORMAT_R8:
2181 case FORMAT_L8:
2182 case FORMAT_L16:
2183 case FORMAT_A8L8:
2184 return false;
2185 case FORMAT_R32F:
2186 case FORMAT_G32R32F:
2187 case FORMAT_A32B32G32R32F:
2188 case FORMAT_D32F:
2189 case FORMAT_D32F_COMPLEMENTARY:
2190 case FORMAT_D32F_LOCKABLE:
John Bauman66b8ab22014-05-06 15:57:45 -04002191 case FORMAT_D32FS8_TEXTURE:
2192 case FORMAT_D32FS8_SHADOW:
Nicolas Capens80594422015-06-09 16:42:56 -04002193 case FORMAT_L16F:
2194 case FORMAT_A16L16F:
2195 case FORMAT_L32F:
2196 case FORMAT_A32L32F:
John Bauman89401822014-05-06 15:04:28 -04002197 return true;
2198 default:
2199 ASSERT(false);
2200 }
2201
2202 return false;
2203 }
2204
2205 bool Surface::isUnsignedComponent(Format format, int component)
2206 {
2207 switch(format)
2208 {
2209 case FORMAT_NULL:
2210 case FORMAT_X8R8G8B8:
Nicolas Capensef77ac12015-03-28 21:48:51 -04002211 case FORMAT_X8B8G8R8:
John Bauman89401822014-05-06 15:04:28 -04002212 case FORMAT_A8R8G8B8:
Nicolas Capensef77ac12015-03-28 21:48:51 -04002213 case FORMAT_A8B8G8R8:
John Bauman89401822014-05-06 15:04:28 -04002214 case FORMAT_G8R8:
2215 case FORMAT_G16R16:
2216 case FORMAT_A16B16G16R16:
2217 case FORMAT_D32F:
2218 case FORMAT_D32F_COMPLEMENTARY:
2219 case FORMAT_D32F_LOCKABLE:
John Bauman66b8ab22014-05-06 15:57:45 -04002220 case FORMAT_D32FS8_TEXTURE:
2221 case FORMAT_D32FS8_SHADOW:
John Bauman89401822014-05-06 15:04:28 -04002222 case FORMAT_A8:
2223 case FORMAT_R8:
2224 case FORMAT_L8:
2225 case FORMAT_L16:
2226 case FORMAT_A8L8:
2227 return true;
2228 case FORMAT_V8U8:
2229 case FORMAT_X8L8V8U8:
2230 case FORMAT_V16U16:
2231 if(component < 2)
2232 {
2233 return false;
2234 }
2235 else
2236 {
2237 return true;
2238 }
2239 case FORMAT_A16W16V16U16:
2240 if(component < 3)
2241 {
2242 return false;
2243 }
2244 else
2245 {
2246 return true;
2247 }
2248 case FORMAT_Q8W8V8U8:
2249 case FORMAT_Q16W16V16U16:
2250 return false;
2251 case FORMAT_R32F:
2252 if(component < 1)
2253 {
2254 return false;
2255 }
2256 else
2257 {
2258 return true;
2259 }
2260 case FORMAT_G32R32F:
2261 if(component < 2)
2262 {
2263 return false;
2264 }
2265 else
2266 {
2267 return true;
2268 }
2269 case FORMAT_A32B32G32R32F:
2270 return false;
2271 default:
2272 ASSERT(false);
2273 }
2274
2275 return false;
2276 }
2277
2278 bool Surface::isSRGBreadable(Format format)
2279 {
2280 // Keep in sync with Capabilities::isSRGBreadable
2281 switch(format)
2282 {
2283 case FORMAT_L8:
2284 case FORMAT_A8L8:
2285 case FORMAT_R8G8B8:
2286 case FORMAT_A8R8G8B8:
2287 case FORMAT_X8R8G8B8:
2288 case FORMAT_A8B8G8R8:
2289 case FORMAT_X8B8G8R8:
2290 case FORMAT_R5G6B5:
2291 case FORMAT_X1R5G5B5:
2292 case FORMAT_A1R5G5B5:
2293 case FORMAT_A4R4G4B4:
2294 #if S3TC_SUPPORT
2295 case FORMAT_DXT1:
2296 case FORMAT_DXT3:
2297 case FORMAT_DXT5:
John Bauman66b8ab22014-05-06 15:57:45 -04002298 #endif
John Bauman89401822014-05-06 15:04:28 -04002299 case FORMAT_ATI1:
2300 case FORMAT_ATI2:
John Bauman89401822014-05-06 15:04:28 -04002301 return true;
2302 default:
2303 return false;
2304 }
2305
2306 return false;
2307 }
2308
2309 bool Surface::isSRGBwritable(Format format)
2310 {
2311 // Keep in sync with Capabilities::isSRGBwritable
2312 switch(format)
2313 {
2314 case FORMAT_NULL:
2315 case FORMAT_A8R8G8B8:
2316 case FORMAT_X8R8G8B8:
2317 case FORMAT_A8B8G8R8:
2318 case FORMAT_X8B8G8R8:
2319 case FORMAT_R5G6B5:
2320 return true;
2321 default:
2322 return false;
2323 }
2324 }
2325
2326 bool Surface::isCompressed(Format format)
2327 {
2328 switch(format)
2329 {
2330 #if S3TC_SUPPORT
2331 case FORMAT_DXT1:
2332 case FORMAT_DXT3:
2333 case FORMAT_DXT5:
John Bauman66b8ab22014-05-06 15:57:45 -04002334 #endif
John Bauman89401822014-05-06 15:04:28 -04002335 case FORMAT_ATI1:
2336 case FORMAT_ATI2:
Nicolas Capens22658242014-11-29 00:31:41 -05002337 case FORMAT_ETC1:
John Bauman89401822014-05-06 15:04:28 -04002338 return true;
John Bauman89401822014-05-06 15:04:28 -04002339 default:
2340 return false;
2341 }
2342 }
2343
2344 int Surface::componentCount(Format format)
2345 {
2346 switch(format)
2347 {
2348 case FORMAT_X8R8G8B8: return 3;
Nicolas Capensef77ac12015-03-28 21:48:51 -04002349 case FORMAT_X8B8G8R8: return 3;
John Bauman89401822014-05-06 15:04:28 -04002350 case FORMAT_A8R8G8B8: return 4;
Nicolas Capensef77ac12015-03-28 21:48:51 -04002351 case FORMAT_A8B8G8R8: return 4;
John Bauman89401822014-05-06 15:04:28 -04002352 case FORMAT_G8R8: return 2;
2353 case FORMAT_G16R16: return 2;
2354 case FORMAT_A16B16G16R16: return 4;
2355 case FORMAT_V8U8: return 2;
2356 case FORMAT_Q8W8V8U8: return 4;
2357 case FORMAT_X8L8V8U8: return 3;
2358 case FORMAT_V16U16: return 2;
2359 case FORMAT_A16W16V16U16: return 4;
2360 case FORMAT_Q16W16V16U16: return 4;
2361 case FORMAT_R32F: return 1;
2362 case FORMAT_G32R32F: return 2;
2363 case FORMAT_A32B32G32R32F: return 4;
2364 case FORMAT_D32F_LOCKABLE: return 1;
John Bauman66b8ab22014-05-06 15:57:45 -04002365 case FORMAT_D32FS8_TEXTURE: return 1;
2366 case FORMAT_D32FS8_SHADOW: return 1;
John Bauman89401822014-05-06 15:04:28 -04002367 case FORMAT_A8: return 1;
2368 case FORMAT_R8: return 1;
2369 case FORMAT_L8: return 1;
2370 case FORMAT_L16: return 1;
2371 case FORMAT_A8L8: return 2;
2372 default:
2373 ASSERT(false);
2374 }
2375
2376 return 1;
2377 }
2378
2379 void *Surface::allocateBuffer(int width, int height, int depth, Format format)
2380 {
2381 int width4 = (width + 3) & ~3;
2382 int height4 = (height + 3) & ~3;
2383
John Baumand4ae8632014-05-06 16:18:33 -04002384 return allocateZero(size(width4, height4, depth, format));
John Bauman89401822014-05-06 15:04:28 -04002385 }
2386
Nicolas Capens5ba566b2015-05-25 17:11:04 -04002387 void Surface::memfill4(void *buffer, int pattern, int bytes)
John Bauman89401822014-05-06 15:04:28 -04002388 {
2389 while((size_t)buffer & 0x1 && bytes >= 1)
2390 {
2391 *(char*)buffer = (char)pattern;
2392 (char*&)buffer += 1;
2393 bytes -= 1;
2394 }
2395
2396 while((size_t)buffer & 0x3 && bytes >= 2)
2397 {
2398 *(short*)buffer = (short)pattern;
2399 (short*&)buffer += 1;
2400 bytes -= 2;
2401 }
2402
2403 if(CPUID::supportsSSE())
2404 {
2405 while((size_t)buffer & 0xF && bytes >= 4)
2406 {
2407 *(int*)buffer = pattern;
2408 (int*&)buffer += 1;
2409 bytes -= 4;
2410 }
2411
2412 __m128 quad = _mm_set_ps1((float&)pattern);
2413
2414 float *pointer = (float*)buffer;
2415 int qxwords = bytes / 64;
2416 bytes -= qxwords * 64;
2417
2418 while(qxwords--)
2419 {
2420 _mm_stream_ps(pointer + 0, quad);
2421 _mm_stream_ps(pointer + 4, quad);
2422 _mm_stream_ps(pointer + 8, quad);
2423 _mm_stream_ps(pointer + 12, quad);
2424
2425 pointer += 16;
2426 }
2427
2428 buffer = pointer;
2429 }
2430
2431 while(bytes >= 4)
2432 {
2433 *(int*)buffer = (int)pattern;
2434 (int*&)buffer += 1;
2435 bytes -= 4;
2436 }
2437
2438 while(bytes >= 2)
2439 {
2440 *(short*)buffer = (short)pattern;
2441 (short*&)buffer += 1;
2442 bytes -= 2;
2443 }
2444
2445 while(bytes >= 1)
2446 {
2447 *(char*)buffer = (char)pattern;
2448 (char*&)buffer += 1;
2449 bytes -= 1;
2450 }
2451 }
2452
Nicolas Capensef77ac12015-03-28 21:48:51 -04002453 void Surface::clearColorBuffer(unsigned int colorARGB, unsigned int rgbaMask, int x0, int y0, int width, int height)
John Bauman89401822014-05-06 15:04:28 -04002454 {
2455 // FIXME: Also clear buffers in other formats?
2456
2457 // Not overlapping
2458 if(x0 > internal.width) return;
2459 if(y0 > internal.height) return;
2460 if(x0 + width < 0) return;
2461 if(y0 + height < 0) return;
2462
2463 // Clip against dimensions
2464 if(x0 < 0) {width += x0; x0 = 0;}
2465 if(x0 + width > internal.width) width = internal.width - x0;
2466 if(y0 < 0) {height += y0; y0 = 0;}
2467 if(y0 + height > internal.height) height = internal.height - y0;
2468
2469 const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
2470 const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;
2471
2472 int width2 = (internal.width + 1) & ~1;
2473
2474 int x1 = x0 + width;
2475 int y1 = y0 + height;
2476
2477 int bytes = 4 * (x1 - x0);
2478
2479 // if(lockable || !quadLayoutEnabled)
2480 {
Nicolas Capens5ba566b2015-05-25 17:11:04 -04002481 unsigned char *buffer = (unsigned char*)lockInternal(x0, y0, 0, lock, PUBLIC);
John Bauman89401822014-05-06 15:04:28 -04002482
2483 for(int z = 0; z < internal.depth; z++)
2484 {
2485 unsigned char *target = buffer;
2486
2487 for(int y = y0; y < y1; y++)
2488 {
2489 switch(internal.format)
2490 {
2491 case FORMAT_NULL:
2492 break;
2493 case FORMAT_X8R8G8B8:
2494 case FORMAT_A8R8G8B8:
2495 // case FORMAT_X8G8R8B8Q: // FIXME
2496 // case FORMAT_A8G8R8B8Q: // FIXME
John Bauman19bac1e2014-05-06 15:23:49 -04002497 if(rgbaMask == 0xF || (internal.format == FORMAT_X8R8G8B8 && rgbaMask == 0x7))
John Bauman89401822014-05-06 15:04:28 -04002498 {
Nicolas Capens5ba566b2015-05-25 17:11:04 -04002499 memfill4(target, colorARGB, 4 * (x1 - x0));
John Bauman89401822014-05-06 15:04:28 -04002500 }
2501 else
2502 {
2503 unsigned int bgraMask = (rgbaMask & 0x1 ? 0x00FF0000 : 0) | (rgbaMask & 0x2 ? 0x0000FF00 : 0) | (rgbaMask & 0x4 ? 0x000000FF : 0) | (rgbaMask & 0x8 ? 0xFF000000 : 0);
2504 unsigned int invMask = ~bgraMask;
Nicolas Capensef77ac12015-03-28 21:48:51 -04002505 unsigned int maskedColor = colorARGB & bgraMask;
John Bauman89401822014-05-06 15:04:28 -04002506 unsigned int *target32 = (unsigned int*)target;
2507
2508 for(int x = 0; x < width; x++)
2509 {
2510 target32[x] = maskedColor | (target32[x] & invMask);
2511 }
2512 }
2513 break;
Nicolas Capensef77ac12015-03-28 21:48:51 -04002514 case FORMAT_X8B8G8R8:
2515 case FORMAT_A8B8G8R8:
Nicolas Capensef77ac12015-03-28 21:48:51 -04002516 {
Nicolas Capensd61d3a72015-05-26 10:56:05 -04002517 unsigned char r8 = (colorARGB & 0x00FF0000) >> 16;
2518 unsigned char g8 = (colorARGB & 0x0000FF00) >> 8;
2519 unsigned char b8 = (colorARGB & 0x000000FF) >> 0;
2520 unsigned char a8 = (colorARGB & 0xFF000000) >> 24;
2521 unsigned char a8b8g8r8[4] = {r8, g8, b8, a8};
2522 unsigned int colorABGR = (unsigned int&)a8b8g8r8;
Nicolas Capensef77ac12015-03-28 21:48:51 -04002523
Nicolas Capensd61d3a72015-05-26 10:56:05 -04002524 if(rgbaMask == 0xF || (internal.format == FORMAT_X8B8G8R8 && rgbaMask == 0x7))
Nicolas Capensef77ac12015-03-28 21:48:51 -04002525 {
Nicolas Capensd61d3a72015-05-26 10:56:05 -04002526 memfill4(target, colorABGR, 4 * (x1 - x0));
2527 }
2528 else
2529 {
2530 unsigned int rgbaMask32 = (rgbaMask & 0x1 ? 0x000000FF : 0) | (rgbaMask & 0x2 ? 0x0000FF00 : 0) | (rgbaMask & 0x4 ? 0x00FF0000 : 0) | (rgbaMask & 0x8 ? 0xFF000000 : 0);
2531 unsigned int invMask = ~rgbaMask32;
2532 unsigned int maskedColor = colorABGR & rgbaMask32;
2533 unsigned int *target32 = (unsigned int*)target;
2534
2535 for(int x = 0; x < width; x++)
2536 {
2537 target32[x] = maskedColor | (target32[x] & invMask);
2538 }
Nicolas Capensef77ac12015-03-28 21:48:51 -04002539 }
2540 }
2541 break;
John Bauman89401822014-05-06 15:04:28 -04002542 case FORMAT_G8R8:
John Bauman89401822014-05-06 15:04:28 -04002543 {
Nicolas Capensd61d3a72015-05-26 10:56:05 -04002544 unsigned char r8 = (colorARGB & 0x00FF0000) >> 16;
2545 unsigned char g8 = (colorARGB & 0x0000FF00) >> 8;
2546 unsigned char g8r8[4] = {r8, g8, r8, g8};
John Bauman89401822014-05-06 15:04:28 -04002547
Nicolas Capensd61d3a72015-05-26 10:56:05 -04002548 if((rgbaMask & 0x3) == 0x3)
John Bauman89401822014-05-06 15:04:28 -04002549 {
Nicolas Capensd61d3a72015-05-26 10:56:05 -04002550 memfill4(target, (int&)g8r8, 2 * (x1 - x0));
2551 }
2552 else
2553 {
2554 unsigned short rgMask = (rgbaMask & 0x1 ? 0x000000FF : 0) | (rgbaMask & 0x2 ? 0x0000FF00 : 0);
2555 unsigned short invMask = ~rgMask;
2556 unsigned short maskedColor = (unsigned short&)g8r8 & rgMask;
2557 unsigned short *target16 = (unsigned short*)target;
2558
2559 for(int x = 0; x < width; x++)
2560 {
2561 target16[x] = maskedColor | (target16[x] & invMask);
2562 }
John Bauman89401822014-05-06 15:04:28 -04002563 }
2564 }
2565 break;
2566 case FORMAT_G16R16:
John Bauman89401822014-05-06 15:04:28 -04002567 {
Nicolas Capensd61d3a72015-05-26 10:56:05 -04002568 unsigned char r8 = (colorARGB & 0x00FF0000) >> 16;
2569 unsigned char g8 = (colorARGB & 0x0000FF00) >> 8;
2570 unsigned short r16 = (r8 << 8) | r8;
2571 unsigned short g16 = (g8 << 8) | g8;
2572 unsigned short g16r16[2] = {r16, g16};
John Bauman89401822014-05-06 15:04:28 -04002573
Nicolas Capensd61d3a72015-05-26 10:56:05 -04002574 if((rgbaMask & 0x3) == 0x3)
John Bauman89401822014-05-06 15:04:28 -04002575 {
Nicolas Capensd61d3a72015-05-26 10:56:05 -04002576 memfill4(target, (int&)g16r16, 4 * (x1 - x0));
2577 }
2578 else
2579 {
2580 unsigned int rgMask = (rgbaMask & 0x1 ? 0x0000FFFF : 0) | (rgbaMask & 0x2 ? 0xFFFF0000 : 0);
2581 unsigned int invMask = ~rgMask;
2582 unsigned int maskedColor = (unsigned int&)g16r16 & rgMask;
2583 unsigned int *target32 = (unsigned int*)target;
2584
2585 for(int x = 0; x < width; x++)
2586 {
2587 target32[x] = maskedColor | (target32[x] & invMask);
2588 }
John Bauman89401822014-05-06 15:04:28 -04002589 }
2590 }
2591 break;
2592 case FORMAT_A16B16G16R16:
John Bauman89401822014-05-06 15:04:28 -04002593 {
Nicolas Capensd61d3a72015-05-26 10:56:05 -04002594 unsigned char r8 = (colorARGB & 0x00FF0000) >> 16;
2595 unsigned char g8 = (colorARGB & 0x0000FF00) >> 8;
2596 unsigned char b8 = (colorARGB & 0x000000FF) >> 0;
2597 unsigned char a8 = (colorARGB & 0xFF000000) >> 24;
2598 unsigned short r16 = (r8 << 8) | r8;
2599 unsigned short g16 = (g8 << 8) | g8;
2600 unsigned short b16 = (b8 << 8) | b8;
2601 unsigned short a16 = (a8 << 8) | a8;
2602
2603 if(rgbaMask == 0xF)
John Bauman89401822014-05-06 15:04:28 -04002604 {
Nicolas Capensd61d3a72015-05-26 10:56:05 -04002605 for(int x = 0; x < width; x++)
2606 {
2607 ((unsigned short*)target)[4 * x + 0] = r16;
2608 ((unsigned short*)target)[4 * x + 1] = g16;
2609 ((unsigned short*)target)[4 * x + 2] = b16;
2610 ((unsigned short*)target)[4 * x + 3] = a16;
2611 }
John Bauman89401822014-05-06 15:04:28 -04002612 }
Nicolas Capensd61d3a72015-05-26 10:56:05 -04002613 else
2614 {
2615 if(rgbaMask & 0x1) for(int x = 0; x < width; x++) ((unsigned short*)target)[4 * x + 0] = r16;
2616 if(rgbaMask & 0x2) for(int x = 0; x < width; x++) ((unsigned short*)target)[4 * x + 1] = g16;
2617 if(rgbaMask & 0x4) for(int x = 0; x < width; x++) ((unsigned short*)target)[4 * x + 2] = b16;
2618 if(rgbaMask & 0x8) for(int x = 0; x < width; x++) ((unsigned short*)target)[4 * x + 3] = a16;
2619 }
John Bauman89401822014-05-06 15:04:28 -04002620 }
2621 break;
2622 case FORMAT_R32F:
2623 if(rgbaMask & 0x1)
2624 {
Nicolas Capensd61d3a72015-05-26 10:56:05 -04002625 float r32f = (float)(colorARGB & 0x00FF0000) / 0x00FF0000;
2626
John Bauman89401822014-05-06 15:04:28 -04002627 for(int x = 0; x < width; x++)
2628 {
2629 ((float*)target)[x] = r32f;
2630 }
2631 }
2632 break;
2633 case FORMAT_G32R32F:
John Bauman89401822014-05-06 15:04:28 -04002634 {
Nicolas Capensd61d3a72015-05-26 10:56:05 -04002635 float r32f = (float)(colorARGB & 0x00FF0000) / 0x00FF0000;
2636 float g32f = (float)(colorARGB & 0x0000FF00) / 0x0000FF00;
2637
2638 if((rgbaMask & 0x3) == 0x3)
John Bauman89401822014-05-06 15:04:28 -04002639 {
Nicolas Capensd61d3a72015-05-26 10:56:05 -04002640 for(int x = 0; x < width; x++)
2641 {
2642 ((float*)target)[2 * x + 0] = r32f;
2643 ((float*)target)[2 * x + 1] = g32f;
2644 }
John Bauman89401822014-05-06 15:04:28 -04002645 }
Nicolas Capensd61d3a72015-05-26 10:56:05 -04002646 else
2647 {
2648 if(rgbaMask & 0x1) for(int x = 0; x < width; x++) ((float*)target)[2 * x + 0] = r32f;
2649 if(rgbaMask & 0x2) for(int x = 0; x < width; x++) ((float*)target)[2 * x + 1] = g32f;
2650 }
John Bauman89401822014-05-06 15:04:28 -04002651 }
2652 break;
2653 case FORMAT_A32B32G32R32F:
John Bauman89401822014-05-06 15:04:28 -04002654 {
Nicolas Capensd61d3a72015-05-26 10:56:05 -04002655 float r32f = (float)(colorARGB & 0x00FF0000) / 0x00FF0000;
2656 float g32f = (float)(colorARGB & 0x0000FF00) / 0x0000FF00;
2657 float b32f = (float)(colorARGB & 0x000000FF) / 0x000000FF;
2658 float a32f = (float)(colorARGB & 0xFF000000) / 0xFF000000;
2659
2660 if(rgbaMask == 0xF)
John Bauman89401822014-05-06 15:04:28 -04002661 {
Nicolas Capensd61d3a72015-05-26 10:56:05 -04002662 for(int x = 0; x < width; x++)
2663 {
2664 ((float*)target)[4 * x + 0] = r32f;
2665 ((float*)target)[4 * x + 1] = g32f;
2666 ((float*)target)[4 * x + 2] = b32f;
2667 ((float*)target)[4 * x + 3] = a32f;
2668 }
John Bauman89401822014-05-06 15:04:28 -04002669 }
Nicolas Capensd61d3a72015-05-26 10:56:05 -04002670 else
2671 {
2672 if(rgbaMask & 0x1) for(int x = 0; x < width; x++) ((float*)target)[4 * x + 0] = r32f;
2673 if(rgbaMask & 0x2) for(int x = 0; x < width; x++) ((float*)target)[4 * x + 1] = g32f;
2674 if(rgbaMask & 0x4) for(int x = 0; x < width; x++) ((float*)target)[4 * x + 2] = b32f;
2675 if(rgbaMask & 0x8) for(int x = 0; x < width; x++) ((float*)target)[4 * x + 3] = a32f;
2676 }
John Bauman89401822014-05-06 15:04:28 -04002677 }
2678 break;
Nicolas Capens5ba566b2015-05-25 17:11:04 -04002679 case FORMAT_R5G6B5:
Nicolas Capens5ba566b2015-05-25 17:11:04 -04002680 {
Nicolas Capensd61d3a72015-05-26 10:56:05 -04002681 unsigned int r5g6b5 = ((colorARGB >> 8) & 0xF800) | ((colorARGB >> 5) & 0x07E0) | ((colorARGB >> 3) & 0x001F);
Nicolas Capens5ba566b2015-05-25 17:11:04 -04002682
Nicolas Capensd61d3a72015-05-26 10:56:05 -04002683 if((rgbaMask & 0x7) == 0x7)
Nicolas Capens5ba566b2015-05-25 17:11:04 -04002684 {
Nicolas Capensd61d3a72015-05-26 10:56:05 -04002685 unsigned int r5g6b5r5g6b5 = r5g6b5 | (r5g6b5 << 16);
2686 memfill4(target, r5g6b5r5g6b5, 2 * (x1 - x0));
2687 }
2688 else
2689 {
2690 unsigned short rgbMask = (rgbaMask & 0x1 ? 0xF800 : 0) | (rgbaMask & 0x2 ? 0x07E0 : 0) | (rgbaMask & 0x3 ? 0x001F : 0);
2691 unsigned short invMask = ~rgbMask;
2692 unsigned short maskedColor = r5g6b5 & rgbMask;
2693 unsigned short *target16 = (unsigned short*)target;
2694
2695 for(int x = 0; x < width; x++)
2696 {
2697 target16[x] = maskedColor | (target16[x] & invMask);
2698 }
Nicolas Capens5ba566b2015-05-25 17:11:04 -04002699 }
2700 }
2701 break;
John Bauman89401822014-05-06 15:04:28 -04002702 default:
2703 ASSERT(false);
2704 }
2705
2706 target += internal.pitchB;
2707 }
2708
2709 buffer += internal.sliceB;
2710 }
2711
2712 unlockInternal();
2713 }
2714 /* else
2715 {
2716 // unsigned char *target = (unsigned char*&)buffer;
2717 //
2718 // for(int y = y0; y < y1; y++)
2719 // {
2720 // for(int x = x0; x < x1; x++)
2721 // {
2722 // target[width2 * 4 * (y & ~1) + 2 * (y & 1) + 8 * (x & ~1) + (x & 1) + 0] = (color & 0x000000FF) >> 0;
2723 // target[width2 * 4 * (y & ~1) + 2 * (y & 1) + 8 * (x & ~1) + (x & 1) + 4] = (color & 0x00FF0000) >> 16;
2724 // target[width2 * 4 * (y & ~1) + 2 * (y & 1) + 8 * (x & ~1) + (x & 1) + 8] = (color & 0x0000FF00) >> 8;
2725 // target[width2 * 4 * (y & ~1) + 2 * (y & 1) + 8 * (x & ~1) + (x & 1) + 12] = (color & 0xFF000000) >> 24;
2726 // }
2727 // }
2728
2729 unsigned char colorQ[16];
2730
2731 colorQ[0] = (color & 0x000000FF) >> 0;
2732 colorQ[1] = (color & 0x000000FF) >> 0;
2733 colorQ[2] = (color & 0x000000FF) >> 0;
2734 colorQ[3] = (color & 0x000000FF) >> 0;
2735 colorQ[4] = (color & 0x00FF0000) >> 16;
2736 colorQ[5] = (color & 0x00FF0000) >> 16;
2737 colorQ[6] = (color & 0x00FF0000) >> 16;
2738 colorQ[7] = (color & 0x00FF0000) >> 16;
2739 colorQ[8] = (color & 0x0000FF00) >> 8;
2740 colorQ[9] = (color & 0x0000FF00) >> 8;
2741 colorQ[10] = (color & 0x0000FF00) >> 8;
2742 colorQ[11] = (color & 0x0000FF00) >> 8;
2743 colorQ[12] = (color & 0xFF000000) >> 24;
2744 colorQ[13] = (color & 0xFF000000) >> 24;
2745 colorQ[14] = (color & 0xFF000000) >> 24;
2746 colorQ[15] = (color & 0xFF000000) >> 24;
2747
2748 for(int y = y0; y < y1; y++)
2749 {
2750 unsigned char *target = (unsigned char*)lockInternal(0, 0, 0, lock) + width2 * 4 * (y & ~1) + 2 * (y & 1); // FIXME: Unlock
2751
2752 if((y & 1) == 0 && y + 1 < y1) // Fill quad line at once
2753 {
2754 if((x0 & 1) != 0)
2755 {
2756 target[8 * (x0 & ~1) + 1 + 0] = (color & 0x000000FF) >> 0;
2757 target[8 * (x0 & ~1) + 1 + 4] = (color & 0x00FF0000) >> 16;
2758 target[8 * (x0 & ~1) + 1 + 8] = (color & 0x0000FF00) >> 8;
2759 target[8 * (x0 & ~1) + 1 + 12] = (color & 0xFF000000) >> 24;
2760
2761 target[8 * (x0 & ~1) + 3 + 0] = (color & 0x000000FF) >> 0;
2762 target[8 * (x0 & ~1) + 3 + 4] = (color & 0x00FF0000) >> 16;
2763 target[8 * (x0 & ~1) + 3 + 8] = (color & 0x0000FF00) >> 8;
2764 target[8 * (x0 & ~1) + 3 + 12] = (color & 0xFF000000) >> 24;
2765 }
2766
2767 __asm
2768 {
2769 movq mm0, colorQ+0
2770 movq mm1, colorQ+8
2771
2772 mov eax, x0
2773 add eax, 1
2774 and eax, 0xFFFFFFFE
2775 cmp eax, x1
2776 jge qEnd
2777
2778 mov edi, target
2779
2780 qLoop:
2781 movntq [edi+8*eax+0], mm0
2782 movntq [edi+8*eax+8], mm1
2783
2784 add eax, 2
2785 cmp eax, x1
2786 jl qLoop
2787 qEnd:
2788 emms
2789 }
2790
2791 if((x1 & 1) != 0)
2792 {
2793 target[8 * (x1 & ~1) + 0 + 0] = (color & 0x000000FF) >> 0;
2794 target[8 * (x1 & ~1) + 0 + 4] = (color & 0x00FF0000) >> 16;
2795 target[8 * (x1 & ~1) + 0 + 8] = (color & 0x0000FF00) >> 8;
2796 target[8 * (x1 & ~1) + 0 + 12] = (color & 0xFF000000) >> 24;
2797
2798 target[8 * (x1 & ~1) + 2 + 0] = (color & 0x000000FF) >> 0;
2799 target[8 * (x1 & ~1) + 2 + 4] = (color & 0x00FF0000) >> 16;
2800 target[8 * (x1 & ~1) + 2 + 8] = (color & 0x0000FF00) >> 8;
2801 target[8 * (x1 & ~1) + 2 + 12] = (color & 0xFF000000) >> 24;
2802 }
2803
2804 y++;
2805 }
2806 else
2807 {
2808 for(int x = x0; x < x1; x++)
2809 {
2810 target[8 * (x & ~1) + (x & 1) + 0] = (color & 0x000000FF) >> 0;
2811 target[8 * (x & ~1) + (x & 1) + 4] = (color & 0x00FF0000) >> 16;
2812 target[8 * (x & ~1) + (x & 1) + 8] = (color & 0x0000FF00) >> 8;
2813 target[8 * (x & ~1) + (x & 1) + 12] = (color & 0xFF000000) >> 24;
2814 }
2815 }
2816 }
2817 }*/
2818 }
2819
2820 void Surface::clearDepthBuffer(float depth, int x0, int y0, int width, int height)
2821 {
2822 // Not overlapping
2823 if(x0 > internal.width) return;
2824 if(y0 > internal.height) return;
2825 if(x0 + width < 0) return;
2826 if(y0 + height < 0) return;
2827
2828 // Clip against dimensions
2829 if(x0 < 0) {width += x0; x0 = 0;}
2830 if(x0 + width > internal.width) width = internal.width - x0;
2831 if(y0 < 0) {height += y0; y0 = 0;}
2832 if(y0 + height > internal.height) height = internal.height - y0;
2833
2834 const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
2835 const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;
2836
2837 int width2 = (internal.width + 1) & ~1;
2838
2839 int x1 = x0 + width;
2840 int y1 = y0 + height;
2841
2842 if(internal.format == FORMAT_D32F_LOCKABLE ||
John Bauman66b8ab22014-05-06 15:57:45 -04002843 internal.format == FORMAT_D32FS8_TEXTURE ||
2844 internal.format == FORMAT_D32FS8_SHADOW)
John Bauman89401822014-05-06 15:04:28 -04002845 {
2846 float *target = (float*)lockInternal(0, 0, 0, lock, PUBLIC) + x0 + width2 * y0;
2847
2848 for(int z = 0; z < internal.depth; z++)
2849 {
2850 for(int y = y0; y < y1; y++)
2851 {
Nicolas Capens5ba566b2015-05-25 17:11:04 -04002852 memfill4(target, (int&)depth, 4 * width);
John Bauman89401822014-05-06 15:04:28 -04002853 target += width2;
2854 }
2855 }
2856
2857 unlockInternal();
2858 }
2859 else // Quad layout
2860 {
2861 if(complementaryDepthBuffer)
2862 {
2863 depth = 1 - depth;
2864 }
2865
2866 float *buffer = (float*)lockInternal(0, 0, 0, lock, PUBLIC);
2867
2868 for(int z = 0; z < internal.depth; z++)
2869 {
2870 for(int y = y0; y < y1; y++)
2871 {
2872 float *target = buffer + (y & ~1) * width2 + (y & 1) * 2;
2873
2874 if((y & 1) == 0 && y + 1 < y1) // Fill quad line at once
2875 {
2876 if((x0 & 1) != 0)
2877 {
2878 target[(x0 & ~1) * 2 + 1] = depth;
2879 target[(x0 & ~1) * 2 + 3] = depth;
2880 }
2881
2882 // for(int x2 = ((x0 + 1) & ~1) * 2; x2 < x1 * 2; x2 += 4)
2883 // {
2884 // target[x2 + 0] = depth;
2885 // target[x2 + 1] = depth;
2886 // target[x2 + 2] = depth;
2887 // target[x2 + 3] = depth;
2888 // }
2889
2890 // __asm
2891 // {
2892 // movss xmm0, depth
2893 // shufps xmm0, xmm0, 0x00
2894 //
2895 // mov eax, x0
2896 // add eax, 1
2897 // and eax, 0xFFFFFFFE
2898 // cmp eax, x1
2899 // jge qEnd
2900 //
2901 // mov edi, target
2902 //
2903 // qLoop:
2904 // movntps [edi+8*eax], xmm0
2905 //
2906 // add eax, 2
2907 // cmp eax, x1
2908 // jl qLoop
2909 // qEnd:
2910 // }
2911
Nicolas Capens5ba566b2015-05-25 17:11:04 -04002912 memfill4(&target[((x0 + 1) & ~1) * 2], (int&)depth, 8 * ((x1 & ~1) - ((x0 + 1) & ~1)));
John Bauman89401822014-05-06 15:04:28 -04002913
2914 if((x1 & 1) != 0)
2915 {
2916 target[(x1 & ~1) * 2 + 0] = depth;
2917 target[(x1 & ~1) * 2 + 2] = depth;
2918 }
2919
2920 y++;
2921 }
2922 else
2923 {
2924 for(int x = x0; x < x1; x++)
2925 {
2926 target[(x & ~1) * 2 + (x & 1)] = depth;
2927 }
2928 }
2929 }
2930
2931 buffer += internal.sliceP;
2932 }
2933
2934 unlockInternal();
2935 }
2936 }
2937
2938 void Surface::clearStencilBuffer(unsigned char s, unsigned char mask, int x0, int y0, int width, int height)
2939 {
2940 // Not overlapping
2941 if(x0 > internal.width) return;
2942 if(y0 > internal.height) return;
2943 if(x0 + width < 0) return;
2944 if(y0 + height < 0) return;
2945
2946 // Clip against dimensions
2947 if(x0 < 0) {width += x0; x0 = 0;}
2948 if(x0 + width > internal.width) width = internal.width - x0;
2949 if(y0 < 0) {height += y0; y0 = 0;}
2950 if(y0 + height > internal.height) height = internal.height - y0;
2951
2952 int width2 = (internal.width + 1) & ~1;
2953
2954 int x1 = x0 + width;
2955 int y1 = y0 + height;
2956
2957 unsigned char maskedS = s & mask;
2958 unsigned char invMask = ~mask;
2959 unsigned int fill = maskedS;
2960 fill = fill | (fill << 8) | (fill << 16) + (fill << 24);
2961
2962 if(false)
2963 {
2964 char *target = (char*)lockStencil(0, PUBLIC) + x0 + width2 * y0;
2965
2966 for(int z = 0; z < stencil.depth; z++)
2967 {
2968 for(int y = y0; y < y0 + height; y++)
2969 {
2970 if(mask == 0xFF)
2971 {
Nicolas Capens5ba566b2015-05-25 17:11:04 -04002972 memfill4(target, fill, width);
John Bauman89401822014-05-06 15:04:28 -04002973 }
2974 else
2975 {
2976 for(int x = 0; x < width; x++)
2977 {
2978 target[x] = maskedS | (target[x] & invMask);
2979 }
2980 }
2981
2982 target += width2;
2983 }
2984 }
2985
2986 unlockStencil();
2987 }
2988 else // Quad layout
2989 {
2990 char *buffer = (char*)lockStencil(0, PUBLIC);
2991
2992 if(mask == 0xFF)
2993 {
2994 for(int z = 0; z < stencil.depth; z++)
2995 {
2996 for(int y = y0; y < y1; y++)
2997 {
2998 char *target = buffer + (y & ~1) * width2 + (y & 1) * 2;
2999
3000 if((y & 1) == 0 && y + 1 < y1 && mask == 0xFF) // Fill quad line at once
3001 {
3002 if((x0 & 1) != 0)
3003 {
3004 target[(x0 & ~1) * 2 + 1] = fill;
3005 target[(x0 & ~1) * 2 + 3] = fill;
3006 }
3007
Nicolas Capens5ba566b2015-05-25 17:11:04 -04003008 memfill4(&target[((x0 + 1) & ~1) * 2], fill, ((x1 + 1) & ~1) * 2 - ((x0 + 1) & ~1) * 2);
John Bauman89401822014-05-06 15:04:28 -04003009
3010 if((x1 & 1) != 0)
3011 {
3012 target[(x1 & ~1) * 2 + 0] = fill;
3013 target[(x1 & ~1) * 2 + 2] = fill;
3014 }
3015
3016 y++;
3017 }
3018 else
3019 {
3020 for(int x = x0; x < x1; x++)
3021 {
3022 target[(x & ~1) * 2 + (x & 1)] = maskedS | (target[x] & invMask);
3023 }
3024 }
3025 }
3026
3027 buffer += stencil.sliceP;
3028 }
3029 }
3030
3031 unlockStencil();
3032 }
3033 }
3034
3035 void Surface::fill(const Color<float> &color, int x0, int y0, int width, int height)
3036 {
3037 unsigned char *row;
3038 Buffer *buffer;
3039
3040 if(internal.dirty)
3041 {
3042 row = (unsigned char*)lockInternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
3043 buffer = &internal;
3044 }
3045 else
3046 {
3047 row = (unsigned char*)lockExternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
3048 buffer = &external;
3049 }
3050
3051 if(buffer->bytes <= 4)
3052 {
3053 int c;
3054 buffer->write(&c, color);
3055
3056 if(buffer->bytes <= 1) c = (c << 8) | c;
3057 if(buffer->bytes <= 2) c = (c << 16) | c;
3058
3059 for(int y = 0; y < height; y++)
3060 {
Nicolas Capens5ba566b2015-05-25 17:11:04 -04003061 memfill4(row, c, width * buffer->bytes);
John Bauman89401822014-05-06 15:04:28 -04003062
3063 row += buffer->pitchB;
3064 }
3065 }
3066 else // Generic
3067 {
3068 for(int y = 0; y < height; y++)
3069 {
3070 unsigned char *element = row;
3071
3072 for(int x = 0; x < width; x++)
3073 {
3074 buffer->write(element, color);
3075
3076 element += buffer->bytes;
3077 }
3078
3079 row += buffer->pitchB;
3080 }
3081 }
3082
3083 if(buffer == &internal)
3084 {
3085 unlockInternal();
3086 }
3087 else
3088 {
3089 unlockExternal();
3090 }
3091 }
3092
3093 Color<float> Surface::readExternal(int x, int y, int z) const
3094 {
3095 ASSERT(external.lock != LOCK_UNLOCKED);
3096
3097 return external.read(x, y, z);
3098 }
3099
3100 Color<float> Surface::readExternal(int x, int y) const
3101 {
3102 ASSERT(external.lock != LOCK_UNLOCKED);
3103
3104 return external.read(x, y);
3105 }
3106
3107 Color<float> Surface::sampleExternal(float x, float y, float z) const
3108 {
3109 ASSERT(external.lock != LOCK_UNLOCKED);
3110
3111 return external.sample(x, y, z);
3112 }
3113
3114 Color<float> Surface::sampleExternal(float x, float y) const
3115 {
3116 ASSERT(external.lock != LOCK_UNLOCKED);
3117
3118 return external.sample(x, y);
3119 }
3120
3121 void Surface::writeExternal(int x, int y, int z, const Color<float> &color)
3122 {
3123 ASSERT(external.lock != LOCK_UNLOCKED);
3124
3125 external.write(x, y, z, color);
3126 }
3127
3128 void Surface::writeExternal(int x, int y, const Color<float> &color)
3129 {
3130 ASSERT(external.lock != LOCK_UNLOCKED);
3131
3132 external.write(x, y, color);
3133 }
3134
3135 Color<float> Surface::readInternal(int x, int y, int z) const
3136 {
3137 ASSERT(internal.lock != LOCK_UNLOCKED);
3138
3139 return internal.read(x, y, z);
3140 }
3141
3142 Color<float> Surface::readInternal(int x, int y) const
3143 {
3144 ASSERT(internal.lock != LOCK_UNLOCKED);
3145
3146 return internal.read(x, y);
3147 }
3148
3149 Color<float> Surface::sampleInternal(float x, float y, float z) const
3150 {
3151 ASSERT(internal.lock != LOCK_UNLOCKED);
3152
3153 return internal.sample(x, y, z);
3154 }
3155
3156 Color<float> Surface::sampleInternal(float x, float y) const
3157 {
3158 ASSERT(internal.lock != LOCK_UNLOCKED);
3159
3160 return internal.sample(x, y);
3161 }
3162
3163 void Surface::writeInternal(int x, int y, int z, const Color<float> &color)
3164 {
3165 ASSERT(internal.lock != LOCK_UNLOCKED);
3166
3167 internal.write(x, y, z, color);
3168 }
3169
3170 void Surface::writeInternal(int x, int y, const Color<float> &color)
3171 {
3172 ASSERT(internal.lock != LOCK_UNLOCKED);
3173
3174 internal.write(x, y, color);
3175 }
3176
3177 bool Surface::hasStencil() const
3178 {
3179 return isStencil(external.format);
3180 }
3181
3182 bool Surface::hasDepth() const
3183 {
3184 return isDepth(external.format);
3185 }
3186
3187 bool Surface::hasPalette() const
3188 {
3189 return isPalette(external.format);
3190 }
3191
3192 bool Surface::isRenderTarget() const
3193 {
3194 return renderTarget;
3195 }
3196
3197 bool Surface::hasDirtyMipmaps() const
3198 {
3199 return dirtyMipmaps;
3200 }
3201
3202 void Surface::cleanMipmaps()
3203 {
3204 dirtyMipmaps = false;
3205 }
3206
3207 Resource *Surface::getResource()
3208 {
3209 return resource;
3210 }
3211
3212 bool Surface::identicalFormats() const
3213 {
John Bauman66b8ab22014-05-06 15:57:45 -04003214 return external.format == internal.format &&
3215 external.width == internal.width &&
Nicolas Capens22658242014-11-29 00:31:41 -05003216 external.height == internal.height &&
3217 external.depth == internal.depth &&
3218 external.pitchB == internal.pitchB &&
3219 external.sliceB == internal.sliceB;
John Bauman89401822014-05-06 15:04:28 -04003220 }
3221
3222 Format Surface::selectInternalFormat(Format format) const
3223 {
3224 switch(format)
3225 {
3226 case FORMAT_NULL:
3227 return FORMAT_NULL;
3228 case FORMAT_P8:
3229 case FORMAT_A8P8:
3230 case FORMAT_A4R4G4B4:
3231 case FORMAT_A1R5G5B5:
3232 case FORMAT_A8R3G3B2:
3233 return FORMAT_A8R8G8B8;
3234 case FORMAT_A8:
3235 return FORMAT_A8;
3236 case FORMAT_R8:
3237 return FORMAT_R8;
3238 case FORMAT_A2R10G10B10:
3239 case FORMAT_A2B10G10R10:
3240 case FORMAT_A16B16G16R16:
3241 return FORMAT_A16B16G16R16;
3242 case FORMAT_G8R8:
3243 return FORMAT_G8R8;
3244 case FORMAT_G16R16:
3245 return FORMAT_G16R16;
3246 case FORMAT_A8R8G8B8:
John Bauman89401822014-05-06 15:04:28 -04003247 if(lockable || !quadLayoutEnabled)
3248 {
3249 return FORMAT_A8R8G8B8;
3250 }
3251 else
3252 {
3253 return FORMAT_A8G8R8B8Q;
3254 }
Nicolas Capens80594422015-06-09 16:42:56 -04003255 case FORMAT_R5G5B5A1:
3256 case FORMAT_R4G4B4A4:
Nicolas Capensef77ac12015-03-28 21:48:51 -04003257 case FORMAT_A8B8G8R8:
3258 return FORMAT_A8B8G8R8;
John Bauman89401822014-05-06 15:04:28 -04003259 case FORMAT_R3G3B2:
3260 case FORMAT_R5G6B5:
3261 case FORMAT_R8G8B8:
3262 case FORMAT_X4R4G4B4:
3263 case FORMAT_X1R5G5B5:
3264 case FORMAT_X8R8G8B8:
John Bauman89401822014-05-06 15:04:28 -04003265 if(lockable || !quadLayoutEnabled)
3266 {
3267 return FORMAT_X8R8G8B8;
3268 }
3269 else
3270 {
3271 return FORMAT_X8G8R8B8Q;
3272 }
Nicolas Capens80594422015-06-09 16:42:56 -04003273 case FORMAT_B8G8R8:
Nicolas Capensef77ac12015-03-28 21:48:51 -04003274 case FORMAT_X8B8G8R8:
3275 return FORMAT_X8B8G8R8;
John Bauman89401822014-05-06 15:04:28 -04003276 // Compressed formats
3277 #if S3TC_SUPPORT
3278 case FORMAT_DXT1:
3279 case FORMAT_DXT3:
3280 case FORMAT_DXT5:
3281 return FORMAT_A8R8G8B8;
John Bauman66b8ab22014-05-06 15:57:45 -04003282 #endif
John Bauman89401822014-05-06 15:04:28 -04003283 case FORMAT_ATI1:
3284 return FORMAT_R8;
3285 case FORMAT_ATI2:
3286 return FORMAT_G8R8;
Nicolas Capens22658242014-11-29 00:31:41 -05003287 case FORMAT_ETC1:
3288 return FORMAT_X8R8G8B8;
John Bauman89401822014-05-06 15:04:28 -04003289 // Bumpmap formats
3290 case FORMAT_V8U8: return FORMAT_V8U8;
3291 case FORMAT_L6V5U5: return FORMAT_X8L8V8U8;
3292 case FORMAT_Q8W8V8U8: return FORMAT_Q8W8V8U8;
3293 case FORMAT_X8L8V8U8: return FORMAT_X8L8V8U8;
3294 case FORMAT_V16U16: return FORMAT_V16U16;
3295 case FORMAT_A2W10V10U10: return FORMAT_A16W16V16U16;
3296 case FORMAT_Q16W16V16U16: return FORMAT_Q16W16V16U16;
3297 // Floating-point formats
Nicolas Capens80594422015-06-09 16:42:56 -04003298 case FORMAT_A16F: return FORMAT_A32B32G32R32F;
John Bauman89401822014-05-06 15:04:28 -04003299 case FORMAT_R16F: return FORMAT_R32F;
3300 case FORMAT_G16R16F: return FORMAT_G32R32F;
Nicolas Capens80594422015-06-09 16:42:56 -04003301 case FORMAT_B16G16R16F: return FORMAT_A32B32G32R32F;
John Bauman89401822014-05-06 15:04:28 -04003302 case FORMAT_A16B16G16R16F: return FORMAT_A32B32G32R32F;
Nicolas Capens80594422015-06-09 16:42:56 -04003303 case FORMAT_A32F: return FORMAT_A32B32G32R32F;
John Bauman89401822014-05-06 15:04:28 -04003304 case FORMAT_R32F: return FORMAT_R32F;
3305 case FORMAT_G32R32F: return FORMAT_G32R32F;
Nicolas Capens80594422015-06-09 16:42:56 -04003306 case FORMAT_B32G32R32F: return FORMAT_A32B32G32R32F;
John Bauman89401822014-05-06 15:04:28 -04003307 case FORMAT_A32B32G32R32F: return FORMAT_A32B32G32R32F;
3308 // Luminance formats
3309 case FORMAT_L8: return FORMAT_L8;
3310 case FORMAT_A4L4: return FORMAT_A8L8;
3311 case FORMAT_L16: return FORMAT_L16;
3312 case FORMAT_A8L8: return FORMAT_A8L8;
Nicolas Capens80594422015-06-09 16:42:56 -04003313 case FORMAT_L16F: return FORMAT_A32B32G32R32F;
3314 case FORMAT_A16L16F: return FORMAT_A32B32G32R32F;
3315 case FORMAT_L32F: return FORMAT_A32B32G32R32F;
3316 case FORMAT_A32L32F: return FORMAT_A32B32G32R32F;
John Bauman89401822014-05-06 15:04:28 -04003317 // Depth/stencil formats
3318 case FORMAT_D16:
3319 case FORMAT_D32:
3320 case FORMAT_D24X8:
3321 case FORMAT_D24S8:
3322 case FORMAT_D24FS8:
3323 if(hasParent) // Texture
3324 {
John Bauman66b8ab22014-05-06 15:57:45 -04003325 return FORMAT_D32FS8_SHADOW;
John Bauman89401822014-05-06 15:04:28 -04003326 }
3327 else if(complementaryDepthBuffer)
3328 {
3329 return FORMAT_D32F_COMPLEMENTARY;
3330 }
3331 else
3332 {
3333 return FORMAT_D32F;
3334 }
John Bauman66b8ab22014-05-06 15:57:45 -04003335 case FORMAT_D32F_LOCKABLE: return FORMAT_D32F_LOCKABLE;
3336 case FORMAT_D32FS8_TEXTURE: return FORMAT_D32FS8_TEXTURE;
3337 case FORMAT_INTZ: return FORMAT_D32FS8_TEXTURE;
3338 case FORMAT_DF24S8: return FORMAT_D32FS8_SHADOW;
3339 case FORMAT_DF16S8: return FORMAT_D32FS8_SHADOW;
John Bauman89401822014-05-06 15:04:28 -04003340 default:
3341 ASSERT(false);
3342 }
3343
3344 return FORMAT_NULL;
3345 }
3346
3347 void Surface::setTexturePalette(unsigned int *palette)
3348 {
3349 Surface::palette = palette;
3350 Surface::paletteID++;
3351 }
3352
3353 void Surface::resolve()
3354 {
3355 if(internal.depth <= 1 || !internal.dirty || !renderTarget || internal.format == FORMAT_NULL)
3356 {
3357 return;
3358 }
3359
3360 void *source = internal.lockRect(0, 0, 0, LOCK_READWRITE);
3361
3362 int quality = internal.depth;
3363 int width = internal.width;
3364 int height = internal.height;
3365 int pitch = internal.pitchB;
3366 int slice = internal.sliceB;
3367
3368 unsigned char *source0 = (unsigned char*)source;
3369 unsigned char *source1 = source0 + slice;
3370 unsigned char *source2 = source1 + slice;
3371 unsigned char *source3 = source2 + slice;
3372 unsigned char *source4 = source3 + slice;
3373 unsigned char *source5 = source4 + slice;
3374 unsigned char *source6 = source5 + slice;
3375 unsigned char *source7 = source6 + slice;
3376 unsigned char *source8 = source7 + slice;
3377 unsigned char *source9 = source8 + slice;
3378 unsigned char *sourceA = source9 + slice;
3379 unsigned char *sourceB = sourceA + slice;
3380 unsigned char *sourceC = sourceB + slice;
3381 unsigned char *sourceD = sourceC + slice;
3382 unsigned char *sourceE = sourceD + slice;
3383 unsigned char *sourceF = sourceE + slice;
3384
Nicolas Capensef77ac12015-03-28 21:48:51 -04003385 if(internal.format == FORMAT_X8R8G8B8 || internal.format == FORMAT_A8R8G8B8 || internal.format == FORMAT_X8B8G8R8 || internal.format == FORMAT_A8B8G8R8)
John Bauman89401822014-05-06 15:04:28 -04003386 {
3387 if(CPUID::supportsSSE2() && (width % 4) == 0)
3388 {
3389 if(internal.depth == 2)
3390 {
3391 for(int y = 0; y < height; y++)
3392 {
3393 for(int x = 0; x < width; x += 4)
3394 {
3395 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3396 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3397
3398 c0 = _mm_avg_epu8(c0, c1);
3399
3400 _mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3401 }
3402
3403 source0 += pitch;
3404 source1 += pitch;
3405 }
3406 }
3407 else if(internal.depth == 4)
3408 {
3409 for(int y = 0; y < height; y++)
3410 {
3411 for(int x = 0; x < width; x += 4)
3412 {
3413 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3414 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3415 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3416 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3417
3418 c0 = _mm_avg_epu8(c0, c1);
3419 c2 = _mm_avg_epu8(c2, c3);
3420 c0 = _mm_avg_epu8(c0, c2);
3421
3422 _mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3423 }
3424
3425 source0 += pitch;
3426 source1 += pitch;
3427 source2 += pitch;
3428 source3 += pitch;
3429 }
3430 }
3431 else if(internal.depth == 8)
3432 {
3433 for(int y = 0; y < height; y++)
3434 {
3435 for(int x = 0; x < width; x += 4)
3436 {
3437 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3438 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3439 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3440 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3441 __m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3442 __m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3443 __m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3444 __m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3445
3446 c0 = _mm_avg_epu8(c0, c1);
3447 c2 = _mm_avg_epu8(c2, c3);
3448 c4 = _mm_avg_epu8(c4, c5);
3449 c6 = _mm_avg_epu8(c6, c7);
3450 c0 = _mm_avg_epu8(c0, c2);
3451 c4 = _mm_avg_epu8(c4, c6);
3452 c0 = _mm_avg_epu8(c0, c4);
3453
3454 _mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3455 }
3456
3457 source0 += pitch;
3458 source1 += pitch;
3459 source2 += pitch;
3460 source3 += pitch;
3461 source4 += pitch;
3462 source5 += pitch;
3463 source6 += pitch;
3464 source7 += pitch;
3465 }
3466 }
3467 else if(internal.depth == 16)
3468 {
3469 for(int y = 0; y < height; y++)
3470 {
3471 for(int x = 0; x < width; x += 4)
3472 {
3473 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3474 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3475 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3476 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3477 __m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3478 __m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3479 __m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3480 __m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3481 __m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
3482 __m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
3483 __m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
3484 __m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
3485 __m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
3486 __m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
3487 __m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
3488 __m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
3489
3490 c0 = _mm_avg_epu8(c0, c1);
3491 c2 = _mm_avg_epu8(c2, c3);
3492 c4 = _mm_avg_epu8(c4, c5);
3493 c6 = _mm_avg_epu8(c6, c7);
3494 c8 = _mm_avg_epu8(c8, c9);
3495 cA = _mm_avg_epu8(cA, cB);
3496 cC = _mm_avg_epu8(cC, cD);
3497 cE = _mm_avg_epu8(cE, cF);
3498 c0 = _mm_avg_epu8(c0, c2);
3499 c4 = _mm_avg_epu8(c4, c6);
3500 c8 = _mm_avg_epu8(c8, cA);
3501 cC = _mm_avg_epu8(cC, cE);
3502 c0 = _mm_avg_epu8(c0, c4);
3503 c8 = _mm_avg_epu8(c8, cC);
3504 c0 = _mm_avg_epu8(c0, c8);
3505
3506 _mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3507 }
3508
3509 source0 += pitch;
3510 source1 += pitch;
3511 source2 += pitch;
3512 source3 += pitch;
3513 source4 += pitch;
3514 source5 += pitch;
3515 source6 += pitch;
3516 source7 += pitch;
3517 source8 += pitch;
3518 source9 += pitch;
3519 sourceA += pitch;
3520 sourceB += pitch;
3521 sourceC += pitch;
3522 sourceD += pitch;
3523 sourceE += pitch;
3524 sourceF += pitch;
3525 }
3526 }
3527 else ASSERT(false);
3528 }
3529 else
3530 {
3531 #define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101))
3532
3533 if(internal.depth == 2)
3534 {
3535 for(int y = 0; y < height; y++)
3536 {
3537 for(int x = 0; x < width; x++)
3538 {
3539 unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3540 unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3541
3542 c0 = AVERAGE(c0, c1);
3543
3544 *(unsigned int*)(source0 + 4 * x) = c0;
3545 }
3546
3547 source0 += pitch;
3548 source1 += pitch;
3549 }
3550 }
3551 else if(internal.depth == 4)
3552 {
3553 for(int y = 0; y < height; y++)
3554 {
3555 for(int x = 0; x < width; x++)
3556 {
3557 unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3558 unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3559 unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3560 unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3561
3562 c0 = AVERAGE(c0, c1);
3563 c2 = AVERAGE(c2, c3);
3564 c0 = AVERAGE(c0, c2);
3565
3566 *(unsigned int*)(source0 + 4 * x) = c0;
3567 }
3568
3569 source0 += pitch;
3570 source1 += pitch;
3571 source2 += pitch;
3572 source3 += pitch;
3573 }
3574 }
3575 else if(internal.depth == 8)
3576 {
3577 for(int y = 0; y < height; y++)
3578 {
3579 for(int x = 0; x < width; x++)
3580 {
3581 unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3582 unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3583 unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3584 unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3585 unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3586 unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3587 unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3588 unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3589
3590 c0 = AVERAGE(c0, c1);
3591 c2 = AVERAGE(c2, c3);
3592 c4 = AVERAGE(c4, c5);
3593 c6 = AVERAGE(c6, c7);
3594 c0 = AVERAGE(c0, c2);
3595 c4 = AVERAGE(c4, c6);
3596 c0 = AVERAGE(c0, c4);
3597
3598 *(unsigned int*)(source0 + 4 * x) = c0;
3599 }
3600
3601 source0 += pitch;
3602 source1 += pitch;
3603 source2 += pitch;
3604 source3 += pitch;
3605 source4 += pitch;
3606 source5 += pitch;
3607 source6 += pitch;
3608 source7 += pitch;
3609 }
3610 }
3611 else if(internal.depth == 16)
3612 {
3613 for(int y = 0; y < height; y++)
3614 {
3615 for(int x = 0; x < width; x++)
3616 {
3617 unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3618 unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3619 unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3620 unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3621 unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3622 unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3623 unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3624 unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3625 unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
3626 unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
3627 unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
3628 unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
3629 unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
3630 unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
3631 unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
3632 unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
3633
3634 c0 = AVERAGE(c0, c1);
3635 c2 = AVERAGE(c2, c3);
3636 c4 = AVERAGE(c4, c5);
3637 c6 = AVERAGE(c6, c7);
3638 c8 = AVERAGE(c8, c9);
3639 cA = AVERAGE(cA, cB);
3640 cC = AVERAGE(cC, cD);
3641 cE = AVERAGE(cE, cF);
3642 c0 = AVERAGE(c0, c2);
3643 c4 = AVERAGE(c4, c6);
3644 c8 = AVERAGE(c8, cA);
3645 cC = AVERAGE(cC, cE);
3646 c0 = AVERAGE(c0, c4);
3647 c8 = AVERAGE(c8, cC);
3648 c0 = AVERAGE(c0, c8);
3649
3650 *(unsigned int*)(source0 + 4 * x) = c0;
3651 }
3652
3653 source0 += pitch;
3654 source1 += pitch;
3655 source2 += pitch;
3656 source3 += pitch;
3657 source4 += pitch;
3658 source5 += pitch;
3659 source6 += pitch;
3660 source7 += pitch;
3661 source8 += pitch;
3662 source9 += pitch;
3663 sourceA += pitch;
3664 sourceB += pitch;
3665 sourceC += pitch;
3666 sourceD += pitch;
3667 sourceE += pitch;
3668 sourceF += pitch;
3669 }
3670 }
3671 else ASSERT(false);
3672
3673 #undef AVERAGE
3674 }
3675 }
3676 else if(internal.format == FORMAT_G16R16)
3677 {
3678 if(CPUID::supportsSSE2() && (width % 4) == 0)
3679 {
3680 if(internal.depth == 2)
3681 {
3682 for(int y = 0; y < height; y++)
3683 {
3684 for(int x = 0; x < width; x += 4)
3685 {
3686 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3687 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3688
3689 c0 = _mm_avg_epu16(c0, c1);
3690
3691 _mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3692 }
3693
3694 source0 += pitch;
3695 source1 += pitch;
3696 }
3697 }
3698 else if(internal.depth == 4)
3699 {
3700 for(int y = 0; y < height; y++)
3701 {
3702 for(int x = 0; x < width; x += 4)
3703 {
3704 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3705 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3706 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3707 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3708
3709 c0 = _mm_avg_epu16(c0, c1);
3710 c2 = _mm_avg_epu16(c2, c3);
3711 c0 = _mm_avg_epu16(c0, c2);
3712
3713 _mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3714 }
3715
3716 source0 += pitch;
3717 source1 += pitch;
3718 source2 += pitch;
3719 source3 += pitch;
3720 }
3721 }
3722 else if(internal.depth == 8)
3723 {
3724 for(int y = 0; y < height; y++)
3725 {
3726 for(int x = 0; x < width; x += 4)
3727 {
3728 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3729 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3730 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3731 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3732 __m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3733 __m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3734 __m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3735 __m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3736
3737 c0 = _mm_avg_epu16(c0, c1);
3738 c2 = _mm_avg_epu16(c2, c3);
3739 c4 = _mm_avg_epu16(c4, c5);
3740 c6 = _mm_avg_epu16(c6, c7);
3741 c0 = _mm_avg_epu16(c0, c2);
3742 c4 = _mm_avg_epu16(c4, c6);
3743 c0 = _mm_avg_epu16(c0, c4);
3744
3745 _mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3746 }
3747
3748 source0 += pitch;
3749 source1 += pitch;
3750 source2 += pitch;
3751 source3 += pitch;
3752 source4 += pitch;
3753 source5 += pitch;
3754 source6 += pitch;
3755 source7 += pitch;
3756 }
3757 }
3758 else if(internal.depth == 16)
3759 {
3760 for(int y = 0; y < height; y++)
3761 {
3762 for(int x = 0; x < width; x += 4)
3763 {
3764 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3765 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3766 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3767 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3768 __m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3769 __m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3770 __m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3771 __m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3772 __m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
3773 __m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
3774 __m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
3775 __m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
3776 __m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
3777 __m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
3778 __m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
3779 __m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
3780
3781 c0 = _mm_avg_epu16(c0, c1);
3782 c2 = _mm_avg_epu16(c2, c3);
3783 c4 = _mm_avg_epu16(c4, c5);
3784 c6 = _mm_avg_epu16(c6, c7);
3785 c8 = _mm_avg_epu16(c8, c9);
3786 cA = _mm_avg_epu16(cA, cB);
3787 cC = _mm_avg_epu16(cC, cD);
3788 cE = _mm_avg_epu16(cE, cF);
3789 c0 = _mm_avg_epu16(c0, c2);
3790 c4 = _mm_avg_epu16(c4, c6);
3791 c8 = _mm_avg_epu16(c8, cA);
3792 cC = _mm_avg_epu16(cC, cE);
3793 c0 = _mm_avg_epu16(c0, c4);
3794 c8 = _mm_avg_epu16(c8, cC);
3795 c0 = _mm_avg_epu16(c0, c8);
3796
3797 _mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3798 }
3799
3800 source0 += pitch;
3801 source1 += pitch;
3802 source2 += pitch;
3803 source3 += pitch;
3804 source4 += pitch;
3805 source5 += pitch;
3806 source6 += pitch;
3807 source7 += pitch;
3808 source8 += pitch;
3809 source9 += pitch;
3810 sourceA += pitch;
3811 sourceB += pitch;
3812 sourceC += pitch;
3813 sourceD += pitch;
3814 sourceE += pitch;
3815 sourceF += pitch;
3816 }
3817 }
3818 else ASSERT(false);
3819 }
3820 else
3821 {
3822 #define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
3823
3824 if(internal.depth == 2)
3825 {
3826 for(int y = 0; y < height; y++)
3827 {
3828 for(int x = 0; x < width; x++)
3829 {
3830 unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3831 unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3832
3833 c0 = AVERAGE(c0, c1);
3834
3835 *(unsigned int*)(source0 + 4 * x) = c0;
3836 }
3837
3838 source0 += pitch;
3839 source1 += pitch;
3840 }
3841 }
3842 else if(internal.depth == 4)
3843 {
3844 for(int y = 0; y < height; y++)
3845 {
3846 for(int x = 0; x < width; x++)
3847 {
3848 unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3849 unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3850 unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3851 unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3852
3853 c0 = AVERAGE(c0, c1);
3854 c2 = AVERAGE(c2, c3);
3855 c0 = AVERAGE(c0, c2);
3856
3857 *(unsigned int*)(source0 + 4 * x) = c0;
3858 }
3859
3860 source0 += pitch;
3861 source1 += pitch;
3862 source2 += pitch;
3863 source3 += pitch;
3864 }
3865 }
3866 else if(internal.depth == 8)
3867 {
3868 for(int y = 0; y < height; y++)
3869 {
3870 for(int x = 0; x < width; x++)
3871 {
3872 unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3873 unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3874 unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3875 unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3876 unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3877 unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3878 unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3879 unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3880
3881 c0 = AVERAGE(c0, c1);
3882 c2 = AVERAGE(c2, c3);
3883 c4 = AVERAGE(c4, c5);
3884 c6 = AVERAGE(c6, c7);
3885 c0 = AVERAGE(c0, c2);
3886 c4 = AVERAGE(c4, c6);
3887 c0 = AVERAGE(c0, c4);
3888
3889 *(unsigned int*)(source0 + 4 * x) = c0;
3890 }
3891
3892 source0 += pitch;
3893 source1 += pitch;
3894 source2 += pitch;
3895 source3 += pitch;
3896 source4 += pitch;
3897 source5 += pitch;
3898 source6 += pitch;
3899 source7 += pitch;
3900 }
3901 }
3902 else if(internal.depth == 16)
3903 {
3904 for(int y = 0; y < height; y++)
3905 {
3906 for(int x = 0; x < width; x++)
3907 {
3908 unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3909 unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3910 unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3911 unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3912 unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3913 unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3914 unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3915 unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3916 unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
3917 unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
3918 unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
3919 unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
3920 unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
3921 unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
3922 unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
3923 unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
3924
3925 c0 = AVERAGE(c0, c1);
3926 c2 = AVERAGE(c2, c3);
3927 c4 = AVERAGE(c4, c5);
3928 c6 = AVERAGE(c6, c7);
3929 c8 = AVERAGE(c8, c9);
3930 cA = AVERAGE(cA, cB);
3931 cC = AVERAGE(cC, cD);
3932 cE = AVERAGE(cE, cF);
3933 c0 = AVERAGE(c0, c2);
3934 c4 = AVERAGE(c4, c6);
3935 c8 = AVERAGE(c8, cA);
3936 cC = AVERAGE(cC, cE);
3937 c0 = AVERAGE(c0, c4);
3938 c8 = AVERAGE(c8, cC);
3939 c0 = AVERAGE(c0, c8);
3940
3941 *(unsigned int*)(source0 + 4 * x) = c0;
3942 }
3943
3944 source0 += pitch;
3945 source1 += pitch;
3946 source2 += pitch;
3947 source3 += pitch;
3948 source4 += pitch;
3949 source5 += pitch;
3950 source6 += pitch;
3951 source7 += pitch;
3952 source8 += pitch;
3953 source9 += pitch;
3954 sourceA += pitch;
3955 sourceB += pitch;
3956 sourceC += pitch;
3957 sourceD += pitch;
3958 sourceE += pitch;
3959 sourceF += pitch;
3960 }
3961 }
3962 else ASSERT(false);
3963
3964 #undef AVERAGE
3965 }
3966 }
3967 else if(internal.format == FORMAT_A16B16G16R16)
3968 {
3969 if(CPUID::supportsSSE2() && (width % 2) == 0)
3970 {
3971 if(internal.depth == 2)
3972 {
3973 for(int y = 0; y < height; y++)
3974 {
3975 for(int x = 0; x < width; x += 2)
3976 {
3977 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
3978 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
3979
3980 c0 = _mm_avg_epu16(c0, c1);
3981
3982 _mm_store_si128((__m128i*)(source0 + 8 * x), c0);
3983 }
3984
3985 source0 += pitch;
3986 source1 += pitch;
3987 }
3988 }
3989 else if(internal.depth == 4)
3990 {
3991 for(int y = 0; y < height; y++)
3992 {
3993 for(int x = 0; x < width; x += 2)
3994 {
3995 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
3996 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
3997 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
3998 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
3999
4000 c0 = _mm_avg_epu16(c0, c1);
4001 c2 = _mm_avg_epu16(c2, c3);
4002 c0 = _mm_avg_epu16(c0, c2);
4003
4004 _mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4005 }
4006
4007 source0 += pitch;
4008 source1 += pitch;
4009 source2 += pitch;
4010 source3 += pitch;
4011 }
4012 }
4013 else if(internal.depth == 8)
4014 {
4015 for(int y = 0; y < height; y++)
4016 {
4017 for(int x = 0; x < width; x += 2)
4018 {
4019 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4020 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4021 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4022 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4023 __m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
4024 __m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
4025 __m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
4026 __m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
4027
4028 c0 = _mm_avg_epu16(c0, c1);
4029 c2 = _mm_avg_epu16(c2, c3);
4030 c4 = _mm_avg_epu16(c4, c5);
4031 c6 = _mm_avg_epu16(c6, c7);
4032 c0 = _mm_avg_epu16(c0, c2);
4033 c4 = _mm_avg_epu16(c4, c6);
4034 c0 = _mm_avg_epu16(c0, c4);
4035
4036 _mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4037 }
4038
4039 source0 += pitch;
4040 source1 += pitch;
4041 source2 += pitch;
4042 source3 += pitch;
4043 source4 += pitch;
4044 source5 += pitch;
4045 source6 += pitch;
4046 source7 += pitch;
4047 }
4048 }
4049 else if(internal.depth == 16)
4050 {
4051 for(int y = 0; y < height; y++)
4052 {
4053 for(int x = 0; x < width; x += 2)
4054 {
4055 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4056 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4057 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4058 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4059 __m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
4060 __m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
4061 __m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
4062 __m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
4063 __m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
4064 __m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
4065 __m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
4066 __m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
4067 __m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
4068 __m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
4069 __m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
4070 __m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));
4071
4072 c0 = _mm_avg_epu16(c0, c1);
4073 c2 = _mm_avg_epu16(c2, c3);
4074 c4 = _mm_avg_epu16(c4, c5);
4075 c6 = _mm_avg_epu16(c6, c7);
4076 c8 = _mm_avg_epu16(c8, c9);
4077 cA = _mm_avg_epu16(cA, cB);
4078 cC = _mm_avg_epu16(cC, cD);
4079 cE = _mm_avg_epu16(cE, cF);
4080 c0 = _mm_avg_epu16(c0, c2);
4081 c4 = _mm_avg_epu16(c4, c6);
4082 c8 = _mm_avg_epu16(c8, cA);
4083 cC = _mm_avg_epu16(cC, cE);
4084 c0 = _mm_avg_epu16(c0, c4);
4085 c8 = _mm_avg_epu16(c8, cC);
4086 c0 = _mm_avg_epu16(c0, c8);
4087
4088 _mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4089 }
4090
4091 source0 += pitch;
4092 source1 += pitch;
4093 source2 += pitch;
4094 source3 += pitch;
4095 source4 += pitch;
4096 source5 += pitch;
4097 source6 += pitch;
4098 source7 += pitch;
4099 source8 += pitch;
4100 source9 += pitch;
4101 sourceA += pitch;
4102 sourceB += pitch;
4103 sourceC += pitch;
4104 sourceD += pitch;
4105 sourceE += pitch;
4106 sourceF += pitch;
4107 }
4108 }
4109 else ASSERT(false);
4110 }
4111 else
4112 {
4113 #define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
4114
4115 if(internal.depth == 2)
4116 {
4117 for(int y = 0; y < height; y++)
4118 {
4119 for(int x = 0; x < 2 * width; x++)
4120 {
4121 unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4122 unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4123
4124 c0 = AVERAGE(c0, c1);
4125
4126 *(unsigned int*)(source0 + 4 * x) = c0;
4127 }
4128
4129 source0 += pitch;
4130 source1 += pitch;
4131 }
4132 }
4133 else if(internal.depth == 4)
4134 {
4135 for(int y = 0; y < height; y++)
4136 {
4137 for(int x = 0; x < 2 * width; x++)
4138 {
4139 unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4140 unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4141 unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4142 unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4143
4144 c0 = AVERAGE(c0, c1);
4145 c2 = AVERAGE(c2, c3);
4146 c0 = AVERAGE(c0, c2);
4147
4148 *(unsigned int*)(source0 + 4 * x) = c0;
4149 }
4150
4151 source0 += pitch;
4152 source1 += pitch;
4153 source2 += pitch;
4154 source3 += pitch;
4155 }
4156 }
4157 else if(internal.depth == 8)
4158 {
4159 for(int y = 0; y < height; y++)
4160 {
4161 for(int x = 0; x < 2 * width; x++)
4162 {
4163 unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4164 unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4165 unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4166 unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4167 unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4168 unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4169 unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4170 unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4171
4172 c0 = AVERAGE(c0, c1);
4173 c2 = AVERAGE(c2, c3);
4174 c4 = AVERAGE(c4, c5);
4175 c6 = AVERAGE(c6, c7);
4176 c0 = AVERAGE(c0, c2);
4177 c4 = AVERAGE(c4, c6);
4178 c0 = AVERAGE(c0, c4);
4179
4180 *(unsigned int*)(source0 + 4 * x) = c0;
4181 }
4182
4183 source0 += pitch;
4184 source1 += pitch;
4185 source2 += pitch;
4186 source3 += pitch;
4187 source4 += pitch;
4188 source5 += pitch;
4189 source6 += pitch;
4190 source7 += pitch;
4191 }
4192 }
4193 else if(internal.depth == 16)
4194 {
4195 for(int y = 0; y < height; y++)
4196 {
4197 for(int x = 0; x < 2 * width; x++)
4198 {
4199 unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4200 unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4201 unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4202 unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4203 unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4204 unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4205 unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4206 unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4207 unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4208 unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4209 unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4210 unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4211 unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4212 unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4213 unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4214 unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4215
4216 c0 = AVERAGE(c0, c1);
4217 c2 = AVERAGE(c2, c3);
4218 c4 = AVERAGE(c4, c5);
4219 c6 = AVERAGE(c6, c7);
4220 c8 = AVERAGE(c8, c9);
4221 cA = AVERAGE(cA, cB);
4222 cC = AVERAGE(cC, cD);
4223 cE = AVERAGE(cE, cF);
4224 c0 = AVERAGE(c0, c2);
4225 c4 = AVERAGE(c4, c6);
4226 c8 = AVERAGE(c8, cA);
4227 cC = AVERAGE(cC, cE);
4228 c0 = AVERAGE(c0, c4);
4229 c8 = AVERAGE(c8, cC);
4230 c0 = AVERAGE(c0, c8);
4231
4232 *(unsigned int*)(source0 + 4 * x) = c0;
4233 }
4234
4235 source0 += pitch;
4236 source1 += pitch;
4237 source2 += pitch;
4238 source3 += pitch;
4239 source4 += pitch;
4240 source5 += pitch;
4241 source6 += pitch;
4242 source7 += pitch;
4243 source8 += pitch;
4244 source9 += pitch;
4245 sourceA += pitch;
4246 sourceB += pitch;
4247 sourceC += pitch;
4248 sourceD += pitch;
4249 sourceE += pitch;
4250 sourceF += pitch;
4251 }
4252 }
4253 else ASSERT(false);
4254
4255 #undef AVERAGE
4256 }
4257 }
4258 else if(internal.format == FORMAT_R32F)
4259 {
4260 if(CPUID::supportsSSE() && (width % 4) == 0)
4261 {
4262 if(internal.depth == 2)
4263 {
4264 for(int y = 0; y < height; y++)
4265 {
4266 for(int x = 0; x < width; x += 4)
4267 {
4268 __m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4269 __m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4270
4271 c0 = _mm_add_ps(c0, c1);
4272 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
4273
4274 _mm_store_ps((float*)(source0 + 4 * x), c0);
4275 }
4276
4277 source0 += pitch;
4278 source1 += pitch;
4279 }
4280 }
4281 else if(internal.depth == 4)
4282 {
4283 for(int y = 0; y < height; y++)
4284 {
4285 for(int x = 0; x < width; x += 4)
4286 {
4287 __m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4288 __m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4289 __m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4290 __m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4291
4292 c0 = _mm_add_ps(c0, c1);
4293 c2 = _mm_add_ps(c2, c3);
4294 c0 = _mm_add_ps(c0, c2);
4295 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
4296
4297 _mm_store_ps((float*)(source0 + 4 * x), c0);
4298 }
4299
4300 source0 += pitch;
4301 source1 += pitch;
4302 source2 += pitch;
4303 source3 += pitch;
4304 }
4305 }
4306 else if(internal.depth == 8)
4307 {
4308 for(int y = 0; y < height; y++)
4309 {
4310 for(int x = 0; x < width; x += 4)
4311 {
4312 __m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4313 __m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4314 __m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4315 __m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4316 __m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
4317 __m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
4318 __m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
4319 __m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
4320
4321 c0 = _mm_add_ps(c0, c1);
4322 c2 = _mm_add_ps(c2, c3);
4323 c4 = _mm_add_ps(c4, c5);
4324 c6 = _mm_add_ps(c6, c7);
4325 c0 = _mm_add_ps(c0, c2);
4326 c4 = _mm_add_ps(c4, c6);
4327 c0 = _mm_add_ps(c0, c4);
4328 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
4329
4330 _mm_store_ps((float*)(source0 + 4 * x), c0);
4331 }
4332
4333 source0 += pitch;
4334 source1 += pitch;
4335 source2 += pitch;
4336 source3 += pitch;
4337 source4 += pitch;
4338 source5 += pitch;
4339 source6 += pitch;
4340 source7 += pitch;
4341 }
4342 }
4343 else if(internal.depth == 16)
4344 {
4345 for(int y = 0; y < height; y++)
4346 {
4347 for(int x = 0; x < width; x += 4)
4348 {
4349 __m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4350 __m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4351 __m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4352 __m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4353 __m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
4354 __m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
4355 __m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
4356 __m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
4357 __m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
4358 __m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
4359 __m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
4360 __m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
4361 __m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
4362 __m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
4363 __m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
4364 __m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));
4365
4366 c0 = _mm_add_ps(c0, c1);
4367 c2 = _mm_add_ps(c2, c3);
4368 c4 = _mm_add_ps(c4, c5);
4369 c6 = _mm_add_ps(c6, c7);
4370 c8 = _mm_add_ps(c8, c9);
4371 cA = _mm_add_ps(cA, cB);
4372 cC = _mm_add_ps(cC, cD);
4373 cE = _mm_add_ps(cE, cF);
4374 c0 = _mm_add_ps(c0, c2);
4375 c4 = _mm_add_ps(c4, c6);
4376 c8 = _mm_add_ps(c8, cA);
4377 cC = _mm_add_ps(cC, cE);
4378 c0 = _mm_add_ps(c0, c4);
4379 c8 = _mm_add_ps(c8, cC);
4380 c0 = _mm_add_ps(c0, c8);
4381 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
4382
4383 _mm_store_ps((float*)(source0 + 4 * x), c0);
4384 }
4385
4386 source0 += pitch;
4387 source1 += pitch;
4388 source2 += pitch;
4389 source3 += pitch;
4390 source4 += pitch;
4391 source5 += pitch;
4392 source6 += pitch;
4393 source7 += pitch;
4394 source8 += pitch;
4395 source9 += pitch;
4396 sourceA += pitch;
4397 sourceB += pitch;
4398 sourceC += pitch;
4399 sourceD += pitch;
4400 sourceE += pitch;
4401 sourceF += pitch;
4402 }
4403 }
4404 else ASSERT(false);
4405 }
4406 else
4407 {
4408 if(internal.depth == 2)
4409 {
4410 for(int y = 0; y < height; y++)
4411 {
4412 for(int x = 0; x < width; x++)
4413 {
4414 float c0 = *(float*)(source0 + 4 * x);
4415 float c1 = *(float*)(source1 + 4 * x);
4416
4417 c0 = c0 + c1;
4418 c0 *= 1.0f / 2.0f;
4419
4420 *(float*)(source0 + 4 * x) = c0;
4421 }
4422
4423 source0 += pitch;
4424 source1 += pitch;
4425 }
4426 }
4427 else if(internal.depth == 4)
4428 {
4429 for(int y = 0; y < height; y++)
4430 {
4431 for(int x = 0; x < width; x++)
4432 {
4433 float c0 = *(float*)(source0 + 4 * x);
4434 float c1 = *(float*)(source1 + 4 * x);
4435 float c2 = *(float*)(source2 + 4 * x);
4436 float c3 = *(float*)(source3 + 4 * x);
4437
4438 c0 = c0 + c1;
4439 c2 = c2 + c3;
4440 c0 = c0 + c2;
4441 c0 *= 1.0f / 4.0f;
4442
4443 *(float*)(source0 + 4 * x) = c0;
4444 }
4445
4446 source0 += pitch;
4447 source1 += pitch;
4448 source2 += pitch;
4449 source3 += pitch;
4450 }
4451 }
4452 else if(internal.depth == 8)
4453 {
4454 for(int y = 0; y < height; y++)
4455 {
4456 for(int x = 0; x < width; x++)
4457 {
4458 float c0 = *(float*)(source0 + 4 * x);
4459 float c1 = *(float*)(source1 + 4 * x);
4460 float c2 = *(float*)(source2 + 4 * x);
4461 float c3 = *(float*)(source3 + 4 * x);
4462 float c4 = *(float*)(source4 + 4 * x);
4463 float c5 = *(float*)(source5 + 4 * x);
4464 float c6 = *(float*)(source6 + 4 * x);
4465 float c7 = *(float*)(source7 + 4 * x);
4466
4467 c0 = c0 + c1;
4468 c2 = c2 + c3;
4469 c4 = c4 + c5;
4470 c6 = c6 + c7;
4471 c0 = c0 + c2;
4472 c4 = c4 + c6;
4473 c0 = c0 + c4;
4474 c0 *= 1.0f / 8.0f;
4475
4476 *(float*)(source0 + 4 * x) = c0;
4477 }
4478
4479 source0 += pitch;
4480 source1 += pitch;
4481 source2 += pitch;
4482 source3 += pitch;
4483 source4 += pitch;
4484 source5 += pitch;
4485 source6 += pitch;
4486 source7 += pitch;
4487 }
4488 }
4489 else if(internal.depth == 16)
4490 {
4491 for(int y = 0; y < height; y++)
4492 {
4493 for(int x = 0; x < width; x++)
4494 {
4495 float c0 = *(float*)(source0 + 4 * x);
4496 float c1 = *(float*)(source1 + 4 * x);
4497 float c2 = *(float*)(source2 + 4 * x);
4498 float c3 = *(float*)(source3 + 4 * x);
4499 float c4 = *(float*)(source4 + 4 * x);
4500 float c5 = *(float*)(source5 + 4 * x);
4501 float c6 = *(float*)(source6 + 4 * x);
4502 float c7 = *(float*)(source7 + 4 * x);
4503 float c8 = *(float*)(source8 + 4 * x);
4504 float c9 = *(float*)(source9 + 4 * x);
4505 float cA = *(float*)(sourceA + 4 * x);
4506 float cB = *(float*)(sourceB + 4 * x);
4507 float cC = *(float*)(sourceC + 4 * x);
4508 float cD = *(float*)(sourceD + 4 * x);
4509 float cE = *(float*)(sourceE + 4 * x);
4510 float cF = *(float*)(sourceF + 4 * x);
4511
4512 c0 = c0 + c1;
4513 c2 = c2 + c3;
4514 c4 = c4 + c5;
4515 c6 = c6 + c7;
4516 c8 = c8 + c9;
4517 cA = cA + cB;
4518 cC = cC + cD;
4519 cE = cE + cF;
4520 c0 = c0 + c2;
4521 c4 = c4 + c6;
4522 c8 = c8 + cA;
4523 cC = cC + cE;
4524 c0 = c0 + c4;
4525 c8 = c8 + cC;
4526 c0 = c0 + c8;
4527 c0 *= 1.0f / 16.0f;
4528
4529 *(float*)(source0 + 4 * x) = c0;
4530 }
4531
4532 source0 += pitch;
4533 source1 += pitch;
4534 source2 += pitch;
4535 source3 += pitch;
4536 source4 += pitch;
4537 source5 += pitch;
4538 source6 += pitch;
4539 source7 += pitch;
4540 source8 += pitch;
4541 source9 += pitch;
4542 sourceA += pitch;
4543 sourceB += pitch;
4544 sourceC += pitch;
4545 sourceD += pitch;
4546 sourceE += pitch;
4547 sourceF += pitch;
4548 }
4549 }
4550 else ASSERT(false);
4551 }
4552 }
4553 else if(internal.format == FORMAT_G32R32F)
4554 {
4555 if(CPUID::supportsSSE() && (width % 2) == 0)
4556 {
4557 if(internal.depth == 2)
4558 {
4559 for(int y = 0; y < height; y++)
4560 {
4561 for(int x = 0; x < width; x += 2)
4562 {
4563 __m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4564 __m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4565
4566 c0 = _mm_add_ps(c0, c1);
4567 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
4568
4569 _mm_store_ps((float*)(source0 + 8 * x), c0);
4570 }
4571
4572 source0 += pitch;
4573 source1 += pitch;
4574 }
4575 }
4576 else if(internal.depth == 4)
4577 {
4578 for(int y = 0; y < height; y++)
4579 {
4580 for(int x = 0; x < width; x += 2)
4581 {
4582 __m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4583 __m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4584 __m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
4585 __m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
4586
4587 c0 = _mm_add_ps(c0, c1);
4588 c2 = _mm_add_ps(c2, c3);
4589 c0 = _mm_add_ps(c0, c2);
4590 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
4591
4592 _mm_store_ps((float*)(source0 + 8 * x), c0);
4593 }
4594
4595 source0 += pitch;
4596 source1 += pitch;
4597 source2 += pitch;
4598 source3 += pitch;
4599 }
4600 }
4601 else if(internal.depth == 8)
4602 {
4603 for(int y = 0; y < height; y++)
4604 {
4605 for(int x = 0; x < width; x += 2)
4606 {
4607 __m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4608 __m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4609 __m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
4610 __m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
4611 __m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
4612 __m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
4613 __m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
4614 __m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
4615
4616 c0 = _mm_add_ps(c0, c1);
4617 c2 = _mm_add_ps(c2, c3);
4618 c4 = _mm_add_ps(c4, c5);
4619 c6 = _mm_add_ps(c6, c7);
4620 c0 = _mm_add_ps(c0, c2);
4621 c4 = _mm_add_ps(c4, c6);
4622 c0 = _mm_add_ps(c0, c4);
4623 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
4624
4625 _mm_store_ps((float*)(source0 + 8 * x), c0);
4626 }
4627
4628 source0 += pitch;
4629 source1 += pitch;
4630 source2 += pitch;
4631 source3 += pitch;
4632 source4 += pitch;
4633 source5 += pitch;
4634 source6 += pitch;
4635 source7 += pitch;
4636 }
4637 }
4638 else if(internal.depth == 16)
4639 {
4640 for(int y = 0; y < height; y++)
4641 {
4642 for(int x = 0; x < width; x += 2)
4643 {
4644 __m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4645 __m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4646 __m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
4647 __m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
4648 __m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
4649 __m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
4650 __m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
4651 __m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
4652 __m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
4653 __m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
4654 __m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
4655 __m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
4656 __m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
4657 __m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
4658 __m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
4659 __m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));
4660
4661 c0 = _mm_add_ps(c0, c1);
4662 c2 = _mm_add_ps(c2, c3);
4663 c4 = _mm_add_ps(c4, c5);
4664 c6 = _mm_add_ps(c6, c7);
4665 c8 = _mm_add_ps(c8, c9);
4666 cA = _mm_add_ps(cA, cB);
4667 cC = _mm_add_ps(cC, cD);
4668 cE = _mm_add_ps(cE, cF);
4669 c0 = _mm_add_ps(c0, c2);
4670 c4 = _mm_add_ps(c4, c6);
4671 c8 = _mm_add_ps(c8, cA);
4672 cC = _mm_add_ps(cC, cE);
4673 c0 = _mm_add_ps(c0, c4);
4674 c8 = _mm_add_ps(c8, cC);
4675 c0 = _mm_add_ps(c0, c8);
4676 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
4677
4678 _mm_store_ps((float*)(source0 + 8 * x), c0);
4679 }
4680
4681 source0 += pitch;
4682 source1 += pitch;
4683 source2 += pitch;
4684 source3 += pitch;
4685 source4 += pitch;
4686 source5 += pitch;
4687 source6 += pitch;
4688 source7 += pitch;
4689 source8 += pitch;
4690 source9 += pitch;
4691 sourceA += pitch;
4692 sourceB += pitch;
4693 sourceC += pitch;
4694 sourceD += pitch;
4695 sourceE += pitch;
4696 sourceF += pitch;
4697 }
4698 }
4699 else ASSERT(false);
4700 }
4701 else
4702 {
4703 if(internal.depth == 2)
4704 {
4705 for(int y = 0; y < height; y++)
4706 {
4707 for(int x = 0; x < 2 * width; x++)
4708 {
4709 float c0 = *(float*)(source0 + 4 * x);
4710 float c1 = *(float*)(source1 + 4 * x);
4711
4712 c0 = c0 + c1;
4713 c0 *= 1.0f / 2.0f;
4714
4715 *(float*)(source0 + 4 * x) = c0;
4716 }
4717
4718 source0 += pitch;
4719 source1 += pitch;
4720 }
4721 }
4722 else if(internal.depth == 4)
4723 {
4724 for(int y = 0; y < height; y++)
4725 {
4726 for(int x = 0; x < 2 * width; x++)
4727 {
4728 float c0 = *(float*)(source0 + 4 * x);
4729 float c1 = *(float*)(source1 + 4 * x);
4730 float c2 = *(float*)(source2 + 4 * x);
4731 float c3 = *(float*)(source3 + 4 * x);
4732
4733 c0 = c0 + c1;
4734 c2 = c2 + c3;
4735 c0 = c0 + c2;
4736 c0 *= 1.0f / 4.0f;
4737
4738 *(float*)(source0 + 4 * x) = c0;
4739 }
4740
4741 source0 += pitch;
4742 source1 += pitch;
4743 source2 += pitch;
4744 source3 += pitch;
4745 }
4746 }
4747 else if(internal.depth == 8)
4748 {
4749 for(int y = 0; y < height; y++)
4750 {
4751 for(int x = 0; x < 2 * width; x++)
4752 {
4753 float c0 = *(float*)(source0 + 4 * x);
4754 float c1 = *(float*)(source1 + 4 * x);
4755 float c2 = *(float*)(source2 + 4 * x);
4756 float c3 = *(float*)(source3 + 4 * x);
4757 float c4 = *(float*)(source4 + 4 * x);
4758 float c5 = *(float*)(source5 + 4 * x);
4759 float c6 = *(float*)(source6 + 4 * x);
4760 float c7 = *(float*)(source7 + 4 * x);
4761
4762 c0 = c0 + c1;
4763 c2 = c2 + c3;
4764 c4 = c4 + c5;
4765 c6 = c6 + c7;
4766 c0 = c0 + c2;
4767 c4 = c4 + c6;
4768 c0 = c0 + c4;
4769 c0 *= 1.0f / 8.0f;
4770
4771 *(float*)(source0 + 4 * x) = c0;
4772 }
4773
4774 source0 += pitch;
4775 source1 += pitch;
4776 source2 += pitch;
4777 source3 += pitch;
4778 source4 += pitch;
4779 source5 += pitch;
4780 source6 += pitch;
4781 source7 += pitch;
4782 }
4783 }
4784 else if(internal.depth == 16)
4785 {
4786 for(int y = 0; y < height; y++)
4787 {
4788 for(int x = 0; x < 2 * width; x++)
4789 {
4790 float c0 = *(float*)(source0 + 4 * x);
4791 float c1 = *(float*)(source1 + 4 * x);
4792 float c2 = *(float*)(source2 + 4 * x);
4793 float c3 = *(float*)(source3 + 4 * x);
4794 float c4 = *(float*)(source4 + 4 * x);
4795 float c5 = *(float*)(source5 + 4 * x);
4796 float c6 = *(float*)(source6 + 4 * x);
4797 float c7 = *(float*)(source7 + 4 * x);
4798 float c8 = *(float*)(source8 + 4 * x);
4799 float c9 = *(float*)(source9 + 4 * x);
4800 float cA = *(float*)(sourceA + 4 * x);
4801 float cB = *(float*)(sourceB + 4 * x);
4802 float cC = *(float*)(sourceC + 4 * x);
4803 float cD = *(float*)(sourceD + 4 * x);
4804 float cE = *(float*)(sourceE + 4 * x);
4805 float cF = *(float*)(sourceF + 4 * x);
4806
4807 c0 = c0 + c1;
4808 c2 = c2 + c3;
4809 c4 = c4 + c5;
4810 c6 = c6 + c7;
4811 c8 = c8 + c9;
4812 cA = cA + cB;
4813 cC = cC + cD;
4814 cE = cE + cF;
4815 c0 = c0 + c2;
4816 c4 = c4 + c6;
4817 c8 = c8 + cA;
4818 cC = cC + cE;
4819 c0 = c0 + c4;
4820 c8 = c8 + cC;
4821 c0 = c0 + c8;
4822 c0 *= 1.0f / 16.0f;
4823
4824 *(float*)(source0 + 4 * x) = c0;
4825 }
4826
4827 source0 += pitch;
4828 source1 += pitch;
4829 source2 += pitch;
4830 source3 += pitch;
4831 source4 += pitch;
4832 source5 += pitch;
4833 source6 += pitch;
4834 source7 += pitch;
4835 source8 += pitch;
4836 source9 += pitch;
4837 sourceA += pitch;
4838 sourceB += pitch;
4839 sourceC += pitch;
4840 sourceD += pitch;
4841 sourceE += pitch;
4842 sourceF += pitch;
4843 }
4844 }
4845 else ASSERT(false);
4846 }
4847 }
4848 else if(internal.format == FORMAT_A32B32G32R32F)
4849 {
4850 if(CPUID::supportsSSE())
4851 {
4852 if(internal.depth == 2)
4853 {
4854 for(int y = 0; y < height; y++)
4855 {
4856 for(int x = 0; x < width; x++)
4857 {
4858 __m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
4859 __m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
4860
4861 c0 = _mm_add_ps(c0, c1);
4862 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
4863
4864 _mm_store_ps((float*)(source0 + 16 * x), c0);
4865 }
4866
4867 source0 += pitch;
4868 source1 += pitch;
4869 }
4870 }
4871 else if(internal.depth == 4)
4872 {
4873 for(int y = 0; y < height; y++)
4874 {
4875 for(int x = 0; x < width; x++)
4876 {
4877 __m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
4878 __m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
4879 __m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
4880 __m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
4881
4882 c0 = _mm_add_ps(c0, c1);
4883 c2 = _mm_add_ps(c2, c3);
4884 c0 = _mm_add_ps(c0, c2);
4885 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
4886
4887 _mm_store_ps((float*)(source0 + 16 * x), c0);
4888 }
4889
4890 source0 += pitch;
4891 source1 += pitch;
4892 source2 += pitch;
4893 source3 += pitch;
4894 }
4895 }
4896 else if(internal.depth == 8)
4897 {
4898 for(int y = 0; y < height; y++)
4899 {
4900 for(int x = 0; x < width; x++)
4901 {
4902 __m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
4903 __m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
4904 __m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
4905 __m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
4906 __m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
4907 __m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
4908 __m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
4909 __m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
4910
4911 c0 = _mm_add_ps(c0, c1);
4912 c2 = _mm_add_ps(c2, c3);
4913 c4 = _mm_add_ps(c4, c5);
4914 c6 = _mm_add_ps(c6, c7);
4915 c0 = _mm_add_ps(c0, c2);
4916 c4 = _mm_add_ps(c4, c6);
4917 c0 = _mm_add_ps(c0, c4);
4918 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
4919
4920 _mm_store_ps((float*)(source0 + 16 * x), c0);
4921 }
4922
4923 source0 += pitch;
4924 source1 += pitch;
4925 source2 += pitch;
4926 source3 += pitch;
4927 source4 += pitch;
4928 source5 += pitch;
4929 source6 += pitch;
4930 source7 += pitch;
4931 }
4932 }
4933 else if(internal.depth == 16)
4934 {
4935 for(int y = 0; y < height; y++)
4936 {
4937 for(int x = 0; x < width; x++)
4938 {
4939 __m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
4940 __m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
4941 __m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
4942 __m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
4943 __m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
4944 __m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
4945 __m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
4946 __m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
4947 __m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
4948 __m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
4949 __m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
4950 __m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
4951 __m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
4952 __m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
4953 __m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
4954 __m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));
4955
4956 c0 = _mm_add_ps(c0, c1);
4957 c2 = _mm_add_ps(c2, c3);
4958 c4 = _mm_add_ps(c4, c5);
4959 c6 = _mm_add_ps(c6, c7);
4960 c8 = _mm_add_ps(c8, c9);
4961 cA = _mm_add_ps(cA, cB);
4962 cC = _mm_add_ps(cC, cD);
4963 cE = _mm_add_ps(cE, cF);
4964 c0 = _mm_add_ps(c0, c2);
4965 c4 = _mm_add_ps(c4, c6);
4966 c8 = _mm_add_ps(c8, cA);
4967 cC = _mm_add_ps(cC, cE);
4968 c0 = _mm_add_ps(c0, c4);
4969 c8 = _mm_add_ps(c8, cC);
4970 c0 = _mm_add_ps(c0, c8);
4971 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
4972
4973 _mm_store_ps((float*)(source0 + 16 * x), c0);
4974 }
4975
4976 source0 += pitch;
4977 source1 += pitch;
4978 source2 += pitch;
4979 source3 += pitch;
4980 source4 += pitch;
4981 source5 += pitch;
4982 source6 += pitch;
4983 source7 += pitch;
4984 source8 += pitch;
4985 source9 += pitch;
4986 sourceA += pitch;
4987 sourceB += pitch;
4988 sourceC += pitch;
4989 sourceD += pitch;
4990 sourceE += pitch;
4991 sourceF += pitch;
4992 }
4993 }
4994 else ASSERT(false);
4995 }
4996 else
4997 {
4998 if(internal.depth == 2)
4999 {
5000 for(int y = 0; y < height; y++)
5001 {
5002 for(int x = 0; x < 4 * width; x++)
5003 {
5004 float c0 = *(float*)(source0 + 4 * x);
5005 float c1 = *(float*)(source1 + 4 * x);
5006
5007 c0 = c0 + c1;
5008 c0 *= 1.0f / 2.0f;
5009
5010 *(float*)(source0 + 4 * x) = c0;
5011 }
5012
5013 source0 += pitch;
5014 source1 += pitch;
5015 }
5016 }
5017 else if(internal.depth == 4)
5018 {
5019 for(int y = 0; y < height; y++)
5020 {
5021 for(int x = 0; x < 4 * width; x++)
5022 {
5023 float c0 = *(float*)(source0 + 4 * x);
5024 float c1 = *(float*)(source1 + 4 * x);
5025 float c2 = *(float*)(source2 + 4 * x);
5026 float c3 = *(float*)(source3 + 4 * x);
5027
5028 c0 = c0 + c1;
5029 c2 = c2 + c3;
5030 c0 = c0 + c2;
5031 c0 *= 1.0f / 4.0f;
5032
5033 *(float*)(source0 + 4 * x) = c0;
5034 }
5035
5036 source0 += pitch;
5037 source1 += pitch;
5038 source2 += pitch;
5039 source3 += pitch;
5040 }
5041 }
5042 else if(internal.depth == 8)
5043 {
5044 for(int y = 0; y < height; y++)
5045 {
5046 for(int x = 0; x < 4 * width; x++)
5047 {
5048 float c0 = *(float*)(source0 + 4 * x);
5049 float c1 = *(float*)(source1 + 4 * x);
5050 float c2 = *(float*)(source2 + 4 * x);
5051 float c3 = *(float*)(source3 + 4 * x);
5052 float c4 = *(float*)(source4 + 4 * x);
5053 float c5 = *(float*)(source5 + 4 * x);
5054 float c6 = *(float*)(source6 + 4 * x);
5055 float c7 = *(float*)(source7 + 4 * x);
5056
5057 c0 = c0 + c1;
5058 c2 = c2 + c3;
5059 c4 = c4 + c5;
5060 c6 = c6 + c7;
5061 c0 = c0 + c2;
5062 c4 = c4 + c6;
5063 c0 = c0 + c4;
5064 c0 *= 1.0f / 8.0f;
5065
5066 *(float*)(source0 + 4 * x) = c0;
5067 }
5068
5069 source0 += pitch;
5070 source1 += pitch;
5071 source2 += pitch;
5072 source3 += pitch;
5073 source4 += pitch;
5074 source5 += pitch;
5075 source6 += pitch;
5076 source7 += pitch;
5077 }
5078 }
5079 else if(internal.depth == 16)
5080 {
5081 for(int y = 0; y < height; y++)
5082 {
5083 for(int x = 0; x < 4 * width; x++)
5084 {
5085 float c0 = *(float*)(source0 + 4 * x);
5086 float c1 = *(float*)(source1 + 4 * x);
5087 float c2 = *(float*)(source2 + 4 * x);
5088 float c3 = *(float*)(source3 + 4 * x);
5089 float c4 = *(float*)(source4 + 4 * x);
5090 float c5 = *(float*)(source5 + 4 * x);
5091 float c6 = *(float*)(source6 + 4 * x);
5092 float c7 = *(float*)(source7 + 4 * x);
5093 float c8 = *(float*)(source8 + 4 * x);
5094 float c9 = *(float*)(source9 + 4 * x);
5095 float cA = *(float*)(sourceA + 4 * x);
5096 float cB = *(float*)(sourceB + 4 * x);
5097 float cC = *(float*)(sourceC + 4 * x);
5098 float cD = *(float*)(sourceD + 4 * x);
5099 float cE = *(float*)(sourceE + 4 * x);
5100 float cF = *(float*)(sourceF + 4 * x);
5101
5102 c0 = c0 + c1;
5103 c2 = c2 + c3;
5104 c4 = c4 + c5;
5105 c6 = c6 + c7;
5106 c8 = c8 + c9;
5107 cA = cA + cB;
5108 cC = cC + cD;
5109 cE = cE + cF;
5110 c0 = c0 + c2;
5111 c4 = c4 + c6;
5112 c8 = c8 + cA;
5113 cC = cC + cE;
5114 c0 = c0 + c4;
5115 c8 = c8 + cC;
5116 c0 = c0 + c8;
5117 c0 *= 1.0f / 16.0f;
5118
5119 *(float*)(source0 + 4 * x) = c0;
5120 }
5121
5122 source0 += pitch;
5123 source1 += pitch;
5124 source2 += pitch;
5125 source3 += pitch;
5126 source4 += pitch;
5127 source5 += pitch;
5128 source6 += pitch;
5129 source7 += pitch;
5130 source8 += pitch;
5131 source9 += pitch;
5132 sourceA += pitch;
5133 sourceB += pitch;
5134 sourceC += pitch;
5135 sourceD += pitch;
5136 sourceE += pitch;
5137 sourceF += pitch;
5138 }
5139 }
5140 else ASSERT(false);
5141 }
5142 }
5143 else
5144 {
5145 // UNIMPLEMENTED();
5146 }
5147 }
5148}