blob: aadaf97c99da2fe26f09c7b1983607244c286288 [file] [log] [blame]
John Bauman89401822014-05-06 15:04:28 -04001// SwiftShader Software Renderer
2//
John Bauman66b8ab22014-05-06 15:57:45 -04003// Copyright(c) 2005-2013 TransGaming Inc.
John Bauman89401822014-05-06 15:04:28 -04004//
5// All rights reserved. No part of this software may be copied, distributed, transmitted,
6// transcribed, stored in a retrieval system, translated into any human or computer
7// language by any means, or disclosed to third parties without the explicit written
8// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
9// or implied, including but not limited to any patent rights, are granted to you.
10//
11
12#include "Surface.hpp"
13
14#include "Color.hpp"
15#include "Context.hpp"
John Bauman19bac1e2014-05-06 15:23:49 -040016#include "Renderer.hpp"
John Bauman89401822014-05-06 15:04:28 -040017#include "Common/Half.hpp"
18#include "Common/Memory.hpp"
19#include "Common/CPUID.hpp"
20#include "Common/Resource.hpp"
21#include "Common/Debug.hpp"
John Bauman19bac1e2014-05-06 15:23:49 -040022#include "Reactor/Reactor.hpp"
John Bauman89401822014-05-06 15:04:28 -040023
24#include <xmmintrin.h>
25#include <emmintrin.h>
26
27#undef min
28#undef max
29
30namespace sw
31{
32 extern bool quadLayoutEnabled;
33 extern bool complementaryDepthBuffer;
34 extern TranscendentalPrecision logPrecision;
35
36 unsigned int *Surface::palette = 0;
37 unsigned int Surface::paletteID = 0;
38
John Bauman19bac1e2014-05-06 15:23:49 -040039 void Rect::clip(int minX, int minY, int maxX, int maxY)
40 {
41 x0 = sw::clamp(x0, minX, maxX);
42 y0 = sw::clamp(y0, minY, maxY);
43 x1 = sw::clamp(x1, minX, maxX);
44 y1 = sw::clamp(y1, minY, maxY);
45 }
46
John Bauman89401822014-05-06 15:04:28 -040047 void Surface::Buffer::write(int x, int y, int z, const Color<float> &color)
48 {
49 void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
50
51 write(element, color);
52 }
53
54 void Surface::Buffer::write(int x, int y, const Color<float> &color)
55 {
56 void *element = (unsigned char*)buffer + x * bytes + y * pitchB;
57
58 write(element, color);
59 }
60
61 inline void Surface::Buffer::write(void *element, const Color<float> &color)
62 {
63 switch(format)
64 {
65 case FORMAT_A8:
66 *(unsigned char*)element = unorm<8>(color.a);
67 break;
68 case FORMAT_R8:
69 *(unsigned char*)element = unorm<8>(color.r);
70 break;
71 case FORMAT_R3G3B2:
72 *(unsigned char*)element = (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0);
73 break;
74 case FORMAT_A8R3G3B2:
75 *(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0);
76 break;
77 case FORMAT_X4R4G4B4:
78 *(unsigned short*)element = 0xF000 | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0);
79 break;
80 case FORMAT_A4R4G4B4:
81 *(unsigned short*)element = (unorm<4>(color.a) << 12) | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0);
82 break;
83 case FORMAT_R5G6B5:
84 *(unsigned short*)element = (unorm<5>(color.r) << 11) | (unorm<6>(color.g) << 5) | (unorm<5>(color.b) << 0);
85 break;
86 case FORMAT_A1R5G5B5:
87 *(unsigned short*)element = (unorm<1>(color.a) << 15) | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0);
88 break;
89 case FORMAT_X1R5G5B5:
90 *(unsigned short*)element = 0x8000 | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0);
91 break;
92 case FORMAT_A8R8G8B8:
93 *(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0);
94 break;
95 case FORMAT_X8R8G8B8:
96 *(unsigned int*)element = 0xFF000000 | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0);
97 break;
98 case FORMAT_A8B8G8R8:
99 *(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
100 break;
101 case FORMAT_X8B8G8R8:
102 *(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
103 break;
104 case FORMAT_A2R10G10B10:
105 *(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.r) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.b) << 0);
106 break;
107 case FORMAT_A2B10G10R10:
108 *(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.b) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.r) << 0);
109 break;
110 case FORMAT_G8R8:
111 *(unsigned int*)element = (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
112 break;
113 case FORMAT_G16R16:
114 *(unsigned int*)element = (unorm<16>(color.g) << 16) | (unorm<16>(color.r) << 0);
115 break;
116 case FORMAT_A16B16G16R16:
117 ((unsigned short*)element)[0] = unorm<16>(color.r);
118 ((unsigned short*)element)[1] = unorm<16>(color.g);
119 ((unsigned short*)element)[2] = unorm<16>(color.b);
120 ((unsigned short*)element)[3] = unorm<16>(color.a);
121 break;
122 case FORMAT_V8U8:
123 *(unsigned short*)element = (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
124 break;
125 case FORMAT_L6V5U5:
126 *(unsigned short*)element = (unorm<6>(color.b) << 10) | (snorm<5>(color.g) << 5) | (snorm<5>(color.r) << 0);
127 break;
128 case FORMAT_Q8W8V8U8:
129 *(unsigned int*)element = (snorm<8>(color.a) << 24) | (snorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
130 break;
131 case FORMAT_X8L8V8U8:
132 *(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
133 break;
134 case FORMAT_V16U16:
135 *(unsigned int*)element = (snorm<16>(color.g) << 16) | (snorm<16>(color.r) << 0);
136 break;
137 case FORMAT_A2W10V10U10:
138 *(unsigned int*)element = (unorm<2>(color.a) << 30) | (snorm<10>(color.b) << 20) | (snorm<10>(color.g) << 10) | (snorm<10>(color.r) << 0);
139 break;
140 case FORMAT_A16W16V16U16:
141 ((unsigned short*)element)[0] = snorm<16>(color.r);
142 ((unsigned short*)element)[1] = snorm<16>(color.g);
143 ((unsigned short*)element)[2] = snorm<16>(color.b);
144 ((unsigned short*)element)[3] = unorm<16>(color.a);
145 break;
146 case FORMAT_Q16W16V16U16:
147 ((unsigned short*)element)[0] = snorm<16>(color.r);
148 ((unsigned short*)element)[1] = snorm<16>(color.g);
149 ((unsigned short*)element)[2] = snorm<16>(color.b);
150 ((unsigned short*)element)[3] = snorm<16>(color.a);
151 break;
152 case FORMAT_R8G8B8:
153 ((unsigned char*)element)[0] = unorm<8>(color.b);
154 ((unsigned char*)element)[1] = unorm<8>(color.g);
155 ((unsigned char*)element)[2] = unorm<8>(color.r);
156 break;
157 case FORMAT_R16F:
158 *(half*)element = (half)color.r;
159 break;
160 case FORMAT_G16R16F:
161 ((half*)element)[0] = (half)color.r;
162 ((half*)element)[1] = (half)color.g;
163 break;
164 case FORMAT_A16B16G16R16F:
165 ((half*)element)[0] = (half)color.r;
166 ((half*)element)[1] = (half)color.g;
167 ((half*)element)[2] = (half)color.b;
168 ((half*)element)[3] = (half)color.a;
169 break;
170 case FORMAT_R32F:
171 *(float*)element = color.r;
172 break;
173 case FORMAT_G32R32F:
174 ((float*)element)[0] = color.r;
175 ((float*)element)[1] = color.g;
176 break;
177 case FORMAT_A32B32G32R32F:
178 ((float*)element)[0] = color.r;
179 ((float*)element)[1] = color.g;
180 ((float*)element)[2] = color.b;
181 ((float*)element)[3] = color.a;
182 break;
183 case FORMAT_D32F:
184 case FORMAT_D32F_LOCKABLE:
John Bauman66b8ab22014-05-06 15:57:45 -0400185 case FORMAT_D32FS8_TEXTURE:
186 case FORMAT_D32FS8_SHADOW:
John Bauman89401822014-05-06 15:04:28 -0400187 *((float*)element) = color.r;
188 break;
189 case FORMAT_D32F_COMPLEMENTARY:
190 *((float*)element) = 1 - color.r;
191 break;
192 case FORMAT_S8:
193 *((unsigned char*)element) = unorm<8>(color.r);
194 break;
195 case FORMAT_L8:
196 *(unsigned char*)element = unorm<8>(color.r);
197 break;
198 case FORMAT_A4L4:
199 *(unsigned char*)element = (unorm<4>(color.a) << 4) | (unorm<4>(color.r) << 0);
200 break;
201 case FORMAT_L16:
202 *(unsigned short*)element = unorm<16>(color.r);
203 break;
204 case FORMAT_A8L8:
205 *(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<8>(color.r) << 0);
206 break;
207 default:
208 ASSERT(false);
209 }
210 }
211
212 Color<float> Surface::Buffer::read(int x, int y, int z) const
213 {
214 void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
215
216 return read(element);
217 }
218
219 Color<float> Surface::Buffer::read(int x, int y) const
220 {
221 void *element = (unsigned char*)buffer + x * bytes + y * pitchB;
222
223 return read(element);
224 }
225
226 inline Color<float> Surface::Buffer::read(void *element) const
227 {
228 float r = 1;
229 float g = 1;
230 float b = 1;
231 float a = 1;
232
233 switch(format)
234 {
235 case FORMAT_P8:
236 {
237 ASSERT(palette);
238
239 unsigned int abgr = palette[*(unsigned char*)element];
240
241 r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
242 g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
243 b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
244 a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
245 }
246 break;
247 case FORMAT_A8P8:
248 {
249 ASSERT(palette);
250
251 unsigned int bgr = palette[((unsigned char*)element)[0]];
252
253 r = (bgr & 0x000000FF) * (1.0f / 0x000000FF);
254 g = (bgr & 0x0000FF00) * (1.0f / 0x0000FF00);
255 b = (bgr & 0x00FF0000) * (1.0f / 0x00FF0000);
256 a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
257 }
258 break;
259 case FORMAT_A8:
260 r = 0;
261 g = 0;
262 b = 0;
263 a = *(unsigned char*)element * (1.0f / 0xFF);
264 break;
265 case FORMAT_R8:
266 r = *(unsigned char*)element * (1.0f / 0xFF);
267 break;
268 case FORMAT_R3G3B2:
269 {
270 unsigned char rgb = *(unsigned char*)element;
271
272 r = (rgb & 0xE0) * (1.0f / 0xE0);
273 g = (rgb & 0x1C) * (1.0f / 0x1C);
274 b = (rgb & 0x03) * (1.0f / 0x03);
275 }
276 break;
277 case FORMAT_A8R3G3B2:
278 {
279 unsigned short argb = *(unsigned short*)element;
280
281 a = (argb & 0xFF00) * (1.0f / 0xFF00);
282 r = (argb & 0x00E0) * (1.0f / 0x00E0);
283 g = (argb & 0x001C) * (1.0f / 0x001C);
284 b = (argb & 0x0003) * (1.0f / 0x0003);
285 }
286 break;
287 case FORMAT_X4R4G4B4:
288 {
289 unsigned short rgb = *(unsigned short*)element;
290
291 r = (rgb & 0x0F00) * (1.0f / 0x0F00);
292 g = (rgb & 0x00F0) * (1.0f / 0x00F0);
293 b = (rgb & 0x000F) * (1.0f / 0x000F);
294 }
295 break;
296 case FORMAT_A4R4G4B4:
297 {
298 unsigned short argb = *(unsigned short*)element;
299
300 a = (argb & 0xF000) * (1.0f / 0xF000);
301 r = (argb & 0x0F00) * (1.0f / 0x0F00);
302 g = (argb & 0x00F0) * (1.0f / 0x00F0);
303 b = (argb & 0x000F) * (1.0f / 0x000F);
304 }
305 break;
306 case FORMAT_R5G6B5:
307 {
308 unsigned short rgb = *(unsigned short*)element;
309
310 r = (rgb & 0xF800) * (1.0f / 0xF800);
311 g = (rgb & 0x07E0) * (1.0f / 0x07E0);
312 b = (rgb & 0x001F) * (1.0f / 0x001F);
313 }
314 break;
315 case FORMAT_A1R5G5B5:
316 {
317 unsigned short argb = *(unsigned short*)element;
318
319 a = (argb & 0x8000) * (1.0f / 0x8000);
320 r = (argb & 0x7C00) * (1.0f / 0x7C00);
321 g = (argb & 0x03E0) * (1.0f / 0x03E0);
322 b = (argb & 0x001F) * (1.0f / 0x001F);
323 }
324 break;
325 case FORMAT_X1R5G5B5:
326 {
327 unsigned short xrgb = *(unsigned short*)element;
328
329 r = (xrgb & 0x7C00) * (1.0f / 0x7C00);
330 g = (xrgb & 0x03E0) * (1.0f / 0x03E0);
331 b = (xrgb & 0x001F) * (1.0f / 0x001F);
332 }
333 break;
334 case FORMAT_A8R8G8B8:
335 {
336 unsigned int argb = *(unsigned int*)element;
337
338 a = (argb & 0xFF000000) * (1.0f / 0xFF000000);
339 r = (argb & 0x00FF0000) * (1.0f / 0x00FF0000);
340 g = (argb & 0x0000FF00) * (1.0f / 0x0000FF00);
341 b = (argb & 0x000000FF) * (1.0f / 0x000000FF);
342 }
343 break;
344 case FORMAT_X8R8G8B8:
345 {
346 unsigned int xrgb = *(unsigned int*)element;
347
348 r = (xrgb & 0x00FF0000) * (1.0f / 0x00FF0000);
349 g = (xrgb & 0x0000FF00) * (1.0f / 0x0000FF00);
350 b = (xrgb & 0x000000FF) * (1.0f / 0x000000FF);
351 }
352 break;
353 case FORMAT_A8B8G8R8:
354 {
355 unsigned int abgr = *(unsigned int*)element;
356
357 a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
358 b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
359 g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
360 r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
361 }
362 break;
363 case FORMAT_X8B8G8R8:
364 {
365 unsigned int xbgr = *(unsigned int*)element;
366
367 b = (xbgr & 0x00FF0000) * (1.0f / 0x00FF0000);
368 g = (xbgr & 0x0000FF00) * (1.0f / 0x0000FF00);
369 r = (xbgr & 0x000000FF) * (1.0f / 0x000000FF);
370 }
371 break;
372 case FORMAT_G8R8:
373 {
374 unsigned short gr = *(unsigned short*)element;
375
376 g = (gr & 0xFF00) * (1.0f / 0xFF00);
377 r = (gr & 0x00FF) * (1.0f / 0x00FF);
378 }
379 break;
380 case FORMAT_G16R16:
381 {
382 unsigned int gr = *(unsigned int*)element;
383
384 g = (gr & 0xFFFF0000) * (1.0f / 0xFFFF0000);
385 r = (gr & 0x0000FFFF) * (1.0f / 0x0000FFFF);
386 }
387 break;
388 case FORMAT_A2R10G10B10:
389 {
390 unsigned int argb = *(unsigned int*)element;
391
392 a = (argb & 0xC0000000) * (1.0f / 0xC0000000);
393 r = (argb & 0x3FF00000) * (1.0f / 0x3FF00000);
394 g = (argb & 0x000FFC00) * (1.0f / 0x000FFC00);
395 b = (argb & 0x000003FF) * (1.0f / 0x000003FF);
396 }
397 break;
398 case FORMAT_A2B10G10R10:
399 {
400 unsigned int abgr = *(unsigned int*)element;
401
402 a = (abgr & 0xC0000000) * (1.0f / 0xC0000000);
403 b = (abgr & 0x3FF00000) * (1.0f / 0x3FF00000);
404 g = (abgr & 0x000FFC00) * (1.0f / 0x000FFC00);
405 r = (abgr & 0x000003FF) * (1.0f / 0x000003FF);
406 }
407 break;
408 case FORMAT_A16B16G16R16:
409 r = ((unsigned short*)element)[0] * (1.0f / 0xFFFF);
410 g = ((unsigned short*)element)[1] * (1.0f / 0xFFFF);
411 b = ((unsigned short*)element)[2] * (1.0f / 0xFFFF);
412 a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
413 break;
414 case FORMAT_V8U8:
415 {
416 unsigned short vu = *(unsigned short*)element;
417
418 r = ((int)(vu & 0x00FF) << 24) * (1.0f / 0x7F000000);
419 g = ((int)(vu & 0xFF00) << 16) * (1.0f / 0x7F000000);
420 }
421 break;
422 case FORMAT_L6V5U5:
423 {
424 unsigned short lvu = *(unsigned short*)element;
425
426 r = ((int)(lvu & 0x001F) << 27) * (1.0f / 0x78000000);
427 g = ((int)(lvu & 0x03E0) << 22) * (1.0f / 0x78000000);
428 b = (lvu & 0xFC00) * (1.0f / 0xFC00);
429 }
430 break;
431 case FORMAT_Q8W8V8U8:
432 {
433 unsigned int qwvu = *(unsigned int*)element;
434
435 r = ((int)(qwvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
436 g = ((int)(qwvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
437 b = ((int)(qwvu & 0x00FF0000) << 8) * (1.0f / 0x7F000000);
438 a = ((int)(qwvu & 0xFF000000) << 0) * (1.0f / 0x7F000000);
439 }
440 break;
441 case FORMAT_X8L8V8U8:
442 {
443 unsigned int xlvu = *(unsigned int*)element;
444
445 r = ((int)(xlvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
446 g = ((int)(xlvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
447 b = (xlvu & 0x00FF0000) * (1.0f / 0x00FF0000);
448 }
449 break;
450 case FORMAT_R8G8B8:
451 r = ((unsigned char*)element)[2] * (1.0f / 0xFF);
452 g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
453 b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
454 break;
455 case FORMAT_V16U16:
456 {
457 unsigned int vu = *(unsigned int*)element;
458
459 r = ((int)(vu & 0x0000FFFF) << 16) * (1.0f / 0x7FFF0000);
460 g = ((int)(vu & 0xFFFF0000) << 0) * (1.0f / 0x7FFF0000);
461 }
462 break;
463 case FORMAT_A2W10V10U10:
464 {
465 unsigned int awvu = *(unsigned int*)element;
466
467 r = ((int)(awvu & 0x000003FF) << 22) * (1.0f / 0x7FC00000);
468 g = ((int)(awvu & 0x000FFC00) << 12) * (1.0f / 0x7FC00000);
469 b = ((int)(awvu & 0x3FF00000) << 2) * (1.0f / 0x7FC00000);
470 a = (awvu & 0xC0000000) * (1.0f / 0xC0000000);
471 }
472 break;
473 case FORMAT_A16W16V16U16:
474 r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
475 g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
476 b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
477 a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
478 break;
479 case FORMAT_Q16W16V16U16:
480 r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
481 g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
482 b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
483 a = ((signed short*)element)[3] * (1.0f / 0x7FFF);
484 break;
485 case FORMAT_L8:
486 r =
487 g =
488 b = *(unsigned char*)element * (1.0f / 0xFF);
489 break;
490 case FORMAT_A4L4:
491 {
492 unsigned char al = *(unsigned char*)element;
493
494 r =
495 g =
496 b = (al & 0x0F) * (1.0f / 0x0F);
497 a = (al & 0xF0) * (1.0f / 0xF0);
498 }
499 break;
500 case FORMAT_L16:
501 r =
502 g =
503 b = *(unsigned short*)element * (1.0f / 0xFFFF);
504 break;
505 case FORMAT_A8L8:
506 r =
507 g =
508 b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
509 a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
510 break;
511 case FORMAT_R16F:
512 r = *(half*)element;
513 break;
514 case FORMAT_G16R16F:
515 r = ((half*)element)[0];
516 g = ((half*)element)[1];
517 break;
518 case FORMAT_A16B16G16R16F:
519 r = ((half*)element)[0];
520 g = ((half*)element)[1];
521 b = ((half*)element)[2];
522 a = ((half*)element)[3];
523 break;
524 case FORMAT_R32F:
525 r = *(float*)element;
526 break;
527 case FORMAT_G32R32F:
528 r = ((float*)element)[0];
529 g = ((float*)element)[1];
530 break;
531 case FORMAT_A32B32G32R32F:
532 r = ((float*)element)[0];
533 g = ((float*)element)[1];
534 b = ((float*)element)[2];
535 a = ((float*)element)[3];
536 break;
537 case FORMAT_D32F:
538 case FORMAT_D32F_LOCKABLE:
John Bauman66b8ab22014-05-06 15:57:45 -0400539 case FORMAT_D32FS8_TEXTURE:
540 case FORMAT_D32FS8_SHADOW:
John Bauman89401822014-05-06 15:04:28 -0400541 r = *(float*)element;
542 g = r;
543 b = r;
544 a = r;
545 break;
546 case FORMAT_D32F_COMPLEMENTARY:
John Bauman66b8ab22014-05-06 15:57:45 -0400547 r = 1.0f - *(float*)element;
John Bauman89401822014-05-06 15:04:28 -0400548 g = r;
549 b = r;
550 a = r;
551 break;
552 case FORMAT_S8:
553 r = *(unsigned char*)element * (1.0f / 0xFF);
554 break;
555 default:
556 ASSERT(false);
557 }
558
559 // if(sRGB)
560 // {
561 // r = sRGBtoLinear(r);
562 // g = sRGBtoLinear(g);
563 // b = sRGBtoLinear(b);
564 // }
565
566 return Color<float>(r, g, b, a);
567 }
568
569 Color<float> Surface::Buffer::sample(float x, float y, float z) const
570 {
571 x -= 0.5f;
572 y -= 0.5f;
573 z -= 0.5f;
574
575 int x0 = clamp((int)x, 0, width - 1);
576 int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
577
578 int y0 = clamp((int)y, 0, height - 1);
579 int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
580
581 int z0 = clamp((int)z, 0, depth - 1);
582 int z1 = (z0 + 1 >= depth) ? z0 : z0 + 1;
583
584 Color<float> c000 = read(x0, y0, z0);
585 Color<float> c100 = read(x1, y0, z0);
586 Color<float> c010 = read(x0, y1, z0);
587 Color<float> c110 = read(x1, y1, z0);
588 Color<float> c001 = read(x0, y0, z1);
589 Color<float> c101 = read(x1, y0, z1);
590 Color<float> c011 = read(x0, y1, z1);
591 Color<float> c111 = read(x1, y1, z1);
592
593 float fx = x - x0;
594 float fy = y - y0;
595 float fz = z - z0;
596
597 c000 *= (1 - fx) * (1 - fy) * (1 - fz);
598 c100 *= fx * (1 - fy) * (1 - fz);
599 c010 *= (1 - fx) * fy * (1 - fz);
600 c110 *= fx * fy * (1 - fz);
601 c001 *= (1 - fx) * (1 - fy) * fz;
602 c101 *= fx * (1 - fy) * fz;
603 c011 *= (1 - fx) * fy * fz;
604 c111 *= fx * fy * fz;
605
606 return c000 + c100 + c010 + c110 + c001 + c101 + c011 + c111;
607 }
608
609 Color<float> Surface::Buffer::sample(float x, float y) const
610 {
611 x -= 0.5f;
612 y -= 0.5f;
613
614 int x0 = clamp((int)x, 0, width - 1);
615 int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
616
617 int y0 = clamp((int)y, 0, height - 1);
618 int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
619
620 Color<float> c00 = read(x0, y0);
621 Color<float> c10 = read(x1, y0);
622 Color<float> c01 = read(x0, y1);
623 Color<float> c11 = read(x1, y1);
624
625 float fx = x - x0;
626 float fy = y - y0;
627
628 c00 *= (1 - fx) * (1 - fy);
629 c10 *= fx * (1 - fy);
630 c01 *= (1 - fx) * fy;
631 c11 *= fx * fy;
632
633 return c00 + c10 + c01 + c11;
634 }
635
John Bauman19bac1e2014-05-06 15:23:49 -0400636 void *Surface::Buffer::lockRect(int x, int y, int z, Lock lock)
John Bauman89401822014-05-06 15:04:28 -0400637 {
638 this->lock = lock;
639
640 switch(lock)
641 {
642 case LOCK_UNLOCKED:
643 case LOCK_READONLY:
644 break;
645 case LOCK_WRITEONLY:
646 case LOCK_READWRITE:
647 case LOCK_DISCARD:
648 dirty = true;
649 break;
650 default:
651 ASSERT(false);
652 }
653
654 switch(format)
655 {
656 #if S3TC_SUPPORT
657 case FORMAT_DXT1:
John Bauman66b8ab22014-05-06 15:57:45 -0400658 #endif
John Bauman89401822014-05-06 15:04:28 -0400659 case FORMAT_ATI1:
John Bauman19bac1e2014-05-06 15:23:49 -0400660 return (unsigned char*)buffer + 8 * (x / 4) + (y / 4) * pitchB + z * sliceB;
John Bauman66b8ab22014-05-06 15:57:45 -0400661 #if S3TC_SUPPORT
John Bauman89401822014-05-06 15:04:28 -0400662 case FORMAT_DXT3:
663 case FORMAT_DXT5:
John Bauman66b8ab22014-05-06 15:57:45 -0400664 #endif
John Bauman89401822014-05-06 15:04:28 -0400665 case FORMAT_ATI2:
John Bauman19bac1e2014-05-06 15:23:49 -0400666 return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
John Bauman89401822014-05-06 15:04:28 -0400667 default:
John Bauman19bac1e2014-05-06 15:23:49 -0400668 return (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
John Bauman89401822014-05-06 15:04:28 -0400669 }
670
671 return 0;
672 }
673
674 void Surface::Buffer::unlockRect()
675 {
676 lock = LOCK_UNLOCKED;
677 }
678
679 Surface::Surface(Resource *texture, int width, int height, int depth, Format format, bool lockable, bool renderTarget) : lockable(lockable), renderTarget(renderTarget)
680 {
681 resource = texture ? texture : new Resource(0);
John Bauman19bac1e2014-05-06 15:23:49 -0400682 hasParent = texture != 0;
John Bauman89401822014-05-06 15:04:28 -0400683 depth = max(1, depth);
684
685 external.buffer = 0;
686 external.width = width;
687 external.height = height;
688 external.depth = depth;
689 external.format = format;
690 external.bytes = bytes(external.format);
691 external.pitchB = pitchB(external.width, external.format, renderTarget && !texture);
692 external.pitchP = pitchP(external.width, external.format, renderTarget && !texture);
693 external.sliceB = sliceB(external.width, external.height, external.format, renderTarget && !texture);
694 external.sliceP = sliceP(external.width, external.height, external.format, renderTarget && !texture);
695 external.lock = LOCK_UNLOCKED;
696 external.dirty = false;
John Bauman89401822014-05-06 15:04:28 -0400697
698 internal.buffer = 0;
699 internal.width = width;
700 internal.height = height;
701 internal.depth = depth;
702 internal.format = selectInternalFormat(format);
703 internal.bytes = bytes(internal.format);
704 internal.pitchB = pitchB(internal.width, internal.format, renderTarget);
705 internal.pitchP = pitchP(internal.width, internal.format, renderTarget);
706 internal.sliceB = sliceB(internal.width, internal.height, internal.format, renderTarget);
707 internal.sliceP = sliceP(internal.width, internal.height, internal.format, renderTarget);
708 internal.lock = LOCK_UNLOCKED;
709 internal.dirty = false;
John Bauman89401822014-05-06 15:04:28 -0400710
711 stencil.buffer = 0;
712 stencil.width = width;
713 stencil.height = height;
714 stencil.depth = depth;
715 stencil.format = FORMAT_S8;
716 stencil.bytes = bytes(stencil.format);
717 stencil.pitchB = pitchB(stencil.width, stencil.format, renderTarget);
718 stencil.pitchP = pitchP(stencil.width, stencil.format, renderTarget);
719 stencil.sliceB = sliceB(stencil.width, stencil.height, stencil.format, renderTarget);
720 stencil.sliceP = sliceP(stencil.width, stencil.height, stencil.format, renderTarget);
721 stencil.lock = LOCK_UNLOCKED;
722 stencil.dirty = false;
John Bauman89401822014-05-06 15:04:28 -0400723
724 dirtyMipmaps = true;
John Bauman66b8ab22014-05-06 15:57:45 -0400725 paletteUsed = 0;
John Bauman89401822014-05-06 15:04:28 -0400726 }
727
728 Surface::~Surface()
729 {
John Bauman8a4f6fc2014-05-06 15:26:18 -0400730 // Synchronize so we can deallocate the buffers below
731 resource->lock(DESTRUCT);
732 resource->unlock();
733
John Bauman89401822014-05-06 15:04:28 -0400734 if(!hasParent)
735 {
736 resource->destruct();
737 }
738
739 deallocate(external.buffer);
740
741 if(internal.buffer != external.buffer)
742 {
743 deallocate(internal.buffer);
744 }
745
746 deallocate(stencil.buffer);
747
748 external.buffer = 0;
749 internal.buffer = 0;
750 stencil.buffer = 0;
751 }
752
John Bauman19bac1e2014-05-06 15:23:49 -0400753 void *Surface::lockExternal(int x, int y, int z, Lock lock, Accessor client)
John Bauman89401822014-05-06 15:04:28 -0400754 {
755 resource->lock(client);
756
757 if(!external.buffer)
758 {
759 if(internal.buffer && identicalFormats())
760 {
761 external.buffer = internal.buffer;
762 }
763 else
764 {
765 external.buffer = allocateBuffer(external.width, external.height, external.depth, external.format);
766 }
767 }
768
769 if(internal.dirty)
770 {
771 if(lock != LOCK_DISCARD)
772 {
773 update(external, internal);
774 }
John Bauman66b8ab22014-05-06 15:57:45 -0400775
776 internal.dirty = false;
John Bauman89401822014-05-06 15:04:28 -0400777 }
778
779 switch(lock)
780 {
781 case LOCK_READONLY:
782 break;
783 case LOCK_WRITEONLY:
784 case LOCK_READWRITE:
785 case LOCK_DISCARD:
786 dirtyMipmaps = true;
787 break;
788 default:
789 ASSERT(false);
790 }
791
John Bauman19bac1e2014-05-06 15:23:49 -0400792 return external.lockRect(x, y, z, lock);
John Bauman89401822014-05-06 15:04:28 -0400793 }
794
795 void Surface::unlockExternal()
796 {
797 resource->unlock();
798
799 external.unlockRect();
800 }
801
John Bauman19bac1e2014-05-06 15:23:49 -0400802 void *Surface::lockInternal(int x, int y, int z, Lock lock, Accessor client)
John Bauman89401822014-05-06 15:04:28 -0400803 {
804 if(lock != LOCK_UNLOCKED)
805 {
806 resource->lock(client);
807 }
808
809 if(!internal.buffer)
810 {
811 if(external.buffer && identicalFormats())
812 {
813 internal.buffer = external.buffer;
814 }
815 else
816 {
817 internal.buffer = allocateBuffer(internal.width, internal.height, internal.depth, internal.format);
818 }
819 }
820
821 // FIXME: WHQL requires conversion to lower external precision and back
822 if(logPrecision >= WHQL)
823 {
824 if(internal.dirty && renderTarget && internal.format != external.format)
825 {
826 if(lock != LOCK_DISCARD)
827 {
828 switch(external.format)
829 {
830 case FORMAT_R3G3B2:
831 case FORMAT_A8R3G3B2:
832 case FORMAT_A1R5G5B5:
833 case FORMAT_A2R10G10B10:
834 case FORMAT_A2B10G10R10:
835 lockExternal(0, 0, 0, LOCK_READWRITE, client);
836 unlockExternal();
837 break;
838 default:
839 // Difference passes WHQL
840 break;
841 }
842 }
843 }
844 }
845
John Bauman66b8ab22014-05-06 15:57:45 -0400846 if(external.dirty || (isPalette(external.format) && paletteUsed != Surface::paletteID))
John Bauman89401822014-05-06 15:04:28 -0400847 {
848 if(lock != LOCK_DISCARD)
849 {
850 update(internal, external);
851 }
John Bauman89401822014-05-06 15:04:28 -0400852
John Bauman66b8ab22014-05-06 15:57:45 -0400853 external.dirty = false;
854 paletteUsed = Surface::paletteID;
John Bauman89401822014-05-06 15:04:28 -0400855 }
856
857 switch(lock)
858 {
859 case LOCK_UNLOCKED:
860 case LOCK_READONLY:
861 break;
862 case LOCK_WRITEONLY:
863 case LOCK_READWRITE:
864 case LOCK_DISCARD:
865 dirtyMipmaps = true;
866 break;
867 default:
868 ASSERT(false);
869 }
870
871 if(lock == LOCK_READONLY && client == PUBLIC)
872 {
873 resolve();
874 }
875
John Bauman19bac1e2014-05-06 15:23:49 -0400876 return internal.lockRect(x, y, z, lock);
John Bauman89401822014-05-06 15:04:28 -0400877 }
878
879 void Surface::unlockInternal()
880 {
881 resource->unlock();
882
883 internal.unlockRect();
884 }
885
886 void *Surface::lockStencil(int front, Accessor client)
887 {
888 resource->lock(client);
889
890 if(!stencil.buffer)
891 {
892 stencil.buffer = allocateBuffer(stencil.width, stencil.height, stencil.depth, stencil.format);
893 }
894
John Bauman89401822014-05-06 15:04:28 -0400895 return stencil.lockRect(0, 0, front, LOCK_READWRITE); // FIXME
896 }
897
898 void Surface::unlockStencil()
899 {
900 resource->unlock();
901
902 stencil.unlockRect();
903 }
904
905 int Surface::bytes(Format format)
906 {
907 switch(format)
908 {
909 case FORMAT_NULL: return 0;
910 case FORMAT_P8: return 1;
911 case FORMAT_A8P8: return 2;
912 case FORMAT_A8: return 1;
913 case FORMAT_R8: return 1;
914 case FORMAT_R3G3B2: return 1;
915 case FORMAT_A8R3G3B2: return 2;
916 case FORMAT_R5G6B5: return 2;
917 case FORMAT_A1R5G5B5: return 2;
918 case FORMAT_X1R5G5B5: return 2;
919 case FORMAT_X4R4G4B4: return 2;
920 case FORMAT_A4R4G4B4: return 2;
921 case FORMAT_R8G8B8: return 3;
922 case FORMAT_X8R8G8B8: return 4;
923 // case FORMAT_X8G8R8B8Q: return 4;
924 case FORMAT_A8R8G8B8: return 4;
925 // case FORMAT_A8G8R8B8Q: return 4;
926 case FORMAT_X8B8G8R8: return 4;
927 case FORMAT_A8B8G8R8: return 4;
928 case FORMAT_A2R10G10B10: return 4;
929 case FORMAT_A2B10G10R10: return 4;
930 case FORMAT_G8R8: return 2;
931 case FORMAT_G16R16: return 4;
932 case FORMAT_A16B16G16R16: return 8;
933 // Compressed formats
934 #if S3TC_SUPPORT
935 case FORMAT_DXT1: return 2; // Column of four pixels
936 case FORMAT_DXT3: return 4; // Column of four pixels
937 case FORMAT_DXT5: return 4; // Column of four pixels
John Bauman66b8ab22014-05-06 15:57:45 -0400938 #endif
John Bauman89401822014-05-06 15:04:28 -0400939 case FORMAT_ATI1: return 2; // Column of four pixels
940 case FORMAT_ATI2: return 4; // Column of four pixels
John Bauman89401822014-05-06 15:04:28 -0400941 // Bumpmap formats
942 case FORMAT_V8U8: return 2;
943 case FORMAT_L6V5U5: return 2;
944 case FORMAT_Q8W8V8U8: return 4;
945 case FORMAT_X8L8V8U8: return 4;
946 case FORMAT_A2W10V10U10: return 4;
947 case FORMAT_V16U16: return 4;
948 case FORMAT_A16W16V16U16: return 8;
949 case FORMAT_Q16W16V16U16: return 8;
950 // Luminance formats
951 case FORMAT_L8: return 1;
952 case FORMAT_A4L4: return 1;
953 case FORMAT_L16: return 2;
954 case FORMAT_A8L8: return 2;
955 // Floating-point formats
956 case FORMAT_R16F: return 2;
957 case FORMAT_G16R16F: return 4;
958 case FORMAT_A16B16G16R16F: return 8;
959 case FORMAT_R32F: return 4;
960 case FORMAT_G32R32F: return 8;
961 case FORMAT_A32B32G32R32F: return 16;
962 // Depth/stencil formats
963 case FORMAT_D16: return 2;
964 case FORMAT_D32: return 4;
965 case FORMAT_D24X8: return 4;
966 case FORMAT_D24S8: return 4;
967 case FORMAT_D24FS8: return 4;
968 case FORMAT_D32F: return 4;
969 case FORMAT_D32F_COMPLEMENTARY: return 4;
970 case FORMAT_D32F_LOCKABLE: return 4;
John Bauman66b8ab22014-05-06 15:57:45 -0400971 case FORMAT_D32FS8_TEXTURE: return 4;
972 case FORMAT_D32FS8_SHADOW: return 4;
973 case FORMAT_DF24S8: return 4;
974 case FORMAT_DF16S8: return 2;
John Bauman89401822014-05-06 15:04:28 -0400975 case FORMAT_INTZ: return 4;
976 case FORMAT_S8: return 1;
977 default:
978 ASSERT(false);
979 }
980
981 return 0;
982 }
983
984 int Surface::pitchB(int width, Format format, bool target)
985 {
986 if(target || isDepth(format) || isStencil(format))
987 {
988 width = ((width + 1) & ~1);
989 }
990
991 switch(format)
992 {
993 #if S3TC_SUPPORT
994 case FORMAT_DXT1:
995 return 8 * ((width + 3) / 4); // 64 bit per 4x4 block, computed per 4 rows
996 case FORMAT_DXT3:
997 case FORMAT_DXT5:
998 return 16 * ((width + 3) / 4); // 128 bit per 4x4 block, computed per 4 rows
John Bauman66b8ab22014-05-06 15:57:45 -0400999 #endif
John Bauman89401822014-05-06 15:04:28 -04001000 case FORMAT_ATI1:
1001 return 2 * ((width + 3) / 4); // 64 bit per 4x4 block, computed per row
1002 case FORMAT_ATI2:
1003 return 4 * ((width + 3) / 4); // 128 bit per 4x4 block, computed per row
John Bauman89401822014-05-06 15:04:28 -04001004 default:
1005 return bytes(format) * width;
1006 }
1007 }
1008
1009 int Surface::pitchP(int width, Format format, bool target)
1010 {
1011 int B = bytes(format);
1012
1013 return B > 0 ? pitchB(width, format, target) / B : 0;
1014 }
1015
1016 int Surface::sliceB(int width, int height, Format format, bool target)
1017 {
1018 if(target || isDepth(format) || isStencil(format))
1019 {
1020 height = ((height + 1) & ~1);
1021 }
1022
1023 switch(format)
1024 {
1025 #if S3TC_SUPPORT
1026 case FORMAT_DXT1:
1027 case FORMAT_DXT3:
1028 case FORMAT_DXT5:
1029 return pitchB(width, format, target) * ((height + 3) / 4); // Pitch computed per 4 rows
John Bauman66b8ab22014-05-06 15:57:45 -04001030 #endif
John Bauman89401822014-05-06 15:04:28 -04001031 case FORMAT_ATI1: // Pitch computed per row
1032 case FORMAT_ATI2: // Pitch computed per row
John Bauman89401822014-05-06 15:04:28 -04001033 default:
1034 return pitchB(width, format, target) * height;
1035 }
1036 }
1037
1038 int Surface::sliceP(int width, int height, Format format, bool target)
1039 {
1040 int B = bytes(format);
1041
1042 return B > 0 ? sliceB(width, height, format, target) / B : 0;
1043 }
1044
1045 void Surface::update(Buffer &destination, Buffer &source)
1046 {
1047 // ASSERT(source.lock != LOCK_UNLOCKED);
1048 // ASSERT(destination.lock != LOCK_UNLOCKED);
1049
1050 if(destination.buffer != source.buffer)
1051 {
1052 ASSERT(source.dirty && !destination.dirty);
1053
1054 switch(source.format)
1055 {
1056 case FORMAT_R8G8B8: decodeR8G8B8(destination, source); break; // FIXME: Check destination format
1057 case FORMAT_X8B8G8R8: decodeX8B8G8R8(destination, source); break; // FIXME: Check destination format
1058 case FORMAT_A8B8G8R8: decodeA8B8G8R8(destination, source); break; // FIXME: Check destination format
1059 case FORMAT_R5G6B5: decodeR5G6B5(destination, source); break; // FIXME: Check destination format
1060 case FORMAT_X1R5G5B5: decodeX1R5G5B5(destination, source); break; // FIXME: Check destination format
1061 case FORMAT_A1R5G5B5: decodeA1R5G5B5(destination, source); break; // FIXME: Check destination format
1062 case FORMAT_X4R4G4B4: decodeX4R4G4B4(destination, source); break; // FIXME: Check destination format
1063 case FORMAT_A4R4G4B4: decodeA4R4G4B4(destination, source); break; // FIXME: Check destination format
1064 case FORMAT_P8: decodeP8(destination, source); break; // FIXME: Check destination format
1065 #if S3TC_SUPPORT
1066 case FORMAT_DXT1: decodeDXT1(destination, source); break; // FIXME: Check destination format
1067 case FORMAT_DXT3: decodeDXT3(destination, source); break; // FIXME: Check destination format
1068 case FORMAT_DXT5: decodeDXT5(destination, source); break; // FIXME: Check destination format
1069 case FORMAT_ATI1: decodeATI1(destination, source); break; // FIXME: Check destination format
1070 case FORMAT_ATI2: decodeATI2(destination, source); break; // FIXME: Check destination format
1071 #endif
1072 default: genericUpdate(destination, source); break;
1073 }
1074 }
John Bauman89401822014-05-06 15:04:28 -04001075 }
1076
1077 void Surface::genericUpdate(Buffer &destination, Buffer &source)
1078 {
1079 unsigned char *sourceSlice = (unsigned char*)source.buffer;
1080 unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1081
1082 int depth = min(destination.depth, source.depth);
1083 int height = min(destination.height, source.height);
1084 int width = min(destination.width, source.width);
1085 int rowBytes = width * source.bytes;
1086
1087 for(int z = 0; z < depth; z++)
1088 {
1089 unsigned char *sourceRow = sourceSlice;
1090 unsigned char *destinationRow = destinationSlice;
1091
1092 for(int y = 0; y < height; y++)
1093 {
1094 if(source.format == destination.format)
1095 {
1096 memcpy(destinationRow, sourceRow, rowBytes);
1097 }
1098 else
1099 {
1100 unsigned char *sourceElement = sourceRow;
1101 unsigned char *destinationElement = destinationRow;
1102
1103 for(int x = 0; x < width; x++)
1104 {
1105 Color<float> color = source.read(sourceElement);
1106 destination.write(destinationElement, color);
1107
1108 sourceElement += source.bytes;
1109 destinationElement += destination.bytes;
1110 }
1111 }
1112
1113 sourceRow += source.pitchB;
1114 destinationRow += destination.pitchB;
1115 }
1116
1117 sourceSlice += source.sliceB;
1118 destinationSlice += destination.sliceB;
1119 }
1120 }
1121
1122 void Surface::decodeR8G8B8(Buffer &destination, const Buffer &source)
1123 {
1124 unsigned char *sourceSlice = (unsigned char*)source.buffer;
1125 unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1126
1127 for(int z = 0; z < destination.depth && z < source.depth; z++)
1128 {
1129 unsigned char *sourceRow = sourceSlice;
1130 unsigned char *destinationRow = destinationSlice;
1131
1132 for(int y = 0; y < destination.height && y < source.height; y++)
1133 {
1134 unsigned char *sourceElement = sourceRow;
1135 unsigned char *destinationElement = destinationRow;
1136
1137 for(int x = 0; x < destination.width && x < source.width; x++)
1138 {
1139 unsigned int b = sourceElement[0];
1140 unsigned int g = sourceElement[1];
1141 unsigned int r = sourceElement[2];
1142
1143 *(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0);
1144
1145 sourceElement += source.bytes;
1146 destinationElement += destination.bytes;
1147 }
1148
1149 sourceRow += source.pitchB;
1150 destinationRow += destination.pitchB;
1151 }
1152
1153 sourceSlice += source.sliceB;
1154 destinationSlice += destination.sliceB;
1155 }
1156 }
1157
1158 void Surface::decodeX8B8G8R8(Buffer &destination, const Buffer &source)
1159 {
1160 unsigned char *sourceSlice = (unsigned char*)source.buffer;
1161 unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1162
1163 for(int z = 0; z < destination.depth && z < source.depth; z++)
1164 {
1165 unsigned char *sourceRow = sourceSlice;
1166 unsigned char *destinationRow = destinationSlice;
1167
1168 for(int y = 0; y < destination.height && y < source.height; y++)
1169 {
1170 unsigned char *sourceElement = sourceRow;
1171 unsigned char *destinationElement = destinationRow;
1172
1173 for(int x = 0; x < destination.width && x < source.width; x++)
1174 {
1175 unsigned int r = sourceElement[0];
1176 unsigned int g = sourceElement[1];
1177 unsigned int b = sourceElement[2];
1178
1179 *(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0);
1180
1181 sourceElement += source.bytes;
1182 destinationElement += destination.bytes;
1183 }
1184
1185 sourceRow += source.pitchB;
1186 destinationRow += destination.pitchB;
1187 }
1188
1189 sourceSlice += source.sliceB;
1190 destinationSlice += destination.sliceB;
1191 }
1192 }
1193
1194 void Surface::decodeA8B8G8R8(Buffer &destination, const Buffer &source)
1195 {
1196 unsigned char *sourceSlice = (unsigned char*)source.buffer;
1197 unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1198
1199 for(int z = 0; z < destination.depth && z < source.depth; z++)
1200 {
1201 unsigned char *sourceRow = sourceSlice;
1202 unsigned char *destinationRow = destinationSlice;
1203
1204 for(int y = 0; y < destination.height && y < source.height; y++)
1205 {
1206 unsigned char *sourceElement = sourceRow;
1207 unsigned char *destinationElement = destinationRow;
1208
1209 for(int x = 0; x < destination.width && x < source.width; x++)
1210 {
1211 unsigned int r = sourceElement[0];
1212 unsigned int g = sourceElement[1];
1213 unsigned int b = sourceElement[2];
1214 unsigned int a = sourceElement[3];
1215
1216 *(unsigned int*)destinationElement = (a << 24) | (r << 16) | (g << 8) | (b << 0);
1217
1218 sourceElement += source.bytes;
1219 destinationElement += destination.bytes;
1220 }
1221
1222 sourceRow += source.pitchB;
1223 destinationRow += destination.pitchB;
1224 }
1225
1226 sourceSlice += source.sliceB;
1227 destinationSlice += destination.sliceB;
1228 }
1229 }
1230
1231 void Surface::decodeR5G6B5(Buffer &destination, const Buffer &source)
1232 {
1233 unsigned char *sourceSlice = (unsigned char*)source.buffer;
1234 unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1235
1236 for(int z = 0; z < destination.depth && z < source.depth; z++)
1237 {
1238 unsigned char *sourceRow = sourceSlice;
1239 unsigned char *destinationRow = destinationSlice;
1240
1241 for(int y = 0; y < destination.height && y < source.height; y++)
1242 {
1243 unsigned char *sourceElement = sourceRow;
1244 unsigned char *destinationElement = destinationRow;
1245
1246 for(int x = 0; x < destination.width && x < source.width; x++)
1247 {
1248 unsigned int rgb = *(unsigned short*)sourceElement;
1249
1250 unsigned int r = (((rgb & 0xF800) * 67385 + 0x800000) >> 8) & 0x00FF0000;
1251 unsigned int g = (((rgb & 0x07E0) * 8289 + 0x8000) >> 8) & 0x0000FF00;
1252 unsigned int b = (((rgb & 0x001F) * 2106 + 0x80) >> 8);
1253
1254 *(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
1255
1256 sourceElement += source.bytes;
1257 destinationElement += destination.bytes;
1258 }
1259
1260 sourceRow += source.pitchB;
1261 destinationRow += destination.pitchB;
1262 }
1263
1264 sourceSlice += source.sliceB;
1265 destinationSlice += destination.sliceB;
1266 }
1267 }
1268
1269 void Surface::decodeX1R5G5B5(Buffer &destination, const Buffer &source)
1270 {
1271 unsigned char *sourceSlice = (unsigned char*)source.buffer;
1272 unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1273
1274 for(int z = 0; z < destination.depth && z < source.depth; z++)
1275 {
1276 unsigned char *sourceRow = sourceSlice;
1277 unsigned char *destinationRow = destinationSlice;
1278
1279 for(int y = 0; y < destination.height && y < source.height; y++)
1280 {
1281 unsigned char *sourceElement = sourceRow;
1282 unsigned char *destinationElement = destinationRow;
1283
1284 for(int x = 0; x < destination.width && x < source.width; x++)
1285 {
1286 unsigned int xrgb = *(unsigned short*)sourceElement;
1287
1288 unsigned int r = (((xrgb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
1289 unsigned int g = (((xrgb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00;
1290 unsigned int b = (((xrgb & 0x001F) * 2106 + 0x80) >> 8);
1291
1292 *(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
1293
1294 sourceElement += source.bytes;
1295 destinationElement += destination.bytes;
1296 }
1297
1298 sourceRow += source.pitchB;
1299 destinationRow += destination.pitchB;
1300 }
1301
1302 sourceSlice += source.sliceB;
1303 destinationSlice += destination.sliceB;
1304 }
1305 }
1306
1307 void Surface::decodeA1R5G5B5(Buffer &destination, const Buffer &source)
1308 {
1309 unsigned char *sourceSlice = (unsigned char*)source.buffer;
1310 unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1311
1312 for(int z = 0; z < destination.depth && z < source.depth; z++)
1313 {
1314 unsigned char *sourceRow = sourceSlice;
1315 unsigned char *destinationRow = destinationSlice;
1316
1317 for(int y = 0; y < destination.height && y < source.height; y++)
1318 {
1319 unsigned char *sourceElement = sourceRow;
1320 unsigned char *destinationElement = destinationRow;
1321
1322 for(int x = 0; x < destination.width && x < source.width; x++)
1323 {
1324 unsigned int argb = *(unsigned short*)sourceElement;
1325
1326 unsigned int a = (argb & 0x8000) * 130560;
1327 unsigned int r = (((argb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
1328 unsigned int g = (((argb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00;
1329 unsigned int b = (((argb & 0x001F) * 2106 + 0x80) >> 8);
1330
1331 *(unsigned int*)destinationElement = a | r | g | b;
1332
1333 sourceElement += source.bytes;
1334 destinationElement += destination.bytes;
1335 }
1336
1337 sourceRow += source.pitchB;
1338 destinationRow += destination.pitchB;
1339 }
1340
1341 sourceSlice += source.sliceB;
1342 destinationSlice += destination.sliceB;
1343 }
1344 }
1345
1346 void Surface::decodeX4R4G4B4(Buffer &destination, const Buffer &source)
1347 {
1348 unsigned char *sourceSlice = (unsigned char*)source.buffer;
1349 unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1350
1351 for(int z = 0; z < destination.depth && z < source.depth; z++)
1352 {
1353 unsigned char *sourceRow = sourceSlice;
1354 unsigned char *destinationRow = destinationSlice;
1355
1356 for(int y = 0; y < destination.height && y < source.height; y++)
1357 {
1358 unsigned char *sourceElement = sourceRow;
1359 unsigned char *destinationElement = destinationRow;
1360
1361 for(int x = 0; x < destination.width && x < source.width; x++)
1362 {
1363 unsigned int xrgb = *(unsigned short*)sourceElement;
1364
1365 unsigned int r = ((xrgb & 0x0F00) * 0x00001100) & 0x00FF0000;
1366 unsigned int g = ((xrgb & 0x00F0) * 0x00000110) & 0x0000FF00;
1367 unsigned int b = (xrgb & 0x000F) * 0x00000011;
1368
1369 *(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
1370
1371 sourceElement += source.bytes;
1372 destinationElement += destination.bytes;
1373 }
1374
1375 sourceRow += source.pitchB;
1376 destinationRow += destination.pitchB;
1377 }
1378
1379 sourceSlice += source.sliceB;
1380 destinationSlice += destination.sliceB;
1381 }
1382 }
1383
1384 void Surface::decodeA4R4G4B4(Buffer &destination, const Buffer &source)
1385 {
1386 unsigned char *sourceSlice = (unsigned char*)source.buffer;
1387 unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1388
1389 for(int z = 0; z < destination.depth && z < source.depth; z++)
1390 {
1391 unsigned char *sourceRow = sourceSlice;
1392 unsigned char *destinationRow = destinationSlice;
1393
1394 for(int y = 0; y < destination.height && y < source.height; y++)
1395 {
1396 unsigned char *sourceElement = sourceRow;
1397 unsigned char *destinationElement = destinationRow;
1398
1399 for(int x = 0; x < destination.width && x < source.width; x++)
1400 {
1401 unsigned int argb = *(unsigned short*)sourceElement;
1402
1403 unsigned int a = ((argb & 0xF000) * 0x00011000) & 0xFF000000;
1404 unsigned int r = ((argb & 0x0F00) * 0x00001100) & 0x00FF0000;
1405 unsigned int g = ((argb & 0x00F0) * 0x00000110) & 0x0000FF00;
1406 unsigned int b = (argb & 0x000F) * 0x00000011;
1407
1408 *(unsigned int*)destinationElement = a | r | g | b;
1409
1410 sourceElement += source.bytes;
1411 destinationElement += destination.bytes;
1412 }
1413
1414 sourceRow += source.pitchB;
1415 destinationRow += destination.pitchB;
1416 }
1417
1418 sourceSlice += source.sliceB;
1419 destinationSlice += destination.sliceB;
1420 }
1421 }
1422
1423 void Surface::decodeP8(Buffer &destination, const Buffer &source)
1424 {
1425 unsigned char *sourceSlice = (unsigned char*)source.buffer;
1426 unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1427
1428 for(int z = 0; z < destination.depth && z < source.depth; z++)
1429 {
1430 unsigned char *sourceRow = sourceSlice;
1431 unsigned char *destinationRow = destinationSlice;
1432
1433 for(int y = 0; y < destination.height && y < source.height; y++)
1434 {
1435 unsigned char *sourceElement = sourceRow;
1436 unsigned char *destinationElement = destinationRow;
1437
1438 for(int x = 0; x < destination.width && x < source.width; x++)
1439 {
1440 unsigned int abgr = palette[*(unsigned char*)sourceElement];
1441
1442 unsigned int r = (abgr & 0x000000FF) << 16;
1443 unsigned int g = (abgr & 0x0000FF00) << 0;
1444 unsigned int b = (abgr & 0x00FF0000) >> 16;
1445 unsigned int a = (abgr & 0xFF000000) >> 0;
1446
1447 *(unsigned int*)destinationElement = a | r | g | b;
1448
1449 sourceElement += source.bytes;
1450 destinationElement += destination.bytes;
1451 }
1452
1453 sourceRow += source.pitchB;
1454 destinationRow += destination.pitchB;
1455 }
1456
1457 sourceSlice += source.sliceB;
1458 destinationSlice += destination.sliceB;
1459 }
1460 }
1461
1462#if S3TC_SUPPORT
1463 void Surface::decodeDXT1(Buffer &internal, const Buffer &external)
1464 {
1465 unsigned int *destSlice = (unsigned int*)internal.buffer;
1466 DXT1 *source = (DXT1*)external.buffer;
1467
1468 for(int z = 0; z < external.depth; z++)
1469 {
1470 unsigned int *dest = destSlice;
1471
1472 for(int y = 0; y < external.height; y += 4)
1473 {
1474 for(int x = 0; x < external.width; x += 4)
1475 {
1476 Color<byte> c[4];
1477
1478 c[0] = source->c0;
1479 c[1] = source->c1;
1480
1481 if(source->c0 > source->c1) // No transparency
1482 {
1483 // c2 = 2 / 3 * c0 + 1 / 3 * c1
1484 c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
1485 c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
1486 c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
1487 c[2].a = 0xFF;
1488
1489 // c3 = 1 / 3 * c0 + 2 / 3 * c1
1490 c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
1491 c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
1492 c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
1493 c[3].a = 0xFF;
1494 }
1495 else // c3 transparent
1496 {
1497 // c2 = 1 / 2 * c0 + 1 / 2 * c1
1498 c[2].r = (byte)(((word)c[0].r + (word)c[1].r) / 2);
1499 c[2].g = (byte)(((word)c[0].g + (word)c[1].g) / 2);
1500 c[2].b = (byte)(((word)c[0].b + (word)c[1].b) / 2);
1501 c[2].a = 0xFF;
1502
1503 c[3].r = 0;
1504 c[3].g = 0;
1505 c[3].b = 0;
1506 c[3].a = 0;
1507 }
1508
1509 for(int j = 0; j < 4 && (y + j) < internal.height; j++)
1510 {
1511 for(int i = 0; i < 4 && (x + i) < internal.width; i++)
1512 {
1513 dest[(x + i) + (y + j) * internal.width] = c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4];
1514 }
1515 }
1516
1517 source++;
1518 }
1519 }
1520
1521 (byte*&)destSlice += internal.sliceB;
1522 }
1523 }
1524
1525 void Surface::decodeDXT3(Buffer &internal, const Buffer &external)
1526 {
1527 unsigned int *destSlice = (unsigned int*)internal.buffer;
1528 DXT3 *source = (DXT3*)external.buffer;
1529
1530 for(int z = 0; z < external.depth; z++)
1531 {
1532 unsigned int *dest = destSlice;
1533
1534 for(int y = 0; y < external.height; y += 4)
1535 {
1536 for(int x = 0; x < external.width; x += 4)
1537 {
1538 Color<byte> c[4];
1539
1540 c[0] = source->c0;
1541 c[1] = source->c1;
1542
1543 // c2 = 2 / 3 * c0 + 1 / 3 * c1
1544 c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
1545 c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
1546 c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
1547
1548 // c3 = 1 / 3 * c0 + 2 / 3 * c1
1549 c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
1550 c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
1551 c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
1552
1553 for(int j = 0; j < 4 && (y + j) < internal.height; j++)
1554 {
1555 for(int i = 0; i < 4 && (x + i) < internal.width; i++)
1556 {
1557 unsigned int a = (unsigned int)(source->a >> 4 * (i + j * 4)) & 0x0F;
1558 unsigned int color = (c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | ((a << 28) + (a << 24));
1559
1560 dest[(x + i) + (y + j) * internal.width] = color;
1561 }
1562 }
1563
1564 source++;
1565 }
1566 }
1567
1568 (byte*&)destSlice += internal.sliceB;
1569 }
1570 }
1571
1572 void Surface::decodeDXT5(Buffer &internal, const Buffer &external)
1573 {
1574 unsigned int *destSlice = (unsigned int*)internal.buffer;
1575 DXT5 *source = (DXT5*)external.buffer;
1576
1577 for(int z = 0; z < external.depth; z++)
1578 {
1579 unsigned int *dest = destSlice;
1580
1581 for(int y = 0; y < external.height; y += 4)
1582 {
1583 for(int x = 0; x < external.width; x += 4)
1584 {
1585 Color<byte> c[4];
1586
1587 c[0] = source->c0;
1588 c[1] = source->c1;
1589
1590 // c2 = 2 / 3 * c0 + 1 / 3 * c1
1591 c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
1592 c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
1593 c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
1594
1595 // c3 = 1 / 3 * c0 + 2 / 3 * c1
1596 c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
1597 c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
1598 c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
1599
1600 byte a[8];
1601
1602 a[0] = source->a0;
1603 a[1] = source->a1;
1604
1605 if(a[0] > a[1])
1606 {
1607 a[2] = (byte)((6 * (word)a[0] + 1 * (word)a[1] + 3) / 7);
1608 a[3] = (byte)((5 * (word)a[0] + 2 * (word)a[1] + 3) / 7);
1609 a[4] = (byte)((4 * (word)a[0] + 3 * (word)a[1] + 3) / 7);
1610 a[5] = (byte)((3 * (word)a[0] + 4 * (word)a[1] + 3) / 7);
1611 a[6] = (byte)((2 * (word)a[0] + 5 * (word)a[1] + 3) / 7);
1612 a[7] = (byte)((1 * (word)a[0] + 6 * (word)a[1] + 3) / 7);
1613 }
1614 else
1615 {
1616 a[2] = (byte)((4 * (word)a[0] + 1 * (word)a[1] + 2) / 5);
1617 a[3] = (byte)((3 * (word)a[0] + 2 * (word)a[1] + 2) / 5);
1618 a[4] = (byte)((2 * (word)a[0] + 3 * (word)a[1] + 2) / 5);
1619 a[5] = (byte)((1 * (word)a[0] + 4 * (word)a[1] + 2) / 5);
1620 a[6] = 0;
1621 a[7] = 0xFF;
1622 }
1623
1624 for(int j = 0; j < 4 && (y + j) < internal.height; j++)
1625 {
1626 for(int i = 0; i < 4 && (x + i) < internal.width; i++)
1627 {
1628 unsigned int alpha = (unsigned int)a[(unsigned int)(source->alut >> (16 + 3 * (i + j * 4))) % 8] << 24;
1629 unsigned int color = (c[(source->clut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | alpha;
1630
1631 dest[(x + i) + (y + j) * internal.width] = color;
1632 }
1633 }
1634
1635 source++;
1636 }
1637 }
1638
1639 (byte*&)destSlice += internal.sliceB;
1640 }
1641 }
1642
1643 void Surface::decodeATI1(Buffer &internal, const Buffer &external)
1644 {
1645 byte *destSlice = (byte*)internal.buffer;
1646 ATI1 *source = (ATI1*)external.buffer;
1647
1648 for(int z = 0; z < external.depth; z++)
1649 {
1650 byte *dest = destSlice;
1651
1652 for(int y = 0; y < external.height; y += 4)
1653 {
1654 for(int x = 0; x < external.width; x += 4)
1655 {
1656 byte r[8];
1657
1658 r[0] = source->r0;
1659 r[1] = source->r1;
1660
1661 if(r[0] > r[1])
1662 {
1663 r[2] = (byte)((6 * (word)r[0] + 1 * (word)r[1] + 3) / 7);
1664 r[3] = (byte)((5 * (word)r[0] + 2 * (word)r[1] + 3) / 7);
1665 r[4] = (byte)((4 * (word)r[0] + 3 * (word)r[1] + 3) / 7);
1666 r[5] = (byte)((3 * (word)r[0] + 4 * (word)r[1] + 3) / 7);
1667 r[6] = (byte)((2 * (word)r[0] + 5 * (word)r[1] + 3) / 7);
1668 r[7] = (byte)((1 * (word)r[0] + 6 * (word)r[1] + 3) / 7);
1669 }
1670 else
1671 {
1672 r[2] = (byte)((4 * (word)r[0] + 1 * (word)r[1] + 2) / 5);
1673 r[3] = (byte)((3 * (word)r[0] + 2 * (word)r[1] + 2) / 5);
1674 r[4] = (byte)((2 * (word)r[0] + 3 * (word)r[1] + 2) / 5);
1675 r[5] = (byte)((1 * (word)r[0] + 4 * (word)r[1] + 2) / 5);
1676 r[6] = 0;
1677 r[7] = 0xFF;
1678 }
1679
1680 for(int j = 0; j < 4 && (y + j) < internal.height; j++)
1681 {
1682 for(int i = 0; i < 4 && (x + i) < internal.width; i++)
1683 {
1684 dest[(x + i) + (y + j) * internal.width] = r[(unsigned int)(source->rlut >> (16 + 3 * (i + j * 4))) % 8];
1685 }
1686 }
1687
1688 source++;
1689 }
1690 }
1691
1692 destSlice += internal.sliceB;
1693 }
1694 }
1695
1696 void Surface::decodeATI2(Buffer &internal, const Buffer &external)
1697 {
1698 word *destSlice = (word*)internal.buffer;
1699 ATI2 *source = (ATI2*)external.buffer;
1700
1701 for(int z = 0; z < external.depth; z++)
1702 {
1703 word *dest = destSlice;
1704
1705 for(int y = 0; y < external.height; y += 4)
1706 {
1707 for(int x = 0; x < external.width; x += 4)
1708 {
1709 byte X[8];
1710
1711 X[0] = source->x0;
1712 X[1] = source->x1;
1713
1714 if(X[0] > X[1])
1715 {
1716 X[2] = (byte)((6 * (word)X[0] + 1 * (word)X[1] + 3) / 7);
1717 X[3] = (byte)((5 * (word)X[0] + 2 * (word)X[1] + 3) / 7);
1718 X[4] = (byte)((4 * (word)X[0] + 3 * (word)X[1] + 3) / 7);
1719 X[5] = (byte)((3 * (word)X[0] + 4 * (word)X[1] + 3) / 7);
1720 X[6] = (byte)((2 * (word)X[0] + 5 * (word)X[1] + 3) / 7);
1721 X[7] = (byte)((1 * (word)X[0] + 6 * (word)X[1] + 3) / 7);
1722 }
1723 else
1724 {
1725 X[2] = (byte)((4 * (word)X[0] + 1 * (word)X[1] + 2) / 5);
1726 X[3] = (byte)((3 * (word)X[0] + 2 * (word)X[1] + 2) / 5);
1727 X[4] = (byte)((2 * (word)X[0] + 3 * (word)X[1] + 2) / 5);
1728 X[5] = (byte)((1 * (word)X[0] + 4 * (word)X[1] + 2) / 5);
1729 X[6] = 0;
1730 X[7] = 0xFF;
1731 }
1732
1733 byte Y[8];
1734
1735 Y[0] = source->y0;
1736 Y[1] = source->y1;
1737
1738 if(Y[0] > Y[1])
1739 {
1740 Y[2] = (byte)((6 * (word)Y[0] + 1 * (word)Y[1] + 3) / 7);
1741 Y[3] = (byte)((5 * (word)Y[0] + 2 * (word)Y[1] + 3) / 7);
1742 Y[4] = (byte)((4 * (word)Y[0] + 3 * (word)Y[1] + 3) / 7);
1743 Y[5] = (byte)((3 * (word)Y[0] + 4 * (word)Y[1] + 3) / 7);
1744 Y[6] = (byte)((2 * (word)Y[0] + 5 * (word)Y[1] + 3) / 7);
1745 Y[7] = (byte)((1 * (word)Y[0] + 6 * (word)Y[1] + 3) / 7);
1746 }
1747 else
1748 {
1749 Y[2] = (byte)((4 * (word)Y[0] + 1 * (word)Y[1] + 2) / 5);
1750 Y[3] = (byte)((3 * (word)Y[0] + 2 * (word)Y[1] + 2) / 5);
1751 Y[4] = (byte)((2 * (word)Y[0] + 3 * (word)Y[1] + 2) / 5);
1752 Y[5] = (byte)((1 * (word)Y[0] + 4 * (word)Y[1] + 2) / 5);
1753 Y[6] = 0;
1754 Y[7] = 0xFF;
1755 }
1756
1757 for(int j = 0; j < 4 && (y + j) < internal.height; j++)
1758 {
1759 for(int i = 0; i < 4 && (x + i) < internal.width; i++)
1760 {
1761 word r = X[(unsigned int)(source->xlut >> (16 + 3 * (i + j * 4))) % 8];
1762 word g = Y[(unsigned int)(source->ylut >> (16 + 3 * (i + j * 4))) % 8];
1763
1764 dest[(x + i) + (y + j) * internal.width] = (g << 8) + r;
1765 }
1766 }
1767
1768 source++;
1769 }
1770 }
1771
1772 (byte*&)destSlice += internal.sliceB;
1773 }
1774 }
1775#endif
1776
1777 unsigned int Surface::size(int width, int height, int depth, Format format)
1778 {
1779 // Dimensions rounded up to multiples of 4, used for DXTC formats
1780 int width4 = (width + 3) & ~3;
1781 int height4 = (height + 3) & ~3;
1782
1783 switch(format)
1784 {
1785 #if S3TC_SUPPORT
1786 case FORMAT_DXT1:
John Bauman66b8ab22014-05-06 15:57:45 -04001787 #endif
John Bauman89401822014-05-06 15:04:28 -04001788 case FORMAT_ATI1:
1789 return width4 * height4 * depth / 2;
John Bauman66b8ab22014-05-06 15:57:45 -04001790 #if S3TC_SUPPORT
John Bauman89401822014-05-06 15:04:28 -04001791 case FORMAT_DXT3:
1792 case FORMAT_DXT5:
John Bauman66b8ab22014-05-06 15:57:45 -04001793 #endif
John Bauman89401822014-05-06 15:04:28 -04001794 case FORMAT_ATI2:
1795 return width4 * height4 * depth;
John Bauman89401822014-05-06 15:04:28 -04001796 default:
1797 return bytes(format) * width * height * depth;
1798 }
1799
1800 return 0;
1801 }
1802
1803 bool Surface::isStencil(Format format)
1804 {
1805 switch(format)
1806 {
1807 case FORMAT_D32:
1808 case FORMAT_D16:
1809 case FORMAT_D24X8:
1810 case FORMAT_D32F:
1811 case FORMAT_D32F_COMPLEMENTARY:
1812 case FORMAT_D32F_LOCKABLE:
1813 return false;
1814 case FORMAT_D24S8:
1815 case FORMAT_D24FS8:
1816 case FORMAT_S8:
John Bauman66b8ab22014-05-06 15:57:45 -04001817 case FORMAT_DF24S8:
1818 case FORMAT_DF16S8:
1819 case FORMAT_D32FS8_TEXTURE:
1820 case FORMAT_D32FS8_SHADOW:
John Bauman89401822014-05-06 15:04:28 -04001821 case FORMAT_INTZ:
1822 return true;
1823 default:
1824 return false;
1825 }
1826 }
1827
1828 bool Surface::isDepth(Format format)
1829 {
1830 switch(format)
1831 {
1832 case FORMAT_D32:
1833 case FORMAT_D16:
1834 case FORMAT_D24X8:
1835 case FORMAT_D24S8:
1836 case FORMAT_D24FS8:
1837 case FORMAT_D32F:
1838 case FORMAT_D32F_COMPLEMENTARY:
1839 case FORMAT_D32F_LOCKABLE:
John Bauman66b8ab22014-05-06 15:57:45 -04001840 case FORMAT_DF24S8:
1841 case FORMAT_DF16S8:
1842 case FORMAT_D32FS8_TEXTURE:
1843 case FORMAT_D32FS8_SHADOW:
John Bauman89401822014-05-06 15:04:28 -04001844 case FORMAT_INTZ:
1845 return true;
1846 case FORMAT_S8:
1847 return false;
1848 default:
1849 return false;
1850 }
1851 }
1852
1853 bool Surface::isPalette(Format format)
1854 {
1855 switch(format)
1856 {
1857 case FORMAT_P8:
1858 case FORMAT_A8P8:
1859 return true;
1860 default:
1861 return false;
1862 }
1863 }
1864
1865 bool Surface::isFloatFormat(Format format)
1866 {
1867 switch(format)
1868 {
1869 case FORMAT_X8R8G8B8:
1870 case FORMAT_A8R8G8B8:
1871 case FORMAT_G8R8:
1872 case FORMAT_G16R16:
1873 case FORMAT_A16B16G16R16:
1874 case FORMAT_V8U8:
1875 case FORMAT_Q8W8V8U8:
1876 case FORMAT_X8L8V8U8:
1877 case FORMAT_V16U16:
1878 case FORMAT_A16W16V16U16:
1879 case FORMAT_Q16W16V16U16:
1880 case FORMAT_A8:
1881 case FORMAT_R8:
1882 case FORMAT_L8:
1883 case FORMAT_L16:
1884 case FORMAT_A8L8:
1885 return false;
1886 case FORMAT_R32F:
1887 case FORMAT_G32R32F:
1888 case FORMAT_A32B32G32R32F:
1889 case FORMAT_D32F:
1890 case FORMAT_D32F_COMPLEMENTARY:
1891 case FORMAT_D32F_LOCKABLE:
John Bauman66b8ab22014-05-06 15:57:45 -04001892 case FORMAT_D32FS8_TEXTURE:
1893 case FORMAT_D32FS8_SHADOW:
John Bauman89401822014-05-06 15:04:28 -04001894 return true;
1895 default:
1896 ASSERT(false);
1897 }
1898
1899 return false;
1900 }
1901
1902 bool Surface::isUnsignedComponent(Format format, int component)
1903 {
1904 switch(format)
1905 {
1906 case FORMAT_NULL:
1907 case FORMAT_X8R8G8B8:
1908 case FORMAT_A8R8G8B8:
1909 case FORMAT_G8R8:
1910 case FORMAT_G16R16:
1911 case FORMAT_A16B16G16R16:
1912 case FORMAT_D32F:
1913 case FORMAT_D32F_COMPLEMENTARY:
1914 case FORMAT_D32F_LOCKABLE:
John Bauman66b8ab22014-05-06 15:57:45 -04001915 case FORMAT_D32FS8_TEXTURE:
1916 case FORMAT_D32FS8_SHADOW:
John Bauman89401822014-05-06 15:04:28 -04001917 case FORMAT_A8:
1918 case FORMAT_R8:
1919 case FORMAT_L8:
1920 case FORMAT_L16:
1921 case FORMAT_A8L8:
1922 return true;
1923 case FORMAT_V8U8:
1924 case FORMAT_X8L8V8U8:
1925 case FORMAT_V16U16:
1926 if(component < 2)
1927 {
1928 return false;
1929 }
1930 else
1931 {
1932 return true;
1933 }
1934 case FORMAT_A16W16V16U16:
1935 if(component < 3)
1936 {
1937 return false;
1938 }
1939 else
1940 {
1941 return true;
1942 }
1943 case FORMAT_Q8W8V8U8:
1944 case FORMAT_Q16W16V16U16:
1945 return false;
1946 case FORMAT_R32F:
1947 if(component < 1)
1948 {
1949 return false;
1950 }
1951 else
1952 {
1953 return true;
1954 }
1955 case FORMAT_G32R32F:
1956 if(component < 2)
1957 {
1958 return false;
1959 }
1960 else
1961 {
1962 return true;
1963 }
1964 case FORMAT_A32B32G32R32F:
1965 return false;
1966 default:
1967 ASSERT(false);
1968 }
1969
1970 return false;
1971 }
1972
1973 bool Surface::isSRGBreadable(Format format)
1974 {
1975 // Keep in sync with Capabilities::isSRGBreadable
1976 switch(format)
1977 {
1978 case FORMAT_L8:
1979 case FORMAT_A8L8:
1980 case FORMAT_R8G8B8:
1981 case FORMAT_A8R8G8B8:
1982 case FORMAT_X8R8G8B8:
1983 case FORMAT_A8B8G8R8:
1984 case FORMAT_X8B8G8R8:
1985 case FORMAT_R5G6B5:
1986 case FORMAT_X1R5G5B5:
1987 case FORMAT_A1R5G5B5:
1988 case FORMAT_A4R4G4B4:
1989 #if S3TC_SUPPORT
1990 case FORMAT_DXT1:
1991 case FORMAT_DXT3:
1992 case FORMAT_DXT5:
John Bauman66b8ab22014-05-06 15:57:45 -04001993 #endif
John Bauman89401822014-05-06 15:04:28 -04001994 case FORMAT_ATI1:
1995 case FORMAT_ATI2:
John Bauman89401822014-05-06 15:04:28 -04001996 return true;
1997 default:
1998 return false;
1999 }
2000
2001 return false;
2002 }
2003
2004 bool Surface::isSRGBwritable(Format format)
2005 {
2006 // Keep in sync with Capabilities::isSRGBwritable
2007 switch(format)
2008 {
2009 case FORMAT_NULL:
2010 case FORMAT_A8R8G8B8:
2011 case FORMAT_X8R8G8B8:
2012 case FORMAT_A8B8G8R8:
2013 case FORMAT_X8B8G8R8:
2014 case FORMAT_R5G6B5:
2015 return true;
2016 default:
2017 return false;
2018 }
2019 }
2020
2021 bool Surface::isCompressed(Format format)
2022 {
2023 switch(format)
2024 {
2025 #if S3TC_SUPPORT
2026 case FORMAT_DXT1:
2027 case FORMAT_DXT3:
2028 case FORMAT_DXT5:
John Bauman66b8ab22014-05-06 15:57:45 -04002029 #endif
John Bauman89401822014-05-06 15:04:28 -04002030 case FORMAT_ATI1:
2031 case FORMAT_ATI2:
2032 return true;
John Bauman89401822014-05-06 15:04:28 -04002033 default:
2034 return false;
2035 }
2036 }
2037
2038 int Surface::componentCount(Format format)
2039 {
2040 switch(format)
2041 {
2042 case FORMAT_X8R8G8B8: return 3;
2043 case FORMAT_A8R8G8B8: return 4;
2044 case FORMAT_G8R8: return 2;
2045 case FORMAT_G16R16: return 2;
2046 case FORMAT_A16B16G16R16: return 4;
2047 case FORMAT_V8U8: return 2;
2048 case FORMAT_Q8W8V8U8: return 4;
2049 case FORMAT_X8L8V8U8: return 3;
2050 case FORMAT_V16U16: return 2;
2051 case FORMAT_A16W16V16U16: return 4;
2052 case FORMAT_Q16W16V16U16: return 4;
2053 case FORMAT_R32F: return 1;
2054 case FORMAT_G32R32F: return 2;
2055 case FORMAT_A32B32G32R32F: return 4;
2056 case FORMAT_D32F_LOCKABLE: return 1;
John Bauman66b8ab22014-05-06 15:57:45 -04002057 case FORMAT_D32FS8_TEXTURE: return 1;
2058 case FORMAT_D32FS8_SHADOW: return 1;
John Bauman89401822014-05-06 15:04:28 -04002059 case FORMAT_A8: return 1;
2060 case FORMAT_R8: return 1;
2061 case FORMAT_L8: return 1;
2062 case FORMAT_L16: return 1;
2063 case FORMAT_A8L8: return 2;
2064 default:
2065 ASSERT(false);
2066 }
2067
2068 return 1;
2069 }
2070
2071 void *Surface::allocateBuffer(int width, int height, int depth, Format format)
2072 {
2073 int width4 = (width + 3) & ~3;
2074 int height4 = (height + 3) & ~3;
2075
John Bauman19bac1e2014-05-06 15:23:49 -04002076 return allocate(size(width4, height4, depth, format));
John Bauman89401822014-05-06 15:04:28 -04002077 }
2078
2079 void Surface::memfill(void *buffer, int pattern, int bytes)
2080 {
2081 while((size_t)buffer & 0x1 && bytes >= 1)
2082 {
2083 *(char*)buffer = (char)pattern;
2084 (char*&)buffer += 1;
2085 bytes -= 1;
2086 }
2087
2088 while((size_t)buffer & 0x3 && bytes >= 2)
2089 {
2090 *(short*)buffer = (short)pattern;
2091 (short*&)buffer += 1;
2092 bytes -= 2;
2093 }
2094
2095 if(CPUID::supportsSSE())
2096 {
2097 while((size_t)buffer & 0xF && bytes >= 4)
2098 {
2099 *(int*)buffer = pattern;
2100 (int*&)buffer += 1;
2101 bytes -= 4;
2102 }
2103
2104 __m128 quad = _mm_set_ps1((float&)pattern);
2105
2106 float *pointer = (float*)buffer;
2107 int qxwords = bytes / 64;
2108 bytes -= qxwords * 64;
2109
2110 while(qxwords--)
2111 {
2112 _mm_stream_ps(pointer + 0, quad);
2113 _mm_stream_ps(pointer + 4, quad);
2114 _mm_stream_ps(pointer + 8, quad);
2115 _mm_stream_ps(pointer + 12, quad);
2116
2117 pointer += 16;
2118 }
2119
2120 buffer = pointer;
2121 }
2122
2123 while(bytes >= 4)
2124 {
2125 *(int*)buffer = (int)pattern;
2126 (int*&)buffer += 1;
2127 bytes -= 4;
2128 }
2129
2130 while(bytes >= 2)
2131 {
2132 *(short*)buffer = (short)pattern;
2133 (short*&)buffer += 1;
2134 bytes -= 2;
2135 }
2136
2137 while(bytes >= 1)
2138 {
2139 *(char*)buffer = (char)pattern;
2140 (char*&)buffer += 1;
2141 bytes -= 1;
2142 }
2143 }
2144
2145 void Surface::clearColorBuffer(unsigned int color, unsigned int rgbaMask, int x0, int y0, int width, int height)
2146 {
2147 // FIXME: Also clear buffers in other formats?
2148
2149 // Not overlapping
2150 if(x0 > internal.width) return;
2151 if(y0 > internal.height) return;
2152 if(x0 + width < 0) return;
2153 if(y0 + height < 0) return;
2154
2155 // Clip against dimensions
2156 if(x0 < 0) {width += x0; x0 = 0;}
2157 if(x0 + width > internal.width) width = internal.width - x0;
2158 if(y0 < 0) {height += y0; y0 = 0;}
2159 if(y0 + height > internal.height) height = internal.height - y0;
2160
2161 const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
2162 const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;
2163
2164 int width2 = (internal.width + 1) & ~1;
2165
2166 int x1 = x0 + width;
2167 int y1 = y0 + height;
2168
2169 int bytes = 4 * (x1 - x0);
2170
2171 // if(lockable || !quadLayoutEnabled)
2172 {
2173 unsigned char *buffer = (unsigned char*)lockInternal(x0, y0, 0, lock, PUBLIC);
2174
2175 unsigned char r8 = (color & 0x00FF0000) >> 16;
2176 unsigned char g8 = (color & 0x0000FF00) >> 8;
2177 unsigned char b8 = (color & 0x000000FF) >> 0;
2178 unsigned char a8 = (color & 0xFF000000) >> 24;
2179
2180 unsigned short r16 = (r8 << 8) + r8;
2181 unsigned short g16 = (g8 << 8) + g8;
2182 unsigned short b16 = (b8 << 8) + b8;
2183 unsigned short a16 = (a8 << 8) + a8;
2184
2185 float r32f = r8 / 255.0f;
2186 float g32f = g8 / 255.0f;
2187 float b32f = b8 / 255.0f;
2188 float a32f = a8 / 255.0f;
2189
2190 unsigned char g8r8[4] = {r8, g8, r8, g8};
2191 unsigned short g16r16[2] = {r16, g16};
2192
2193 for(int z = 0; z < internal.depth; z++)
2194 {
2195 unsigned char *target = buffer;
2196
2197 for(int y = y0; y < y1; y++)
2198 {
2199 switch(internal.format)
2200 {
2201 case FORMAT_NULL:
2202 break;
2203 case FORMAT_X8R8G8B8:
2204 case FORMAT_A8R8G8B8:
2205 // case FORMAT_X8G8R8B8Q: // FIXME
2206 // case FORMAT_A8G8R8B8Q: // FIXME
John Bauman19bac1e2014-05-06 15:23:49 -04002207 if(rgbaMask == 0xF || (internal.format == FORMAT_X8R8G8B8 && rgbaMask == 0x7))
John Bauman89401822014-05-06 15:04:28 -04002208 {
2209 memfill(target, color, 4 * (x1 - x0));
2210 }
2211 else
2212 {
2213 unsigned int bgraMask = (rgbaMask & 0x1 ? 0x00FF0000 : 0) | (rgbaMask & 0x2 ? 0x0000FF00 : 0) | (rgbaMask & 0x4 ? 0x000000FF : 0) | (rgbaMask & 0x8 ? 0xFF000000 : 0);
2214 unsigned int invMask = ~bgraMask;
2215 unsigned int maskedColor = color & bgraMask;
2216 unsigned int *target32 = (unsigned int*)target;
2217
2218 for(int x = 0; x < width; x++)
2219 {
2220 target32[x] = maskedColor | (target32[x] & invMask);
2221 }
2222 }
2223 break;
2224 case FORMAT_G8R8:
2225 if((rgbaMask & 0x3) == 0x3)
2226 {
2227 memfill(target, (int&)g8r8, 2 * (x1 - x0));
2228 }
2229 else
2230 {
2231 unsigned short rgMask = (rgbaMask & 0x1 ? 0x000000FF : 0) | (rgbaMask & 0x2 ? 0x0000FF00 : 0);
2232 unsigned short invMask = ~rgMask;
2233 unsigned short maskedColor = (unsigned short&)g8r8 & rgMask;
2234 unsigned short *target16 = (unsigned short*)target;
2235
2236 for(int x = 0; x < width; x++)
2237 {
2238 target16[x] = maskedColor | (target16[x] & invMask);
2239 }
2240 }
2241 break;
2242 case FORMAT_G16R16:
2243 if((rgbaMask & 0x3) == 0x3)
2244 {
2245 memfill(target, (int&)g16r16, 4 * (x1 - x0));
2246 }
2247 else
2248 {
2249 unsigned int rgMask = (rgbaMask & 0x1 ? 0x0000FFFF : 0) | (rgbaMask & 0x2 ? 0xFFFF0000 : 0);
2250 unsigned int invMask = ~rgMask;
2251 unsigned int maskedColor = (unsigned int&)g16r16 & rgMask;
2252 unsigned int *target32 = (unsigned int*)target;
2253
2254 for(int x = 0; x < width; x++)
2255 {
2256 target32[x] = maskedColor | (target32[x] & invMask);
2257 }
2258 }
2259 break;
2260 case FORMAT_A16B16G16R16:
2261 if(rgbaMask == 0xF)
2262 {
2263 for(int x = 0; x < width; x++)
2264 {
2265 ((unsigned short*)target)[4 * x + 0] = r16;
2266 ((unsigned short*)target)[4 * x + 1] = g16;
2267 ((unsigned short*)target)[4 * x + 2] = b16;
2268 ((unsigned short*)target)[4 * x + 3] = a16;
2269 }
2270 }
2271 else
2272 {
2273 if(rgbaMask & 0x1) for(int x = 0; x < width; x++) ((unsigned short*)target)[4 * x + 0] = r16;
2274 if(rgbaMask & 0x2) for(int x = 0; x < width; x++) ((unsigned short*)target)[4 * x + 1] = g16;
2275 if(rgbaMask & 0x4) for(int x = 0; x < width; x++) ((unsigned short*)target)[4 * x + 2] = b16;
2276 if(rgbaMask & 0x8) for(int x = 0; x < width; x++) ((unsigned short*)target)[4 * x + 3] = a16;
2277 }
2278 break;
2279 case FORMAT_R32F:
2280 if(rgbaMask & 0x1)
2281 {
2282 for(int x = 0; x < width; x++)
2283 {
2284 ((float*)target)[x] = r32f;
2285 }
2286 }
2287 break;
2288 case FORMAT_G32R32F:
2289 if((rgbaMask & 0x3) == 0x3)
2290 {
2291 for(int x = 0; x < width; x++)
2292 {
2293 ((float*)target)[2 * x + 0] = r32f;
2294 ((float*)target)[2 * x + 1] = g32f;
2295 }
2296 }
2297 else
2298 {
2299 if(rgbaMask & 0x1) for(int x = 0; x < width; x++) ((float*)target)[2 * x + 0] = r32f;
2300 if(rgbaMask & 0x2) for(int x = 0; x < width; x++) ((float*)target)[2 * x + 1] = g32f;
2301 }
2302 break;
2303 case FORMAT_A32B32G32R32F:
2304 if(rgbaMask == 0xF)
2305 {
2306 for(int x = 0; x < width; x++)
2307 {
2308 ((float*)target)[4 * x + 0] = r32f;
2309 ((float*)target)[4 * x + 1] = g32f;
2310 ((float*)target)[4 * x + 2] = b32f;
2311 ((float*)target)[4 * x + 3] = a32f;
2312 }
2313 }
2314 else
2315 {
2316 if(rgbaMask & 0x1) for(int x = 0; x < width; x++) ((float*)target)[4 * x + 0] = r32f;
2317 if(rgbaMask & 0x2) for(int x = 0; x < width; x++) ((float*)target)[4 * x + 1] = g32f;
2318 if(rgbaMask & 0x4) for(int x = 0; x < width; x++) ((float*)target)[4 * x + 2] = b32f;
2319 if(rgbaMask & 0x8) for(int x = 0; x < width; x++) ((float*)target)[4 * x + 3] = a32f;
2320 }
2321 break;
2322 default:
2323 ASSERT(false);
2324 }
2325
2326 target += internal.pitchB;
2327 }
2328
2329 buffer += internal.sliceB;
2330 }
2331
2332 unlockInternal();
2333 }
2334 /* else
2335 {
2336 // unsigned char *target = (unsigned char*&)buffer;
2337 //
2338 // for(int y = y0; y < y1; y++)
2339 // {
2340 // for(int x = x0; x < x1; x++)
2341 // {
2342 // target[width2 * 4 * (y & ~1) + 2 * (y & 1) + 8 * (x & ~1) + (x & 1) + 0] = (color & 0x000000FF) >> 0;
2343 // target[width2 * 4 * (y & ~1) + 2 * (y & 1) + 8 * (x & ~1) + (x & 1) + 4] = (color & 0x00FF0000) >> 16;
2344 // target[width2 * 4 * (y & ~1) + 2 * (y & 1) + 8 * (x & ~1) + (x & 1) + 8] = (color & 0x0000FF00) >> 8;
2345 // target[width2 * 4 * (y & ~1) + 2 * (y & 1) + 8 * (x & ~1) + (x & 1) + 12] = (color & 0xFF000000) >> 24;
2346 // }
2347 // }
2348
2349 unsigned char colorQ[16];
2350
2351 colorQ[0] = (color & 0x000000FF) >> 0;
2352 colorQ[1] = (color & 0x000000FF) >> 0;
2353 colorQ[2] = (color & 0x000000FF) >> 0;
2354 colorQ[3] = (color & 0x000000FF) >> 0;
2355 colorQ[4] = (color & 0x00FF0000) >> 16;
2356 colorQ[5] = (color & 0x00FF0000) >> 16;
2357 colorQ[6] = (color & 0x00FF0000) >> 16;
2358 colorQ[7] = (color & 0x00FF0000) >> 16;
2359 colorQ[8] = (color & 0x0000FF00) >> 8;
2360 colorQ[9] = (color & 0x0000FF00) >> 8;
2361 colorQ[10] = (color & 0x0000FF00) >> 8;
2362 colorQ[11] = (color & 0x0000FF00) >> 8;
2363 colorQ[12] = (color & 0xFF000000) >> 24;
2364 colorQ[13] = (color & 0xFF000000) >> 24;
2365 colorQ[14] = (color & 0xFF000000) >> 24;
2366 colorQ[15] = (color & 0xFF000000) >> 24;
2367
2368 for(int y = y0; y < y1; y++)
2369 {
2370 unsigned char *target = (unsigned char*)lockInternal(0, 0, 0, lock) + width2 * 4 * (y & ~1) + 2 * (y & 1); // FIXME: Unlock
2371
2372 if((y & 1) == 0 && y + 1 < y1) // Fill quad line at once
2373 {
2374 if((x0 & 1) != 0)
2375 {
2376 target[8 * (x0 & ~1) + 1 + 0] = (color & 0x000000FF) >> 0;
2377 target[8 * (x0 & ~1) + 1 + 4] = (color & 0x00FF0000) >> 16;
2378 target[8 * (x0 & ~1) + 1 + 8] = (color & 0x0000FF00) >> 8;
2379 target[8 * (x0 & ~1) + 1 + 12] = (color & 0xFF000000) >> 24;
2380
2381 target[8 * (x0 & ~1) + 3 + 0] = (color & 0x000000FF) >> 0;
2382 target[8 * (x0 & ~1) + 3 + 4] = (color & 0x00FF0000) >> 16;
2383 target[8 * (x0 & ~1) + 3 + 8] = (color & 0x0000FF00) >> 8;
2384 target[8 * (x0 & ~1) + 3 + 12] = (color & 0xFF000000) >> 24;
2385 }
2386
2387 __asm
2388 {
2389 movq mm0, colorQ+0
2390 movq mm1, colorQ+8
2391
2392 mov eax, x0
2393 add eax, 1
2394 and eax, 0xFFFFFFFE
2395 cmp eax, x1
2396 jge qEnd
2397
2398 mov edi, target
2399
2400 qLoop:
2401 movntq [edi+8*eax+0], mm0
2402 movntq [edi+8*eax+8], mm1
2403
2404 add eax, 2
2405 cmp eax, x1
2406 jl qLoop
2407 qEnd:
2408 emms
2409 }
2410
2411 if((x1 & 1) != 0)
2412 {
2413 target[8 * (x1 & ~1) + 0 + 0] = (color & 0x000000FF) >> 0;
2414 target[8 * (x1 & ~1) + 0 + 4] = (color & 0x00FF0000) >> 16;
2415 target[8 * (x1 & ~1) + 0 + 8] = (color & 0x0000FF00) >> 8;
2416 target[8 * (x1 & ~1) + 0 + 12] = (color & 0xFF000000) >> 24;
2417
2418 target[8 * (x1 & ~1) + 2 + 0] = (color & 0x000000FF) >> 0;
2419 target[8 * (x1 & ~1) + 2 + 4] = (color & 0x00FF0000) >> 16;
2420 target[8 * (x1 & ~1) + 2 + 8] = (color & 0x0000FF00) >> 8;
2421 target[8 * (x1 & ~1) + 2 + 12] = (color & 0xFF000000) >> 24;
2422 }
2423
2424 y++;
2425 }
2426 else
2427 {
2428 for(int x = x0; x < x1; x++)
2429 {
2430 target[8 * (x & ~1) + (x & 1) + 0] = (color & 0x000000FF) >> 0;
2431 target[8 * (x & ~1) + (x & 1) + 4] = (color & 0x00FF0000) >> 16;
2432 target[8 * (x & ~1) + (x & 1) + 8] = (color & 0x0000FF00) >> 8;
2433 target[8 * (x & ~1) + (x & 1) + 12] = (color & 0xFF000000) >> 24;
2434 }
2435 }
2436 }
2437 }*/
2438 }
2439
2440 void Surface::clearDepthBuffer(float depth, int x0, int y0, int width, int height)
2441 {
2442 // Not overlapping
2443 if(x0 > internal.width) return;
2444 if(y0 > internal.height) return;
2445 if(x0 + width < 0) return;
2446 if(y0 + height < 0) return;
2447
2448 // Clip against dimensions
2449 if(x0 < 0) {width += x0; x0 = 0;}
2450 if(x0 + width > internal.width) width = internal.width - x0;
2451 if(y0 < 0) {height += y0; y0 = 0;}
2452 if(y0 + height > internal.height) height = internal.height - y0;
2453
2454 const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
2455 const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;
2456
2457 int width2 = (internal.width + 1) & ~1;
2458
2459 int x1 = x0 + width;
2460 int y1 = y0 + height;
2461
2462 if(internal.format == FORMAT_D32F_LOCKABLE ||
John Bauman66b8ab22014-05-06 15:57:45 -04002463 internal.format == FORMAT_D32FS8_TEXTURE ||
2464 internal.format == FORMAT_D32FS8_SHADOW)
John Bauman89401822014-05-06 15:04:28 -04002465 {
2466 float *target = (float*)lockInternal(0, 0, 0, lock, PUBLIC) + x0 + width2 * y0;
2467
2468 for(int z = 0; z < internal.depth; z++)
2469 {
2470 for(int y = y0; y < y1; y++)
2471 {
2472 memfill(target, (int&)depth, 4 * width);
2473 target += width2;
2474 }
2475 }
2476
2477 unlockInternal();
2478 }
2479 else // Quad layout
2480 {
2481 if(complementaryDepthBuffer)
2482 {
2483 depth = 1 - depth;
2484 }
2485
2486 float *buffer = (float*)lockInternal(0, 0, 0, lock, PUBLIC);
2487
2488 for(int z = 0; z < internal.depth; z++)
2489 {
2490 for(int y = y0; y < y1; y++)
2491 {
2492 float *target = buffer + (y & ~1) * width2 + (y & 1) * 2;
2493
2494 if((y & 1) == 0 && y + 1 < y1) // Fill quad line at once
2495 {
2496 if((x0 & 1) != 0)
2497 {
2498 target[(x0 & ~1) * 2 + 1] = depth;
2499 target[(x0 & ~1) * 2 + 3] = depth;
2500 }
2501
2502 // for(int x2 = ((x0 + 1) & ~1) * 2; x2 < x1 * 2; x2 += 4)
2503 // {
2504 // target[x2 + 0] = depth;
2505 // target[x2 + 1] = depth;
2506 // target[x2 + 2] = depth;
2507 // target[x2 + 3] = depth;
2508 // }
2509
2510 // __asm
2511 // {
2512 // movss xmm0, depth
2513 // shufps xmm0, xmm0, 0x00
2514 //
2515 // mov eax, x0
2516 // add eax, 1
2517 // and eax, 0xFFFFFFFE
2518 // cmp eax, x1
2519 // jge qEnd
2520 //
2521 // mov edi, target
2522 //
2523 // qLoop:
2524 // movntps [edi+8*eax], xmm0
2525 //
2526 // add eax, 2
2527 // cmp eax, x1
2528 // jl qLoop
2529 // qEnd:
2530 // }
2531
2532 memfill(&target[((x0 + 1) & ~1) * 2], (int&)depth, 8 * ((x1 & ~1) - ((x0 + 1) & ~1)));
2533
2534 if((x1 & 1) != 0)
2535 {
2536 target[(x1 & ~1) * 2 + 0] = depth;
2537 target[(x1 & ~1) * 2 + 2] = depth;
2538 }
2539
2540 y++;
2541 }
2542 else
2543 {
2544 for(int x = x0; x < x1; x++)
2545 {
2546 target[(x & ~1) * 2 + (x & 1)] = depth;
2547 }
2548 }
2549 }
2550
2551 buffer += internal.sliceP;
2552 }
2553
2554 unlockInternal();
2555 }
2556 }
2557
2558 void Surface::clearStencilBuffer(unsigned char s, unsigned char mask, int x0, int y0, int width, int height)
2559 {
2560 // Not overlapping
2561 if(x0 > internal.width) return;
2562 if(y0 > internal.height) return;
2563 if(x0 + width < 0) return;
2564 if(y0 + height < 0) return;
2565
2566 // Clip against dimensions
2567 if(x0 < 0) {width += x0; x0 = 0;}
2568 if(x0 + width > internal.width) width = internal.width - x0;
2569 if(y0 < 0) {height += y0; y0 = 0;}
2570 if(y0 + height > internal.height) height = internal.height - y0;
2571
2572 int width2 = (internal.width + 1) & ~1;
2573
2574 int x1 = x0 + width;
2575 int y1 = y0 + height;
2576
2577 unsigned char maskedS = s & mask;
2578 unsigned char invMask = ~mask;
2579 unsigned int fill = maskedS;
2580 fill = fill | (fill << 8) | (fill << 16) + (fill << 24);
2581
2582 if(false)
2583 {
2584 char *target = (char*)lockStencil(0, PUBLIC) + x0 + width2 * y0;
2585
2586 for(int z = 0; z < stencil.depth; z++)
2587 {
2588 for(int y = y0; y < y0 + height; y++)
2589 {
2590 if(mask == 0xFF)
2591 {
2592 memfill(target, fill, width);
2593 }
2594 else
2595 {
2596 for(int x = 0; x < width; x++)
2597 {
2598 target[x] = maskedS | (target[x] & invMask);
2599 }
2600 }
2601
2602 target += width2;
2603 }
2604 }
2605
2606 unlockStencil();
2607 }
2608 else // Quad layout
2609 {
2610 char *buffer = (char*)lockStencil(0, PUBLIC);
2611
2612 if(mask == 0xFF)
2613 {
2614 for(int z = 0; z < stencil.depth; z++)
2615 {
2616 for(int y = y0; y < y1; y++)
2617 {
2618 char *target = buffer + (y & ~1) * width2 + (y & 1) * 2;
2619
2620 if((y & 1) == 0 && y + 1 < y1 && mask == 0xFF) // Fill quad line at once
2621 {
2622 if((x0 & 1) != 0)
2623 {
2624 target[(x0 & ~1) * 2 + 1] = fill;
2625 target[(x0 & ~1) * 2 + 3] = fill;
2626 }
2627
2628 memfill(&target[((x0 + 1) & ~1) * 2], fill, ((x1 + 1) & ~1) * 2 - ((x0 + 1) & ~1) * 2);
2629
2630 if((x1 & 1) != 0)
2631 {
2632 target[(x1 & ~1) * 2 + 0] = fill;
2633 target[(x1 & ~1) * 2 + 2] = fill;
2634 }
2635
2636 y++;
2637 }
2638 else
2639 {
2640 for(int x = x0; x < x1; x++)
2641 {
2642 target[(x & ~1) * 2 + (x & 1)] = maskedS | (target[x] & invMask);
2643 }
2644 }
2645 }
2646
2647 buffer += stencil.sliceP;
2648 }
2649 }
2650
2651 unlockStencil();
2652 }
2653 }
2654
2655 void Surface::fill(const Color<float> &color, int x0, int y0, int width, int height)
2656 {
2657 unsigned char *row;
2658 Buffer *buffer;
2659
2660 if(internal.dirty)
2661 {
2662 row = (unsigned char*)lockInternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
2663 buffer = &internal;
2664 }
2665 else
2666 {
2667 row = (unsigned char*)lockExternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
2668 buffer = &external;
2669 }
2670
2671 if(buffer->bytes <= 4)
2672 {
2673 int c;
2674 buffer->write(&c, color);
2675
2676 if(buffer->bytes <= 1) c = (c << 8) | c;
2677 if(buffer->bytes <= 2) c = (c << 16) | c;
2678
2679 for(int y = 0; y < height; y++)
2680 {
2681 memfill(row, c, width * buffer->bytes);
2682
2683 row += buffer->pitchB;
2684 }
2685 }
2686 else // Generic
2687 {
2688 for(int y = 0; y < height; y++)
2689 {
2690 unsigned char *element = row;
2691
2692 for(int x = 0; x < width; x++)
2693 {
2694 buffer->write(element, color);
2695
2696 element += buffer->bytes;
2697 }
2698
2699 row += buffer->pitchB;
2700 }
2701 }
2702
2703 if(buffer == &internal)
2704 {
2705 unlockInternal();
2706 }
2707 else
2708 {
2709 unlockExternal();
2710 }
2711 }
2712
2713 Color<float> Surface::readExternal(int x, int y, int z) const
2714 {
2715 ASSERT(external.lock != LOCK_UNLOCKED);
2716
2717 return external.read(x, y, z);
2718 }
2719
2720 Color<float> Surface::readExternal(int x, int y) const
2721 {
2722 ASSERT(external.lock != LOCK_UNLOCKED);
2723
2724 return external.read(x, y);
2725 }
2726
2727 Color<float> Surface::sampleExternal(float x, float y, float z) const
2728 {
2729 ASSERT(external.lock != LOCK_UNLOCKED);
2730
2731 return external.sample(x, y, z);
2732 }
2733
2734 Color<float> Surface::sampleExternal(float x, float y) const
2735 {
2736 ASSERT(external.lock != LOCK_UNLOCKED);
2737
2738 return external.sample(x, y);
2739 }
2740
2741 void Surface::writeExternal(int x, int y, int z, const Color<float> &color)
2742 {
2743 ASSERT(external.lock != LOCK_UNLOCKED);
2744
2745 external.write(x, y, z, color);
2746 }
2747
2748 void Surface::writeExternal(int x, int y, const Color<float> &color)
2749 {
2750 ASSERT(external.lock != LOCK_UNLOCKED);
2751
2752 external.write(x, y, color);
2753 }
2754
2755 Color<float> Surface::readInternal(int x, int y, int z) const
2756 {
2757 ASSERT(internal.lock != LOCK_UNLOCKED);
2758
2759 return internal.read(x, y, z);
2760 }
2761
2762 Color<float> Surface::readInternal(int x, int y) const
2763 {
2764 ASSERT(internal.lock != LOCK_UNLOCKED);
2765
2766 return internal.read(x, y);
2767 }
2768
2769 Color<float> Surface::sampleInternal(float x, float y, float z) const
2770 {
2771 ASSERT(internal.lock != LOCK_UNLOCKED);
2772
2773 return internal.sample(x, y, z);
2774 }
2775
2776 Color<float> Surface::sampleInternal(float x, float y) const
2777 {
2778 ASSERT(internal.lock != LOCK_UNLOCKED);
2779
2780 return internal.sample(x, y);
2781 }
2782
2783 void Surface::writeInternal(int x, int y, int z, const Color<float> &color)
2784 {
2785 ASSERT(internal.lock != LOCK_UNLOCKED);
2786
2787 internal.write(x, y, z, color);
2788 }
2789
2790 void Surface::writeInternal(int x, int y, const Color<float> &color)
2791 {
2792 ASSERT(internal.lock != LOCK_UNLOCKED);
2793
2794 internal.write(x, y, color);
2795 }
2796
2797 bool Surface::hasStencil() const
2798 {
2799 return isStencil(external.format);
2800 }
2801
2802 bool Surface::hasDepth() const
2803 {
2804 return isDepth(external.format);
2805 }
2806
2807 bool Surface::hasPalette() const
2808 {
2809 return isPalette(external.format);
2810 }
2811
2812 bool Surface::isRenderTarget() const
2813 {
2814 return renderTarget;
2815 }
2816
2817 bool Surface::hasDirtyMipmaps() const
2818 {
2819 return dirtyMipmaps;
2820 }
2821
2822 void Surface::cleanMipmaps()
2823 {
2824 dirtyMipmaps = false;
2825 }
2826
2827 Resource *Surface::getResource()
2828 {
2829 return resource;
2830 }
2831
2832 bool Surface::identicalFormats() const
2833 {
John Bauman66b8ab22014-05-06 15:57:45 -04002834 return external.format == internal.format &&
2835 external.width == internal.width &&
2836 external.height == internal.height &&
2837 external.depth == internal.depth &&
2838 external.pitchB == internal.pitchB &&
2839 external.sliceB == internal.sliceB;
John Bauman89401822014-05-06 15:04:28 -04002840 }
2841
2842 Format Surface::selectInternalFormat(Format format) const
2843 {
2844 switch(format)
2845 {
2846 case FORMAT_NULL:
2847 return FORMAT_NULL;
2848 case FORMAT_P8:
2849 case FORMAT_A8P8:
2850 case FORMAT_A4R4G4B4:
2851 case FORMAT_A1R5G5B5:
2852 case FORMAT_A8R3G3B2:
2853 return FORMAT_A8R8G8B8;
2854 case FORMAT_A8:
2855 return FORMAT_A8;
2856 case FORMAT_R8:
2857 return FORMAT_R8;
2858 case FORMAT_A2R10G10B10:
2859 case FORMAT_A2B10G10R10:
2860 case FORMAT_A16B16G16R16:
2861 return FORMAT_A16B16G16R16;
2862 case FORMAT_G8R8:
2863 return FORMAT_G8R8;
2864 case FORMAT_G16R16:
2865 return FORMAT_G16R16;
2866 case FORMAT_A8R8G8B8:
2867 case FORMAT_A8B8G8R8:
2868 if(lockable || !quadLayoutEnabled)
2869 {
2870 return FORMAT_A8R8G8B8;
2871 }
2872 else
2873 {
2874 return FORMAT_A8G8R8B8Q;
2875 }
2876 case FORMAT_R3G3B2:
2877 case FORMAT_R5G6B5:
2878 case FORMAT_R8G8B8:
2879 case FORMAT_X4R4G4B4:
2880 case FORMAT_X1R5G5B5:
2881 case FORMAT_X8R8G8B8:
2882 case FORMAT_X8B8G8R8:
2883 if(lockable || !quadLayoutEnabled)
2884 {
2885 return FORMAT_X8R8G8B8;
2886 }
2887 else
2888 {
2889 return FORMAT_X8G8R8B8Q;
2890 }
2891 // Compressed formats
2892 #if S3TC_SUPPORT
2893 case FORMAT_DXT1:
2894 case FORMAT_DXT3:
2895 case FORMAT_DXT5:
2896 return FORMAT_A8R8G8B8;
John Bauman66b8ab22014-05-06 15:57:45 -04002897 #endif
John Bauman89401822014-05-06 15:04:28 -04002898 case FORMAT_ATI1:
2899 return FORMAT_R8;
2900 case FORMAT_ATI2:
2901 return FORMAT_G8R8;
John Bauman89401822014-05-06 15:04:28 -04002902 // Bumpmap formats
2903 case FORMAT_V8U8: return FORMAT_V8U8;
2904 case FORMAT_L6V5U5: return FORMAT_X8L8V8U8;
2905 case FORMAT_Q8W8V8U8: return FORMAT_Q8W8V8U8;
2906 case FORMAT_X8L8V8U8: return FORMAT_X8L8V8U8;
2907 case FORMAT_V16U16: return FORMAT_V16U16;
2908 case FORMAT_A2W10V10U10: return FORMAT_A16W16V16U16;
2909 case FORMAT_Q16W16V16U16: return FORMAT_Q16W16V16U16;
2910 // Floating-point formats
2911 case FORMAT_R16F: return FORMAT_R32F;
2912 case FORMAT_G16R16F: return FORMAT_G32R32F;
2913 case FORMAT_A16B16G16R16F: return FORMAT_A32B32G32R32F;
2914 case FORMAT_R32F: return FORMAT_R32F;
2915 case FORMAT_G32R32F: return FORMAT_G32R32F;
2916 case FORMAT_A32B32G32R32F: return FORMAT_A32B32G32R32F;
2917 // Luminance formats
2918 case FORMAT_L8: return FORMAT_L8;
2919 case FORMAT_A4L4: return FORMAT_A8L8;
2920 case FORMAT_L16: return FORMAT_L16;
2921 case FORMAT_A8L8: return FORMAT_A8L8;
2922 // Depth/stencil formats
2923 case FORMAT_D16:
2924 case FORMAT_D32:
2925 case FORMAT_D24X8:
2926 case FORMAT_D24S8:
2927 case FORMAT_D24FS8:
2928 if(hasParent) // Texture
2929 {
John Bauman66b8ab22014-05-06 15:57:45 -04002930 return FORMAT_D32FS8_SHADOW;
John Bauman89401822014-05-06 15:04:28 -04002931 }
2932 else if(complementaryDepthBuffer)
2933 {
2934 return FORMAT_D32F_COMPLEMENTARY;
2935 }
2936 else
2937 {
2938 return FORMAT_D32F;
2939 }
John Bauman66b8ab22014-05-06 15:57:45 -04002940 case FORMAT_D32F_LOCKABLE: return FORMAT_D32F_LOCKABLE;
2941 case FORMAT_D32FS8_TEXTURE: return FORMAT_D32FS8_TEXTURE;
2942 case FORMAT_INTZ: return FORMAT_D32FS8_TEXTURE;
2943 case FORMAT_DF24S8: return FORMAT_D32FS8_SHADOW;
2944 case FORMAT_DF16S8: return FORMAT_D32FS8_SHADOW;
John Bauman89401822014-05-06 15:04:28 -04002945 default:
2946 ASSERT(false);
2947 }
2948
2949 return FORMAT_NULL;
2950 }
2951
2952 void Surface::setTexturePalette(unsigned int *palette)
2953 {
2954 Surface::palette = palette;
2955 Surface::paletteID++;
2956 }
2957
2958 void Surface::resolve()
2959 {
2960 if(internal.depth <= 1 || !internal.dirty || !renderTarget || internal.format == FORMAT_NULL)
2961 {
2962 return;
2963 }
2964
2965 void *source = internal.lockRect(0, 0, 0, LOCK_READWRITE);
2966
2967 int quality = internal.depth;
2968 int width = internal.width;
2969 int height = internal.height;
2970 int pitch = internal.pitchB;
2971 int slice = internal.sliceB;
2972
2973 unsigned char *source0 = (unsigned char*)source;
2974 unsigned char *source1 = source0 + slice;
2975 unsigned char *source2 = source1 + slice;
2976 unsigned char *source3 = source2 + slice;
2977 unsigned char *source4 = source3 + slice;
2978 unsigned char *source5 = source4 + slice;
2979 unsigned char *source6 = source5 + slice;
2980 unsigned char *source7 = source6 + slice;
2981 unsigned char *source8 = source7 + slice;
2982 unsigned char *source9 = source8 + slice;
2983 unsigned char *sourceA = source9 + slice;
2984 unsigned char *sourceB = sourceA + slice;
2985 unsigned char *sourceC = sourceB + slice;
2986 unsigned char *sourceD = sourceC + slice;
2987 unsigned char *sourceE = sourceD + slice;
2988 unsigned char *sourceF = sourceE + slice;
2989
2990 if(internal.format == FORMAT_X8R8G8B8 || internal.format == FORMAT_A8R8G8B8)
2991 {
2992 if(CPUID::supportsSSE2() && (width % 4) == 0)
2993 {
2994 if(internal.depth == 2)
2995 {
2996 for(int y = 0; y < height; y++)
2997 {
2998 for(int x = 0; x < width; x += 4)
2999 {
3000 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3001 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3002
3003 c0 = _mm_avg_epu8(c0, c1);
3004
3005 _mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3006 }
3007
3008 source0 += pitch;
3009 source1 += pitch;
3010 }
3011 }
3012 else if(internal.depth == 4)
3013 {
3014 for(int y = 0; y < height; y++)
3015 {
3016 for(int x = 0; x < width; x += 4)
3017 {
3018 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3019 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3020 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3021 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3022
3023 c0 = _mm_avg_epu8(c0, c1);
3024 c2 = _mm_avg_epu8(c2, c3);
3025 c0 = _mm_avg_epu8(c0, c2);
3026
3027 _mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3028 }
3029
3030 source0 += pitch;
3031 source1 += pitch;
3032 source2 += pitch;
3033 source3 += pitch;
3034 }
3035 }
3036 else if(internal.depth == 8)
3037 {
3038 for(int y = 0; y < height; y++)
3039 {
3040 for(int x = 0; x < width; x += 4)
3041 {
3042 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3043 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3044 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3045 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3046 __m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3047 __m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3048 __m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3049 __m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3050
3051 c0 = _mm_avg_epu8(c0, c1);
3052 c2 = _mm_avg_epu8(c2, c3);
3053 c4 = _mm_avg_epu8(c4, c5);
3054 c6 = _mm_avg_epu8(c6, c7);
3055 c0 = _mm_avg_epu8(c0, c2);
3056 c4 = _mm_avg_epu8(c4, c6);
3057 c0 = _mm_avg_epu8(c0, c4);
3058
3059 _mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3060 }
3061
3062 source0 += pitch;
3063 source1 += pitch;
3064 source2 += pitch;
3065 source3 += pitch;
3066 source4 += pitch;
3067 source5 += pitch;
3068 source6 += pitch;
3069 source7 += pitch;
3070 }
3071 }
3072 else if(internal.depth == 16)
3073 {
3074 for(int y = 0; y < height; y++)
3075 {
3076 for(int x = 0; x < width; x += 4)
3077 {
3078 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3079 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3080 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3081 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3082 __m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3083 __m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3084 __m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3085 __m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3086 __m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
3087 __m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
3088 __m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
3089 __m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
3090 __m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
3091 __m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
3092 __m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
3093 __m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
3094
3095 c0 = _mm_avg_epu8(c0, c1);
3096 c2 = _mm_avg_epu8(c2, c3);
3097 c4 = _mm_avg_epu8(c4, c5);
3098 c6 = _mm_avg_epu8(c6, c7);
3099 c8 = _mm_avg_epu8(c8, c9);
3100 cA = _mm_avg_epu8(cA, cB);
3101 cC = _mm_avg_epu8(cC, cD);
3102 cE = _mm_avg_epu8(cE, cF);
3103 c0 = _mm_avg_epu8(c0, c2);
3104 c4 = _mm_avg_epu8(c4, c6);
3105 c8 = _mm_avg_epu8(c8, cA);
3106 cC = _mm_avg_epu8(cC, cE);
3107 c0 = _mm_avg_epu8(c0, c4);
3108 c8 = _mm_avg_epu8(c8, cC);
3109 c0 = _mm_avg_epu8(c0, c8);
3110
3111 _mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3112 }
3113
3114 source0 += pitch;
3115 source1 += pitch;
3116 source2 += pitch;
3117 source3 += pitch;
3118 source4 += pitch;
3119 source5 += pitch;
3120 source6 += pitch;
3121 source7 += pitch;
3122 source8 += pitch;
3123 source9 += pitch;
3124 sourceA += pitch;
3125 sourceB += pitch;
3126 sourceC += pitch;
3127 sourceD += pitch;
3128 sourceE += pitch;
3129 sourceF += pitch;
3130 }
3131 }
3132 else ASSERT(false);
3133 }
3134 else
3135 {
3136 #define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101))
3137
3138 if(internal.depth == 2)
3139 {
3140 for(int y = 0; y < height; y++)
3141 {
3142 for(int x = 0; x < width; x++)
3143 {
3144 unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3145 unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3146
3147 c0 = AVERAGE(c0, c1);
3148
3149 *(unsigned int*)(source0 + 4 * x) = c0;
3150 }
3151
3152 source0 += pitch;
3153 source1 += pitch;
3154 }
3155 }
3156 else if(internal.depth == 4)
3157 {
3158 for(int y = 0; y < height; y++)
3159 {
3160 for(int x = 0; x < width; x++)
3161 {
3162 unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3163 unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3164 unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3165 unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3166
3167 c0 = AVERAGE(c0, c1);
3168 c2 = AVERAGE(c2, c3);
3169 c0 = AVERAGE(c0, c2);
3170
3171 *(unsigned int*)(source0 + 4 * x) = c0;
3172 }
3173
3174 source0 += pitch;
3175 source1 += pitch;
3176 source2 += pitch;
3177 source3 += pitch;
3178 }
3179 }
3180 else if(internal.depth == 8)
3181 {
3182 for(int y = 0; y < height; y++)
3183 {
3184 for(int x = 0; x < width; x++)
3185 {
3186 unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3187 unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3188 unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3189 unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3190 unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3191 unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3192 unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3193 unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3194
3195 c0 = AVERAGE(c0, c1);
3196 c2 = AVERAGE(c2, c3);
3197 c4 = AVERAGE(c4, c5);
3198 c6 = AVERAGE(c6, c7);
3199 c0 = AVERAGE(c0, c2);
3200 c4 = AVERAGE(c4, c6);
3201 c0 = AVERAGE(c0, c4);
3202
3203 *(unsigned int*)(source0 + 4 * x) = c0;
3204 }
3205
3206 source0 += pitch;
3207 source1 += pitch;
3208 source2 += pitch;
3209 source3 += pitch;
3210 source4 += pitch;
3211 source5 += pitch;
3212 source6 += pitch;
3213 source7 += pitch;
3214 }
3215 }
3216 else if(internal.depth == 16)
3217 {
3218 for(int y = 0; y < height; y++)
3219 {
3220 for(int x = 0; x < width; x++)
3221 {
3222 unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3223 unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3224 unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3225 unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3226 unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3227 unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3228 unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3229 unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3230 unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
3231 unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
3232 unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
3233 unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
3234 unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
3235 unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
3236 unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
3237 unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
3238
3239 c0 = AVERAGE(c0, c1);
3240 c2 = AVERAGE(c2, c3);
3241 c4 = AVERAGE(c4, c5);
3242 c6 = AVERAGE(c6, c7);
3243 c8 = AVERAGE(c8, c9);
3244 cA = AVERAGE(cA, cB);
3245 cC = AVERAGE(cC, cD);
3246 cE = AVERAGE(cE, cF);
3247 c0 = AVERAGE(c0, c2);
3248 c4 = AVERAGE(c4, c6);
3249 c8 = AVERAGE(c8, cA);
3250 cC = AVERAGE(cC, cE);
3251 c0 = AVERAGE(c0, c4);
3252 c8 = AVERAGE(c8, cC);
3253 c0 = AVERAGE(c0, c8);
3254
3255 *(unsigned int*)(source0 + 4 * x) = c0;
3256 }
3257
3258 source0 += pitch;
3259 source1 += pitch;
3260 source2 += pitch;
3261 source3 += pitch;
3262 source4 += pitch;
3263 source5 += pitch;
3264 source6 += pitch;
3265 source7 += pitch;
3266 source8 += pitch;
3267 source9 += pitch;
3268 sourceA += pitch;
3269 sourceB += pitch;
3270 sourceC += pitch;
3271 sourceD += pitch;
3272 sourceE += pitch;
3273 sourceF += pitch;
3274 }
3275 }
3276 else ASSERT(false);
3277
3278 #undef AVERAGE
3279 }
3280 }
3281 else if(internal.format == FORMAT_G16R16)
3282 {
3283 if(CPUID::supportsSSE2() && (width % 4) == 0)
3284 {
3285 if(internal.depth == 2)
3286 {
3287 for(int y = 0; y < height; y++)
3288 {
3289 for(int x = 0; x < width; x += 4)
3290 {
3291 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3292 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3293
3294 c0 = _mm_avg_epu16(c0, c1);
3295
3296 _mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3297 }
3298
3299 source0 += pitch;
3300 source1 += pitch;
3301 }
3302 }
3303 else if(internal.depth == 4)
3304 {
3305 for(int y = 0; y < height; y++)
3306 {
3307 for(int x = 0; x < width; x += 4)
3308 {
3309 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3310 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3311 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3312 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3313
3314 c0 = _mm_avg_epu16(c0, c1);
3315 c2 = _mm_avg_epu16(c2, c3);
3316 c0 = _mm_avg_epu16(c0, c2);
3317
3318 _mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3319 }
3320
3321 source0 += pitch;
3322 source1 += pitch;
3323 source2 += pitch;
3324 source3 += pitch;
3325 }
3326 }
3327 else if(internal.depth == 8)
3328 {
3329 for(int y = 0; y < height; y++)
3330 {
3331 for(int x = 0; x < width; x += 4)
3332 {
3333 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3334 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3335 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3336 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3337 __m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3338 __m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3339 __m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3340 __m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3341
3342 c0 = _mm_avg_epu16(c0, c1);
3343 c2 = _mm_avg_epu16(c2, c3);
3344 c4 = _mm_avg_epu16(c4, c5);
3345 c6 = _mm_avg_epu16(c6, c7);
3346 c0 = _mm_avg_epu16(c0, c2);
3347 c4 = _mm_avg_epu16(c4, c6);
3348 c0 = _mm_avg_epu16(c0, c4);
3349
3350 _mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3351 }
3352
3353 source0 += pitch;
3354 source1 += pitch;
3355 source2 += pitch;
3356 source3 += pitch;
3357 source4 += pitch;
3358 source5 += pitch;
3359 source6 += pitch;
3360 source7 += pitch;
3361 }
3362 }
3363 else if(internal.depth == 16)
3364 {
3365 for(int y = 0; y < height; y++)
3366 {
3367 for(int x = 0; x < width; x += 4)
3368 {
3369 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3370 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3371 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3372 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3373 __m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3374 __m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3375 __m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3376 __m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3377 __m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
3378 __m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
3379 __m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
3380 __m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
3381 __m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
3382 __m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
3383 __m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
3384 __m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
3385
3386 c0 = _mm_avg_epu16(c0, c1);
3387 c2 = _mm_avg_epu16(c2, c3);
3388 c4 = _mm_avg_epu16(c4, c5);
3389 c6 = _mm_avg_epu16(c6, c7);
3390 c8 = _mm_avg_epu16(c8, c9);
3391 cA = _mm_avg_epu16(cA, cB);
3392 cC = _mm_avg_epu16(cC, cD);
3393 cE = _mm_avg_epu16(cE, cF);
3394 c0 = _mm_avg_epu16(c0, c2);
3395 c4 = _mm_avg_epu16(c4, c6);
3396 c8 = _mm_avg_epu16(c8, cA);
3397 cC = _mm_avg_epu16(cC, cE);
3398 c0 = _mm_avg_epu16(c0, c4);
3399 c8 = _mm_avg_epu16(c8, cC);
3400 c0 = _mm_avg_epu16(c0, c8);
3401
3402 _mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3403 }
3404
3405 source0 += pitch;
3406 source1 += pitch;
3407 source2 += pitch;
3408 source3 += pitch;
3409 source4 += pitch;
3410 source5 += pitch;
3411 source6 += pitch;
3412 source7 += pitch;
3413 source8 += pitch;
3414 source9 += pitch;
3415 sourceA += pitch;
3416 sourceB += pitch;
3417 sourceC += pitch;
3418 sourceD += pitch;
3419 sourceE += pitch;
3420 sourceF += pitch;
3421 }
3422 }
3423 else ASSERT(false);
3424 }
3425 else
3426 {
3427 #define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
3428
3429 if(internal.depth == 2)
3430 {
3431 for(int y = 0; y < height; y++)
3432 {
3433 for(int x = 0; x < width; x++)
3434 {
3435 unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3436 unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3437
3438 c0 = AVERAGE(c0, c1);
3439
3440 *(unsigned int*)(source0 + 4 * x) = c0;
3441 }
3442
3443 source0 += pitch;
3444 source1 += pitch;
3445 }
3446 }
3447 else if(internal.depth == 4)
3448 {
3449 for(int y = 0; y < height; y++)
3450 {
3451 for(int x = 0; x < width; x++)
3452 {
3453 unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3454 unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3455 unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3456 unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3457
3458 c0 = AVERAGE(c0, c1);
3459 c2 = AVERAGE(c2, c3);
3460 c0 = AVERAGE(c0, c2);
3461
3462 *(unsigned int*)(source0 + 4 * x) = c0;
3463 }
3464
3465 source0 += pitch;
3466 source1 += pitch;
3467 source2 += pitch;
3468 source3 += pitch;
3469 }
3470 }
3471 else if(internal.depth == 8)
3472 {
3473 for(int y = 0; y < height; y++)
3474 {
3475 for(int x = 0; x < width; x++)
3476 {
3477 unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3478 unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3479 unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3480 unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3481 unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3482 unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3483 unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3484 unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3485
3486 c0 = AVERAGE(c0, c1);
3487 c2 = AVERAGE(c2, c3);
3488 c4 = AVERAGE(c4, c5);
3489 c6 = AVERAGE(c6, c7);
3490 c0 = AVERAGE(c0, c2);
3491 c4 = AVERAGE(c4, c6);
3492 c0 = AVERAGE(c0, c4);
3493
3494 *(unsigned int*)(source0 + 4 * x) = c0;
3495 }
3496
3497 source0 += pitch;
3498 source1 += pitch;
3499 source2 += pitch;
3500 source3 += pitch;
3501 source4 += pitch;
3502 source5 += pitch;
3503 source6 += pitch;
3504 source7 += pitch;
3505 }
3506 }
3507 else if(internal.depth == 16)
3508 {
3509 for(int y = 0; y < height; y++)
3510 {
3511 for(int x = 0; x < width; x++)
3512 {
3513 unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3514 unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3515 unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3516 unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3517 unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3518 unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3519 unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3520 unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3521 unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
3522 unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
3523 unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
3524 unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
3525 unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
3526 unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
3527 unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
3528 unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
3529
3530 c0 = AVERAGE(c0, c1);
3531 c2 = AVERAGE(c2, c3);
3532 c4 = AVERAGE(c4, c5);
3533 c6 = AVERAGE(c6, c7);
3534 c8 = AVERAGE(c8, c9);
3535 cA = AVERAGE(cA, cB);
3536 cC = AVERAGE(cC, cD);
3537 cE = AVERAGE(cE, cF);
3538 c0 = AVERAGE(c0, c2);
3539 c4 = AVERAGE(c4, c6);
3540 c8 = AVERAGE(c8, cA);
3541 cC = AVERAGE(cC, cE);
3542 c0 = AVERAGE(c0, c4);
3543 c8 = AVERAGE(c8, cC);
3544 c0 = AVERAGE(c0, c8);
3545
3546 *(unsigned int*)(source0 + 4 * x) = c0;
3547 }
3548
3549 source0 += pitch;
3550 source1 += pitch;
3551 source2 += pitch;
3552 source3 += pitch;
3553 source4 += pitch;
3554 source5 += pitch;
3555 source6 += pitch;
3556 source7 += pitch;
3557 source8 += pitch;
3558 source9 += pitch;
3559 sourceA += pitch;
3560 sourceB += pitch;
3561 sourceC += pitch;
3562 sourceD += pitch;
3563 sourceE += pitch;
3564 sourceF += pitch;
3565 }
3566 }
3567 else ASSERT(false);
3568
3569 #undef AVERAGE
3570 }
3571 }
3572 else if(internal.format == FORMAT_A16B16G16R16)
3573 {
3574 if(CPUID::supportsSSE2() && (width % 2) == 0)
3575 {
3576 if(internal.depth == 2)
3577 {
3578 for(int y = 0; y < height; y++)
3579 {
3580 for(int x = 0; x < width; x += 2)
3581 {
3582 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
3583 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
3584
3585 c0 = _mm_avg_epu16(c0, c1);
3586
3587 _mm_store_si128((__m128i*)(source0 + 8 * x), c0);
3588 }
3589
3590 source0 += pitch;
3591 source1 += pitch;
3592 }
3593 }
3594 else if(internal.depth == 4)
3595 {
3596 for(int y = 0; y < height; y++)
3597 {
3598 for(int x = 0; x < width; x += 2)
3599 {
3600 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
3601 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
3602 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
3603 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
3604
3605 c0 = _mm_avg_epu16(c0, c1);
3606 c2 = _mm_avg_epu16(c2, c3);
3607 c0 = _mm_avg_epu16(c0, c2);
3608
3609 _mm_store_si128((__m128i*)(source0 + 8 * x), c0);
3610 }
3611
3612 source0 += pitch;
3613 source1 += pitch;
3614 source2 += pitch;
3615 source3 += pitch;
3616 }
3617 }
3618 else if(internal.depth == 8)
3619 {
3620 for(int y = 0; y < height; y++)
3621 {
3622 for(int x = 0; x < width; x += 2)
3623 {
3624 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
3625 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
3626 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
3627 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
3628 __m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
3629 __m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
3630 __m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
3631 __m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
3632
3633 c0 = _mm_avg_epu16(c0, c1);
3634 c2 = _mm_avg_epu16(c2, c3);
3635 c4 = _mm_avg_epu16(c4, c5);
3636 c6 = _mm_avg_epu16(c6, c7);
3637 c0 = _mm_avg_epu16(c0, c2);
3638 c4 = _mm_avg_epu16(c4, c6);
3639 c0 = _mm_avg_epu16(c0, c4);
3640
3641 _mm_store_si128((__m128i*)(source0 + 8 * x), c0);
3642 }
3643
3644 source0 += pitch;
3645 source1 += pitch;
3646 source2 += pitch;
3647 source3 += pitch;
3648 source4 += pitch;
3649 source5 += pitch;
3650 source6 += pitch;
3651 source7 += pitch;
3652 }
3653 }
3654 else if(internal.depth == 16)
3655 {
3656 for(int y = 0; y < height; y++)
3657 {
3658 for(int x = 0; x < width; x += 2)
3659 {
3660 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
3661 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
3662 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
3663 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
3664 __m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
3665 __m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
3666 __m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
3667 __m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
3668 __m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
3669 __m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
3670 __m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
3671 __m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
3672 __m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
3673 __m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
3674 __m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
3675 __m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));
3676
3677 c0 = _mm_avg_epu16(c0, c1);
3678 c2 = _mm_avg_epu16(c2, c3);
3679 c4 = _mm_avg_epu16(c4, c5);
3680 c6 = _mm_avg_epu16(c6, c7);
3681 c8 = _mm_avg_epu16(c8, c9);
3682 cA = _mm_avg_epu16(cA, cB);
3683 cC = _mm_avg_epu16(cC, cD);
3684 cE = _mm_avg_epu16(cE, cF);
3685 c0 = _mm_avg_epu16(c0, c2);
3686 c4 = _mm_avg_epu16(c4, c6);
3687 c8 = _mm_avg_epu16(c8, cA);
3688 cC = _mm_avg_epu16(cC, cE);
3689 c0 = _mm_avg_epu16(c0, c4);
3690 c8 = _mm_avg_epu16(c8, cC);
3691 c0 = _mm_avg_epu16(c0, c8);
3692
3693 _mm_store_si128((__m128i*)(source0 + 8 * x), c0);
3694 }
3695
3696 source0 += pitch;
3697 source1 += pitch;
3698 source2 += pitch;
3699 source3 += pitch;
3700 source4 += pitch;
3701 source5 += pitch;
3702 source6 += pitch;
3703 source7 += pitch;
3704 source8 += pitch;
3705 source9 += pitch;
3706 sourceA += pitch;
3707 sourceB += pitch;
3708 sourceC += pitch;
3709 sourceD += pitch;
3710 sourceE += pitch;
3711 sourceF += pitch;
3712 }
3713 }
3714 else ASSERT(false);
3715 }
3716 else
3717 {
3718 #define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
3719
3720 if(internal.depth == 2)
3721 {
3722 for(int y = 0; y < height; y++)
3723 {
3724 for(int x = 0; x < 2 * width; x++)
3725 {
3726 unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3727 unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3728
3729 c0 = AVERAGE(c0, c1);
3730
3731 *(unsigned int*)(source0 + 4 * x) = c0;
3732 }
3733
3734 source0 += pitch;
3735 source1 += pitch;
3736 }
3737 }
3738 else if(internal.depth == 4)
3739 {
3740 for(int y = 0; y < height; y++)
3741 {
3742 for(int x = 0; x < 2 * width; x++)
3743 {
3744 unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3745 unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3746 unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3747 unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3748
3749 c0 = AVERAGE(c0, c1);
3750 c2 = AVERAGE(c2, c3);
3751 c0 = AVERAGE(c0, c2);
3752
3753 *(unsigned int*)(source0 + 4 * x) = c0;
3754 }
3755
3756 source0 += pitch;
3757 source1 += pitch;
3758 source2 += pitch;
3759 source3 += pitch;
3760 }
3761 }
3762 else if(internal.depth == 8)
3763 {
3764 for(int y = 0; y < height; y++)
3765 {
3766 for(int x = 0; x < 2 * width; x++)
3767 {
3768 unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3769 unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3770 unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3771 unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3772 unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3773 unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3774 unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3775 unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3776
3777 c0 = AVERAGE(c0, c1);
3778 c2 = AVERAGE(c2, c3);
3779 c4 = AVERAGE(c4, c5);
3780 c6 = AVERAGE(c6, c7);
3781 c0 = AVERAGE(c0, c2);
3782 c4 = AVERAGE(c4, c6);
3783 c0 = AVERAGE(c0, c4);
3784
3785 *(unsigned int*)(source0 + 4 * x) = c0;
3786 }
3787
3788 source0 += pitch;
3789 source1 += pitch;
3790 source2 += pitch;
3791 source3 += pitch;
3792 source4 += pitch;
3793 source5 += pitch;
3794 source6 += pitch;
3795 source7 += pitch;
3796 }
3797 }
3798 else if(internal.depth == 16)
3799 {
3800 for(int y = 0; y < height; y++)
3801 {
3802 for(int x = 0; x < 2 * width; x++)
3803 {
3804 unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3805 unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3806 unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3807 unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3808 unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3809 unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3810 unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3811 unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3812 unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
3813 unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
3814 unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
3815 unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
3816 unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
3817 unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
3818 unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
3819 unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
3820
3821 c0 = AVERAGE(c0, c1);
3822 c2 = AVERAGE(c2, c3);
3823 c4 = AVERAGE(c4, c5);
3824 c6 = AVERAGE(c6, c7);
3825 c8 = AVERAGE(c8, c9);
3826 cA = AVERAGE(cA, cB);
3827 cC = AVERAGE(cC, cD);
3828 cE = AVERAGE(cE, cF);
3829 c0 = AVERAGE(c0, c2);
3830 c4 = AVERAGE(c4, c6);
3831 c8 = AVERAGE(c8, cA);
3832 cC = AVERAGE(cC, cE);
3833 c0 = AVERAGE(c0, c4);
3834 c8 = AVERAGE(c8, cC);
3835 c0 = AVERAGE(c0, c8);
3836
3837 *(unsigned int*)(source0 + 4 * x) = c0;
3838 }
3839
3840 source0 += pitch;
3841 source1 += pitch;
3842 source2 += pitch;
3843 source3 += pitch;
3844 source4 += pitch;
3845 source5 += pitch;
3846 source6 += pitch;
3847 source7 += pitch;
3848 source8 += pitch;
3849 source9 += pitch;
3850 sourceA += pitch;
3851 sourceB += pitch;
3852 sourceC += pitch;
3853 sourceD += pitch;
3854 sourceE += pitch;
3855 sourceF += pitch;
3856 }
3857 }
3858 else ASSERT(false);
3859
3860 #undef AVERAGE
3861 }
3862 }
3863 else if(internal.format == FORMAT_R32F)
3864 {
3865 if(CPUID::supportsSSE() && (width % 4) == 0)
3866 {
3867 if(internal.depth == 2)
3868 {
3869 for(int y = 0; y < height; y++)
3870 {
3871 for(int x = 0; x < width; x += 4)
3872 {
3873 __m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
3874 __m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
3875
3876 c0 = _mm_add_ps(c0, c1);
3877 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
3878
3879 _mm_store_ps((float*)(source0 + 4 * x), c0);
3880 }
3881
3882 source0 += pitch;
3883 source1 += pitch;
3884 }
3885 }
3886 else if(internal.depth == 4)
3887 {
3888 for(int y = 0; y < height; y++)
3889 {
3890 for(int x = 0; x < width; x += 4)
3891 {
3892 __m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
3893 __m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
3894 __m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
3895 __m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
3896
3897 c0 = _mm_add_ps(c0, c1);
3898 c2 = _mm_add_ps(c2, c3);
3899 c0 = _mm_add_ps(c0, c2);
3900 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
3901
3902 _mm_store_ps((float*)(source0 + 4 * x), c0);
3903 }
3904
3905 source0 += pitch;
3906 source1 += pitch;
3907 source2 += pitch;
3908 source3 += pitch;
3909 }
3910 }
3911 else if(internal.depth == 8)
3912 {
3913 for(int y = 0; y < height; y++)
3914 {
3915 for(int x = 0; x < width; x += 4)
3916 {
3917 __m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
3918 __m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
3919 __m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
3920 __m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
3921 __m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
3922 __m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
3923 __m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
3924 __m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
3925
3926 c0 = _mm_add_ps(c0, c1);
3927 c2 = _mm_add_ps(c2, c3);
3928 c4 = _mm_add_ps(c4, c5);
3929 c6 = _mm_add_ps(c6, c7);
3930 c0 = _mm_add_ps(c0, c2);
3931 c4 = _mm_add_ps(c4, c6);
3932 c0 = _mm_add_ps(c0, c4);
3933 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
3934
3935 _mm_store_ps((float*)(source0 + 4 * x), c0);
3936 }
3937
3938 source0 += pitch;
3939 source1 += pitch;
3940 source2 += pitch;
3941 source3 += pitch;
3942 source4 += pitch;
3943 source5 += pitch;
3944 source6 += pitch;
3945 source7 += pitch;
3946 }
3947 }
3948 else if(internal.depth == 16)
3949 {
3950 for(int y = 0; y < height; y++)
3951 {
3952 for(int x = 0; x < width; x += 4)
3953 {
3954 __m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
3955 __m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
3956 __m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
3957 __m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
3958 __m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
3959 __m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
3960 __m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
3961 __m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
3962 __m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
3963 __m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
3964 __m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
3965 __m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
3966 __m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
3967 __m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
3968 __m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
3969 __m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));
3970
3971 c0 = _mm_add_ps(c0, c1);
3972 c2 = _mm_add_ps(c2, c3);
3973 c4 = _mm_add_ps(c4, c5);
3974 c6 = _mm_add_ps(c6, c7);
3975 c8 = _mm_add_ps(c8, c9);
3976 cA = _mm_add_ps(cA, cB);
3977 cC = _mm_add_ps(cC, cD);
3978 cE = _mm_add_ps(cE, cF);
3979 c0 = _mm_add_ps(c0, c2);
3980 c4 = _mm_add_ps(c4, c6);
3981 c8 = _mm_add_ps(c8, cA);
3982 cC = _mm_add_ps(cC, cE);
3983 c0 = _mm_add_ps(c0, c4);
3984 c8 = _mm_add_ps(c8, cC);
3985 c0 = _mm_add_ps(c0, c8);
3986 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
3987
3988 _mm_store_ps((float*)(source0 + 4 * x), c0);
3989 }
3990
3991 source0 += pitch;
3992 source1 += pitch;
3993 source2 += pitch;
3994 source3 += pitch;
3995 source4 += pitch;
3996 source5 += pitch;
3997 source6 += pitch;
3998 source7 += pitch;
3999 source8 += pitch;
4000 source9 += pitch;
4001 sourceA += pitch;
4002 sourceB += pitch;
4003 sourceC += pitch;
4004 sourceD += pitch;
4005 sourceE += pitch;
4006 sourceF += pitch;
4007 }
4008 }
4009 else ASSERT(false);
4010 }
4011 else
4012 {
4013 if(internal.depth == 2)
4014 {
4015 for(int y = 0; y < height; y++)
4016 {
4017 for(int x = 0; x < width; x++)
4018 {
4019 float c0 = *(float*)(source0 + 4 * x);
4020 float c1 = *(float*)(source1 + 4 * x);
4021
4022 c0 = c0 + c1;
4023 c0 *= 1.0f / 2.0f;
4024
4025 *(float*)(source0 + 4 * x) = c0;
4026 }
4027
4028 source0 += pitch;
4029 source1 += pitch;
4030 }
4031 }
4032 else if(internal.depth == 4)
4033 {
4034 for(int y = 0; y < height; y++)
4035 {
4036 for(int x = 0; x < width; x++)
4037 {
4038 float c0 = *(float*)(source0 + 4 * x);
4039 float c1 = *(float*)(source1 + 4 * x);
4040 float c2 = *(float*)(source2 + 4 * x);
4041 float c3 = *(float*)(source3 + 4 * x);
4042
4043 c0 = c0 + c1;
4044 c2 = c2 + c3;
4045 c0 = c0 + c2;
4046 c0 *= 1.0f / 4.0f;
4047
4048 *(float*)(source0 + 4 * x) = c0;
4049 }
4050
4051 source0 += pitch;
4052 source1 += pitch;
4053 source2 += pitch;
4054 source3 += pitch;
4055 }
4056 }
4057 else if(internal.depth == 8)
4058 {
4059 for(int y = 0; y < height; y++)
4060 {
4061 for(int x = 0; x < width; x++)
4062 {
4063 float c0 = *(float*)(source0 + 4 * x);
4064 float c1 = *(float*)(source1 + 4 * x);
4065 float c2 = *(float*)(source2 + 4 * x);
4066 float c3 = *(float*)(source3 + 4 * x);
4067 float c4 = *(float*)(source4 + 4 * x);
4068 float c5 = *(float*)(source5 + 4 * x);
4069 float c6 = *(float*)(source6 + 4 * x);
4070 float c7 = *(float*)(source7 + 4 * x);
4071
4072 c0 = c0 + c1;
4073 c2 = c2 + c3;
4074 c4 = c4 + c5;
4075 c6 = c6 + c7;
4076 c0 = c0 + c2;
4077 c4 = c4 + c6;
4078 c0 = c0 + c4;
4079 c0 *= 1.0f / 8.0f;
4080
4081 *(float*)(source0 + 4 * x) = c0;
4082 }
4083
4084 source0 += pitch;
4085 source1 += pitch;
4086 source2 += pitch;
4087 source3 += pitch;
4088 source4 += pitch;
4089 source5 += pitch;
4090 source6 += pitch;
4091 source7 += pitch;
4092 }
4093 }
4094 else if(internal.depth == 16)
4095 {
4096 for(int y = 0; y < height; y++)
4097 {
4098 for(int x = 0; x < width; x++)
4099 {
4100 float c0 = *(float*)(source0 + 4 * x);
4101 float c1 = *(float*)(source1 + 4 * x);
4102 float c2 = *(float*)(source2 + 4 * x);
4103 float c3 = *(float*)(source3 + 4 * x);
4104 float c4 = *(float*)(source4 + 4 * x);
4105 float c5 = *(float*)(source5 + 4 * x);
4106 float c6 = *(float*)(source6 + 4 * x);
4107 float c7 = *(float*)(source7 + 4 * x);
4108 float c8 = *(float*)(source8 + 4 * x);
4109 float c9 = *(float*)(source9 + 4 * x);
4110 float cA = *(float*)(sourceA + 4 * x);
4111 float cB = *(float*)(sourceB + 4 * x);
4112 float cC = *(float*)(sourceC + 4 * x);
4113 float cD = *(float*)(sourceD + 4 * x);
4114 float cE = *(float*)(sourceE + 4 * x);
4115 float cF = *(float*)(sourceF + 4 * x);
4116
4117 c0 = c0 + c1;
4118 c2 = c2 + c3;
4119 c4 = c4 + c5;
4120 c6 = c6 + c7;
4121 c8 = c8 + c9;
4122 cA = cA + cB;
4123 cC = cC + cD;
4124 cE = cE + cF;
4125 c0 = c0 + c2;
4126 c4 = c4 + c6;
4127 c8 = c8 + cA;
4128 cC = cC + cE;
4129 c0 = c0 + c4;
4130 c8 = c8 + cC;
4131 c0 = c0 + c8;
4132 c0 *= 1.0f / 16.0f;
4133
4134 *(float*)(source0 + 4 * x) = c0;
4135 }
4136
4137 source0 += pitch;
4138 source1 += pitch;
4139 source2 += pitch;
4140 source3 += pitch;
4141 source4 += pitch;
4142 source5 += pitch;
4143 source6 += pitch;
4144 source7 += pitch;
4145 source8 += pitch;
4146 source9 += pitch;
4147 sourceA += pitch;
4148 sourceB += pitch;
4149 sourceC += pitch;
4150 sourceD += pitch;
4151 sourceE += pitch;
4152 sourceF += pitch;
4153 }
4154 }
4155 else ASSERT(false);
4156 }
4157 }
4158 else if(internal.format == FORMAT_G32R32F)
4159 {
4160 if(CPUID::supportsSSE() && (width % 2) == 0)
4161 {
4162 if(internal.depth == 2)
4163 {
4164 for(int y = 0; y < height; y++)
4165 {
4166 for(int x = 0; x < width; x += 2)
4167 {
4168 __m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4169 __m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4170
4171 c0 = _mm_add_ps(c0, c1);
4172 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
4173
4174 _mm_store_ps((float*)(source0 + 8 * x), c0);
4175 }
4176
4177 source0 += pitch;
4178 source1 += pitch;
4179 }
4180 }
4181 else if(internal.depth == 4)
4182 {
4183 for(int y = 0; y < height; y++)
4184 {
4185 for(int x = 0; x < width; x += 2)
4186 {
4187 __m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4188 __m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4189 __m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
4190 __m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
4191
4192 c0 = _mm_add_ps(c0, c1);
4193 c2 = _mm_add_ps(c2, c3);
4194 c0 = _mm_add_ps(c0, c2);
4195 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
4196
4197 _mm_store_ps((float*)(source0 + 8 * x), c0);
4198 }
4199
4200 source0 += pitch;
4201 source1 += pitch;
4202 source2 += pitch;
4203 source3 += pitch;
4204 }
4205 }
4206 else if(internal.depth == 8)
4207 {
4208 for(int y = 0; y < height; y++)
4209 {
4210 for(int x = 0; x < width; x += 2)
4211 {
4212 __m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4213 __m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4214 __m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
4215 __m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
4216 __m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
4217 __m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
4218 __m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
4219 __m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
4220
4221 c0 = _mm_add_ps(c0, c1);
4222 c2 = _mm_add_ps(c2, c3);
4223 c4 = _mm_add_ps(c4, c5);
4224 c6 = _mm_add_ps(c6, c7);
4225 c0 = _mm_add_ps(c0, c2);
4226 c4 = _mm_add_ps(c4, c6);
4227 c0 = _mm_add_ps(c0, c4);
4228 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
4229
4230 _mm_store_ps((float*)(source0 + 8 * x), c0);
4231 }
4232
4233 source0 += pitch;
4234 source1 += pitch;
4235 source2 += pitch;
4236 source3 += pitch;
4237 source4 += pitch;
4238 source5 += pitch;
4239 source6 += pitch;
4240 source7 += pitch;
4241 }
4242 }
4243 else if(internal.depth == 16)
4244 {
4245 for(int y = 0; y < height; y++)
4246 {
4247 for(int x = 0; x < width; x += 2)
4248 {
4249 __m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4250 __m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4251 __m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
4252 __m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
4253 __m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
4254 __m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
4255 __m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
4256 __m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
4257 __m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
4258 __m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
4259 __m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
4260 __m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
4261 __m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
4262 __m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
4263 __m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
4264 __m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));
4265
4266 c0 = _mm_add_ps(c0, c1);
4267 c2 = _mm_add_ps(c2, c3);
4268 c4 = _mm_add_ps(c4, c5);
4269 c6 = _mm_add_ps(c6, c7);
4270 c8 = _mm_add_ps(c8, c9);
4271 cA = _mm_add_ps(cA, cB);
4272 cC = _mm_add_ps(cC, cD);
4273 cE = _mm_add_ps(cE, cF);
4274 c0 = _mm_add_ps(c0, c2);
4275 c4 = _mm_add_ps(c4, c6);
4276 c8 = _mm_add_ps(c8, cA);
4277 cC = _mm_add_ps(cC, cE);
4278 c0 = _mm_add_ps(c0, c4);
4279 c8 = _mm_add_ps(c8, cC);
4280 c0 = _mm_add_ps(c0, c8);
4281 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
4282
4283 _mm_store_ps((float*)(source0 + 8 * x), c0);
4284 }
4285
4286 source0 += pitch;
4287 source1 += pitch;
4288 source2 += pitch;
4289 source3 += pitch;
4290 source4 += pitch;
4291 source5 += pitch;
4292 source6 += pitch;
4293 source7 += pitch;
4294 source8 += pitch;
4295 source9 += pitch;
4296 sourceA += pitch;
4297 sourceB += pitch;
4298 sourceC += pitch;
4299 sourceD += pitch;
4300 sourceE += pitch;
4301 sourceF += pitch;
4302 }
4303 }
4304 else ASSERT(false);
4305 }
4306 else
4307 {
4308 if(internal.depth == 2)
4309 {
4310 for(int y = 0; y < height; y++)
4311 {
4312 for(int x = 0; x < 2 * width; x++)
4313 {
4314 float c0 = *(float*)(source0 + 4 * x);
4315 float c1 = *(float*)(source1 + 4 * x);
4316
4317 c0 = c0 + c1;
4318 c0 *= 1.0f / 2.0f;
4319
4320 *(float*)(source0 + 4 * x) = c0;
4321 }
4322
4323 source0 += pitch;
4324 source1 += pitch;
4325 }
4326 }
4327 else if(internal.depth == 4)
4328 {
4329 for(int y = 0; y < height; y++)
4330 {
4331 for(int x = 0; x < 2 * width; x++)
4332 {
4333 float c0 = *(float*)(source0 + 4 * x);
4334 float c1 = *(float*)(source1 + 4 * x);
4335 float c2 = *(float*)(source2 + 4 * x);
4336 float c3 = *(float*)(source3 + 4 * x);
4337
4338 c0 = c0 + c1;
4339 c2 = c2 + c3;
4340 c0 = c0 + c2;
4341 c0 *= 1.0f / 4.0f;
4342
4343 *(float*)(source0 + 4 * x) = c0;
4344 }
4345
4346 source0 += pitch;
4347 source1 += pitch;
4348 source2 += pitch;
4349 source3 += pitch;
4350 }
4351 }
4352 else if(internal.depth == 8)
4353 {
4354 for(int y = 0; y < height; y++)
4355 {
4356 for(int x = 0; x < 2 * width; x++)
4357 {
4358 float c0 = *(float*)(source0 + 4 * x);
4359 float c1 = *(float*)(source1 + 4 * x);
4360 float c2 = *(float*)(source2 + 4 * x);
4361 float c3 = *(float*)(source3 + 4 * x);
4362 float c4 = *(float*)(source4 + 4 * x);
4363 float c5 = *(float*)(source5 + 4 * x);
4364 float c6 = *(float*)(source6 + 4 * x);
4365 float c7 = *(float*)(source7 + 4 * x);
4366
4367 c0 = c0 + c1;
4368 c2 = c2 + c3;
4369 c4 = c4 + c5;
4370 c6 = c6 + c7;
4371 c0 = c0 + c2;
4372 c4 = c4 + c6;
4373 c0 = c0 + c4;
4374 c0 *= 1.0f / 8.0f;
4375
4376 *(float*)(source0 + 4 * x) = c0;
4377 }
4378
4379 source0 += pitch;
4380 source1 += pitch;
4381 source2 += pitch;
4382 source3 += pitch;
4383 source4 += pitch;
4384 source5 += pitch;
4385 source6 += pitch;
4386 source7 += pitch;
4387 }
4388 }
4389 else if(internal.depth == 16)
4390 {
4391 for(int y = 0; y < height; y++)
4392 {
4393 for(int x = 0; x < 2 * width; x++)
4394 {
4395 float c0 = *(float*)(source0 + 4 * x);
4396 float c1 = *(float*)(source1 + 4 * x);
4397 float c2 = *(float*)(source2 + 4 * x);
4398 float c3 = *(float*)(source3 + 4 * x);
4399 float c4 = *(float*)(source4 + 4 * x);
4400 float c5 = *(float*)(source5 + 4 * x);
4401 float c6 = *(float*)(source6 + 4 * x);
4402 float c7 = *(float*)(source7 + 4 * x);
4403 float c8 = *(float*)(source8 + 4 * x);
4404 float c9 = *(float*)(source9 + 4 * x);
4405 float cA = *(float*)(sourceA + 4 * x);
4406 float cB = *(float*)(sourceB + 4 * x);
4407 float cC = *(float*)(sourceC + 4 * x);
4408 float cD = *(float*)(sourceD + 4 * x);
4409 float cE = *(float*)(sourceE + 4 * x);
4410 float cF = *(float*)(sourceF + 4 * x);
4411
4412 c0 = c0 + c1;
4413 c2 = c2 + c3;
4414 c4 = c4 + c5;
4415 c6 = c6 + c7;
4416 c8 = c8 + c9;
4417 cA = cA + cB;
4418 cC = cC + cD;
4419 cE = cE + cF;
4420 c0 = c0 + c2;
4421 c4 = c4 + c6;
4422 c8 = c8 + cA;
4423 cC = cC + cE;
4424 c0 = c0 + c4;
4425 c8 = c8 + cC;
4426 c0 = c0 + c8;
4427 c0 *= 1.0f / 16.0f;
4428
4429 *(float*)(source0 + 4 * x) = c0;
4430 }
4431
4432 source0 += pitch;
4433 source1 += pitch;
4434 source2 += pitch;
4435 source3 += pitch;
4436 source4 += pitch;
4437 source5 += pitch;
4438 source6 += pitch;
4439 source7 += pitch;
4440 source8 += pitch;
4441 source9 += pitch;
4442 sourceA += pitch;
4443 sourceB += pitch;
4444 sourceC += pitch;
4445 sourceD += pitch;
4446 sourceE += pitch;
4447 sourceF += pitch;
4448 }
4449 }
4450 else ASSERT(false);
4451 }
4452 }
4453 else if(internal.format == FORMAT_A32B32G32R32F)
4454 {
4455 if(CPUID::supportsSSE())
4456 {
4457 if(internal.depth == 2)
4458 {
4459 for(int y = 0; y < height; y++)
4460 {
4461 for(int x = 0; x < width; x++)
4462 {
4463 __m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
4464 __m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
4465
4466 c0 = _mm_add_ps(c0, c1);
4467 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
4468
4469 _mm_store_ps((float*)(source0 + 16 * x), c0);
4470 }
4471
4472 source0 += pitch;
4473 source1 += pitch;
4474 }
4475 }
4476 else if(internal.depth == 4)
4477 {
4478 for(int y = 0; y < height; y++)
4479 {
4480 for(int x = 0; x < width; x++)
4481 {
4482 __m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
4483 __m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
4484 __m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
4485 __m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
4486
4487 c0 = _mm_add_ps(c0, c1);
4488 c2 = _mm_add_ps(c2, c3);
4489 c0 = _mm_add_ps(c0, c2);
4490 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
4491
4492 _mm_store_ps((float*)(source0 + 16 * x), c0);
4493 }
4494
4495 source0 += pitch;
4496 source1 += pitch;
4497 source2 += pitch;
4498 source3 += pitch;
4499 }
4500 }
4501 else if(internal.depth == 8)
4502 {
4503 for(int y = 0; y < height; y++)
4504 {
4505 for(int x = 0; x < width; x++)
4506 {
4507 __m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
4508 __m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
4509 __m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
4510 __m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
4511 __m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
4512 __m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
4513 __m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
4514 __m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
4515
4516 c0 = _mm_add_ps(c0, c1);
4517 c2 = _mm_add_ps(c2, c3);
4518 c4 = _mm_add_ps(c4, c5);
4519 c6 = _mm_add_ps(c6, c7);
4520 c0 = _mm_add_ps(c0, c2);
4521 c4 = _mm_add_ps(c4, c6);
4522 c0 = _mm_add_ps(c0, c4);
4523 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
4524
4525 _mm_store_ps((float*)(source0 + 16 * x), c0);
4526 }
4527
4528 source0 += pitch;
4529 source1 += pitch;
4530 source2 += pitch;
4531 source3 += pitch;
4532 source4 += pitch;
4533 source5 += pitch;
4534 source6 += pitch;
4535 source7 += pitch;
4536 }
4537 }
4538 else if(internal.depth == 16)
4539 {
4540 for(int y = 0; y < height; y++)
4541 {
4542 for(int x = 0; x < width; x++)
4543 {
4544 __m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
4545 __m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
4546 __m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
4547 __m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
4548 __m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
4549 __m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
4550 __m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
4551 __m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
4552 __m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
4553 __m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
4554 __m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
4555 __m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
4556 __m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
4557 __m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
4558 __m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
4559 __m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));
4560
4561 c0 = _mm_add_ps(c0, c1);
4562 c2 = _mm_add_ps(c2, c3);
4563 c4 = _mm_add_ps(c4, c5);
4564 c6 = _mm_add_ps(c6, c7);
4565 c8 = _mm_add_ps(c8, c9);
4566 cA = _mm_add_ps(cA, cB);
4567 cC = _mm_add_ps(cC, cD);
4568 cE = _mm_add_ps(cE, cF);
4569 c0 = _mm_add_ps(c0, c2);
4570 c4 = _mm_add_ps(c4, c6);
4571 c8 = _mm_add_ps(c8, cA);
4572 cC = _mm_add_ps(cC, cE);
4573 c0 = _mm_add_ps(c0, c4);
4574 c8 = _mm_add_ps(c8, cC);
4575 c0 = _mm_add_ps(c0, c8);
4576 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
4577
4578 _mm_store_ps((float*)(source0 + 16 * x), c0);
4579 }
4580
4581 source0 += pitch;
4582 source1 += pitch;
4583 source2 += pitch;
4584 source3 += pitch;
4585 source4 += pitch;
4586 source5 += pitch;
4587 source6 += pitch;
4588 source7 += pitch;
4589 source8 += pitch;
4590 source9 += pitch;
4591 sourceA += pitch;
4592 sourceB += pitch;
4593 sourceC += pitch;
4594 sourceD += pitch;
4595 sourceE += pitch;
4596 sourceF += pitch;
4597 }
4598 }
4599 else ASSERT(false);
4600 }
4601 else
4602 {
4603 if(internal.depth == 2)
4604 {
4605 for(int y = 0; y < height; y++)
4606 {
4607 for(int x = 0; x < 4 * width; x++)
4608 {
4609 float c0 = *(float*)(source0 + 4 * x);
4610 float c1 = *(float*)(source1 + 4 * x);
4611
4612 c0 = c0 + c1;
4613 c0 *= 1.0f / 2.0f;
4614
4615 *(float*)(source0 + 4 * x) = c0;
4616 }
4617
4618 source0 += pitch;
4619 source1 += pitch;
4620 }
4621 }
4622 else if(internal.depth == 4)
4623 {
4624 for(int y = 0; y < height; y++)
4625 {
4626 for(int x = 0; x < 4 * width; x++)
4627 {
4628 float c0 = *(float*)(source0 + 4 * x);
4629 float c1 = *(float*)(source1 + 4 * x);
4630 float c2 = *(float*)(source2 + 4 * x);
4631 float c3 = *(float*)(source3 + 4 * x);
4632
4633 c0 = c0 + c1;
4634 c2 = c2 + c3;
4635 c0 = c0 + c2;
4636 c0 *= 1.0f / 4.0f;
4637
4638 *(float*)(source0 + 4 * x) = c0;
4639 }
4640
4641 source0 += pitch;
4642 source1 += pitch;
4643 source2 += pitch;
4644 source3 += pitch;
4645 }
4646 }
4647 else if(internal.depth == 8)
4648 {
4649 for(int y = 0; y < height; y++)
4650 {
4651 for(int x = 0; x < 4 * width; x++)
4652 {
4653 float c0 = *(float*)(source0 + 4 * x);
4654 float c1 = *(float*)(source1 + 4 * x);
4655 float c2 = *(float*)(source2 + 4 * x);
4656 float c3 = *(float*)(source3 + 4 * x);
4657 float c4 = *(float*)(source4 + 4 * x);
4658 float c5 = *(float*)(source5 + 4 * x);
4659 float c6 = *(float*)(source6 + 4 * x);
4660 float c7 = *(float*)(source7 + 4 * x);
4661
4662 c0 = c0 + c1;
4663 c2 = c2 + c3;
4664 c4 = c4 + c5;
4665 c6 = c6 + c7;
4666 c0 = c0 + c2;
4667 c4 = c4 + c6;
4668 c0 = c0 + c4;
4669 c0 *= 1.0f / 8.0f;
4670
4671 *(float*)(source0 + 4 * x) = c0;
4672 }
4673
4674 source0 += pitch;
4675 source1 += pitch;
4676 source2 += pitch;
4677 source3 += pitch;
4678 source4 += pitch;
4679 source5 += pitch;
4680 source6 += pitch;
4681 source7 += pitch;
4682 }
4683 }
4684 else if(internal.depth == 16)
4685 {
4686 for(int y = 0; y < height; y++)
4687 {
4688 for(int x = 0; x < 4 * width; x++)
4689 {
4690 float c0 = *(float*)(source0 + 4 * x);
4691 float c1 = *(float*)(source1 + 4 * x);
4692 float c2 = *(float*)(source2 + 4 * x);
4693 float c3 = *(float*)(source3 + 4 * x);
4694 float c4 = *(float*)(source4 + 4 * x);
4695 float c5 = *(float*)(source5 + 4 * x);
4696 float c6 = *(float*)(source6 + 4 * x);
4697 float c7 = *(float*)(source7 + 4 * x);
4698 float c8 = *(float*)(source8 + 4 * x);
4699 float c9 = *(float*)(source9 + 4 * x);
4700 float cA = *(float*)(sourceA + 4 * x);
4701 float cB = *(float*)(sourceB + 4 * x);
4702 float cC = *(float*)(sourceC + 4 * x);
4703 float cD = *(float*)(sourceD + 4 * x);
4704 float cE = *(float*)(sourceE + 4 * x);
4705 float cF = *(float*)(sourceF + 4 * x);
4706
4707 c0 = c0 + c1;
4708 c2 = c2 + c3;
4709 c4 = c4 + c5;
4710 c6 = c6 + c7;
4711 c8 = c8 + c9;
4712 cA = cA + cB;
4713 cC = cC + cD;
4714 cE = cE + cF;
4715 c0 = c0 + c2;
4716 c4 = c4 + c6;
4717 c8 = c8 + cA;
4718 cC = cC + cE;
4719 c0 = c0 + c4;
4720 c8 = c8 + cC;
4721 c0 = c0 + c8;
4722 c0 *= 1.0f / 16.0f;
4723
4724 *(float*)(source0 + 4 * x) = c0;
4725 }
4726
4727 source0 += pitch;
4728 source1 += pitch;
4729 source2 += pitch;
4730 source3 += pitch;
4731 source4 += pitch;
4732 source5 += pitch;
4733 source6 += pitch;
4734 source7 += pitch;
4735 source8 += pitch;
4736 source9 += pitch;
4737 sourceA += pitch;
4738 sourceB += pitch;
4739 sourceC += pitch;
4740 sourceD += pitch;
4741 sourceE += pitch;
4742 sourceF += pitch;
4743 }
4744 }
4745 else ASSERT(false);
4746 }
4747 }
4748 else
4749 {
4750 // UNIMPLEMENTED();
4751 }
4752 }
4753}