ES31: Support atomic functions on D3D11 - Part I

This patch is the first one of the implementation of atomic
functions in D3D11.

There are mainly two differences in the usage of GLSL and HLSL
atomic functions:
1. All GLSL atomic functions have return values, which all
   represent the original value of the shared or ssbo variable;
   while all HLSL atomic functions don't, and the original value
   can be stored in the last parameter of the function call.
2. For HLSL atomic functions, the last parameter that stores the
   original value is optional except for InterlockedExchange and
   InterlockedCompareExchange. Missing original_value in the call
   of InterlockedExchange and InterlockedCompareExchange results
   in a compile error from HLSL compiler.

To handle these differences, we plan to implement the translation
in two steps:
1. Support direct translations from GLSL atomic functions to HLSL
   ones.
Direct translation can only handle the following two situations:
(1) The sentence is a GLSL atomic function call without requesting
    a return value and it is not atomicExchange or atomicCompSwap:
    e.g.
    GLSL: atomicAdd(mem, value);
 -> HLSL: InterlockedAdd(mem, value);
(2) The sentence is a simple assignment expression: its right is
    a GLSL atomic function call and its left is a declared variable.
    e.g.
    GLSL: oldValue = atomicAdd(mem, value);
 -> HLSL: InterlockedAdd(mem, value, oldValue);

2. Support atomic functions in the situations that don't support
   direct translations.
We will modify the intermediate tree to make direct translation work
on all these situations.
e.g.
   atomicExchange(mem, value);
-> int oldValue;
   oldValue = atomicExchange(mem, value);

   int oldValue = atomicAdd(mem, value);
-> int oldValue;
   oldValue = atomicAdd(mem, value);

   return atomicAdd(mem, value);
-> int temp;
   temp = atomicAdd(mem, value);
   return temp;

   for (i = 0; i < atomicAdd(mem, value); ++i)
-> int temp;
   temp = atomicAdd(mem, value);
   for (i = 0; i < temp; ++i)
   {
       ...
       temp = atomicAdd(mem, value);
   }

   int result = isTrue ? atomicAdd(mem, value) : 0;
-> int result;
   if (isTrue)
   {
       result = atomicAdd(mem, value);
   }
   else
   {
       result = 0;
   }

This patch completes Step 1 which mainly focus on the translation
from GLSL atomic functions to HLSL ones.

BUG=angleproject:2682
TEST=angle_end2end_tests

Change-Id: I3b655b6e286dad4fd97f255f7fe87521c94db30c
Reviewed-on: https://chromium-review.googlesource.com/1121835
Commit-Queue: Jiawei Shao <jiawei.shao@intel.com>
Reviewed-by: Olli Etuaho <oetuaho@nvidia.com>
diff --git a/src/tests/gl_tests/ComputeShaderTest.cpp b/src/tests/gl_tests/ComputeShaderTest.cpp
index bade1f5..a22eaad 100644
--- a/src/tests/gl_tests/ComputeShaderTest.cpp
+++ b/src/tests/gl_tests/ComputeShaderTest.cpp
@@ -20,34 +20,35 @@
   protected:
     ComputeShaderTest() {}
 
-    template <GLint kWidth, GLint kHeight>
+    template <class T, GLint kWidth, GLint kHeight>
     void runSharedMemoryTest(const char *csSource,
-                             const std::array<GLuint, kWidth * kHeight> &inputData,
-                             const std::array<GLuint, kWidth * kHeight> &expectedValues)
+                             GLenum internalFormat,
+                             GLenum format,
+                             const std::array<T, kWidth * kHeight> &inputData,
+                             const std::array<T, kWidth * kHeight> &expectedValues)
     {
         GLTexture texture[2];
         GLFramebuffer framebuffer;
 
         glBindTexture(GL_TEXTURE_2D, texture[0]);
-        glTexStorage2D(GL_TEXTURE_2D, 1, GL_R32UI, kWidth, kHeight);
-        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, kWidth, kHeight, GL_RED_INTEGER, GL_UNSIGNED_INT,
+        glTexStorage2D(GL_TEXTURE_2D, 1, internalFormat, kWidth, kHeight);
+        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, kWidth, kHeight, GL_RED_INTEGER, format,
                         inputData.data());
         EXPECT_GL_NO_ERROR();
 
-        constexpr GLuint initData[kWidth * kHeight] = {};
+        constexpr T initData[kWidth * kHeight] = {};
         glBindTexture(GL_TEXTURE_2D, texture[1]);
-        glTexStorage2D(GL_TEXTURE_2D, 1, GL_R32UI, kWidth, kHeight);
-        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, kWidth, kHeight, GL_RED_INTEGER, GL_UNSIGNED_INT,
-                        initData);
+        glTexStorage2D(GL_TEXTURE_2D, 1, internalFormat, kWidth, kHeight);
+        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, kWidth, kHeight, GL_RED_INTEGER, format, initData);
         EXPECT_GL_NO_ERROR();
 
         ANGLE_GL_COMPUTE_PROGRAM(program, csSource);
         glUseProgram(program.get());
 
-        glBindImageTexture(0, texture[0], 0, GL_FALSE, 0, GL_READ_ONLY, GL_R32UI);
+        glBindImageTexture(0, texture[0], 0, GL_FALSE, 0, GL_READ_ONLY, internalFormat);
         EXPECT_GL_NO_ERROR();
 
-        glBindImageTexture(1, texture[1], 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R32UI);
+        glBindImageTexture(1, texture[1], 0, GL_FALSE, 0, GL_WRITE_ONLY, internalFormat);
         EXPECT_GL_NO_ERROR();
 
         glDispatchCompute(1, 1, 1);
@@ -55,14 +56,14 @@
 
         glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT);
 
-        GLuint outputValues[kWidth * kHeight];
+        T outputValues[kWidth * kHeight] = {};
         glUseProgram(0);
         glBindFramebuffer(GL_READ_FRAMEBUFFER, framebuffer);
 
         glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, texture[1],
                                0);
         EXPECT_GL_NO_ERROR();
-        glReadPixels(0, 0, kWidth, kHeight, GL_RED_INTEGER, GL_UNSIGNED_INT, outputValues);
+        glReadPixels(0, 0, kWidth, kHeight, GL_RED_INTEGER, format, outputValues);
         EXPECT_GL_NO_ERROR();
 
         for (int i = 0; i < kWidth * kHeight; i++)
@@ -1386,7 +1387,8 @@
 
     const std::array<GLuint, 4> inputData      = {{250, 200, 150, 100}};
     const std::array<GLuint, 4> expectedValues = {{250, 200, 150, 250}};
-    runSharedMemoryTest<2, 2>(kCSShader, inputData, expectedValues);
+    runSharedMemoryTest<GLuint, 2, 2>(kCSShader, GL_R32UI, GL_UNSIGNED_INT, inputData,
+                                      expectedValues);
 }
 
 // Verify shared non-struct array variables can work correctly.
@@ -1410,7 +1412,8 @@
 
     const std::array<GLuint, 4> inputData      = {{250, 200, 150, 100}};
     const std::array<GLuint, 4> expectedValues = {{250, 150, 200, 100}};
-    runSharedMemoryTest<2, 2>(kCSShader, inputData, expectedValues);
+    runSharedMemoryTest<GLuint, 2, 2>(kCSShader, GL_R32UI, GL_UNSIGNED_INT, inputData,
+                                      expectedValues);
 }
 
 // Verify shared struct array variables work correctly.
@@ -1438,7 +1441,111 @@
 
     const std::array<GLuint, 4> inputData      = {{250, 200, 150, 100}};
     const std::array<GLuint, 4> expectedValues = {{250, 150, 200, 100}};
-    runSharedMemoryTest<2, 2>(kCSShader, inputData, expectedValues);
+    runSharedMemoryTest<GLuint, 2, 2>(kCSShader, GL_R32UI, GL_UNSIGNED_INT, inputData,
+                                      expectedValues);
+}
+
+// Verify using atomic functions without return value can work correctly.
+// TODO(jiawei.shao@intel.com): add test on atomicExchange and atomicCompSwap.
+TEST_P(ComputeShaderTest, AtomicFunctionsNoReturnValue)
+{
+    // TODO(jiawei.shao@intel.com): find out why this shader causes a link error on Android Nexus 5
+    // bot.
+    ANGLE_SKIP_TEST_IF(IsAndroid());
+
+    const char kCSShader[] =
+        R"(#version 310 es
+        layout (local_size_x = 6, local_size_y = 1, local_size_z = 1) in;
+        layout (r32ui, binding = 0) readonly uniform highp uimage2D srcImage;
+        layout (r32ui, binding = 1) writeonly uniform highp uimage2D dstImage;
+
+        const uint kSumIndex = 0u;
+        const uint kMinIndex = 1u;
+        const uint kMaxIndex = 2u;
+        const uint kOrIndex = 3u;
+        const uint kAndIndex = 4u;
+        const uint kXorIndex = 5u;
+
+        shared highp uint results[6];
+
+        void main()
+        {
+            if (gl_LocalInvocationID.x == kMinIndex || gl_LocalInvocationID.x == kAndIndex)
+            {
+                results[gl_LocalInvocationID.x] = 0xFFFFu;
+            }
+            else
+            {
+                results[gl_LocalInvocationID.x] = 0u;
+            }
+            memoryBarrierShared();
+            barrier();
+
+            uint value = imageLoad(srcImage, ivec2(gl_LocalInvocationID.xy)).x;
+            atomicAdd(results[kSumIndex], value);
+            atomicMin(results[kMinIndex], value);
+            atomicMax(results[kMaxIndex], value);
+            atomicOr(results[kOrIndex], value);
+            atomicAnd(results[kAndIndex], value);
+            atomicXor(results[kXorIndex], value);
+            memoryBarrierShared();
+            barrier();
+
+            imageStore(dstImage, ivec2(gl_LocalInvocationID.xy),
+                       uvec4(results[gl_LocalInvocationID.x]));
+        })";
+
+    const std::array<GLuint, 6> inputData      = {{1, 2, 4, 8, 16, 32}};
+    const std::array<GLuint, 6> expectedValues = {{63, 1, 32, 63, 0, 63}};
+    runSharedMemoryTest<GLuint, 6, 1>(kCSShader, GL_R32UI, GL_UNSIGNED_INT, inputData,
+                                      expectedValues);
+}
+
+// Verify using atomic functions in a non-initializer single assignment can work correctly.
+TEST_P(ComputeShaderTest, AtomicFunctionsInNonInitializerSingleAssignment)
+{
+    const char kCSShader[] =
+        R"(#version 310 es
+        layout (local_size_x = 9, local_size_y = 1, local_size_z = 1) in;
+        layout (r32i, binding = 0) readonly uniform highp iimage2D srcImage;
+        layout (r32i, binding = 1) writeonly uniform highp iimage2D dstImage;
+
+        shared highp int sharedVariable;
+
+        shared highp int inputData[9];
+        shared highp int outputData[9];
+
+        void main()
+        {
+            int inputValue = imageLoad(srcImage, ivec2(gl_LocalInvocationID.xy)).x;
+            inputData[gl_LocalInvocationID.x] = inputValue;
+            memoryBarrierShared();
+            barrier();
+
+            if (gl_LocalInvocationID.x == 0u)
+            {
+                sharedVariable = 0;
+
+                outputData[0] = atomicAdd(sharedVariable, inputData[0]);
+                outputData[1] = atomicMin(sharedVariable, inputData[1]);
+                outputData[2] = atomicMax(sharedVariable, inputData[2]);
+                outputData[3] = atomicAnd(sharedVariable, inputData[3]);
+                outputData[4] = atomicOr(sharedVariable, inputData[4]);
+                outputData[5] = atomicXor(sharedVariable, inputData[5]);
+                outputData[6] = atomicExchange(sharedVariable, inputData[6]);
+                outputData[7] = atomicCompSwap(sharedVariable, 64, inputData[7]);
+                outputData[8] = atomicAdd(sharedVariable, inputData[8]);
+            }
+            memoryBarrierShared();
+            barrier();
+
+            imageStore(dstImage, ivec2(gl_LocalInvocationID.xy),
+                       ivec4(outputData[gl_LocalInvocationID.x]));
+        })";
+
+    const std::array<GLint, 9> inputData      = {{1, 2, 4, 8, 16, 32, 64, 128, 1}};
+    const std::array<GLint, 9> expectedValues = {{0, 1, 1, 4, 0, 16, 48, 64, 128}};
+    runSharedMemoryTest<GLint, 9, 1>(kCSShader, GL_R32I, GL_INT, inputData, expectedValues);
 }
 
 // Check that it is not possible to create a compute shader when the context does not support ES