D3D: Rework varying packing code.

In D3D we pack varyings by making a register map, and using the
recommended GLSL ES algorithm to reserve register space. We use
this map to assign row and column slots to each varying and then
produce a semantic index value.

The existing scheme had a number of bugs, and was failing several
angle_end2end_tests. The new design cleans up the code somewhat
and uses a different counting scheme for the semantic indexes:
just sort the varyings in packing order and use a simple
incrementing semantic index per varying. In SM4+, the HLSL compiler
sorts and packs the varyings correctly itself, and in SM3, handle
the cases we don't support by returning an error instead of a D3D
compiler link error.

Also refactor how we store varying information for TF Feedback/
StreamOut. Only store the necessary D3D information, instead of
extra information like the name and type.

This fixes several tests in GLSLTest/*. This also will allow us to
fix interpolation qualifier packing and the structure packing in
HLSL, which seems to work differently than the rest of the varying
types.

BUG=angleproject:1202
TEST=bots,dEQP-GLES3.functional.transform_feedback.*

Change-Id: Ie5bfbb4f71d8bf97f39115fc46d2e61b131df639
Reviewed-on: https://chromium-review.googlesource.com/311241
Reviewed-by: Geoff Lang <geofflang@chromium.org>
Tested-by: Jamie Madill <jmadill@chromium.org>
diff --git a/src/libANGLE/renderer/d3d/ProgramD3D.cpp b/src/libANGLE/renderer/d3d/ProgramD3D.cpp
index d937441..58ac5f8 100644
--- a/src/libANGLE/renderer/d3d/ProgramD3D.cpp
+++ b/src/libANGLE/renderer/d3d/ProgramD3D.cpp
@@ -101,6 +101,12 @@
     const ProgramD3D::SemanticIndexArray *originalIndices;
 };
 
+// true if varying x has a higher priority in packing than y
+bool ComparePackedVarying(const PackedVarying &x, const PackedVarying &y)
+{
+    return gl::CompareVarying(*x.varying, *y.varying);
+}
+
 std::vector<PackedVarying> MergeVaryings(const gl::Shader &vertexShader,
                                          const gl::Shader &fragmentShader,
                                          const std::vector<std::string> &tfVaryings)
@@ -142,6 +148,8 @@
         }
     }
 
+    std::sort(packedVaryings.begin(), packedVaryings.end(), ComparePackedVarying);
+
     return packedVaryings;
 }
 
@@ -344,22 +352,18 @@
 
 // D3DVarying Implementation
 
-D3DVarying::D3DVarying()
+D3DVarying::D3DVarying() : semanticIndex(0), componentCount(0), outputSlot(0)
 {
 }
 
-D3DVarying::D3DVarying(const std::string &name,
-                       GLenum type,
-                       GLsizei size,
-                       const std::string &semanticName,
-                       unsigned int semanticIndex,
-                       unsigned int semanticIndexCount)
-    : name(name),
-      type(type),
-      size(size),
-      semanticName(semanticName),
-      semanticIndex(semanticIndex),
-      semanticIndexCount(semanticIndexCount)
+D3DVarying::D3DVarying(const std::string &semanticNameIn,
+                       unsigned int semanticIndexIn,
+                       unsigned int componentCountIn,
+                       unsigned int outputSlotIn)
+    : semanticName(semanticNameIn),
+      semanticIndex(semanticIndexIn),
+      componentCount(componentCountIn),
+      outputSlot(outputSlotIn)
 {
 }
 
@@ -781,19 +785,16 @@
         mD3DUniformBlocks.push_back(uniformBlock);
     }
 
-    const unsigned int transformFeedbackVaryingCount = stream->readInt<unsigned int>();
-    mTransformFeedbackD3DVaryings.resize(transformFeedbackVaryingCount);
-    for (unsigned int varyingIndex = 0; varyingIndex < transformFeedbackVaryingCount;
-         varyingIndex++)
+    const unsigned int streamOutVaryingCount = stream->readInt<unsigned int>();
+    mStreamOutVaryings.resize(streamOutVaryingCount);
+    for (unsigned int varyingIndex = 0; varyingIndex < streamOutVaryingCount; ++varyingIndex)
     {
-        D3DVarying *varying = &mTransformFeedbackD3DVaryings[varyingIndex];
+        D3DVarying *varying = &mStreamOutVaryings[varyingIndex];
 
-        stream->readString(&varying->name);
-        stream->readInt(&varying->type);
-        stream->readInt(&varying->size);
         stream->readString(&varying->semanticName);
         stream->readInt(&varying->semanticIndex);
-        stream->readInt(&varying->semanticIndexCount);
+        stream->readInt(&varying->componentCount);
+        stream->readInt(&varying->outputSlot);
     }
 
     stream->readString(&mVertexHLSL);
@@ -839,7 +840,7 @@
         ShaderExecutableD3D *shaderExecutable = nullptr;
 
         gl::Error error = mRenderer->loadExecutable(
-            vertexShaderFunction, vertexShaderSize, SHADER_VERTEX, mTransformFeedbackD3DVaryings,
+            vertexShaderFunction, vertexShaderSize, SHADER_VERTEX, mStreamOutVaryings,
             (mData.getTransformFeedbackBufferMode() == GL_SEPARATE_ATTRIBS), &shaderExecutable);
         if (error.isError())
         {
@@ -878,7 +879,7 @@
         ShaderExecutableD3D *shaderExecutable    = nullptr;
 
         gl::Error error = mRenderer->loadExecutable(
-            pixelShaderFunction, pixelShaderSize, SHADER_PIXEL, mTransformFeedbackD3DVaryings,
+            pixelShaderFunction, pixelShaderSize, SHADER_PIXEL, mStreamOutVaryings,
             (mData.getTransformFeedbackBufferMode() == GL_SEPARATE_ATTRIBS), &shaderExecutable);
         if (error.isError())
         {
@@ -911,8 +912,8 @@
         bool splitAttribs                           = (mData.getTransformFeedbackBufferMode() == GL_SEPARATE_ATTRIBS);
 
         gl::Error error = mRenderer->loadExecutable(
-            geometryShaderFunction, geometryShaderSize, SHADER_GEOMETRY,
-            mTransformFeedbackD3DVaryings, splitAttribs, &mGeometryExecutables[geometryExeIndex]);
+            geometryShaderFunction, geometryShaderSize, SHADER_GEOMETRY, mStreamOutVaryings,
+            splitAttribs, &mGeometryExecutables[geometryExeIndex]);
         if (error.isError())
         {
             return LinkResult(false, error);
@@ -985,17 +986,13 @@
         stream->writeInt(uniformBlock.vsRegisterIndex);
     }
 
-    stream->writeInt(mTransformFeedbackD3DVaryings.size());
-    for (size_t i = 0; i < mTransformFeedbackD3DVaryings.size(); i++)
+    stream->writeInt(mStreamOutVaryings.size());
+    for (const auto &varying : mStreamOutVaryings)
     {
-        const D3DVarying &varying = mTransformFeedbackD3DVaryings[i];
-
-        stream->writeString(varying.name);
-        stream->writeInt(varying.type);
-        stream->writeInt(varying.size);
         stream->writeString(varying.semanticName);
         stream->writeInt(varying.semanticIndex);
-        stream->writeInt(varying.semanticIndexCount);
+        stream->writeInt(varying.componentCount);
+        stream->writeInt(varying.outputSlot);
     }
 
     stream->writeString(mVertexHLSL);
@@ -1130,7 +1127,7 @@
     gl::InfoLog *currentInfoLog = infoLog ? infoLog : &tempInfoLog;
 
     gl::Error error = mRenderer->compileToExecutable(
-        *currentInfoLog, finalPixelHLSL, SHADER_PIXEL, mTransformFeedbackD3DVaryings,
+        *currentInfoLog, finalPixelHLSL, SHADER_PIXEL, mStreamOutVaryings,
         (mData.getTransformFeedbackBufferMode() == GL_SEPARATE_ATTRIBS), mPixelWorkarounds,
         &pixelExecutable);
     if (error.isError())
@@ -1179,7 +1176,7 @@
     gl::InfoLog *currentInfoLog = infoLog ? infoLog : &tempInfoLog;
 
     gl::Error error = mRenderer->compileToExecutable(
-        *currentInfoLog, finalVertexHLSL, SHADER_VERTEX, mTransformFeedbackD3DVaryings,
+        *currentInfoLog, finalVertexHLSL, SHADER_VERTEX, mStreamOutVaryings,
         (mData.getTransformFeedbackBufferMode() == GL_SEPARATE_ATTRIBS), mVertexWorkarounds,
         &vertexExecutable);
     if (error.isError())
@@ -1238,7 +1235,7 @@
     gl::InfoLog *currentInfoLog = infoLog ? infoLog : &tempInfoLog;
 
     gl::Error error = mRenderer->compileToExecutable(
-        *currentInfoLog, geometryHLSL, SHADER_GEOMETRY, mTransformFeedbackD3DVaryings,
+        *currentInfoLog, geometryHLSL, SHADER_GEOMETRY, mStreamOutVaryings,
         (mData.getTransformFeedbackBufferMode() == GL_SEPARATE_ATTRIBS), D3DCompilerWorkarounds(),
         &mGeometryExecutables[geometryShaderType]);
 
@@ -1353,9 +1350,9 @@
         MergeVaryings(*vertexShader, *fragmentShader, mData.getTransformFeedbackVaryingNames());
 
     // Map the varyings to the register file
-    unsigned int registerCount = 0;
-    if (!PackVaryings(*data.caps, infoLog, &packedVaryings,
-                      mData.getTransformFeedbackVaryingNames(), &registerCount))
+    VaryingPacking varyingPacking(data.caps->maxVaryingVectors);
+    if (!varyingPacking.packVaryings(infoLog, packedVaryings,
+                                     mData.getTransformFeedbackVaryingNames()))
     {
         return LinkResult(false, gl::Error(GL_NO_ERROR));
     }
@@ -1364,10 +1361,27 @@
                                 usesInstancedPointSpriteEmulation(), vertexShaderD3D,
                                 fragmentShaderD3D);
 
-    std::vector<D3DVarying> d3dVaryings;
-    if (!mDynamicHLSL->generateShaderLinkHLSL(data, mData, metadata, infoLog, registerCount,
-                                              &mPixelHLSL, &mVertexHLSL, packedVaryings,
-                                              &d3dVaryings))
+    varyingPacking.enableBuiltins(SHADER_VERTEX, metadata);
+    varyingPacking.enableBuiltins(SHADER_PIXEL, metadata);
+
+    if (static_cast<GLuint>(varyingPacking.getRegisterCount()) > data.caps->maxVaryingVectors)
+    {
+        infoLog << "No varying registers left to support gl_FragCoord/gl_PointCoord";
+        return LinkResult(false, gl::Error(GL_NO_ERROR));
+    }
+
+    // TODO(jmadill): Implement more sophisticated component packing in D3D9.
+    // We can fail here because we use one semantic per GLSL varying. D3D11 can pack varyings
+    // intelligently, but D3D9 assumes one semantic per register.
+    if (mRenderer->getRendererClass() == RENDERER_D3D9 &&
+        varyingPacking.getMaxSemanticIndex() > data.caps->maxVaryingVectors)
+    {
+        infoLog << "Cannot pack these varyings on D3D9.";
+        return LinkResult(false, gl::Error(GL_NO_ERROR));
+    }
+
+    if (!mDynamicHLSL->generateShaderLinkHLSL(data, mData, metadata, varyingPacking, &mPixelHLSL,
+                                              &mVertexHLSL))
     {
         return LinkResult(false, gl::Error(GL_NO_ERROR));
     }
@@ -1388,15 +1402,15 @@
 
     if (mRenderer->getMajorShaderModel() >= 4)
     {
-        mGeometryShaderPreamble = mDynamicHLSL->generateGeometryShaderPreamble(
-            data, mData, metadata, registerCount, packedVaryings);
+        varyingPacking.enableBuiltins(SHADER_GEOMETRY, metadata);
+        mGeometryShaderPreamble = mDynamicHLSL->generateGeometryShaderPreamble(varyingPacking);
     }
 
     initSemanticIndex();
 
     defineUniformsAndAssignRegisters();
 
-    gatherTransformFeedbackVaryings(d3dVaryings);
+    gatherTransformFeedbackVaryings(varyingPacking);
 
     LinkResult result = compileProgramExecutables(data, infoLog);
     if (result.error.isError() || !result.linkSuccess)
@@ -2113,7 +2127,7 @@
     std::fill(mSemanticIndexes, mSemanticIndexes + ArraySize(mSemanticIndexes), -1);
     std::fill(mAttributesByLayout, mAttributesByLayout + ArraySize(mAttributesByLayout), -1);
 
-    mTransformFeedbackD3DVaryings.clear();
+    mStreamOutVaryings.clear();
 
     mGeometryShaderPreamble.clear();
 }
@@ -2195,18 +2209,60 @@
     }
 }
 
-void ProgramD3D::gatherTransformFeedbackVaryings(const std::vector<D3DVarying> &d3dVaryings)
+void ProgramD3D::gatherTransformFeedbackVaryings(const VaryingPacking &varyingPacking)
 {
+    const auto &builtins = varyingPacking.builtins(SHADER_VERTEX);
+
+    const std::string &varyingSemantic =
+        GetVaryingSemantic(mRenderer->getMajorShaderModel(), usesPointSize());
+
     // Gather the linked varyings that are used for transform feedback, they should all exist.
-    mTransformFeedbackD3DVaryings.clear();
-    for (const std::string &tfVaryingName : mData.getTransformFeedbackVaryingNames())
+    mStreamOutVaryings.clear();
+
+    const auto &tfVaryingNames = mData.getTransformFeedbackVaryingNames();
+    for (unsigned int outputSlot = 0; outputSlot < static_cast<unsigned int>(tfVaryingNames.size());
+         ++outputSlot)
     {
-        for (const D3DVarying &d3dVarying : d3dVaryings)
+        const auto &tfVaryingName = tfVaryingNames[outputSlot];
+        if (tfVaryingName == "gl_Position")
         {
-            if (tfVaryingName == d3dVarying.name)
+            if (builtins.glPosition.enabled)
             {
-                mTransformFeedbackD3DVaryings.push_back(d3dVarying);
-                break;
+                mStreamOutVaryings.push_back(D3DVarying(builtins.glPosition.semantic,
+                                                        builtins.glPosition.index, 4, outputSlot));
+            }
+        }
+        else if (tfVaryingName == "gl_FragCoord")
+        {
+            if (builtins.glFragCoord.enabled)
+            {
+                mStreamOutVaryings.push_back(D3DVarying(builtins.glFragCoord.semantic,
+                                                        builtins.glFragCoord.index, 4, outputSlot));
+            }
+        }
+        else if (tfVaryingName == "gl_PointSize")
+        {
+            if (builtins.glPointSize.enabled)
+            {
+                mStreamOutVaryings.push_back(D3DVarying("PSIZE", 0, 1, outputSlot));
+            }
+        }
+        else
+        {
+            for (const PackedVaryingRegister &registerInfo : varyingPacking.getRegisterList())
+            {
+                const sh::Varying &varying = *registerInfo.packedVarying->varying;
+                GLenum transposedType      = gl::TransposeMatrixType(varying.type);
+                int componentCount = gl::VariableColumnCount(transposedType);
+                ASSERT(!varying.isBuiltIn());
+
+                // There can be more than one register assigned to a particular varying, and each
+                // register needs its own stream out entry.
+                if (tfVaryingName == varying.name)
+                {
+                    mStreamOutVaryings.push_back(D3DVarying(
+                        varyingSemantic, registerInfo.semanticIndex, componentCount, outputSlot));
+                }
             }
         }
     }