Split VAO dirty bits to speed iteration.

Using > 64 bits (we had over 90) would use a much slower dirty bit
iteration. Speed this up by splitting the dirty bits into two levels.
The first top level only has a single dirty bit per attrib, per
binding, and one bit for the element array buffer. The next level has
separate dirty bits for attribs and bindings.

The D3D11 back-end doesn't actually care about individual dirty bits
of attribs or bindings, since it resets entire attributes at a time,
but the GL back-end only refreshes the necessary info.

Improves the score of a simple state change microbenchmark by 15% on
the D3D11 and GL back-ends with a no-op driver. Real-world impact will
be smaller.

Also includes a test suppression for an NVIDIA bug that surfaced when
we changed the order of that GL commands were sent to the driver.

BUG=angleproject:2389

Change-Id: If8d5e5eb0b27e2a77e20535e33626183d372d311
Reviewed-on: https://chromium-review.googlesource.com/556799
Reviewed-by: Geoff Lang <geofflang@chromium.org>
Reviewed-by: Yuly Novikov <ynovikov@chromium.org>
Commit-Queue: Jamie Madill <jmadill@chromium.org>
diff --git a/src/libANGLE/VertexArray.cpp b/src/libANGLE/VertexArray.cpp
index d738065..1013774 100644
--- a/src/libANGLE/VertexArray.cpp
+++ b/src/libANGLE/VertexArray.cpp
@@ -111,7 +111,19 @@
     static_assert(gl::MAX_VERTEX_ATTRIBS == gl::MAX_VERTEX_ATTRIB_BINDINGS,
                   "The stride of vertex attributes should equal to that of vertex bindings.");
     ASSERT(dirtyBit > DIRTY_BIT_ELEMENT_ARRAY_BUFFER);
-    return (dirtyBit - DIRTY_BIT_ATTRIB_0_ENABLED) % gl::MAX_VERTEX_ATTRIBS;
+    return (dirtyBit - DIRTY_BIT_ATTRIB_0) % gl::MAX_VERTEX_ATTRIBS;
+}
+
+void VertexArray::setDirtyAttribBit(size_t attribIndex, DirtyAttribBitType dirtyAttribBit)
+{
+    mDirtyBits.set(DIRTY_BIT_ATTRIB_0 + attribIndex);
+    mDirtyAttribBits[attribIndex].set(dirtyAttribBit);
+}
+
+void VertexArray::setDirtyBindingBit(size_t bindingIndex, DirtyBindingBitType dirtyBindingBit)
+{
+    mDirtyBits.set(DIRTY_BIT_BINDING_0 + bindingIndex);
+    mDirtyBindingBits[bindingIndex].set(dirtyBindingBit);
 }
 
 void VertexArray::bindVertexBufferImpl(const Context *context,
@@ -137,8 +149,7 @@
                                    GLsizei stride)
 {
     bindVertexBufferImpl(context, bindingIndex, boundBuffer, offset, stride);
-
-    mDirtyBits.set(DIRTY_BIT_BINDING_0_BUFFER + bindingIndex);
+    setDirtyBindingBit(bindingIndex, DIRTY_BINDING_BUFFER);
 }
 
 void VertexArray::setVertexAttribBinding(const Context *context,
@@ -153,8 +164,9 @@
         ASSERT(context->getClientVersion() >= ES_3_1);
         mState.mVertexAttributes[attribIndex].bindingIndex = bindingIndex;
 
-        mDirtyBits.set(DIRTY_BIT_ATTRIB_0_BINDING + attribIndex);
+        setDirtyAttribBit(attribIndex, DIRTY_ATTRIB_BINDING);
     }
+    mState.mVertexAttributes[attribIndex].bindingIndex = static_cast<GLuint>(bindingIndex);
 }
 
 void VertexArray::setVertexBindingDivisor(size_t bindingIndex, GLuint divisor)
@@ -162,8 +174,7 @@
     ASSERT(bindingIndex < getMaxBindings());
 
     mState.mVertexBindings[bindingIndex].setDivisor(divisor);
-
-    mDirtyBits.set(DIRTY_BIT_BINDING_0_DIVISOR + bindingIndex);
+    setDirtyBindingBit(bindingIndex, DIRTY_BINDING_DIVISOR);
 }
 
 void VertexArray::setVertexAttribFormatImpl(size_t attribIndex,
@@ -194,8 +205,7 @@
                                         GLuint relativeOffset)
 {
     setVertexAttribFormatImpl(attribIndex, size, type, normalized, pureInteger, relativeOffset);
-
-    mDirtyBits.set(DIRTY_BIT_ATTRIB_0_FORMAT + attribIndex);
+    setDirtyAttribBit(attribIndex, DIRTY_ATTRIB_FORMAT);
 }
 
 void VertexArray::setVertexAttribDivisor(const Context *context, size_t attribIndex, GLuint divisor)
@@ -214,7 +224,7 @@
     mState.mVertexAttributesTypeMask.setIndex(
         GetVertexAttributeBaseType(mState.mVertexAttributes[attribIndex]), attribIndex);
 
-    mDirtyBits.set(DIRTY_BIT_ATTRIB_0_ENABLED + attribIndex);
+    setDirtyAttribBit(attribIndex, DIRTY_ATTRIB_ENABLED);
 
     // Update state cache
     mState.mEnabledAttributesMask.set(attribIndex, enabledState);
@@ -246,7 +256,7 @@
 
     bindVertexBufferImpl(context, attribIndex, boundBuffer, offset, effectiveStride);
 
-    mDirtyBits.set(DIRTY_BIT_ATTRIB_0_POINTER + attribIndex);
+    setDirtyAttribBit(attribIndex, DIRTY_ATTRIB_POINTER);
 }
 
 void VertexArray::setElementArrayBuffer(const Context *context, Buffer *buffer)
@@ -264,8 +274,15 @@
 {
     if (mDirtyBits.any())
     {
-        mVertexArray->syncState(context, mDirtyBits);
+        mVertexArray->syncState(context, mDirtyBits, mDirtyAttribBits, mDirtyBindingBits);
         mDirtyBits.reset();
+
+        // This is a bit of an implementation hack - but since we know the implementation
+        // details of the dirty bit class it should always have the same effect as iterating
+        // individual attribs. We could also look into schemes where iterating the dirty
+        // bit set also resets it as you pass through it.
+        memset(&mDirtyAttribBits, 0, sizeof(mDirtyAttribBits));
+        memset(&mDirtyBindingBits, 0, sizeof(mDirtyBindingBits));
     }
 }