Optimize glDrawElements performance

A call to glDrawElements results in a calling depth of up to 4
* glDrawElements
* gl::Context::DrawElements
* rx::ContextGL::DrawElements
* VertexArrayGL::syncDrawState.

Each function call has to save/restore a lot of registers which
results in a stall in the prologue of rx::ContextGL::DrawElements
due to memory bandwidth limitations.

The main change is the function gl::Context::DrawElements being
inlined to reduce the calling depth by one. In addition the call
to ContextGL::syncDrawElementsState is now protected so that it
gets called only if it's required. Finally a few small getter
functions have been inlined where the calling code was bigger
than the actual function.

In total this change improves performance of the
DrawElementsPerfBenchmark.Run/gl benchmark by 16%.

Bug: angleproject:2966

Change-Id: I423d18452f2f5b520ab52850fda2054e1da86991
Reviewed-on: https://chromium-review.googlesource.com/c/1389988
Reviewed-by: Jamie Madill <jmadill@chromium.org>
Commit-Queue: Markus Tavenrath <matavenrath@nvidia.com>
diff --git a/src/libANGLE/Context.cpp b/src/libANGLE/Context.cpp
index 5a62efe..b9b1b4e 100644
--- a/src/libANGLE/Context.cpp
+++ b/src/libANGLE/Context.cpp
@@ -8,6 +8,7 @@
 // rendering operations. It is the GLES2 specific implementation of EGLContext.
 
 #include "libANGLE/Context.h"
+#include "libANGLE/Context.inl.h"
 
 #include <string.h>
 #include <iterator>
@@ -25,7 +26,6 @@
 #include "libANGLE/Fence.h"
 #include "libANGLE/Framebuffer.h"
 #include "libANGLE/FramebufferAttachment.h"
-#include "libANGLE/GLES1Renderer.h"
 #include "libANGLE/Path.h"
 #include "libANGLE/Program.h"
 #include "libANGLE/ProgramPipeline.h"
@@ -42,7 +42,6 @@
 #include "libANGLE/queryconversions.h"
 #include "libANGLE/queryutils.h"
 #include "libANGLE/renderer/BufferImpl.h"
-#include "libANGLE/renderer/ContextImpl.h"
 #include "libANGLE/renderer/EGLImplFactory.h"
 #include "libANGLE/renderer/Format.h"
 #include "libANGLE/validationES.h"
@@ -2241,21 +2240,6 @@
     MarkTransformFeedbackBufferUsage(this, count, instanceCount);
 }
 
-void Context::drawElements(PrimitiveMode mode,
-                           GLsizei count,
-                           DrawElementsType type,
-                           const void *indices)
-{
-    // No-op if count draws no primitives for given mode
-    if (noopDraw(mode, count))
-    {
-        return;
-    }
-
-    ANGLE_CONTEXT_TRY(prepareForDraw(mode));
-    ANGLE_CONTEXT_TRY(mImplementation->drawElements(this, mode, count, type, indices));
-}
-
 void Context::drawElementsInstanced(PrimitiveMode mode,
                                     GLsizei count,
                                     DrawElementsType type,
@@ -3516,40 +3500,6 @@
     return (instanceCount == 0) || noopDraw(mode, count);
 }
 
-ANGLE_INLINE angle::Result Context::syncDirtyBits()
-{
-    const State::DirtyBits &dirtyBits = mGLState.getDirtyBits();
-    ANGLE_TRY(mImplementation->syncState(this, dirtyBits, mAllDirtyBits));
-    mGLState.clearDirtyBits();
-    return angle::Result::Continue;
-}
-
-ANGLE_INLINE angle::Result Context::syncDirtyBits(const State::DirtyBits &bitMask)
-{
-    const State::DirtyBits &dirtyBits = (mGLState.getDirtyBits() & bitMask);
-    ANGLE_TRY(mImplementation->syncState(this, dirtyBits, bitMask));
-    mGLState.clearDirtyBits(dirtyBits);
-    return angle::Result::Continue;
-}
-
-ANGLE_INLINE angle::Result Context::syncDirtyObjects(const State::DirtyObjects &objectMask)
-{
-    return mGLState.syncDirtyObjects(this, objectMask);
-}
-
-ANGLE_INLINE angle::Result Context::prepareForDraw(PrimitiveMode mode)
-{
-    if (mGLES1Renderer)
-    {
-        ANGLE_TRY(mGLES1Renderer->prepareForDraw(mode, this, &mGLState));
-    }
-
-    ANGLE_TRY(syncDirtyObjects(mDrawDirtyObjects));
-    ASSERT(!isRobustResourceInitEnabled() ||
-           !mGLState.getDrawFramebuffer()->hasResourceThatNeedsInit());
-    return syncDirtyBits();
-}
-
 angle::Result Context::prepareForClear(GLbitfield mask)
 {
     ANGLE_TRY(syncDirtyObjects(mClearDirtyObjects));
@@ -8495,7 +8445,7 @@
 
 void StateCache::updateTransformFeedbackActiveUnpaused(Context *context)
 {
-    TransformFeedback *xfb = context->getGLState().getCurrentTransformFeedback();
+    TransformFeedback *xfb                 = context->getGLState().getCurrentTransformFeedback();
     mCachedTransformFeedbackActiveUnpaused = xfb && xfb->isActive() && !xfb->isPaused();
 }
 }  // namespace gl