Micro-optimize math in IndexDataManager

Use bitwise operations instead of division, which is expensive on multiple CPU
architectures.

BUG=angleproject:956
TEST=angle_end2end_tests

Change-Id: I57ab540d447c03dae5a96bafb4975fc37e310261
Reviewed-on: https://chromium-review.googlesource.com/262181
Tested-by: Olli Etuaho <oetuaho@nvidia.com>
Reviewed-by: Nicolas Capens <capn@chromium.org>
Tested-by: Jamie Madill <jmadill@chromium.org>
diff --git a/src/libANGLE/renderer/d3d/IndexDataManager.cpp b/src/libANGLE/renderer/d3d/IndexDataManager.cpp
index f29a6f2..74d1022 100644
--- a/src/libANGLE/renderer/d3d/IndexDataManager.cpp
+++ b/src/libANGLE/renderer/d3d/IndexDataManager.cpp
@@ -86,6 +86,8 @@
 
         storage = GetImplAs<BufferD3D>(buffer);
 
+        // We'll trust that the compiler will optimize the % below:
+        // the operands are unsigned and the divisor is a constant.
         switch (type)
         {
           case GL_UNSIGNED_BYTE:  alignedOffset = (offset % sizeof(GLubyte) == 0);  break;
@@ -127,7 +129,8 @@
 
         if (!staticBuffer->getIndexRangeCache()->findRange(type, offset, count, NULL, &streamOffset))
         {
-            streamOffset = (offset / typeInfo.bytes) * gl::GetTypeInfo(destinationIndexType).bytes;
+            // Using bit-shift here is faster than using division.
+            streamOffset = (offset >> typeInfo.bytesShift) << gl::GetTypeInfo(destinationIndexType).bytesShift;
             staticBuffer->getIndexRangeCache()->addRange(type, offset, count, translated->indexRange, streamOffset);
         }
         if (!buffer->getIndexRangeCache()->findRange(type, offset, count, nullptr, nullptr))
@@ -162,7 +165,8 @@
             if (staticBuffer->getBufferSize() == 0 && alignedOffset)
             {
                 indexBuffer = staticBuffer;
-                convertCount = storage->getSize() / typeInfo.bytes;
+                // Using bit-shift here is faster than using division.
+                convertCount = storage->getSize() >> typeInfo.bytesShift;
             }
             else
             {
@@ -173,13 +177,14 @@
 
         ASSERT(indexBuffer);
 
-        if (convertCount > std::numeric_limits<unsigned int>::max() / destTypeInfo.bytes)
+        // Using bit-shift here is faster than using division.
+        if (convertCount > (std::numeric_limits<unsigned int>::max() >> destTypeInfo.bytesShift))
         {
             return gl::Error(GL_OUT_OF_MEMORY, "Reserving %u indices of %u bytes each exceeds the maximum buffer size.",
                              convertCount, destTypeInfo.bytes);
         }
 
-        unsigned int bufferSizeRequired = convertCount * destTypeInfo.bytes;
+        unsigned int bufferSizeRequired = convertCount << destTypeInfo.bytesShift;
         error = indexBuffer->reserveBufferSpace(bufferSizeRequired, type);
         if (error.isError())
         {
@@ -212,7 +217,8 @@
 
         if (staticBuffer)
         {
-            streamOffset = (offset / typeInfo.bytes) * destTypeInfo.bytes;
+            // Using bit-shift here is faster than using division.
+            streamOffset = (offset >> typeInfo.bytesShift) << destTypeInfo.bytesShift;
             staticBuffer->getIndexRangeCache()->addRange(type, offset, count, translated->indexRange, streamOffset);
         }
     }
@@ -220,13 +226,14 @@
     translated->storage = directStorage ? storage : NULL;
     translated->indexBuffer = indexBuffer ? indexBuffer->getIndexBuffer() : NULL;
     translated->serial = directStorage ? storage->getSerial() : indexBuffer->getSerial();
-    translated->startIndex = streamOffset / destTypeInfo.bytes;
+    // Using bit-shift here is faster than using division.
+    translated->startIndex = (streamOffset >> destTypeInfo.bytesShift);
     translated->startOffset = streamOffset;
     translated->indexType = destinationIndexType;
 
     if (storage)
     {
-        storage->promoteStaticUsage(count * typeInfo.bytes);
+        storage->promoteStaticUsage(count << typeInfo.bytesShift);
     }
 
     return gl::Error(GL_NO_ERROR);