Micro-optimize math in IndexDataManager
Use bitwise operations instead of division, which is expensive on multiple CPU
architectures.
BUG=angleproject:956
TEST=angle_end2end_tests
Change-Id: I57ab540d447c03dae5a96bafb4975fc37e310261
Reviewed-on: https://chromium-review.googlesource.com/262181
Tested-by: Olli Etuaho <oetuaho@nvidia.com>
Reviewed-by: Nicolas Capens <capn@chromium.org>
Tested-by: Jamie Madill <jmadill@chromium.org>
diff --git a/src/libANGLE/renderer/d3d/IndexDataManager.cpp b/src/libANGLE/renderer/d3d/IndexDataManager.cpp
index f29a6f2..74d1022 100644
--- a/src/libANGLE/renderer/d3d/IndexDataManager.cpp
+++ b/src/libANGLE/renderer/d3d/IndexDataManager.cpp
@@ -86,6 +86,8 @@
storage = GetImplAs<BufferD3D>(buffer);
+ // We'll trust that the compiler will optimize the % below:
+ // the operands are unsigned and the divisor is a constant.
switch (type)
{
case GL_UNSIGNED_BYTE: alignedOffset = (offset % sizeof(GLubyte) == 0); break;
@@ -127,7 +129,8 @@
if (!staticBuffer->getIndexRangeCache()->findRange(type, offset, count, NULL, &streamOffset))
{
- streamOffset = (offset / typeInfo.bytes) * gl::GetTypeInfo(destinationIndexType).bytes;
+ // Using bit-shift here is faster than using division.
+ streamOffset = (offset >> typeInfo.bytesShift) << gl::GetTypeInfo(destinationIndexType).bytesShift;
staticBuffer->getIndexRangeCache()->addRange(type, offset, count, translated->indexRange, streamOffset);
}
if (!buffer->getIndexRangeCache()->findRange(type, offset, count, nullptr, nullptr))
@@ -162,7 +165,8 @@
if (staticBuffer->getBufferSize() == 0 && alignedOffset)
{
indexBuffer = staticBuffer;
- convertCount = storage->getSize() / typeInfo.bytes;
+ // Using bit-shift here is faster than using division.
+ convertCount = storage->getSize() >> typeInfo.bytesShift;
}
else
{
@@ -173,13 +177,14 @@
ASSERT(indexBuffer);
- if (convertCount > std::numeric_limits<unsigned int>::max() / destTypeInfo.bytes)
+ // Using bit-shift here is faster than using division.
+ if (convertCount > (std::numeric_limits<unsigned int>::max() >> destTypeInfo.bytesShift))
{
return gl::Error(GL_OUT_OF_MEMORY, "Reserving %u indices of %u bytes each exceeds the maximum buffer size.",
convertCount, destTypeInfo.bytes);
}
- unsigned int bufferSizeRequired = convertCount * destTypeInfo.bytes;
+ unsigned int bufferSizeRequired = convertCount << destTypeInfo.bytesShift;
error = indexBuffer->reserveBufferSpace(bufferSizeRequired, type);
if (error.isError())
{
@@ -212,7 +217,8 @@
if (staticBuffer)
{
- streamOffset = (offset / typeInfo.bytes) * destTypeInfo.bytes;
+ // Using bit-shift here is faster than using division.
+ streamOffset = (offset >> typeInfo.bytesShift) << destTypeInfo.bytesShift;
staticBuffer->getIndexRangeCache()->addRange(type, offset, count, translated->indexRange, streamOffset);
}
}
@@ -220,13 +226,14 @@
translated->storage = directStorage ? storage : NULL;
translated->indexBuffer = indexBuffer ? indexBuffer->getIndexBuffer() : NULL;
translated->serial = directStorage ? storage->getSerial() : indexBuffer->getSerial();
- translated->startIndex = streamOffset / destTypeInfo.bytes;
+ // Using bit-shift here is faster than using division.
+ translated->startIndex = (streamOffset >> destTypeInfo.bytesShift);
translated->startOffset = streamOffset;
translated->indexType = destinationIndexType;
if (storage)
{
- storage->promoteStaticUsage(count * typeInfo.bytes);
+ storage->promoteStaticUsage(count << typeInfo.bytesShift);
}
return gl::Error(GL_NO_ERROR);