Merge commit 'f5ecac92' into manualmerge

Conflicts:
	driver/rsdBcc.cpp

Change-Id: I04a06c157165a77f3c5edffa637b9e42a02ef617
diff --git a/driver/rsdBcc.cpp b/driver/rsdBcc.cpp
index 09981d2..ddcaac8 100644
--- a/driver/rsdBcc.cpp
+++ b/driver/rsdBcc.cpp
@@ -343,16 +343,39 @@
     RsdHal * dc = (RsdHal *)mtls->rsc->mHal.drv;
 
     if ((dc->mWorkers.mCount >= 1) && isThreadable && !dc->mInForEach) {
+        const size_t targetByteChunk = 16 * 1024;
         dc->mInForEach = true;
         if (mtls->fep.dimY > 1) {
-            mtls->mSliceSize = mtls->fep.dimY / (dc->mWorkers.mCount * 4);
+            uint32_t s1 = mtls->fep.dimY / ((dc->mWorkers.mCount + 1) * 4);
+            uint32_t s2 = 0;
+
+            // This chooses our slice size to rate limit atomic ops to
+            // one per 16k bytes of reads/writes.
+            if (mtls->fep.yStrideOut) {
+                s2 = targetByteChunk / mtls->fep.yStrideOut;
+            } else {
+                s2 = targetByteChunk / mtls->fep.yStrideIn;
+            }
+            mtls->mSliceSize = rsMin(s1, s2);
+
             if(mtls->mSliceSize < 1) {
                 mtls->mSliceSize = 1;
             }
 
             rsdLaunchThreads(mrsc, wc_xy, mtls);
         } else {
-            mtls->mSliceSize = mtls->fep.dimX / (dc->mWorkers.mCount * 4);
+            uint32_t s1 = mtls->fep.dimX / ((dc->mWorkers.mCount + 1) * 4);
+            uint32_t s2 = 0;
+
+            // This chooses our slice size to rate limit atomic ops to
+            // one per 16k bytes of reads/writes.
+            if (mtls->fep.eStrideOut) {
+                s2 = targetByteChunk / mtls->fep.eStrideOut;
+            } else {
+                s2 = targetByteChunk / mtls->fep.eStrideIn;
+            }
+            mtls->mSliceSize = rsMin(s1, s2);
+
             if(mtls->mSliceSize < 1) {
                 mtls->mSliceSize = 1;
             }