Add x86 server support.

Change-Id: I674acaf15b67afa48bc736f72942a11e2e38e940
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index c78b238..de71112 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -24,10 +24,20 @@
 #include <sys/types.h>
 #include <sys/resource.h>
 #include <sched.h>
-#include <cutils/properties.h>
 #include <sys/syscall.h>
 #include <string.h>
+
+#ifndef RS_SERVER
+#include <cutils/properties.h>
 #include "utils/StopWatch.h"
+#endif
+
+#ifdef RS_SERVER
+// Android exposes gettid(), standard Linux does not
+static pid_t gettid() {
+    return syscall(SYS_gettid);
+}
+#endif
 
 using namespace android;
 using namespace android::renderscript;
@@ -102,8 +112,7 @@
 void * RsdCpuReferenceImpl::helperThreadProc(void *vrsc) {
     RsdCpuReferenceImpl *dc = (RsdCpuReferenceImpl *)vrsc;
 
-
-    uint32_t idx = (uint32_t)android_atomic_inc(&dc->mWorkers.mLaunchCount);
+    uint32_t idx = __sync_fetch_and_add(&dc->mWorkers.mLaunchCount, 1);
 
     //ALOGV("RS helperThread starting %p idx=%i", dc, idx);
 
@@ -132,7 +141,7 @@
            // idx +1 is used because the calling thread is always worker 0.
            dc->mWorkers.mLaunchCallback(dc->mWorkers.mLaunchData, idx+1);
         }
-        android_atomic_dec(&dc->mWorkers.mRunningCount);
+        __sync_fetch_and_sub(&dc->mWorkers.mRunningCount, 1);
         dc->mWorkers.mCompleteSignal.set();
     }
 
@@ -153,7 +162,9 @@
         return;
     }
 
-    android_atomic_release_store(mWorkers.mCount, &mWorkers.mRunningCount);
+    mWorkers.mRunningCount = mWorkers.mCount;
+    __sync_synchronize();
+
     for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
         mWorkers.mLaunchSignals[ct].set();
     }
@@ -164,7 +175,7 @@
         mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
     }
 
-    while (android_atomic_acquire_load(&mWorkers.mRunningCount) != 0) {
+    while (__sync_fetch_and_or(&mWorkers.mRunningCount, 0) != 0) {
         mWorkers.mCompleteSignal.wait();
     }
 }
@@ -224,8 +235,9 @@
 
     mWorkers.mCompleteSignal.init();
 
-    android_atomic_release_store(mWorkers.mCount, &mWorkers.mRunningCount);
-    android_atomic_release_store(0, &mWorkers.mLaunchCount);
+    mWorkers.mRunningCount = mWorkers.mCount;
+    mWorkers.mLaunchCount = 0;
+    __sync_synchronize();
 
     pthread_attr_t threadAttr;
     status = pthread_attr_init(&threadAttr);
@@ -242,7 +254,7 @@
             break;
         }
     }
-    while (android_atomic_acquire_load(&mWorkers.mRunningCount) != 0) {
+    while (__sync_fetch_and_or(&mWorkers.mRunningCount, 0) != 0) {
         usleep(100);
     }
 
@@ -261,7 +273,8 @@
     mExit = true;
     mWorkers.mLaunchData = NULL;
     mWorkers.mLaunchCallback = NULL;
-    android_atomic_release_store(mWorkers.mCount, &mWorkers.mRunningCount);
+    mWorkers.mRunningCount = mWorkers.mCount;
+    __sync_synchronize();
     for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
         mWorkers.mLaunchSignals[ct].set();
     }
@@ -269,7 +282,7 @@
     for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
         pthread_join(mWorkers.mThreadId[ct], &res);
     }
-    rsAssert(android_atomic_acquire_load(&mWorkers.mRunningCount) == 0);
+    rsAssert(__sync_fetch_and_or(&mWorkers.mRunningCount, 0) == 0);
 
     // Global structure cleanup.
     lockMutex();
@@ -292,7 +305,7 @@
 
     outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
     while (1) {
-        uint32_t slice = (uint32_t)android_atomic_inc(&mtls->mSliceNum);
+        uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
         uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
         uint32_t yEnd = yStart + mtls->mSliceSize;
         yEnd = rsMin(yEnd, mtls->yEnd);
@@ -322,7 +335,7 @@
 
     outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
     while (1) {
-        uint32_t slice = (uint32_t)android_atomic_inc(&mtls->mSliceNum);
+        uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
         uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
         uint32_t xEnd = xStart + mtls->mSliceSize;
         xEnd = rsMin(xEnd, mtls->xEnd);
diff --git a/cpu_ref/rsCpuIntrinsic3DLUT.cpp b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
index 2eca373..03f24d8 100644
--- a/cpu_ref/rsCpuIntrinsic3DLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
@@ -135,7 +135,7 @@
         uint4 v2 = (v + 0x7f) >> (int4)8;
 
         uchar4 ret = convert_uchar4(v2);
-        ret.a = in->a;
+        ret.w = in->w;
 
         #if 0
         if (!x1) {
diff --git a/cpu_ref/rsCpuIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp
index d7b01b6..4e9470e 100644
--- a/cpu_ref/rsCpuIntrinsicBlend.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlend.cpp
@@ -143,7 +143,7 @@
         for (;x1 < x2; x1++, out++, in++) {
             short4 in_s = convert_short4(*in);
             short4 out_s = convert_short4(*out);
-            in_s = in_s + ((out_s * (short4)(255 - in_s.a)) >> (short4)8);
+            in_s = in_s + ((out_s * (short4)(255 - in_s.w)) >> (short4)8);
             *out = convert_uchar4(in_s);
         }
         break;
@@ -160,7 +160,7 @@
         for (;x1 < x2; x1++, out++, in++) {
             short4 in_s = convert_short4(*in);
             short4 out_s = convert_short4(*out);
-            in_s = out_s + ((in_s * (short4)(255 - out_s.a)) >> (short4)8);
+            in_s = out_s + ((in_s * (short4)(255 - out_s.w)) >> (short4)8);
             *out = convert_uchar4(in_s);
         }
         break;
@@ -176,7 +176,7 @@
 #endif
         for (;x1 < x2; x1++, out++, in++) {
             short4 in_s = convert_short4(*in);
-            in_s = (in_s * out->a) >> (short4)8;
+            in_s = (in_s * out->w) >> (short4)8;
             *out = convert_uchar4(in_s);
         }
         break;
@@ -192,7 +192,7 @@
 #endif
         for (;x1 < x2; x1++, out++, in++) {
             short4 out_s = convert_short4(*out);
-            out_s = (out_s * in->a) >> (short4)8;
+            out_s = (out_s * in->w) >> (short4)8;
             *out = convert_uchar4(out_s);
         }
         break;
@@ -208,7 +208,7 @@
 #endif
         for (;x1 < x2; x1++, out++, in++) {
             short4 in_s = convert_short4(*in);
-            in_s = (in_s * (short4)(255 - out->a)) >> (short4)8;
+            in_s = (in_s * (short4)(255 - out->w)) >> (short4)8;
             *out = convert_uchar4(in_s);
         }
         break;
@@ -224,7 +224,7 @@
 #endif
         for (;x1 < x2; x1++, out++, in++) {
             short4 out_s = convert_short4(*out);
-            out_s = (out_s * (short4)(255 - in->a)) >> (short4)8;
+            out_s = (out_s * (short4)(255 - in->w)) >> (short4)8;
             *out = convert_uchar4(out_s);
         }
         break;
@@ -241,8 +241,8 @@
         for (;x1 < x2; x1++, out++, in++) {
             short4 in_s = convert_short4(*in);
             short4 out_s = convert_short4(*out);
-            out_s.rgb = ((in_s.rgb * out_s.a) +
-              (out_s.rgb * ((short3)255 - (short3)in_s.a))) >> (short3)8;
+            out_s.xyz = ((in_s.xyz * out_s.w) +
+              (out_s.xyz * ((short3)255 - (short3)in_s.w))) >> (short3)8;
             *out = convert_uchar4(out_s);
         }
         break;
@@ -259,8 +259,8 @@
         for (;x1 < x2; x1++, out++, in++) {
             short4 in_s = convert_short4(*in);
             short4 out_s = convert_short4(*out);
-            out_s.rgb = ((out_s.rgb * in_s.a) +
-              (in_s.rgb * ((short3)255 - (short3)out_s.a))) >> (short3)8;
+            out_s.xyz = ((out_s.xyz * in_s.w) +
+              (in_s.xyz * ((short3)255 - (short3)out_s.w))) >> (short3)8;
             *out = convert_uchar4(out_s);
         }
         break;
@@ -388,12 +388,12 @@
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
-            uint32_t iR = in->r, iG = in->g, iB = in->b, iA = in->a,
-                oR = out->r, oG = out->g, oB = out->b, oA = out->a;
-            out->r = (oR + iR) > 255 ? 255 : oR + iR;
-            out->g = (oG + iG) > 255 ? 255 : oG + iG;
-            out->b = (oB + iB) > 255 ? 255 : oB + iB;
-            out->a = (oA + iA) > 255 ? 255 : oA + iA;
+            uint32_t iR = in->x, iG = in->y, iB = in->z, iA = in->w,
+                oR = out->x, oG = out->y, oB = out->z, oA = out->w;
+            out->x = (oR + iR) > 255 ? 255 : oR + iR;
+            out->y = (oG + iG) > 255 ? 255 : oG + iG;
+            out->z = (oB + iB) > 255 ? 255 : oB + iB;
+            out->w = (oA + iA) > 255 ? 255 : oA + iA;
         }
         break;
     case BLEND_SUBTRACT:
@@ -407,12 +407,12 @@
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
-            int32_t iR = in->r, iG = in->g, iB = in->b, iA = in->a,
-                oR = out->r, oG = out->g, oB = out->b, oA = out->a;
-            out->r = (oR - iR) < 0 ? 0 : oR - iR;
-            out->g = (oG - iG) < 0 ? 0 : oG - iG;
-            out->b = (oB - iB) < 0 ? 0 : oB - iB;
-            out->a = (oA - iA) < 0 ? 0 : oA - iA;
+            int32_t iR = in->x, iG = in->y, iB = in->z, iA = in->w,
+                oR = out->x, oG = out->y, oB = out->z, oA = out->w;
+            out->x = (oR - iR) < 0 ? 0 : oR - iR;
+            out->y = (oG - iG) < 0 ? 0 : oG - iG;
+            out->z = (oB - iB) < 0 ? 0 : oB - iB;
+            out->w = (oA - iA) < 0 ? 0 : oA - iA;
         }
         break;
     case BLEND_STAMP:
diff --git a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
index bcd5ffd..112f377 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
@@ -113,7 +113,7 @@
         //ALOGE("x %i  %i,%i,%i,%i  %i,%i,%i,%i", x, o.x, o.y, o.z, o.w, out[0].x, out[0].y, out[0].z, out[0].w);
     //}
     //o.w = 0xff;
-    out->rgba = o.rgba;
+    out->xyzw = o.xyzw;
 }
 
 extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1,
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
index bb8cde1..3a49c0d 100644
--- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -61,30 +61,30 @@
     short V = ((short)v) - 128;
 
     short4 p;
-    p.r = (Y * 298 + V * 409 + 128) >> 8;
-    p.g = (Y * 298 - U * 100 - V * 208 + 128) >> 8;
-    p.b = (Y * 298 + U * 516 + 128) >> 8;
-    p.a = 255;
-    if(p.r < 0) {
-        p.r = 0;
+    p.x = (Y * 298 + V * 409 + 128) >> 8;
+    p.y = (Y * 298 - U * 100 - V * 208 + 128) >> 8;
+    p.z = (Y * 298 + U * 516 + 128) >> 8;
+    p.w = 255;
+    if(p.x < 0) {
+        p.x = 0;
     }
-    if(p.r > 255) {
-        p.r = 255;
+    if(p.x > 255) {
+        p.x = 255;
     }
-    if(p.g < 0) {
-        p.g = 0;
+    if(p.y < 0) {
+        p.y = 0;
     }
-    if(p.g > 255) {
-        p.g = 255;
+    if(p.y > 255) {
+        p.y = 255;
     }
-    if(p.b < 0) {
-        p.b = 0;
+    if(p.z < 0) {
+        p.z = 0;
     }
-    if(p.b > 255) {
-        p.b = 255;
+    if(p.z > 255) {
+        p.z = 255;
     }
 
-    return (uchar4){p.r, p.g, p.b, p.a};
+    return (uchar4){p.x, p.y, p.z, p.w};
 }
 
 
diff --git a/cpu_ref/rsCpuRuntimeMath.cpp b/cpu_ref/rsCpuRuntimeMath.cpp
index f66677b..6c02303 100644
--- a/cpu_ref/rsCpuRuntimeMath.cpp
+++ b/cpu_ref/rsCpuRuntimeMath.cpp
@@ -14,7 +14,9 @@
  * limitations under the License.
  */
 
+#ifndef RS_SERVER
 #include <cutils/compiler.h>
+#endif
 
 #include "rsContext.h"
 #include "rsScriptC.h"
diff --git a/cpu_ref/rsCpuRuntimeStubs.cpp b/cpu_ref/rsCpuRuntimeStubs.cpp
index ceea9c4..7b8d557 100644
--- a/cpu_ref/rsCpuRuntimeStubs.cpp
+++ b/cpu_ref/rsCpuRuntimeStubs.cpp
@@ -21,12 +21,15 @@
 #include "rsMatrix2x2.h"
 #include "rsRuntime.h"
 
-#include "utils/Timers.h"
 #include "rsCpuCore.h"
 #include "rsCpuScript.h"
 
 #include <time.h>
 
+#ifndef RS_SERVER
+#include "utils/Timers.h"
+#endif
+
 using namespace android;
 using namespace android::renderscript;
 
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index 2ae4d83..7887474 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -23,10 +23,11 @@
 //#include "rsdAllocation.h"
 //#include "rsCpuIntrinsics.h"
 
-
+#ifndef RS_SERVER
 #include "utils/Vector.h"
 #include "utils/Timers.h"
 #include "utils/StopWatch.h"
+#endif
 
 #ifdef RS_COMPATIBILITY_LIB
     #include <dlfcn.h>
@@ -177,10 +178,14 @@
 
 #else
 
+#ifndef RS_SERVER
     String8 scriptSOName(cacheDir);
     scriptSOName = scriptSOName.getPathDir();
     scriptSOName.appendPath("lib");
     scriptSOName.append("/librs.");
+#else
+    String8 scriptSOName("lib");
+#endif
     scriptSOName.append(resName);
     scriptSOName.append(".so");
 
diff --git a/cpu_ref/rsCpuScriptGroup.cpp b/cpu_ref/rsCpuScriptGroup.cpp
index d418c22..b3b5bf9 100644
--- a/cpu_ref/rsCpuScriptGroup.cpp
+++ b/cpu_ref/rsCpuScriptGroup.cpp
@@ -18,10 +18,12 @@
 #include "rsCpuScript.h"
 #include "rsCpuScriptGroup.h"
 
+#ifndef RS_SERVER
 #include <bcc/BCCContext.h>
 #include <bcc/Renderscript/RSCompilerDriver.h>
 #include <bcc/Renderscript/RSExecutable.h>
 #include <bcc/Renderscript/RSInfo.h>
+#endif
 
 #include "rsScript.h"
 #include "rsScriptGroup.h"