Fix clz implementation for types with size < 4 bytes.

Bug: 12911231

Change-Id: I2b612b34ae7d69c2793ba44d2b43ab96432e3828
diff --git a/driver/runtime/rs_cl.c b/driver/runtime/rs_cl.c
index 83327bc..b39cb7e 100644
--- a/driver/runtime/rs_cl.c
+++ b/driver/runtime/rs_cl.c
@@ -746,24 +746,27 @@
 
 /**
  * clz
+ * __builtin_clz only accepts a 32-bit unsigned int, so every input will be
+ * expanded to 32 bits. For our smaller data types, we need to subtract off
+ * these unused top bits (that will be always be composed of zeros).
  */
 extern uint32_t __attribute__((overloadable)) clz(uint32_t v) {
     return __builtin_clz(v);
 }
 extern uint16_t __attribute__((overloadable)) clz(uint16_t v) {
-    return (uint16_t)__builtin_clz(v);
+    return __builtin_clz(v) - 16;
 }
 extern uint8_t __attribute__((overloadable)) clz(uint8_t v) {
-    return (uint8_t)__builtin_clz(v);
+    return __builtin_clz(v) - 24;
 }
 extern int32_t __attribute__((overloadable)) clz(int32_t v) {
-    return (int32_t)__builtin_clz((uint32_t)v);
+    return __builtin_clz(v);
 }
 extern int16_t __attribute__((overloadable)) clz(int16_t v) {
-    return (int16_t)__builtin_clz(v);
+    return __builtin_clz(((uint32_t)v) & 0x0000ffff) - 16;
 }
 extern int8_t __attribute__((overloadable)) clz(int8_t v) {
-    return (int8_t)__builtin_clz(v);
+    return __builtin_clz(((uint32_t)v) & 0x000000ff) - 24;
 }