Merge "Add librsrt_x86_64.bc and modify the script to update x86_64 prebuilts." into nyc-dev
diff --git a/api/rs_convert.spec b/api/rs_convert.spec
index c1539b5..6ae7b40 100644
--- a/api/rs_convert.spec
+++ b/api/rs_convert.spec
@@ -17,7 +17,7 @@
 header:
 summary: Conversion Functions
 description:
- The functions below convert from a numerical vector type to another, of from one color
+ The functions below convert from a numerical vector type to another, or from one color
  representation to another.
 end:
 
@@ -78,10 +78,9 @@
 attrib: const
 w: 2, 3, 4
 t: f16
-t: u8, u16, u32, u64, i8, i16, i32, i64, f32, f64
+t: u8, u16, u32, u64, i8, i16, i32, i64, f16, f32, f64
 ret: #3#1
 arg: #2#1 v, compatible(#3)
-test: none
 end:
 
 function: convert_#3#1
@@ -92,7 +91,6 @@
 t: f16
 ret: #3#1
 arg: #2#1 v, compatible(#3)
-test: none
 end:
 
 function: rsPackColorTo8888
diff --git a/api/rs_math.spec b/api/rs_math.spec
index ef66f7d..dda9867 100644
--- a/api/rs_math.spec
+++ b/api/rs_math.spec
@@ -183,8 +183,7 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-arg: #2#1 v
-test: none
+arg: #2#1 v, range(-1,1)
 end:
 
 function: acosh
@@ -208,7 +207,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: acospi
@@ -233,8 +231,7 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-arg: #2#1 v
-test: none
+arg: #2#1 v, range(-1,1)
 end:
 
 function: asin
@@ -257,8 +254,7 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-arg: #2#1 v
-test: none
+arg: #2#1 v, range(-1,1)
 end:
 
 function: asinh
@@ -282,7 +278,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: asinpi
@@ -307,8 +302,7 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-arg: #2#1 v
-test: none
+arg: #2#1 v, range(-1,1)
 end:
 
 function: atan
@@ -331,8 +325,7 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-arg: #2#1 v
-test: none
+arg: #2#1 v, range(-1,1)
 end:
 
 function: atan2
@@ -358,7 +351,6 @@
 ret: #2#1
 arg: #2#1 numerator
 arg: #2#1 denominator
-test: none
 end:
 
 function: atan2pi
@@ -386,7 +378,6 @@
 ret: #2#1
 arg: #2#1 numerator
 arg: #2#1 denominator
-test: none
 end:
 
 function: atanh
@@ -409,8 +400,7 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-arg: #2#1 v
-test: none
+arg: #2#1 v, range(-1,1)
 end:
 
 function: atanpi
@@ -435,8 +425,7 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-arg: #2#1 v
-test: none
+arg: #2#1 v, range(-1,1)
 end:
 
 function: cbrt
@@ -460,7 +449,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: ceil
@@ -486,7 +474,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: clamp
@@ -605,7 +592,6 @@
 ret: #2#1
 arg: #2#1 magnitude_value
 arg: #2#1 sign_value
-test: none
 end:
 
 function: cos
@@ -652,7 +638,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: cospi
@@ -678,7 +663,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: degrees
@@ -700,7 +684,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: erf
@@ -722,7 +705,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: erfc
@@ -744,7 +726,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: exp
@@ -768,7 +749,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: exp10
@@ -792,7 +772,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: exp2
@@ -816,7 +795,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: expm1
@@ -840,7 +818,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: fabs
@@ -864,7 +841,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: fdim
@@ -890,7 +866,6 @@
 ret: #2#1
 arg: #2#1 a
 arg: #2#1 b
-test: none
 end:
 
 function: floor
@@ -945,7 +920,6 @@
 arg: #2#1 multiplicand1
 arg: #2#1 multiplicand2
 arg: #2#1 offset
-test: none
 end:
 
 function: fmax
@@ -971,7 +945,6 @@
 ret: #2#1
 arg: #2#1 a
 arg: #2#1 b
-test: none
 end:
 
 function: fmax
@@ -992,7 +965,6 @@
 ret: #2#1
 arg: #2#1 a
 arg: #2 b
-test: none
 end:
 
 function: fmin
@@ -1018,7 +990,6 @@
 ret: #2#1
 arg: #2#1 a
 arg: #2#1 b
-test: none
 end:
 
 function: fmin
@@ -1039,7 +1010,6 @@
 ret: #2#1
 arg: #2#1 a
 arg: #2 b
-test: none
 end:
 
 function: fmod
@@ -1067,10 +1037,8 @@
 ret: #2#1
 arg: #2#1 numerator
 arg: #2#1 denominator
-test: none
 end:
 
-# TODO Add (both variants) of fract for f16
 function: fract
 version: 9
 w: 1, 2, 3, 4
@@ -1113,7 +1081,6 @@
 ret: #2#1
 arg: #2#1 v
 arg: #2#1* floor
-test: none
 end:
 
 function: fract
@@ -1122,10 +1089,8 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
-# TODO Add f16 frexp
 function: frexp
 version: 9
 w: 1, 2, 3, 4
@@ -1223,7 +1188,6 @@
 ret: #2#1
 arg: #2#1 a
 arg: #2#1 b
-test: none
 end:
 
 function: ilogb
@@ -1367,7 +1331,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: log10
@@ -1391,7 +1354,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: log1p
@@ -1415,7 +1377,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: log2
@@ -1439,7 +1400,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: logb
@@ -1468,7 +1428,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: mad
@@ -1498,7 +1457,6 @@
 arg: #2#1 multiplicand1
 arg: #2#1 multiplicand2
 arg: #2#1 offset
-test: none
 end:
 
 function: max
@@ -1522,7 +1480,6 @@
 ret: #2#1
 arg: #2#1 a
 arg: #2#1 b
-test: none
 end:
 
 function: max
@@ -1543,7 +1500,6 @@
 ret: #2#1
 arg: #2#1 a
 arg: #2 b
-test: none
 end:
 
 function: max
@@ -1637,7 +1593,6 @@
 ret: #2#1
 arg: #2#1 a
 arg: #2#1 b
-test: none
 end:
 
 function: min
@@ -1658,7 +1613,6 @@
 ret: #2#1
 arg: #2#1 a
 arg: #2 b
-test: none
 end:
 
 function: min
@@ -1757,7 +1711,6 @@
 arg: #2#1 start
 arg: #2#1 stop
 arg: #2#1 fraction
-test: none
 end:
 
 function: mix
@@ -1780,7 +1733,6 @@
 arg: #2#1 start
 arg: #2#1 stop
 arg: #2 fraction
-test: none
 end:
 
 function: modf
@@ -1830,7 +1782,6 @@
 summary: Not a Number
 description:
   Returns a half-precision floating point NaN value (Not a Number).
-test: none
 end:
 
 function: native_acos
@@ -1857,9 +1808,9 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-# TODO Need range(-1,1) here similar to the float version?
-arg: #2#1 v
-test: none
+arg: #2#1 v, range(-1,1)
+# Absolute error of 2^-11, i.e. 0.00048828125
+test: limited(0.00048828125)
 end:
 
 function: native_acosh
@@ -1885,7 +1836,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: native_acospi
@@ -1914,9 +1864,9 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-# TODO Need range(-1,1) here similar to the float version?
-arg: #2#1 v
-test: none
+arg: #2#1 v, range(-1,1)
+# Absolute error of 2^-11, i.e. 0.00048828125
+test: limited(0.00048828125)
 end:
 
 function: native_asin
@@ -1943,9 +1893,9 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-# TODO Need range(-1,1) here similar to the float version?
-arg: #2#1 v
-test: none
+arg: #2#1 v, range(-1,1)
+# Absolute error of 2^-11, i.e. 0.00048828125
+test: limited(0.00048828125)
 end:
 
 function: native_asinh
@@ -1971,7 +1921,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: native_asinpi
@@ -2000,9 +1949,9 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-# TODO Need range(-1,1) here similar to the float version?
-arg: #2#1 v
-test: none
+arg: #2#1 v, range(-1,1)
+# Absolute error of 2^-11, i.e. 0.00048828125
+test: limited(0.00048828125)
 end:
 
 function: native_atan
@@ -2027,9 +1976,7 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-# TODO Need range(-1,1) here similar to the float version?
-arg: #2#1 v
-test: none
+arg: #2#1 v, range(-1, 1)
 end:
 
 function: native_atan2
@@ -2057,7 +2004,6 @@
 ret: #2#1
 arg: #2#1 numerator
 arg: #2#1 denominator
-test: none
 end:
 
 function: native_atan2pi
@@ -2088,7 +2034,6 @@
 ret: #2#1
 arg: #2#1 numerator
 arg: #2#1 denominator
-test: none
 end:
 
 function: native_atanh
@@ -2113,9 +2058,7 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-# TODO Need range(-1,1) here similar to the float version?
-arg: #2#1 v
-test: none
+arg: #2#1 v, range(-1,1)
 end:
 
 function: native_atanpi
@@ -2142,9 +2085,7 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-# TODO Need range(-1,1) here similar to the float version?
-arg: #2#1 v
-test: none
+arg: #2#1 v, range(-1,1)
 end:
 
 function: native_cbrt
@@ -2168,7 +2109,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: native_cos
@@ -2191,8 +2131,9 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-arg: #2#1 v
-test: none
+arg: #2#1 v, range(-314,314)
+# Absolute error of 2^-11, i.e. 0.00048828125
+test: limited(0.00048828125)
 end:
 
 function: native_cosh
@@ -2216,7 +2157,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: native_cospi
@@ -2241,8 +2181,9 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-arg: #2#1 v
-test: none
+arg: #2#1 v, range(-100,100)
+# Absolute error of 2^-11, i.e. 0.00048828125
+test: limited(0.00048828125)
 end:
 
 function: native_divide
@@ -2266,7 +2207,6 @@
 ret: #2#1
 arg: #2#1 left_vector
 arg: #2#1 right_vector
-test: none
 end:
 
 function: native_exp
@@ -2293,9 +2233,7 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-# TODO Need range(-86, 86) here similar to the float version?
-arg: #2#1 v
-test: none
+arg: #2#1 v, range(-86,86)
 end:
 
 function: native_exp10
@@ -2322,9 +2260,7 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-# TODO Need range(-37, 37) here similar to the float version?
-arg: #2#1 v
-test: none
+arg: #2#1 v, range(-37,37)
 end:
 
 function: native_exp2
@@ -2351,9 +2287,7 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-# TODO Need range(-125, 125) here similar to the float version?
-arg: #2#1 v
-test: none
+arg: #2#1 v, range(-125,125)
 end:
 
 function: native_expm1
@@ -2403,7 +2337,6 @@
 ret: #2#1
 arg: #2#1 a
 arg: #2#1 b
-test: none
 end:
 
 function: native_log
@@ -2429,9 +2362,7 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-# TODO Need range(10e-10,10e10) here similar to the float version?
-arg: #2#1 v
-test: none
+arg: #2#1 v, range(10e-5,65504)
 end:
 
 function: native_log10
@@ -2457,9 +2388,7 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-# TODO Need range(10e-10,10e10) here similar to the float version?
-arg: #2#1 v
-test: none
+arg: #2#1 v, range(10e-5,65504)
 end:
 
 function: native_log1p
@@ -2483,7 +2412,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: native_log2
@@ -2509,9 +2437,7 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-# TODO Need range(10e-10,10e10) here similar to the float version?
-arg: #2#1 v
-test: none
+arg: #2#1 v, range(10e-5,65504)
 end:
 
 function: native_powr
@@ -2536,11 +2462,8 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-# TODO Need range(0, 256) here similar to the float version?
-arg: #2#1 base
-# TODO Need range(-15,15) here similar to the float version?
-arg: #2#1 exponent
-test: none
+arg: #2#1 base, range(0,256)
+arg: #2#1 exponent, range(-15,15)
 end:
 
 function: native_recip
@@ -2564,7 +2487,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: native_rootn
@@ -2614,7 +2536,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: native_sin
@@ -2637,8 +2558,9 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-arg: #2#1 v
-test: none
+arg: #2#1 v, range(-314,314)
+# Absolute error of 2^-11, i.e. 0.00048828125
+test: limited(0.00048828125)
 end:
 
 function: native_sincos
@@ -2663,8 +2585,9 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-arg: #2#1* cos
-test: none
+arg: #2#1* cos, range(-314,314)
+# Absolute error of 2^-11, i.e. 0.00048828125
+test: limited(0.00048828125)
 end:
 
 function: native_sinh
@@ -2688,7 +2611,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: native_sinpi
@@ -2713,8 +2635,9 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-arg: #2#1 v
-test: none
+arg: #2#1 v, range(-100,100)
+# Absolute error of 2^-11, i.e. 0.00048828125
+test: limited(0.00048828125)
 end:
 
 function: native_sqrt
@@ -2738,7 +2661,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: native_tan
@@ -2784,7 +2706,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: native_tanpi
@@ -2864,7 +2785,6 @@
 ret: #2#1
 arg: #2#1 base
 arg: #2#1 exponent
-test: none
 end:
 
 function: pown
@@ -2891,7 +2811,6 @@
 ret: #2#1
 arg: #2#1 base
 arg: int#1 exponent
-test: none
 end:
 
 function: powr
@@ -2918,10 +2837,8 @@
 w: 1, 2, 3, 4
 t: f16
 ret: #2#1
-# TODO Need range(0,3000) here similar to the float version?
-arg: #2#1 base
+arg: #2#1 base, range(0,300)
 arg: #2#1 exponent
-test: none
 end:
 
 function: radians
@@ -2943,7 +2860,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: remainder
@@ -2972,7 +2888,6 @@
 ret: #2#1
 arg: #2#1 numerator
 arg: #2#1 denominator
-test: none
 end:
 
 function: remquo
@@ -3035,7 +2950,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: rootn
@@ -3089,7 +3003,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: rsqrt
@@ -3138,7 +3051,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: sin
@@ -3162,7 +3074,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: sincos
@@ -3186,7 +3097,6 @@
 ret: #2#1
 arg: #2#1 v
 arg: #2#1* cos
-test: none
 end:
 
 function: sinh
@@ -3210,7 +3120,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: sinpi
@@ -3236,7 +3145,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: sqrt
@@ -3260,7 +3168,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: step
@@ -3288,7 +3195,6 @@
 ret: #2#1
 arg: #2#1 edge
 arg: #2#1 v
-test: none
 end:
 
 function: step
@@ -3309,7 +3215,6 @@
 ret: #2#1
 arg: #2#1 edge
 arg: #2 v
-test: none
 end:
 
 function: step
@@ -3330,7 +3235,6 @@
 ret: #2#1
 arg: #2 edge
 arg: #2#1 v
-test: none
 end:
 
 function: tan
@@ -3354,7 +3258,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: tanh
@@ -3378,7 +3281,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: tanpi
@@ -3404,7 +3306,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: tgamma
@@ -3428,7 +3329,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: trunc
@@ -3454,7 +3354,6 @@
 t: f16
 ret: #2#1
 arg: #2#1 v
-test: none
 end:
 
 function: rsClamp
diff --git a/api/rs_vector_math.spec b/api/rs_vector_math.spec
index ae6d4df..c1d464d 100644
--- a/api/rs_vector_math.spec
+++ b/api/rs_vector_math.spec
@@ -210,6 +210,17 @@
 test: vector
 end:
 
+function: native_distance
+version: 24
+attrib: const
+w: 1, 2, 3, 4
+t: f16
+ret: #2
+arg: #2#1 left_vector
+arg: #2#1 right_vector
+test: vector
+end:
+
 function: native_length
 version: 21
 attrib: const
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index ad5d097..5a7194a 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -49,8 +49,8 @@
     rsCpuIntrinsics_advsimd_Blur.S \
     rsCpuIntrinsics_advsimd_ColorMatrix.S \
     rsCpuIntrinsics_advsimd_Resize.S \
-    rsCpuIntrinsics_advsimd_YuvToRGB.S
-#    rsCpuIntrinsics_advsimd_Blend.S \
+    rsCpuIntrinsics_advsimd_YuvToRGB.S \
+    rsCpuIntrinsics_advsimd_Blend.S
 
 ifeq ($(ARCH_ARM_HAVE_NEON),true)
     LOCAL_CFLAGS_arm += -DARCH_ARM_HAVE_NEON
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index b8b4838..9f9c429 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -45,6 +45,8 @@
 using namespace android;
 using namespace android::renderscript;
 
+#define REDUCE_NEW_ALOGV(...) /* ALOGV(__VA_ARGS__) */
+
 static pthread_key_t gThreadTLSKey = 0;
 static uint32_t gThreadTLSKeyCount = 0;
 static pthread_mutex_t gInitMutex = PTHREAD_MUTEX_INITIALIZER;
@@ -98,7 +100,7 @@
 
     version_major = 0;
     version_minor = 0;
-    mInForEach = false;
+    mInKernel = false;
     memset(&mWorkers, 0, sizeof(mWorkers));
     memset(&mTlsStruct, 0, sizeof(mTlsStruct));
     mExit = false;
@@ -239,6 +241,9 @@
         ALOGE("pthread_setspecific %i", status);
     }
 
+    mPageSize = sysconf(_SC_PAGE_SIZE);
+    REDUCE_NEW_ALOGV("page size = %ld", mPageSize);
+
     GetCpuInfo();
 
     int cpu = sysconf(_SC_NPROCESSORS_CONF);
@@ -435,7 +440,7 @@
     }
 }
 
-static void walk_1d(void *usr, uint32_t idx) {
+static void walk_1d_foreach(void *usr, uint32_t idx) {
     MTLaunchStructForEach *mtls = (MTLaunchStructForEach *)usr;
     RsExpandKernelDriverInfo fep = mtls->fep;
     fep.lid = idx;
@@ -458,6 +463,103 @@
     }
 }
 
+// The function format_bytes() is an auxiliary function to assist in logging.
+//
+// Bytes are read from an input (inBuf) and written (as pairs of hex digits)
+// to an output (outBuf).
+//
+// Output format:
+// - starts with ": "
+// - each input byte is translated to a pair of hex digits
+// - bytes are separated by "." except that every fourth separator is "|"
+// - if the input is sufficiently long, the output is truncated and terminated with "..."
+//
+// Arguments:
+// - outBuf  -- Pointer to buffer of type "FormatBuf" into which output is written
+// - inBuf   -- Pointer to bytes which are to be formatted into outBuf
+// - inBytes -- Number of bytes in inBuf
+//
+// Constant:
+// - kFormatInBytesMax -- Only min(kFormatInBytesMax, inBytes) bytes will be read
+//                        from inBuf
+//
+// Return value:
+// - pointer (const char *) to output (which is part of outBuf)
+//
+static const int kFormatInBytesMax = 16;
+// ": " + 2 digits per byte + 1 separator between bytes + "..." + null
+typedef char FormatBuf[2 + kFormatInBytesMax*2 + (kFormatInBytesMax - 1) + 3 + 1];
+static const char *format_bytes(FormatBuf *outBuf, const uint8_t *inBuf, const int inBytes) {
+  strcpy(*outBuf, ": ");
+  int pos = 2;
+  const int lim = std::min(kFormatInBytesMax, inBytes);
+  for (int i = 0; i < lim; ++i) {
+    if (i) {
+      sprintf(*outBuf + pos, (i % 4 ? "." : "|"));
+      ++pos;
+    }
+    sprintf(*outBuf + pos, "%02x", inBuf[i]);
+    pos += 2;
+  }
+  if (kFormatInBytesMax < inBytes)
+    strcpy(*outBuf + pos, "...");
+  return *outBuf;
+}
+
+static void walk_1d_reduce_new(void *usr, uint32_t idx) {
+  const MTLaunchStructReduceNew *mtls = (const MTLaunchStructReduceNew *)usr;
+  RsExpandKernelDriverInfo redp = mtls->redp;
+
+  // find accumulator
+  uint8_t *&accumPtr = mtls->accumPtr[idx];
+  if (!accumPtr) {
+    uint32_t accumIdx = (uint32_t)__sync_fetch_and_add(&mtls->accumCount, 1);
+    if (mtls->outFunc) {
+      accumPtr = mtls->accumAlloc + mtls->accumStride * accumIdx;
+    } else {
+      if (accumIdx == 0) {
+        accumPtr = mtls->redp.outPtr[0];
+      } else {
+        accumPtr = mtls->accumAlloc + mtls->accumStride * (accumIdx - 1);
+      }
+    }
+    REDUCE_NEW_ALOGV("walk_1d_reduce_new(%p): idx = %u got accumCount %u and accumPtr %p",
+                     mtls->accumFunc, idx, accumIdx, accumPtr);
+    // initialize accumulator
+    if (mtls->initFunc) {
+      mtls->initFunc(accumPtr);
+    } else {
+      memset(accumPtr, 0, mtls->accumSize);
+    }
+  }
+
+  // accumulate
+  const ReduceNewAccumulatorFunc_t fn = mtls->accumFunc;
+  while (1) {
+    uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+    uint32_t xStart = mtls->start.x + slice * mtls->mSliceSize;
+    uint32_t xEnd   = xStart + mtls->mSliceSize;
+
+    xEnd = rsMin(xEnd, mtls->end.x);
+
+    if (xEnd <= xStart) {
+      return;
+    }
+
+    RedpPtrSetup(mtls, &redp, xStart, 0, 0);
+    fn(&redp, xStart, xEnd, accumPtr);
+
+    FormatBuf fmt;
+    if (mtls->logReduceAccum) {
+      format_bytes(&fmt, accumPtr, mtls->accumSize);
+    } else {
+      fmt[0] = 0;
+    }
+    REDUCE_NEW_ALOGV("walk_1d_reduce_new(%p): idx = %u [%u, %u)%s",
+                     mtls->accumFunc, idx, xStart, xEnd, fmt);
+  }
+}
+
 // Launch a simple reduce-style kernel.
 // Inputs:
 //  ain:  The allocation that contains the input
@@ -486,6 +588,25 @@
                                           uint32_t inLen,
                                           Allocation * aout,
                                           MTLaunchStructReduceNew *mtls) {
+  mtls->logReduceAccum = mRSC->props.mLogReduceAccum;
+  if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInKernel) {
+    launchReduceNewParallel(ains, inLen, aout, mtls);
+  } else {
+    launchReduceNewSerial(ains, inLen, aout, mtls);
+  }
+}
+
+// Launch a general reduce-style kernel, single-threaded.
+// Inputs:
+//   ains[0..inLen-1]: Array of allocations that contain the inputs
+//   aout:             The allocation that will hold the output
+//   mtls:             Holds launch parameters
+void RsdCpuReferenceImpl::launchReduceNewSerial(const Allocation ** ains,
+                                                uint32_t inLen,
+                                                Allocation * aout,
+                                                MTLaunchStructReduceNew *mtls) {
+  ALOGV("launchReduceNewSerial(%p)", mtls->accumFunc);
+
   // In the presence of outconverter, we allocate temporary memory for
   // the accumulator.
   //
@@ -521,6 +642,112 @@
   }
 }
 
+// Launch a general reduce-style kernel, multi-threaded.
+// Inputs:
+//   ains[0..inLen-1]: Array of allocations that contain the inputs
+//   aout:             The allocation that will hold the output
+//   mtls:             Holds launch parameters
+void RsdCpuReferenceImpl::launchReduceNewParallel(const Allocation ** ains,
+                                                  uint32_t inLen,
+                                                  Allocation * aout,
+                                                  MTLaunchStructReduceNew *mtls) {
+  // For now, we don't know how to go parallel beyond 1D, or in the absence of a combiner.
+  if ((mtls->redp.dim.y > 1) || (mtls->redp.dim.z > 1) || !mtls->combFunc) {
+    launchReduceNewSerial(ains, inLen, aout, mtls);
+    return;
+  }
+
+  // Number of threads = "main thread" + number of other (worker) threads
+  const uint32_t numThreads = mWorkers.mCount + 1;
+
+  // In the absence of outconverter, we use the output allocation as
+  // an accumulator, and therefore need to allocate one fewer accumulator.
+  const uint32_t numAllocAccum = numThreads - (mtls->outFunc == nullptr);
+
+  // If mDebugReduceSplitAccum, then we want each accumulator to start
+  // on a page boundary.  (TODO: Would some unit smaller than a page
+  // be sufficient to avoid false sharing?)
+  if (mRSC->props.mDebugReduceSplitAccum) {
+    // Round up accumulator size to an integral number of pages
+    mtls->accumStride =
+        (unsigned(mtls->accumSize) + unsigned(mPageSize)-1) &
+        ~(unsigned(mPageSize)-1);
+    // Each accumulator gets its own page.  Alternatively, if we just
+    // wanted to make sure no two accumulators are on the same page,
+    // we could instead do
+    //   allocSize = mtls->accumStride * (numAllocation - 1) + mtls->accumSize
+    const size_t allocSize = mtls->accumStride * numAllocAccum;
+    mtls->accumAlloc = static_cast<uint8_t *>(memalign(mPageSize, allocSize));
+  } else {
+    mtls->accumStride = mtls->accumSize;
+    mtls->accumAlloc = static_cast<uint8_t *>(malloc(mtls->accumStride * numAllocAccum));
+  }
+
+  const size_t accumPtrArrayBytes = sizeof(uint8_t *) * numThreads;
+  mtls->accumPtr = static_cast<uint8_t **>(malloc(accumPtrArrayBytes));
+  memset(mtls->accumPtr, 0, accumPtrArrayBytes);
+
+  mtls->accumCount = 0;
+
+  rsAssert(!mInKernel);
+  mInKernel = true;
+  mtls->mSliceSize = rsMax(1U, mtls->redp.dim.x / (numThreads * 4));
+  ALOGV("launchReduceNewParallel(%p): %u threads, accumAlloc = %p",
+        mtls->accumFunc, numThreads, mtls->accumAlloc);
+  launchThreads(walk_1d_reduce_new, mtls);
+  mInKernel = false;
+
+  // Combine accumulators and identify final accumulator
+  uint8_t *finalAccumPtr = (mtls->outFunc ? nullptr : mtls->redp.outPtr[0]);
+  //   Loop over accumulators, combining into finalAccumPtr.  If finalAccumPtr
+  //   is null, then the first accumulator I find becomes finalAccumPtr.
+  for (unsigned idx = 0; idx < mtls->accumCount; ++idx) {
+    uint8_t *const thisAccumPtr = mtls->accumPtr[idx];
+    if (finalAccumPtr) {
+      if (finalAccumPtr != thisAccumPtr) {
+        if (mtls->combFunc) {
+          if (mtls->logReduceAccum) {
+            FormatBuf fmt;
+            REDUCE_NEW_ALOGV("launchReduceNewParallel(%p): accumulating into%s",
+                             mtls->accumFunc,
+                             format_bytes(&fmt, finalAccumPtr, mtls->accumSize));
+            REDUCE_NEW_ALOGV("launchReduceNewParallel(%p):    accumulator[%d]%s",
+                             mtls->accumFunc, idx,
+                             format_bytes(&fmt, thisAccumPtr, mtls->accumSize));
+          }
+          mtls->combFunc(finalAccumPtr, thisAccumPtr);
+        } else {
+          rsAssert(!"expected combiner");
+        }
+      }
+    } else {
+      finalAccumPtr = thisAccumPtr;
+    }
+  }
+  rsAssert(finalAccumPtr != nullptr);
+  if (mtls->logReduceAccum) {
+    FormatBuf fmt;
+    REDUCE_NEW_ALOGV("launchReduceNewParallel(%p): final accumulator%s",
+                     mtls->accumFunc, format_bytes(&fmt, finalAccumPtr, mtls->accumSize));
+  }
+
+  // Outconvert
+  if (mtls->outFunc) {
+    mtls->outFunc(mtls->redp.outPtr[0], finalAccumPtr);
+    if (mtls->logReduceAccum) {
+      FormatBuf fmt;
+      REDUCE_NEW_ALOGV("launchReduceNewParallel(%p): final outconverted result%s",
+                       mtls->accumFunc,
+                       format_bytes(&fmt, mtls->redp.outPtr[0], mtls->redp.outStride[0]));
+    }
+  }
+
+  // Clean up
+  free(mtls->accumPtr);
+  free(mtls->accumAlloc);
+}
+
+
 void RsdCpuReferenceImpl::launchForEach(const Allocation ** ains,
                                         uint32_t inLen,
                                         Allocation* aout,
@@ -537,9 +764,9 @@
                      (mtls->start.array[2] != mtls->end.array[2]) ||
                      (mtls->start.array[3] != mtls->end.array[3]);
 
-    if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
+    if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInKernel) {
         const size_t targetByteChunk = 16 * 1024;
-        mInForEach = true;
+        mInKernel = true;  // NOTE: The guard immediately above ensures this was !mInKernel
 
         if (outerDims) {
             // No fancy logic for chunk size
@@ -588,9 +815,9 @@
                 mtls->mSliceSize = 1;
             }
 
-            launchThreads(walk_1d, mtls);
+            launchThreads(walk_1d_foreach, mtls);
         }
-        mInForEach = false;
+        mInKernel = false;
 
     } else {
         ForEachFunc_t fn = mtls->kernel;
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index 939b7ae..c2a0864 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -34,6 +34,7 @@
 // Function types found in RenderScript code
 typedef void (*ReduceFunc_t)(const uint8_t *inBuf, uint8_t *outBuf, uint32_t len);
 typedef void (*ReduceNewAccumulatorFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint8_t *accum);
+typedef void (*ReduceNewCombinerFunc_t)(uint8_t *accum, const uint8_t *other);
 typedef void (*ReduceNewInitializerFunc_t)(uint8_t *accum);
 typedef void (*ReduceNewOutConverterFunc_t)(uint8_t *out, const uint8_t *accum);
 typedef void (*ForEachFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint32_t outStride);
@@ -44,6 +45,7 @@
 struct ReduceNewDescription {
     ReduceNewAccumulatorFunc_t  accumFunc;  // expanded accumulator function
     ReduceNewInitializerFunc_t  initFunc;   // user initializer function
+    ReduceNewCombinerFunc_t     combFunc;   // user combiner function
     ReduceNewOutConverterFunc_t outFunc;    // user outconverter function
     size_t                      accumSize;  // accumulator datum size, in bytes
 };
@@ -73,7 +75,8 @@
     RsLaunchDimensions start;
     RsLaunchDimensions end;
     // Points to MTLaunchStructForEach::fep::dim or
-    // MTLaunchStructReduce::inputDim.
+    // MTLaunchStructReduce::inputDim or
+    // MTLaunchStructReduceNew::redp::dim.
     RsLaunchDimensions *dimPtr;
 };
 
@@ -101,9 +104,51 @@
 
     ReduceNewAccumulatorFunc_t accumFunc;
     ReduceNewInitializerFunc_t initFunc;
+    ReduceNewCombinerFunc_t combFunc;
     ReduceNewOutConverterFunc_t outFunc;
 
     size_t accumSize;  // accumulator datum size in bytes
+
+    size_t accumStride;  // stride between accumulators in accumAlloc (below)
+
+    // These fields are used for managing accumulator data items in a
+    // multithreaded execution.
+    //
+    // Let the number of threads be N.
+    // Let Outc be true iff there is an outconverter.
+    //
+    // accumAlloc is a pointer to a single allocation of (N - !Outc)
+    // accumulators.  (If there is no outconverter, then the output
+    // allocation acts as an accumulator.)  It is created at kernel
+    // launch time.  Within that allocation, the distance between the
+    // start of adjacent accumulators is accumStride bytes -- this
+    // might be the same as accumSize, or it might be larger, if we
+    // are attempting to avoid false sharing.
+    //
+    // accumCount is an atomic counter of how many accumulators have
+    // been grabbed by threads.  It is initialized to zero at kernel
+    // launch time.  See accumPtr for further description.
+    //
+    // accumPtr is pointer to an array of N pointers to accumulators.
+    // The array is created at kernel launch time, and each element is
+    // initialized to nullptr.  When a particular thread goes to work,
+    // that thread obtains its accumulator from its entry in this
+    // array.  If the entry is nullptr, that thread needs to obtain an
+    // accumulator, and initialize its entry in the array accordingly.
+    // It does so via atomic access (fetch-and-add) to accumCount.
+    // - If Outc, then the fetched value is used as an index into
+    //   accumAlloc.
+    // - If !Outc, then
+    //   - If the fetched value is zero, then this thread gets the
+    //     output allocation for its accumulator.
+    //   - If the fetched value is nonzero, then (fetched value - 1)
+    //     is used as an index into accumAlloc.
+    uint8_t *accumAlloc;
+    uint8_t **accumPtr;
+    uint32_t accumCount;
+
+    // Logging control
+    bool logReduceAccum;
 };
 
 class RsdCpuReferenceImpl : public RsdCpuReference {
@@ -161,7 +206,7 @@
     virtual const char *getBccPluginName() const {
         return mBccPluginName.string();
     }
-    bool getInForEach() override { return mInForEach; }
+    bool getInKernel() override { return mInKernel; }
 
     // Set to true if we should embed global variable information in the code.
     void setEmbedGlobalInfo(bool v) override {
@@ -190,7 +235,7 @@
     uint32_t version_major;
     uint32_t version_minor;
     //bool mHasGraphics;
-    bool mInForEach;
+    bool mInKernel;  // Is a parallel kernel execution underway?
 
     struct Workers {
         volatile int mRunningCount;
@@ -222,6 +267,14 @@
     // when potentially embedding information about globals.
     // Defaults to true.
     bool mEmbedGlobalInfoSkipConstant;
+
+    long mPageSize;
+
+    // Launch a general reduce kernel
+    void launchReduceNewSerial(const Allocation ** ains, uint32_t inLen, Allocation *aout,
+                               MTLaunchStructReduceNew *mtls);
+    void launchReduceNewParallel(const Allocation ** ains, uint32_t inLen, Allocation *aout,
+                                 MTLaunchStructReduceNew *mtls);
 };
 
 
diff --git a/cpu_ref/rsCpuExecutable.cpp b/cpu_ref/rsCpuExecutable.cpp
index 5dd31ee..1a0e70e 100644
--- a/cpu_ref/rsCpuExecutable.cpp
+++ b/cpu_ref/rsCpuExecutable.cpp
@@ -538,8 +538,8 @@
             goto error;
         }
 
-        // The current implementation does not use the signature,
-        // reduce name, or combiner.
+        // The current implementation does not use the signature
+        // or reduce name.
 
         reduceNewDescriptions[i].accumSize = tmpSize;
 
@@ -565,6 +565,19 @@
             goto error;
         }
 
+        // Process the (optional) combiner.
+        if (strcmp(tmpNameCombiner, kNoName)) {
+          // Lookup the original user-written combiner.
+          if (!(reduceNewDescriptions[i].combFunc =
+                (ReduceNewCombinerFunc_t) dlsym(sharedObj, tmpNameCombiner))) {
+            ALOGE("Failed to find combiner function address for %s(): %s",
+                  tmpNameCombiner, dlerror());
+            goto error;
+          }
+        } else {
+          reduceNewDescriptions[i].combFunc = nullptr;
+        }
+
         // Process the (optional) outconverter.
         if (strcmp(tmpNameOutConverter, kNoName)) {
           // Lookup the original user-written outconverter.
diff --git a/cpu_ref/rsCpuIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp
index 34bc82d..131b357 100644
--- a/cpu_ref/rsCpuIntrinsicBlend.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlend.cpp
@@ -120,7 +120,7 @@
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
-#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
+#if defined(ARCH_ARM_USE_INTRINSICS)
     // Bug: 22047392 - Skip optimized version for BLEND_DST_ATOP until this
     // been fixed.
     if (gArchUseSIMD && info->slot != BLEND_DST_ATOP) {
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_Blend.S b/cpu_ref/rsCpuIntrinsics_advsimd_Blend.S
index 5211bb3..90dbd73 100644
--- a/cpu_ref/rsCpuIntrinsics_advsimd_Blend.S
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_Blend.S
@@ -579,6 +579,9 @@
     BLEND_LIST(BLEND_X)
 #undef BLEND_X
 
+#define BLEND_X(d, n) .set tablesize, d+1 ;
+    BLEND_LIST(BLEND_X)
+#undef BLEND_X
 
 /*  int rsdIntrinsicBlend_K(
  *          uchar4 *out,        // x0
@@ -589,7 +592,7 @@
  */
 ENTRY(rsdIntrinsicBlend_K)
     adr     x5, 2f
-    cmp     w2, #(3f - 2f) >> 1
+    cmp     w2, tablesize >> 1
     bhs     1f
     ldrsh   x6, [x5, w2, uxtw #1]
     add     x0, x0, w3, uxtw #2
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index a5fc96b..25dab00 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -646,9 +646,9 @@
 
     mtls->rs = mCtx;
 
-    // Currently not threaded.
-    mtls->isThreadable = false;
-    mtls->mSliceNum = -1;
+    mtls->mSliceNum    = 0;
+    mtls->mSliceSize   = 1;
+    mtls->isThreadable = mIsThreadable;
 
     // Set up output,
     mtls->redp.outLen = 1;
@@ -843,6 +843,7 @@
     const ReduceNewDescription *desc = mScriptExec->getReduceNewDescription(slot);
     mtls->accumFunc = desc->accumFunc;
     mtls->initFunc  = desc->initFunc;   // might legally be nullptr
+    mtls->combFunc  = desc->combFunc;   // might legally be nullptr
     mtls->outFunc   = desc->outFunc;    // might legally be nullptr
     mtls->accumSize = desc->accumSize;
 
diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h
index 49a999d..e226b93 100644
--- a/cpu_ref/rsd_cpu.h
+++ b/cpu_ref/rsd_cpu.h
@@ -131,7 +131,7 @@
                                      uint32_t flags) = 0;
     virtual CpuScript * createIntrinsic(const Script *s, RsScriptIntrinsicID iid, Element *e) = 0;
     virtual void* createScriptGroup(const ScriptGroupBase *sg) = 0;
-    virtual bool getInForEach() = 0;
+    virtual bool getInKernel() = 0;  // Is a parallel kernel execution underway?
 
     // Set to true if we should embed global variable information in the code.
     virtual void setEmbedGlobalInfo(bool v) = 0;
diff --git a/driver/rsdRuntimeStubs.cpp b/driver/rsdRuntimeStubs.cpp
index b4958e2..9fea491 100644
--- a/driver/rsdRuntimeStubs.cpp
+++ b/driver/rsdRuntimeStubs.cpp
@@ -136,7 +136,7 @@
     RsdHal *dc = (RsdHal *)rsc->mHal.drv;
     RsdCpuReference *impl = (RsdCpuReference *) dc->mCpuRef;
 
-    if (impl->getInForEach()) {
+    if (impl->getInKernel()) {
         char buf[256];
         snprintf(buf, sizeof(buf), "Error: Call to unsupported function %s "
                          "in kernel", funcName);
diff --git a/driver/runtime/build_bc_lib_internal.mk b/driver/runtime/build_bc_lib_internal.mk
index 8797abe..d3e594f 100644
--- a/driver/runtime/build_bc_lib_internal.mk
+++ b/driver/runtime/build_bc_lib_internal.mk
@@ -92,7 +92,7 @@
 	@mkdir -p $(dir $@)
 	$(hide) $(RELATIVE_PWD) $(RS_LLVM_AS) $< -o $@
 
--include $(c_bc_files:%.bc=%.P)
+$(foreach f,$(c_bc_files),$(call include-depfile,$(f:%.bc=%.P),$(f)))
 
 $(LOCAL_BUILT_MODULE): PRIVATE_BC_FILES := $(c_bc_files) $(ll_bc_files)
 $(LOCAL_BUILT_MODULE): $(c_bc_files) $(ll_bc_files)
diff --git a/java/tests/HealingBrush/Android.mk b/java/tests/HealingBrush/Android.mk
index b8d40fe..ed974e8 100644
--- a/java/tests/HealingBrush/Android.mk
+++ b/java/tests/HealingBrush/Android.mk
@@ -37,7 +37,4 @@
 LOCAL_RENDERSCRIPT_FLAGS := -rs-package-name=android.support.v8.renderscript
 LOCAL_JNI_SHARED_LIBRARIES := librsjni
 
-LOCAL_32_BIT_ONLY := true
-
 include $(BUILD_PACKAGE)
-
diff --git a/java/tests/ImageProcessing/Android.mk b/java/tests/ImageProcessing/Android.mk
index d7486e8..7ea6f6e 100644
--- a/java/tests/ImageProcessing/Android.mk
+++ b/java/tests/ImageProcessing/Android.mk
@@ -23,7 +23,6 @@
 
 LOCAL_SRC_FILES := $(call all-java-files-under, src) \
                    $(call all-renderscript-files-under, src)
-#LOCAL_STATIC_JAVA_LIBRARIES := android.renderscript
 
 LOCAL_PACKAGE_NAME := ImageProcessing
 
diff --git a/java/tests/ImageProcessing2/Android.mk b/java/tests/ImageProcessing2/Android.mk
index 9c53d91..ad2bc52 100644
--- a/java/tests/ImageProcessing2/Android.mk
+++ b/java/tests/ImageProcessing2/Android.mk
@@ -33,8 +33,5 @@
 
 LOCAL_RENDERSCRIPT_FLAGS := -rs-package-name=android.support.v8.renderscript
 LOCAL_JNI_SHARED_LIBRARIES := librsjni
-LOCAL_32_BIT_ONLY := true
 
 include $(BUILD_PACKAGE)
-
-#include $(call all-makefiles-under, $(LOCAL_PATH))
diff --git a/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce.java b/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce.java
index 608de47..c1e9c40 100644
--- a/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce.java
+++ b/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce.java
@@ -119,39 +119,6 @@
 
     ///////////////////////////////////////////////////////////////////
 
-    private float dp(float[] input1, float[] input2) {
-        _RS_ASSERT("dp input length mismatch", input1.length == input2.length);
-
-        float rslt = 0;
-        for (int idx = 0; idx < input1.length; ++idx)
-            rslt += input1[idx] * input2[idx];
-        return rslt;
-    }
-
-    private boolean dp(RenderScript RS, ScriptC_reduce s) {
-        final float[] input1 = createInputArrayFloat(100000, 2);
-        final float[] input2 = createInputArrayFloat(100000, 3);
-
-        final float javaRslt = dp(input1, input2);
-        final float rsRslt = s.reduce_dp(input1, input2).get();
-
-        // NOTE: Using a floating point equality check to test for
-        // correctness -- as we do below -- is a bad idea.  It's only
-        // reliable if the Java and RenderScript implementation of dp
-        // use the same algorithm.  Equality could be broken by
-        // different optimizations between the two, or running the
-        // RenderScript algorithm multithreaded, or running the
-        // RenderScript algorithm on a GPU rather than the CPU.
-        //
-        // Should we be checking instead that the results are
-        // "sufficiently close"?  Cooking the input set to try to
-        // ensure a deterministic result?  Changing to integers
-        // instead?
-        return result("dp", javaRslt, rsRslt);
-    }
-
-    ///////////////////////////////////////////////////////////////////
-
     private Int2 findMinAndMax(float[] input) {
         float minVal = Float.POSITIVE_INFINITY;
         int minIdx = -1;
@@ -322,7 +289,6 @@
         boolean pass = true;
         pass &= addint1D(pRS, s);
         pass &= addint2D(pRS, s);
-        pass &= dp(pRS, s);
         pass &= findMinAndMax(pRS, s);
         pass &= fz(pRS, s);
         pass &= fz2(pRS, s);
diff --git a/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce_backward.java b/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce_backward.java
index 84d2c50..b998f51 100644
--- a/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce_backward.java
+++ b/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce_backward.java
@@ -119,39 +119,6 @@
 
     ///////////////////////////////////////////////////////////////////
 
-    private float dp(float[] input1, float[] input2) {
-        _RS_ASSERT("dp input length mismatch", input1.length == input2.length);
-
-        float rslt = 0;
-        for (int idx = 0; idx < input1.length; ++idx)
-            rslt += input1[idx] * input2[idx];
-        return rslt;
-    }
-
-    private boolean dp(RenderScript RS, ScriptC_reduce_backward s) {
-        final float[] input1 = createInputArrayFloat(100000, 2);
-        final float[] input2 = createInputArrayFloat(100000, 3);
-
-        final float javaRslt = dp(input1, input2);
-        final float rsRslt = s.reduce_dp(input1, input2).get();
-
-        // NOTE: Using a floating point equality check to test for
-        // correctness -- as we do below -- is a bad idea.  It's only
-        // reliable if the Java and RenderScript implementation of dp
-        // use the same algorithm.  Equality could be broken by
-        // different optimizations between the two, or running the
-        // RenderScript algorithm multithreaded, or running the
-        // RenderScript algorithm on a GPU rather than the CPU.
-        //
-        // Should we be checking instead that the results are
-        // "sufficiently close"?  Cooking the input set to try to
-        // ensure a deterministic result?  Changing to integers
-        // instead?
-        return result("dp", javaRslt, rsRslt);
-    }
-
-    ///////////////////////////////////////////////////////////////////
-
     private Int2 findMinAndMax(float[] input) {
         float minVal = Float.POSITIVE_INFINITY;
         int minIdx = -1;
@@ -322,7 +289,6 @@
         boolean pass = true;
         pass &= addint1D(pRS, s);
         pass &= addint2D(pRS, s);
-        pass &= dp(pRS, s);
         pass &= findMinAndMax(pRS, s);
         pass &= fz(pRS, s);
         pass &= fz2(pRS, s);
diff --git a/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce.rs b/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce.rs
index be09dfb..ec7be8b 100644
--- a/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce.rs
+++ b/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce.rs
@@ -16,18 +16,6 @@
 
 /////////////////////////////////////////////////////////////////////////
 
-#pragma rs reduce(dp) \
-  accumulator(dpAccum) combiner(dpSum)
-
-static void dpAccum(float *accum, float in1, float in2) {
-  *accum += in1*in2;
-}
-
-// combiner function
-static void dpSum(float *accum, const float *val) { *accum += *val; }
-
-/////////////////////////////////////////////////////////////////////////
-
 #pragma rs reduce(findMinAndMax) \
   initializer(fMMInit) accumulator(fMMAccumulator) \
   combiner(fMMCombiner) outconverter(fMMOutConverter)
@@ -61,8 +49,10 @@
 
 static void fMMCombiner(MinAndMax *accum,
                         const MinAndMax *val) {
-  fMMAccumulator(accum, val->min.val, val->min.idx);
-  fMMAccumulator(accum, val->max.val, val->max.idx);
+  if (val->min.val < accum->min.val)
+    accum->min = val->min;
+  if (val->max.val > accum->max.val)
+    accum->max = val->max;
 }
 
 static void fMMOutConverter(int2 *result,
diff --git a/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce_backward.rs b/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce_backward.rs
index 419e709..41252c8 100644
--- a/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce_backward.rs
+++ b/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce_backward.rs
@@ -15,18 +15,6 @@
 
 /////////////////////////////////////////////////////////////////////////
 
-static void dpAccum(float *accum, float in1, float in2) {
-  *accum += in1*in2;
-}
-
-// combiner function
-static void dpSum(float *accum, const float *val) { *accum += *val; }
-
-#pragma rs reduce(dp) \
-  accumulator(dpAccum) combiner(dpSum)
-
-/////////////////////////////////////////////////////////////////////////
-
 typedef struct {
   float val;
   int idx;
@@ -56,8 +44,10 @@
 
 static void fMMCombiner(MinAndMax *accum,
                         const MinAndMax *val) {
-  fMMAccumulator(accum, val->min.val, val->min.idx);
-  fMMAccumulator(accum, val->max.val, val->max.idx);
+  if (val->min.val < accum->min.val)
+    accum->min = val->min;
+  if (val->max.val > accum->max.val)
+    accum->max = val->max;
 }
 
 static void fMMOutConverter(int2 *result,
diff --git a/java/tests/Refocus/Android.mk b/java/tests/Refocus/Android.mk
index 13ccc22..887f894 100644
--- a/java/tests/Refocus/Android.mk
+++ b/java/tests/Refocus/Android.mk
@@ -37,6 +37,5 @@
 
 LOCAL_RENDERSCRIPT_FLAGS := -rs-package-name=android.support.v8.renderscript
 LOCAL_JNI_SHARED_LIBRARIES := librsjni
-LOCAL_32_BIT_ONLY := true
 
 include $(BUILD_PACKAGE)
diff --git a/java/tests/RsBLAS_Benchmark/libsgemm/Android.mk b/java/tests/RsBLAS_Benchmark/libsgemm/Android.mk
index 21194e9..9327869 100644
--- a/java/tests/RsBLAS_Benchmark/libsgemm/Android.mk
+++ b/java/tests/RsBLAS_Benchmark/libsgemm/Android.mk
@@ -22,6 +22,7 @@
 LOCAL_MODULE := libgemmdata
 LOCAL_MODULE_TAGS := optional
 LOCAL_SRC_FILES := test_data.cpp
+LOCAL_SDK_VERSION := 14
 
 LOCAL_C_INCLUDES := $(JNI_H_INCLUDE)
 
diff --git a/java/tests/RsTest/AndroidManifest.xml b/java/tests/RsTest/AndroidManifest.xml
index b660398..31da896 100644
--- a/java/tests/RsTest/AndroidManifest.xml
+++ b/java/tests/RsTest/AndroidManifest.xml
@@ -2,6 +2,7 @@
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
     package="com.android.rs.test">
     <application 
+	android:largeHeap="true"
         android:label="_RS_Test"
         android:icon="@drawable/test_pattern">
         <activity android:name="RSTest"
diff --git a/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java b/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java
index a244646..0769259 100644
--- a/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java
+++ b/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java
@@ -27,6 +27,7 @@
 import android.renderscript.*;
 import android.util.Log;
 import java.lang.Float;
+import java.util.Arrays;
 import java.util.Random;
 
 public class UT_reduce extends UnitTest {
@@ -36,6 +37,81 @@
         super(rstc, "reduce", ctx);
     }
 
+    private static class timing {
+        timing(long myJavaStart, long myJavaEnd, long myRsStart,
+               long myCopyStart, long myKernelStart, long myRsEnd,
+               Allocation... myInputs) {
+            javaStart = myJavaStart;
+            javaEnd = myJavaEnd;
+            rsStart = myRsStart;
+            copyStart = myCopyStart;
+            kernelStart = myKernelStart;
+            rsEnd = myRsEnd;
+
+            inputBytes = 0;
+            for (Allocation input : myInputs)
+                inputBytes += input.getBytesSize();
+
+            inputCells = (myInputs.length > 0) ? myInputs[0].getType().getCount() : 0;
+        }
+
+        timing(long myInputCells) {
+            inputCells = myInputCells;
+        }
+
+        private long javaStart = -1;
+        private long javaEnd = -1;
+        private long rsStart = -1;
+        private long copyStart = -1;
+        private long kernelStart = -1;
+        private long rsEnd = -1;
+        private long inputBytes = -1;
+        private long inputCells = -1;
+
+        public long javaTime() { return javaEnd - javaStart; }
+        public long rsTime() { return rsEnd - rsStart; }
+        public long kernelTime() { return rsEnd - kernelStart; }
+        public long overheadTime() { return kernelStart - rsStart; }
+        public long allocationTime() { return copyStart - rsStart; }
+        public long copyTime() { return kernelStart - copyStart; }
+
+        public static String string(long myJavaStart, long myJavaEnd, long myRsStart,
+                                    long myCopyStart, long myKernelStart, long myRsEnd,
+                                    Allocation... myInputs) {
+            return (new timing(myJavaStart, myJavaEnd, myRsStart,
+                               myCopyStart, myKernelStart, myRsEnd, myInputs)).string();
+        }
+
+        public static String string(long myInputCells) {
+            return (new timing(myInputCells)).string();
+        }
+
+        public String string() {
+            String result;
+            if (javaStart >= 0) {
+                result = "(java " + javaTime() + "ms, rs " + rsTime() + "ms = overhead " +
+                         overheadTime() + "ms (alloc " + allocationTime() + "ms + copy " +
+                         copyTime() + "ms) + kernel+get() " + kernelTime() + "ms)";
+                if (inputCells > 0)
+                    result += " ";
+            } else {
+                result = "";
+            }
+            if (inputCells > 0) {
+                result += "(" + fmt.format(inputCells) + " cells";
+                if (inputBytes > 0)
+                    result += ", " + fmt.format(inputBytes) + " bytes";
+                result += ")";
+            }
+            return result;
+        }
+
+        private static java.text.DecimalFormat fmt;
+        static {
+            fmt = new java.text.DecimalFormat("###,###");
+        }
+    };
+
     private byte[] createInputArrayByte(int len, int seed) {
         byte[] array = new byte[len];
         (new Random(seed)).nextBytes(array);
@@ -66,21 +142,60 @@
         return array;
     }
 
-    private <T extends Number> boolean result(String testName, T javaRslt, T rsRslt) {
+    private <T extends Number> boolean result(String testName, final timing t,
+                                              T javaRslt, T rsRslt) {
         final boolean success = javaRslt.equals(rsRslt);
-        Log.i(TAG,
-                testName + ": java " + javaRslt + ", rs " + rsRslt + ": " +
-                (success ? "PASSED" : "FAILED"));
+        String status = (success ? "PASSED" : "FAILED");
+        if (success && (t != null))
+            status += " " + t.string();
+        Log.i(TAG, testName + ": java " + javaRslt + ", rs " + rsRslt + ": " + status);
         return success;
     }
 
-    private boolean result(String testName, Int2 javaRslt, Int2 rsRslt) {
+    private boolean result(String testName, final timing t,
+                           final long[] javaRslt, final long[] rsRslt) {
+        if (javaRslt.length != rsRslt.length) {
+            Log.i(TAG, testName + ": java length " + javaRslt.length +
+                       ", rs length " + rsRslt.length + ": FAILED");
+            return false;
+        }
+        for (int i = 0; i < javaRslt.length; ++i) {
+            if (javaRslt[i] != rsRslt[i]) {
+                Log.i(TAG, testName + "[" + i + "]: java " + javaRslt[i] +
+                           ", rs " + rsRslt[i] + ": FAILED");
+                return false;
+            }
+        }
+        String status = "PASSED";
+        if (t != null)
+            status += " " + t.string();
+        Log.i(TAG, testName + ": " + status);
+        return true;
+    }
+
+    private boolean result(String testName, final timing t, Int2 javaRslt, Int2 rsRslt) {
         final boolean success = (javaRslt.x == rsRslt.x) && (javaRslt.y == rsRslt.y);
+        String status = (success ? "PASSED" : "FAILED");
+        if (success && (t != null))
+            status += " " + t.string();
         Log.i(TAG,
                 testName +
                 ": java (" + javaRslt.x + ", " + javaRslt.y + ")" +
                 ", rs (" + rsRslt.x + ", " + rsRslt.y + ")" +
-                ": " + (success ? "PASSED" : "FAILED"));
+                ": " + status);
+        return success;
+    }
+
+    private boolean result(String testName, final timing t, Float2 javaRslt, Float2 rsRslt) {
+        final boolean success = (javaRslt.x == rsRslt.x) && (javaRslt.y == rsRslt.y);
+        String status = (success ? "PASSED" : "FAILED");
+        if (success && (t != null))
+            status += " " + t.string();
+        Log.i(TAG,
+                testName +
+                ": java (" + javaRslt.x + ", " + javaRslt.y + ")" +
+                ", rs (" + rsRslt.x + ", " + rsRslt.y + ")" +
+                ": " + status);
         return success;
     }
 
@@ -93,61 +208,68 @@
         return rslt;
     }
 
-    private boolean addint1D(RenderScript RS, ScriptC_reduce s) {
-        final int[] input = createInputArrayInt(100000, 0, 1 << 13);
+    private boolean addint1D_array(RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+        final int[] input = createInputArrayInt(size[0], seed, Integer.MAX_VALUE / size[0]);
 
         final int javaRslt = addint(input);
         final int rsRslt = s.reduce_addint(input).get();
 
-        return result("addint1D", javaRslt, rsRslt);
+        return result("addint1D_array", new timing(size[0]), javaRslt, rsRslt);
     }
 
-    private boolean addint2D(RenderScript RS, ScriptC_reduce s) {
-        final int dimX = 450, dimY = 225;
+    private boolean addint1D(RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+        final int[] inputArray = createInputArrayInt(size[0], seed, Integer.MAX_VALUE / size[0]);
 
-        final int[] inputArray = createInputArrayInt(dimX * dimY, 1, 1 << 13);
+        final long javaTimeStart = java.lang.System.currentTimeMillis();
+        final int javaRslt = addint(inputArray);
+        final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+        final long rsTimeStart = java.lang.System.currentTimeMillis();
+
+        Allocation inputAllocation = Allocation.createSized(RS, Element.I32(RS), inputArray.length);
+
+        final long copyTimeStart = java.lang.System.currentTimeMillis();
+
+        inputAllocation.copyFrom(inputArray);
+
+        final long kernelTimeStart = java.lang.System.currentTimeMillis();
+        final int rsRslt = s.reduce_addint(inputAllocation).get();
+        final long rsTimeEnd = java.lang.System.currentTimeMillis();
+
+        return result("addint1D",
+                new timing(javaTimeStart, javaTimeEnd, rsTimeStart,
+                           copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation),
+                javaRslt, rsRslt);
+    }
+
+    private boolean addint2D(RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+        final int dimX = size[0];
+        final int dimY = size[1];
+
+        final int[] inputArray = createInputArrayInt(dimX * dimY, seed, Integer.MAX_VALUE / (dimX * dimY));
+
+        final long javaTimeStart = java.lang.System.currentTimeMillis();
+        final int javaRslt = addint(inputArray);
+        final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+        final long rsTimeStart = java.lang.System.currentTimeMillis();
+
         Type.Builder typeBuilder = new Type.Builder(RS, Element.I32(RS));
         typeBuilder.setX(dimX).setY(dimY);
         Allocation inputAllocation = Allocation.createTyped(RS, typeBuilder.create());
+
+        final long copyTimeStart = java.lang.System.currentTimeMillis();
+
         inputAllocation.copy2DRangeFrom(0, 0, dimX, dimY, inputArray);
 
-        final int javaRslt = addint(inputArray);
+        final long kernelTimeStart = java.lang.System.currentTimeMillis();
         final int rsRslt = s.reduce_addint(inputAllocation).get();
+        final long rsTimeEnd = java.lang.System.currentTimeMillis();
 
-        return result("addint2D", javaRslt, rsRslt);
-    }
-
-    ///////////////////////////////////////////////////////////////////
-
-    private float dp(float[] input1, float[] input2) {
-        _RS_ASSERT("dp input length mismatch", input1.length == input2.length);
-
-        float rslt = 0;
-        for (int idx = 0; idx < input1.length; ++idx)
-            rslt += input1[idx] * input2[idx];
-        return rslt;
-    }
-
-    private boolean dp(RenderScript RS, ScriptC_reduce s) {
-        final float[] input1 = createInputArrayFloat(100000, 2);
-        final float[] input2 = createInputArrayFloat(100000, 3);
-
-        final float javaRslt = dp(input1, input2);
-        final float rsRslt = s.reduce_dp(input1, input2).get();
-
-        // NOTE: Using a floating point equality check to test for
-        // correctness -- as we do below -- is a bad idea.  It's only
-        // reliable if the Java and RenderScript implementation of dp
-        // use the same algorithm.  Equality could be broken by
-        // different optimizations between the two, or running the
-        // RenderScript algorithm multithreaded, or running the
-        // RenderScript algorithm on a GPU rather than the CPU.
-        //
-        // Should we be checking instead that the results are
-        // "sufficiently close"?  Cooking the input set to try to
-        // ensure a deterministic result?  Changing to integers
-        // instead?
-        return result("dp", javaRslt, rsRslt);
+        return result("addint2D",
+                new timing(javaTimeStart, javaTimeEnd, rsTimeStart,
+                           copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation),
+                javaRslt, rsRslt);
     }
 
     ///////////////////////////////////////////////////////////////////
@@ -172,79 +294,195 @@
         return new Int2(minIdx, maxIdx);
     }
 
-    private boolean findMinAndMax(RenderScript RS, ScriptC_reduce s) {
-        final float[] input = createInputArrayFloat(100000, 4);
+    private boolean findMinAndMax_array(RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+        final float[] input = createInputArrayFloat(size[0], seed);
 
         final Int2 javaRslt = findMinAndMax(input);
         final Int2 rsRslt = s.reduce_findMinAndMax(input).get();
 
-        return result("findMinAndMax", javaRslt, rsRslt);
+        // Note that the Java and RenderScript algorithms are not
+        // guaranteed to find the same cells -- but they should
+        // find cells of the same value.
+        final Float2 javaVal = new Float2(input[javaRslt.x], input[javaRslt.y]);
+        final Float2 rsVal = new Float2(input[rsRslt.x], input[rsRslt.y]);
+
+        return result("findMinAndMax_array", new timing(size[0]), javaVal, rsVal);
+    }
+
+    private boolean findMinAndMax(RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+        final float[] inputArray = createInputArrayFloat(size[0], seed);
+
+        final long javaTimeStart = java.lang.System.currentTimeMillis();
+        final Int2 javaRslt = findMinAndMax(inputArray);
+        final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+        final long rsTimeStart = java.lang.System.currentTimeMillis();
+
+        Allocation inputAllocation = Allocation.createSized(RS, Element.F32(RS), inputArray.length);
+
+        final long copyTimeStart = java.lang.System.currentTimeMillis();
+
+        inputAllocation.copyFrom(inputArray);
+
+        final long kernelTimeStart = java.lang.System.currentTimeMillis();
+        final Int2 rsRslt = s.reduce_findMinAndMax(inputAllocation).get();
+        final long rsTimeEnd = java.lang.System.currentTimeMillis();
+
+        // Note that the Java and RenderScript algorithms are not
+        // guaranteed to find the same cells -- but they should
+        // find cells of the same value.
+        final Float2 javaVal = new Float2(inputArray[javaRslt.x], inputArray[javaRslt.y]);
+        final Float2 rsVal = new Float2(inputArray[rsRslt.x], inputArray[rsRslt.y]);
+
+        return result("findMinAndMax",
+                new timing(javaTimeStart, javaTimeEnd, rsTimeStart,
+                           copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation),
+                javaVal, rsVal);
     }
 
     ///////////////////////////////////////////////////////////////////
 
-    private boolean fz(RenderScript RS, ScriptC_reduce s) {
-        final int inputLen = 100000;
-        int[] input = createInputArrayInt(inputLen, 5);
+    private int fz(final int[] input) {
+        for (int i = 0; i < input.length; ++i)
+            if (input[i] == 0)
+                return i;
+        return -1;
+    }
+
+    private boolean fz_array(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+        final int inputLen = size[0];
+        int[] input = createInputArrayInt(inputLen, seed+0);
         // just in case we got unlucky
-        input[(new Random(6)).nextInt(inputLen)] = 0;
+        input[(new Random(seed+1)).nextInt(inputLen)] = 0;
 
         final int rsRslt = s.reduce_fz(input).get();
 
         final boolean success = (input[rsRslt] == 0);
         Log.i(TAG,
-                "fz: input[" + rsRslt + "] == " + input[rsRslt] + ": " +
-                (success ? "PASSED" : "FAILED"));
+                "fz_array: input[" + rsRslt + "] == " + input[rsRslt] + ": " +
+                (success ? "PASSED " + timing.string(size[0]) : "FAILED"));
+        return success;
+    }
+
+    private boolean fz(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+        final int inputLen = size[0];
+        int[] inputArray = createInputArrayInt(inputLen, seed+0);
+        // just in case we got unlucky
+        inputArray[(new Random(seed+1)).nextInt(inputLen)] = 0;
+
+        final long javaTimeStart = java.lang.System.currentTimeMillis();
+        final int javaRslt = fz(inputArray);
+        final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+        final long rsTimeStart = java.lang.System.currentTimeMillis();
+
+        Allocation inputAllocation = Allocation.createSized(RS, Element.I32(RS), inputArray.length);
+
+        final long copyTimeStart = java.lang.System.currentTimeMillis();
+
+        inputAllocation.copyFrom(inputArray);
+
+        final long kernelTimeStart = java.lang.System.currentTimeMillis();
+        final int rsRslt = s.reduce_fz(inputAllocation).get();
+        final long rsTimeEnd = java.lang.System.currentTimeMillis();
+
+        final boolean success = (inputArray[rsRslt] == 0);
+        String status = (success ? "PASSED" : "FAILED");
+        if (success)
+            status += " " + timing.string(javaTimeStart, javaTimeEnd, rsTimeStart,
+                                          copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation);
+        Log.i(TAG,
+                "fz: java input[" + javaRslt + "] == " + inputArray[javaRslt] +
+                ", rs input[" + rsRslt + "] == " + inputArray[javaRslt] + ": " + status);
         return success;
     }
 
     ///////////////////////////////////////////////////////////////////
 
-    private boolean fz2(RenderScript RS, ScriptC_reduce s) {
-        final int dimX = 225, dimY = 450;
+    private boolean fz2(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+        final int dimX = size[0], dimY = size[1];
         final int inputLen = dimX * dimY;
 
-        int[] inputArray = createInputArrayInt(inputLen, 7);
+        int[] inputArray = createInputArrayInt(inputLen, seed+0);
         // just in case we got unlucky
-        inputArray[(new Random(8)).nextInt(inputLen)] = 0;
+        inputArray[(new Random(seed+1)).nextInt(inputLen)] = 0;
+
+        final long javaTimeStart = java.lang.System.currentTimeMillis();
+        final int javaRsltLinear = fz(inputArray);
+        final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+        final Int2 javaRslt = new Int2(javaRsltLinear % dimX, javaRsltLinear / dimX);
+        final int javaCellVal = inputArray[javaRslt.x + dimX * javaRslt.y];
+
+        final long rsTimeStart = java.lang.System.currentTimeMillis();
 
         Type.Builder typeBuilder = new Type.Builder(RS, Element.I32(RS));
         typeBuilder.setX(dimX).setY(dimY);
         Allocation inputAllocation = Allocation.createTyped(RS, typeBuilder.create());
+
+        final long copyTimeStart = java.lang.System.currentTimeMillis();
+
         inputAllocation.copy2DRangeFrom(0, 0, dimX, dimY, inputArray);
 
+        final long kernelTimeStart = java.lang.System.currentTimeMillis();
         final Int2 rsRslt = s.reduce_fz2(inputAllocation).get();
+        final long rsTimeEnd = java.lang.System.currentTimeMillis();
 
-        final int cellVal = inputArray[rsRslt.x + dimX * rsRslt.y];
-        final boolean success = (cellVal == 0);
+        final int rsCellVal = inputArray[rsRslt.x + dimX * rsRslt.y];
+        final boolean success = (rsCellVal == 0);
+        String status = (success ? "PASSED" : "FAILED");
+        if (success)
+            status += " " + timing.string(javaTimeStart, javaTimeEnd, rsTimeStart,
+                                          copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation);
         Log.i(TAG,
-                "fz2: input[" + rsRslt.x + ", " + rsRslt.y + "] == " + cellVal + ": " +
-                (success ? "PASSED" : "FAILED"));
+                "fz2: java input[" + javaRslt.x + ", " + javaRslt.y + "] == " + javaCellVal +
+                ", rs input[" + rsRslt.x + ", " + rsRslt.y + "] == " + rsCellVal + ": " + status);
         return success;
     }
 
     ///////////////////////////////////////////////////////////////////
 
-    private boolean fz3(RenderScript RS, ScriptC_reduce s) {
-        final int dimX = 59, dimY = 48, dimZ = 37;
+    private boolean fz3(RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+        final int dimX = size[0], dimY = size[1], dimZ = size[2];
         final int inputLen = dimX * dimY * dimZ;
 
-        int[] inputArray = createInputArrayInt(inputLen, 9);
+        int[] inputArray = createInputArrayInt(inputLen, seed+0);
         // just in case we got unlucky
-        inputArray[(new Random(10)).nextInt(inputLen)] = 0;
+        inputArray[(new Random(seed+1)).nextInt(inputLen)] = 0;
+
+        final long javaTimeStart = java.lang.System.currentTimeMillis();
+        final int javaRsltLinear = fz(inputArray);
+        final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+        final Int3 javaRslt = new Int3(
+            javaRsltLinear % dimX,
+            (javaRsltLinear / dimX) % dimY,
+            javaRsltLinear / (dimX * dimY));
+        final int javaCellVal = inputArray[javaRslt.x + dimX * javaRslt.y + dimX * dimY * javaRslt.z];
+
+        final long rsTimeStart = java.lang.System.currentTimeMillis();
 
         Type.Builder typeBuilder = new Type.Builder(RS, Element.I32(RS));
         typeBuilder.setX(dimX).setY(dimY).setZ(dimZ);
         Allocation inputAllocation = Allocation.createTyped(RS, typeBuilder.create());
+
+        final long copyTimeStart = java.lang.System.currentTimeMillis();
+
         inputAllocation.copy3DRangeFrom(0, 0, 0, dimX, dimY, dimZ, inputArray);
 
+        final long kernelTimeStart = java.lang.System.currentTimeMillis();
         final Int3 rsRslt = s.reduce_fz3(inputAllocation).get();
+        final long rsTimeEnd = java.lang.System.currentTimeMillis();
 
-        final int cellVal = inputArray[rsRslt.x + dimX * rsRslt.y + dimX * dimY * rsRslt.z];
-        final boolean success = (cellVal == 0);
+        final int rsCellVal = inputArray[rsRslt.x + dimX * rsRslt.y + dimX * dimY * rsRslt.z];
+        final boolean success = (rsCellVal == 0);
+        String status = (success ? "PASSED" : "FAILED");
+        if (success)
+            status += " " + timing.string(javaTimeStart, javaTimeEnd, rsTimeStart,
+                                          copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation);
         Log.i(TAG,
-                "fz3: input[" + rsRslt.x + ", " + rsRslt.y + ", " + rsRslt.z + "] == " + cellVal + ": " +
-                (success ? "PASSED" : "FAILED"));
+                "fz3: java input[" + javaRslt.x + ", " + javaRslt.y + ", " + javaRslt.z + "] == " + javaCellVal +
+                ", rs input[" + rsRslt.x + ", " + rsRslt.y + ", " + rsRslt.z + "] == " + rsCellVal + ": " + status);
         return success;
     }
 
@@ -271,24 +509,43 @@
         return outputArray;
     }
 
-    private boolean histogram(RenderScript RS, ScriptC_reduce s) {
-        final byte[] inputArray = createInputArrayByte(100000, 11);
+    private boolean histogram_array(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+        final byte[] inputArray = createInputArrayByte(size[0], seed);
 
         final long[] javaRslt = histogram(RS, inputArray);
         _RS_ASSERT("javaRslt unexpected length: " + javaRslt.length, javaRslt.length == histogramBucketCount);
         final long[] rsRslt = s.reduce_histogram(inputArray).get();
         _RS_ASSERT("rsRslt unexpected length: " + rsRslt.length, rsRslt.length == histogramBucketCount);
 
-        for (int i = 0; i < histogramBucketCount; ++i) {
-            if (javaRslt[i] != rsRslt[i]) {
-                Log.i(TAG,
-                        "histogram[" + i + "]: java " + javaRslt[i] + ", rs " + rsRslt[i] + ": FAILED");
-                return false;
-            }
-        }
+        return result("histogram_array", new timing(size[0]), javaRslt, rsRslt);
+    }
 
-        Log.i(TAG, "histogram: PASSED");
-        return true;
+    private boolean histogram(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+        final byte[] inputArray = createInputArrayByte(size[0], seed);
+
+        final long javaTimeStart = java.lang.System.currentTimeMillis();
+        final long[] javaRslt = histogram(RS, inputArray);
+        final long javaTimeEnd = java.lang.System.currentTimeMillis();
+        _RS_ASSERT("javaRslt unexpected length: " + javaRslt.length, javaRslt.length == histogramBucketCount);
+
+        final long rsTimeStart = java.lang.System.currentTimeMillis();
+
+        Allocation inputAllocation = Allocation.createSized(RS, Element.U8(RS), inputArray.length);
+
+        final long copyTimeStart = java.lang.System.currentTimeMillis();
+
+        inputAllocation.copyFrom(inputArray);
+
+        final long kernelTimeStart = java.lang.System.currentTimeMillis();
+        final long[] rsRslt = s.reduce_histogram(inputAllocation).get();
+        final long rsTimeEnd = java.lang.System.currentTimeMillis();
+        _RS_ASSERT("rsRslt unexpected length: " + rsRslt.length, rsRslt.length == histogramBucketCount);
+
+        // NOTE: The "java time" is actually for the RenderScript histogram intrinsic
+        return result("histogram",
+                new timing(javaTimeStart, javaTimeEnd, rsTimeStart,
+                           copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation),
+                javaRslt, rsRslt);
     }
 
     //-----------------------------------------------------------------
@@ -302,17 +559,250 @@
         return new Int2(modeIdx, (int)hsg[modeIdx]);
     }
 
-    private boolean mode(RenderScript RS, ScriptC_reduce s) {
-        final byte[] inputArray = createInputArrayByte(100000, 12);
+    private boolean mode_array(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+        final byte[] inputArray = createInputArrayByte(size[0], seed);
 
         final Int2 javaRslt = mode(RS, inputArray);
         final Int2 rsRslt = s.reduce_mode(inputArray).get();
 
-        return result("mode", javaRslt, rsRslt);
+        return result("mode", new timing(size[0]), javaRslt, rsRslt);
     }
 
     ///////////////////////////////////////////////////////////////////
 
+    private long sumgcd(final int in1[], final int in2[]) {
+        _RS_ASSERT("sumgcd input length mismatch", in1.length == in2.length);
+
+        long sum = 0;
+        for (int i = 0; i < in1.length; ++i) {
+            int a = in1[i], b = in2[i];
+
+            while (b != 0) {
+                final int aNew = b;
+                final int bNew = a % b;
+
+                a = aNew;
+                b = bNew;
+            }
+
+            sum += a;
+        }
+        return sum;
+    }
+
+    private boolean sumgcd(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+        final int len = size[0];
+
+        final int[] inputArrayA = createInputArrayInt(len, seed+0);
+        final int[] inputArrayB = createInputArrayInt(len, seed+1);
+
+        final long javaTimeStart = java.lang.System.currentTimeMillis();
+        final long javaRslt = sumgcd(inputArrayA, inputArrayB);
+        final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+        final long rsTimeStart = java.lang.System.currentTimeMillis();
+
+        Allocation inputAllocationA = Allocation.createSized(RS, Element.I32(RS), len);
+        Allocation inputAllocationB = Allocation.createSized(RS, Element.I32(RS), len);
+
+        final long copyTimeStart = java.lang.System.currentTimeMillis();
+
+        inputAllocationA.copyFrom(inputArrayA);
+        inputAllocationB.copyFrom(inputArrayB);
+
+        final long kernelTimeStart = java.lang.System.currentTimeMillis();
+        final long rsRslt = s.reduce_sumgcd(inputAllocationA, inputAllocationB).get();
+        final long rsTimeEnd = java.lang.System.currentTimeMillis();
+
+        return result("sumgcd",
+                new timing(javaTimeStart, javaTimeEnd, rsTimeStart, copyTimeStart, kernelTimeStart, rsTimeEnd,
+                        inputAllocationA, inputAllocationB),
+                javaRslt, rsRslt);
+    }
+
+    ///////////////////////////////////////////////////////////////////
+
+    public static final int maxSeedsPerTest = 10;
+
+    static interface Test {
+        // A test execution is characterized by two properties: A seed
+        // and a size.
+        //
+        // The seed is used for generating pseudorandom input data.
+        // Ideally, we use different seeds for different tests and for
+        // different executions of the same test at different sizes.
+        // A test with multiple blocks of input data (i.e., for a
+        // reduction with multiple inputs) may want multiple seeds; it
+        // may use the seeds seed..seed+maxSeedsPerTest-1.
+        //
+        // The size indicates the amount of input data.  It is the number
+        // of cells in a particular dimension of the iteration space.
+        boolean run(RenderScript RS, ScriptC_reduce s, int seed, int[] size);
+    };
+
+    static class TestDescription {
+        public TestDescription(String myTestName, Test myTest, int mySeed, int[] myDefSize, int[] myLog2MaxSize) {
+            testName    = myTestName;
+            test        = myTest;
+            seed        = mySeed;
+            defSize     = myDefSize;
+            log2MaxSize = myLog2MaxSize;
+        };
+
+        public TestDescription(String myTestName, Test myTest, int mySeed, int[] myDefSize) {
+            testName    = myTestName;
+            test        = myTest;
+            seed        = mySeed;
+            defSize     = myDefSize;
+            log2MaxSize = null;
+        };
+
+        public final String testName;
+
+        public final Test test;
+
+        // When executing the test, scale this up by maxSeedsPerTest.
+        public final int seed;
+
+        // If we're only going to run the test once, what size should
+        // we use?
+        public final int[] defSize;
+
+        // If we're going to run the test over a range of sizes, what
+        // is the maximum size to use?
+        public final int[] log2MaxSize;
+    };
+
+    private boolean run(TestDescription td, RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+        String arrayContent = "";
+        for (int i = 0; i < size.length; ++i) {
+            if (i != 0)
+                arrayContent += ", ";
+            arrayContent += size[i];
+        }
+        Log.i(TAG, "Running " + td.testName + "(seed = " + seed + ", size[] = {" + arrayContent + "})");
+        return td.test.run(RS, s, seed, size);
+    }
+
+    private final TestDescription[] correctnessTests = {
+        // alloc and array variants of the same test will use the same
+        // seed, in case results need to be compared.
+
+        new TestDescription("addint1D", this::addint1D, 0, new int[]{100000}, new int[]{20}),
+        new TestDescription("addint1D_array", this::addint1D_array, 0, new int[]{100000}, new int[]{20}),
+        new TestDescription("addint2D", this::addint2D, 1, new int[]{450, 225}),
+        new TestDescription("findMinAndMax", this::findMinAndMax, 3, new int[]{100000}, new int[]{20}),
+        new TestDescription("findMinAndMaxArray", this::findMinAndMax_array, 3, new int[]{100000}, new int[]{20}),
+        new TestDescription("fz", this::fz, 4, new int[]{100000}, new int[]{20}),
+        new TestDescription("fz_array", this::fz_array, 4, new int[]{100000}, new int[]{20}),
+        new TestDescription("fz2", this::fz2, 5, new int[]{225, 450}),
+        new TestDescription("fz3", this::fz3, 6, new int[]{59, 48, 37}),
+        new TestDescription("histogram", this::histogram, 7, new int[]{100000}, new int[]{20}),
+        new TestDescription("histogram_array", this::histogram_array, 7, new int[]{100000}, new int[]{20}),
+        // might want to add: new TestDescription("mode", this::mode, 8, new int[]{100000}, new int[]{20}),
+        new TestDescription("mode_array", this::mode_array, 8, new int[]{100000}, new int[]{20}),
+        new TestDescription("sumgcd", this::sumgcd, 9, new int[]{1 << 16}, new int[]{20})
+    };
+
+    private boolean runCorrectnessQuick(RenderScript RS, ScriptC_reduce s) {
+        boolean pass = true;
+
+        for (TestDescription td : correctnessTests) {
+            pass &= run(td, RS, s, maxSeedsPerTest * td.seed, td.defSize);
+        }
+
+        return pass;
+    }
+
+    private boolean runCorrectness(RenderScript RS, ScriptC_reduce s) {
+        boolean pass = true;
+
+        for (TestDescription td : correctnessTests) {
+            if (td.log2MaxSize == null)  // TODO: Eventually this should never happen?
+                continue;
+
+            if (td.log2MaxSize.length == 1) {
+                final int log2MaxSize = td.log2MaxSize[0];
+                // We will execute the test with the following sizes:
+                // (a) Each power of 2 from zero (2**0) up to log2MaxSize (2**log2MaxSize)
+                // (b) Each size from (a) +/-1
+                // (c) 2 random sizes between adjacent points in (a)
+                int[] testSizes = new int[
+                    /* a */ (1 + log2MaxSize) +
+                    /* b */ 2*(1 + log2MaxSize) +
+                    /* c */ 2*log2MaxSize];
+
+                // NOTE: Each test execution gets maxSeedsPerTest, and
+                // there are up to 3 + 5*log2MaxSize test executions
+                // of a test, and we need a seed for (c).  Assuming
+                // log2MaxSize does not exceed 32, then it should be
+                // sufficient to reserve 1 + 5*32*maxSeedsPerTest seeds
+                // per TestDescription.
+                final int seedForPickingTestSizes = td.seed * (1 + 5*32*maxSeedsPerTest);
+
+                int nextTestIdx = 0;
+
+                // Fill in (a) and (b)
+                for (int i = 0; i <= log2MaxSize; ++i) {
+                    final int pwrOf2 = 1 << i;
+                    testSizes[nextTestIdx++] = pwrOf2;      /* a */
+                    testSizes[nextTestIdx++] = pwrOf2 - 1;  /* b */
+                    testSizes[nextTestIdx++] = pwrOf2 + 1;  /* b */
+                }
+
+                // Fill in (c)
+                Random r = new Random(seedForPickingTestSizes);
+                for (int i = 0; i < log2MaxSize; ++i) {
+                    final int lo = (1 << i) + 1;
+                    final int hi = 1 << (i + 1);
+
+                    if (lo < hi) {
+                        for (int j = 0; j < 2; ++j) {
+                            testSizes[nextTestIdx++] = r.nextInt(hi - lo) + lo;
+                        }
+                    }
+                }
+
+                Arrays.sort(testSizes);
+
+                int[] lastTestSizeArg = new int[]{-1};
+                for (int i = 0; i < testSizes.length; ++i) {
+                    if ((testSizes[i] > 0) && (testSizes[i] != lastTestSizeArg[0])) {
+                        lastTestSizeArg[0] = testSizes[i];
+                        final int seedForTestExecution = seedForPickingTestSizes + 1 + i*maxSeedsPerTest;
+                        pass &= run(td, RS, s, seedForTestExecution, lastTestSizeArg);
+                    }
+                }
+            }
+            // TODO: lengths 2 and 3, and assert otherwise
+        }
+
+        return pass;
+    }
+
+    private final TestDescription[] performanceTests = {
+        new TestDescription("addint1D", this::addint1D, 0, new int[]{100000 << 10}),
+        new TestDescription("addint2D", this::addint2D, 1, new int[]{450 << 5, 225 << 5}),
+        new TestDescription("findMinAndMax", this::findMinAndMax, 3, new int[]{100000 << 9}),
+        new TestDescription("fz", this::fz, 4, new int[]{100000 << 10}),
+        new TestDescription("fz2", this::fz2, 5, new int[]{225 << 5, 450 << 5}),
+        new TestDescription("fz3", this::fz3, 6, new int[]{59 << 3, 48 << 3, 37 << 3}),
+        new TestDescription("histogram", this::histogram, 7, new int[]{100000 << 10}),
+        // might want to add: new TestDescription("mode", this::mode, 8, new int[]{100000}),
+        new TestDescription("sumgcd", this::sumgcd, 9, new int[]{1 << 21})
+    };
+
+    private boolean runPerformanceQuick(RenderScript RS, ScriptC_reduce s) {
+        boolean pass = true;
+
+        for (TestDescription td : performanceTests) {
+            pass &= run(td, RS, s, maxSeedsPerTest * td.seed, td.defSize);
+        }
+
+        return pass;
+    }
+
+
     public void run() {
         RenderScript pRS = RenderScript.create(mCtx);
         ScriptC_reduce s = new ScriptC_reduce(pRS);
@@ -320,15 +810,10 @@
         s.set_posInf(Float.POSITIVE_INFINITY);
 
         boolean pass = true;
-        pass &= addint1D(pRS, s);
-        pass &= addint2D(pRS, s);
-        pass &= dp(pRS, s);
-        pass &= findMinAndMax(pRS, s);
-        pass &= fz(pRS, s);
-        pass &= fz2(pRS, s);
-        pass &= fz3(pRS, s);
-        pass &= histogram(pRS, s);
-        pass &= mode(pRS, s);
+
+        pass &= runCorrectnessQuick(pRS, s);
+        pass &= runCorrectness(pRS, s);
+        // pass &= runPerformanceQuick(pRS, s);
 
         pRS.finish();
         pRS.destroy();
diff --git a/java/tests/RsTest/src/com/android/rs/test/UT_reduce_backward.java b/java/tests/RsTest/src/com/android/rs/test/UT_reduce_backward.java
index 3a64a73..6a50d2b 100644
--- a/java/tests/RsTest/src/com/android/rs/test/UT_reduce_backward.java
+++ b/java/tests/RsTest/src/com/android/rs/test/UT_reduce_backward.java
@@ -119,39 +119,6 @@
 
     ///////////////////////////////////////////////////////////////////
 
-    private float dp(float[] input1, float[] input2) {
-        _RS_ASSERT("dp input length mismatch", input1.length == input2.length);
-
-        float rslt = 0;
-        for (int idx = 0; idx < input1.length; ++idx)
-            rslt += input1[idx] * input2[idx];
-        return rslt;
-    }
-
-    private boolean dp(RenderScript RS, ScriptC_reduce_backward s) {
-        final float[] input1 = createInputArrayFloat(100000, 2);
-        final float[] input2 = createInputArrayFloat(100000, 3);
-
-        final float javaRslt = dp(input1, input2);
-        final float rsRslt = s.reduce_dp(input1, input2).get();
-
-        // NOTE: Using a floating point equality check to test for
-        // correctness -- as we do below -- is a bad idea.  It's only
-        // reliable if the Java and RenderScript implementation of dp
-        // use the same algorithm.  Equality could be broken by
-        // different optimizations between the two, or running the
-        // RenderScript algorithm multithreaded, or running the
-        // RenderScript algorithm on a GPU rather than the CPU.
-        //
-        // Should we be checking instead that the results are
-        // "sufficiently close"?  Cooking the input set to try to
-        // ensure a deterministic result?  Changing to integers
-        // instead?
-        return result("dp", javaRslt, rsRslt);
-    }
-
-    ///////////////////////////////////////////////////////////////////
-
     private Int2 findMinAndMax(float[] input) {
         float minVal = Float.POSITIVE_INFINITY;
         int minIdx = -1;
@@ -322,7 +289,6 @@
         boolean pass = true;
         pass &= addint1D(pRS, s);
         pass &= addint2D(pRS, s);
-        pass &= dp(pRS, s);
         pass &= findMinAndMax(pRS, s);
         pass &= fz(pRS, s);
         pass &= fz2(pRS, s);
diff --git a/java/tests/RsTest/src/com/android/rs/test/math_fp16.rs b/java/tests/RsTest/src/com/android/rs/test/math_fp16.rs
index 331a871..eef3a8a 100644
--- a/java/tests/RsTest/src/com/android/rs/test/math_fp16.rs
+++ b/java/tests/RsTest/src/com/android/rs/test/math_fp16.rs
@@ -88,6 +88,12 @@
     h1 = fn(h3);            \
     h1 = fn(h4);
 
+#define TEST_H_FUNC_HN_HN(fn) \
+    h1 = fn(h1, h1);          \
+    h1 = fn(h2, h2);          \
+    h1 = fn(h3, h3);          \
+    h1 = fn(h4, h4);
+
 static bool testAPI() {
     TEST_HN_FUNC_HN(acos);
     TEST_HN_FUNC_HN(acosh);
@@ -138,7 +144,6 @@
     TEST_IN_FUNC_HN(ilogb);
     TEST_HN_FUNC_HN_IN(ldexp);
     TEST_HN_FUNC_HN_I(ldexp);
-    TEST_H_FUNC_HN(length);
     TEST_HN_FUNC_HN(lgamma);
     TEST_HN_FUNC_HN_PIN(lgamma);
 
@@ -191,7 +196,6 @@
     TEST_HN_FUNC_HN(native_log1p);
     TEST_HN_FUNC_HN(native_log2);
 
-    TEST_HN_FUNC_HN(native_normalize);
     TEST_HN_FUNC_HN_HN(native_powr);
     TEST_HN_FUNC_HN(native_recip);
     TEST_HN_FUNC_HN_IN(native_rootn);
@@ -207,7 +211,6 @@
     TEST_HN_FUNC_HN(native_tanpi);
 
     TEST_HN_FUNC_HN_HN(nextafter);
-    TEST_HN_FUNC_HN(normalize);
     TEST_HN_FUNC_HN_HN(pow);
     TEST_HN_FUNC_HN_IN(pown);
     TEST_HN_FUNC_HN_HN(powr);
@@ -241,6 +244,14 @@
     // Vector math functions
     h3 = cross(h3, h3);
     h4 = cross(h4, h4);
+
+    TEST_H_FUNC_HN_HN(distance);
+    TEST_H_FUNC_HN_HN(dot);
+    TEST_H_FUNC_HN(length);
+    TEST_H_FUNC_HN_HN(native_distance);
+    TEST_H_FUNC_HN(native_length);
+    TEST_HN_FUNC_HN(native_normalize);
+    TEST_HN_FUNC_HN(normalize);
     return true;
 }
 
diff --git a/java/tests/RsTest/src/com/android/rs/test/reduce.rs b/java/tests/RsTest/src/com/android/rs/test/reduce.rs
index be09dfb..97b45e0 100644
--- a/java/tests/RsTest/src/com/android/rs/test/reduce.rs
+++ b/java/tests/RsTest/src/com/android/rs/test/reduce.rs
@@ -16,18 +16,6 @@
 
 /////////////////////////////////////////////////////////////////////////
 
-#pragma rs reduce(dp) \
-  accumulator(dpAccum) combiner(dpSum)
-
-static void dpAccum(float *accum, float in1, float in2) {
-  *accum += in1*in2;
-}
-
-// combiner function
-static void dpSum(float *accum, const float *val) { *accum += *val; }
-
-/////////////////////////////////////////////////////////////////////////
-
 #pragma rs reduce(findMinAndMax) \
   initializer(fMMInit) accumulator(fMMAccumulator) \
   combiner(fMMCombiner) outconverter(fMMOutConverter)
@@ -61,8 +49,10 @@
 
 static void fMMCombiner(MinAndMax *accum,
                         const MinAndMax *val) {
-  fMMAccumulator(accum, val->min.val, val->min.idx);
-  fMMAccumulator(accum, val->max.val, val->max.idx);
+  if (val->min.val < accum->min.val)
+    accum->min = val->min;
+  if (val->max.val > accum->max.val)
+    accum->max = val->max;
 }
 
 static void fMMOutConverter(int2 *result,
@@ -160,3 +150,24 @@
   result->x = mode;
   result->y = (*h)[mode];
 }
+
+/////////////////////////////////////////////////////////////////////////
+
+#pragma rs reduce(sumgcd) accumulator(sgAccum) combiner(sgCombine)
+
+static int gcd(int a, int b) {
+  while (b != 0) {
+    const int aNew = b;
+    const int bNew = a % b;
+
+    a = aNew;
+    b = bNew;
+  }
+  return a;
+}
+
+static void sgAccum(long *accum, int a, int b) {
+  *accum += gcd(a, b);
+}
+
+static void sgCombine(long *accum, const long *other) { *accum += *other; }
diff --git a/java/tests/RsTest/src/com/android/rs/test/reduce_backward.rs b/java/tests/RsTest/src/com/android/rs/test/reduce_backward.rs
index 419e709..41252c8 100644
--- a/java/tests/RsTest/src/com/android/rs/test/reduce_backward.rs
+++ b/java/tests/RsTest/src/com/android/rs/test/reduce_backward.rs
@@ -15,18 +15,6 @@
 
 /////////////////////////////////////////////////////////////////////////
 
-static void dpAccum(float *accum, float in1, float in2) {
-  *accum += in1*in2;
-}
-
-// combiner function
-static void dpSum(float *accum, const float *val) { *accum += *val; }
-
-#pragma rs reduce(dp) \
-  accumulator(dpAccum) combiner(dpSum)
-
-/////////////////////////////////////////////////////////////////////////
-
 typedef struct {
   float val;
   int idx;
@@ -56,8 +44,10 @@
 
 static void fMMCombiner(MinAndMax *accum,
                         const MinAndMax *val) {
-  fMMAccumulator(accum, val->min.val, val->min.idx);
-  fMMAccumulator(accum, val->max.val, val->max.idx);
+  if (val->min.val < accum->min.val)
+    accum->min = val->min;
+  if (val->max.val > accum->max.val)
+    accum->max = val->max;
 }
 
 static void fMMOutConverter(int2 *result,
diff --git a/rs.spec b/rs.spec
index 608f324..16908dd 100644
--- a/rs.spec
+++ b/rs.spec
@@ -1,5 +1,4 @@
 ContextDestroy {
-    direct
 }
 
 ContextGetMessage {
diff --git a/rsContext.cpp b/rsContext.cpp
index 737d636..eab8bae 100644
--- a/rsContext.cpp
+++ b/rsContext.cpp
@@ -259,6 +259,8 @@
     rsc->props.mLogShadersAttr = getProp("debug.rs.shader.attributes") != 0;
     rsc->props.mLogShadersUniforms = getProp("debug.rs.shader.uniforms") != 0;
     rsc->props.mLogVisual = getProp("debug.rs.visual") != 0;
+    rsc->props.mLogReduceAccum = getProp("debug.rs.reduce-accum") != 0;
+    rsc->props.mDebugReduceSplitAccum = getProp("debug.rs.reduce-split-accum") != 0;
     rsc->props.mDebugMaxThreads = getProp("debug.rs.max-threads");
 
     if (getProp("debug.rs.debug") != 0) {
diff --git a/rsContext.h b/rsContext.h
index 890459d..e809792 100644
--- a/rsContext.h
+++ b/rsContext.h
@@ -227,6 +227,8 @@
         bool mLogShadersAttr;
         bool mLogShadersUniforms;
         bool mLogVisual;
+        bool mLogReduceAccum;
+        bool mDebugReduceSplitAccum;
         uint32_t mDebugMaxThreads;
     } props;
 
diff --git a/scriptc/rs_convert.rsh b/scriptc/rs_convert.rsh
index 9ffc183..4c318d4 100644
--- a/scriptc/rs_convert.rsh
+++ b/scriptc/rs_convert.rsh
@@ -19,7 +19,7 @@
 /*
  * rs_convert.rsh: Conversion Functions
  *
- * The functions below convert from a numerical vector type to another, of from one color
+ * The functions below convert from a numerical vector type to another, or from one color
  * representation to another.
  */
 
@@ -1247,6 +1247,21 @@
 #endif
 
 #if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    convert_half2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    convert_half3(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    convert_half4(half4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
 extern float2 __attribute__((const, overloadable))
     convert_float2(half2 v);
 #endif
diff --git a/scriptc/rs_vector_math.rsh b/scriptc/rs_vector_math.rsh
index d611464..2f5e8e7 100644
--- a/scriptc/rs_vector_math.rsh
+++ b/scriptc/rs_vector_math.rsh
@@ -294,6 +294,26 @@
     native_distance(float4 left_vector, float4 right_vector);
 #endif
 
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_distance(half left_vector, half right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_distance(half2 left_vector, half2 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_distance(half3 left_vector, half3 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_distance(half4 left_vector, half4 right_vector);
+#endif
+
 /*
  * native_length: Approximate length of a vector
  *