llvmpipe: fix snorm blending

The blend math gets a bit funky due to inverse blend factors being
in range [0,2] rather than [-1,1], our normalized math can't really
cover this.
src_alpha_saturate blend factor has a similar problem too.
(Note that piglit fbo-blending-formats test is mostly useless for
anything but unorm formats, since not just all src/dst values are
between [0,1], but the tests are crafted in a way that the results
are between [0,1] too.)

v2: some formatting fixes, and fix a fairly obscure (to debug)
issue with alpha-only formats (not related to snorm at all), where
blend optimization would think it could simplify the blend equation
if the blend factors were complementary, however was using the
completely unrelated rgb blend factors instead of the alpha ones...

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index a1edd34..321c6e4 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -541,38 +541,38 @@
    assert(lp_check_value(type, a));
    assert(lp_check_value(type, b));
 
-   if(a == bld->zero)
+   if (a == bld->zero)
       return b;
-   if(b == bld->zero)
+   if (b == bld->zero)
       return a;
-   if(a == bld->undef || b == bld->undef)
+   if (a == bld->undef || b == bld->undef)
       return bld->undef;
 
-   if(bld->type.norm) {
+   if (type.norm) {
       const char *intrinsic = NULL;
 
-      if(a == bld->one || b == bld->one)
+      if (!type.sign && (a == bld->one || b == bld->one))
         return bld->one;
 
       if (!type.floating && !type.fixed) {
          if (type.width * type.length == 128) {
-            if(util_cpu_caps.has_sse2) {
-              if(type.width == 8)
+            if (util_cpu_caps.has_sse2) {
+              if (type.width == 8)
                 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
-              if(type.width == 16)
+              if (type.width == 16)
                 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
             } else if (util_cpu_caps.has_altivec) {
-              if(type.width == 8)
+              if (type.width == 8)
                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
-              if(type.width == 16)
+              if (type.width == 16)
                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
             }
          }
          if (type.width * type.length == 256) {
-            if(util_cpu_caps.has_avx2) {
-              if(type.width == 8)
+            if (util_cpu_caps.has_avx2) {
+              if (type.width == 8)
                 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
-              if(type.width == 16)
+              if (type.width == 16)
                 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
             }
          }
@@ -842,38 +842,38 @@
    assert(lp_check_value(type, a));
    assert(lp_check_value(type, b));
 
-   if(b == bld->zero)
+   if (b == bld->zero)
       return a;
-   if(a == bld->undef || b == bld->undef)
+   if (a == bld->undef || b == bld->undef)
       return bld->undef;
-   if(a == b)
+   if (a == b)
       return bld->zero;
 
-   if(bld->type.norm) {
+   if (type.norm) {
       const char *intrinsic = NULL;
 
-      if(b == bld->one)
+      if (!type.sign && b == bld->one)
         return bld->zero;
 
       if (!type.floating && !type.fixed) {
          if (type.width * type.length == 128) {
             if (util_cpu_caps.has_sse2) {
-              if(type.width == 8)
+              if (type.width == 8)
                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
-              if(type.width == 16)
+              if (type.width == 16)
                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
             } else if (util_cpu_caps.has_altivec) {
-              if(type.width == 8)
+              if (type.width == 8)
                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
-              if(type.width == 16)
+              if (type.width == 16)
                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
             }
          }
          if (type.width * type.length == 256) {
             if (util_cpu_caps.has_avx2) {
-              if(type.width == 8)
+              if (type.width == 8)
                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
-              if(type.width == 16)
+              if (type.width == 16)
                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
             }
          }
@@ -963,7 +963,7 @@
  * @sa Michael Herf, The "double blend trick", May 2000, 
  *     http://www.stereopsis.com/doubleblend.html
  */
-static LLVMValueRef
+LLVMValueRef
 lp_build_mul_norm(struct gallivm_state *gallivm,
                   struct lp_type wide_type,
                   LLVMValueRef a, LLVMValueRef b)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
index 2a4137a..f5b2800 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -71,6 +71,13 @@
              LLVMValueRef a,
              LLVMValueRef b);
 
+
+LLVMValueRef
+lp_build_mul_norm(struct gallivm_state *gallivm,
+                  struct lp_type wide_type,
+                  LLVMValueRef a,
+                  LLVMValueRef b);
+
 LLVMValueRef
 lp_build_mul(struct lp_build_context *bld,
              LLVMValueRef a,
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend.c b/src/gallium/drivers/llvmpipe/lp_bld_blend.c
index 1feb415..02ec55e 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend.c
@@ -35,6 +35,7 @@
 #include "gallivm/lp_bld_swizzle.h"
 #include "gallivm/lp_bld_flow.h"
 #include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_pack.h"
 
 #include "lp_bld_blend.h"
 
@@ -65,11 +66,11 @@
 boolean
 lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func)
 {
-   if(rgb_func == alpha_func)
+   if (rgb_func == alpha_func)
       return FALSE;
-   if(rgb_func == PIPE_BLEND_SUBTRACT && alpha_func == PIPE_BLEND_REVERSE_SUBTRACT)
+   if (rgb_func == PIPE_BLEND_SUBTRACT && alpha_func == PIPE_BLEND_REVERSE_SUBTRACT)
       return TRUE;
-   if(rgb_func == PIPE_BLEND_REVERSE_SUBTRACT && alpha_func == PIPE_BLEND_SUBTRACT)
+   if (rgb_func == PIPE_BLEND_REVERSE_SUBTRACT && alpha_func == PIPE_BLEND_SUBTRACT)
       return TRUE;
    return FALSE;
 }
@@ -81,11 +82,65 @@
 static inline boolean
 lp_build_blend_factor_complementary(unsigned src_factor, unsigned dst_factor)
 {
+   STATIC_ASSERT((PIPE_BLENDFACTOR_ZERO ^ 0x10) == PIPE_BLENDFACTOR_ONE);
+   STATIC_ASSERT((PIPE_BLENDFACTOR_CONST_COLOR ^ 0x10) ==
+                 PIPE_BLENDFACTOR_INV_CONST_COLOR);
    return dst_factor == (src_factor ^ 0x10);
 }
 
 
 /**
+ * Whether this is a inverse blend factor
+ */
+static inline boolean
+is_inverse_factor(unsigned factor)
+{
+   STATIC_ASSERT(PIPE_BLENDFACTOR_ZERO == 0x11);
+   return factor > 0x11;
+}
+
+
+/**
+ * Calculates the (expanded to wider type) multiplication
+ * of 2 normalized numbers.
+ */
+static void
+lp_build_mul_norm_expand(struct lp_build_context *bld,
+                         LLVMValueRef a, LLVMValueRef b,
+                         LLVMValueRef *resl, LLVMValueRef *resh,
+                         boolean signedness_differs)
+{
+   const struct lp_type type = bld->type;
+   struct lp_type wide_type = lp_wider_type(type);
+   struct lp_type wide_type2 = wide_type;
+   struct lp_type type2 = type;
+   LLVMValueRef al, ah, bl, bh;
+
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+   assert(!type.floating && !type.fixed && type.norm);
+
+   if (a == bld->zero || b == bld->zero) {
+      LLVMValueRef zero = LLVMConstNull(lp_build_vec_type(bld->gallivm, wide_type));
+      *resl = zero;
+      *resh = zero;
+      return;
+   }
+
+   if (signedness_differs) {
+      type2.sign = !type.sign;
+      wide_type2.sign = !wide_type2.sign;
+   }
+
+   lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
+   lp_build_unpack2_native(bld->gallivm, type2, wide_type2, b, &bl, &bh);
+
+   *resl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
+   *resh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
+}
+
+
+/**
  * @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendEquationSeparate.xml
  */
 LLVMValueRef
@@ -155,7 +210,7 @@
             } else {
                return lp_build_lerp(bld, dst_factor, src, dst, 0);
             }
-         } else if(bld->type.floating && func == PIPE_BLEND_SUBTRACT) {
+         } else if (bld->type.floating && func == PIPE_BLEND_SUBTRACT) {
             result = lp_build_add(bld, src, dst);
 
             if (factor_src < factor_dst) {
@@ -165,7 +220,7 @@
                result = lp_build_mul(bld, result, dst_factor);
                return lp_build_sub(bld, src, result);
             }
-         } else if(bld->type.floating && func == PIPE_BLEND_REVERSE_SUBTRACT) {
+         } else if (bld->type.floating && func == PIPE_BLEND_REVERSE_SUBTRACT) {
             result = lp_build_add(bld, src, dst);
 
             if (factor_src < factor_dst) {
@@ -192,9 +247,72 @@
    if (optimise_only)
       return NULL;
 
-   src_term = lp_build_mul(bld, src, src_factor);
-   dst_term = lp_build_mul(bld, dst, dst_factor);
-   return lp_build_blend_func(bld, func, src_term, dst_term);
+   if ((bld->type.norm && bld->type.sign) &&
+       (is_inverse_factor(factor_src) || is_inverse_factor(factor_dst))) {
+      /*
+       * With snorm blending, the inverse blend factors range from [0,2]
+       * instead of [-1,1], so the ordinary signed normalized arithmetic
+       * doesn't quite work. Unpack must be unsigned, and the add/sub
+       * must be done with wider type.
+       * (Note that it's not quite obvious what the blend equation wrt to
+       * clamping should actually be based on GL spec in this case, but
+       * really the incoming src values are clamped to [-1,1] (the dst is
+       * always clamped already), and then NO further clamping occurs until
+       * the end.)
+       */
+      struct lp_build_context bldw;
+      struct lp_type wide_type = lp_wider_type(bld->type);
+      LLVMValueRef src_terml, src_termh, dst_terml, dst_termh;
+      LLVMValueRef resl, resh;
+
+      /*
+       * We don't need saturate math for the sub/add, since we have
+       * x+1 bit numbers in x*2 wide type (result is x+2 bits).
+       * (Doesn't really matter on x86 sse2 though as we use saturated
+       * intrinsics.)
+       */
+      wide_type.norm = 0;
+      lp_build_context_init(&bldw, bld->gallivm, wide_type);
+
+      /*
+       * XXX This is a bit hackish. Note that -128 really should
+       * be -1.0, the same as -127. However, we did not actually clamp
+       * things anywhere (relying on pack intrinsics instead) therefore
+       * we will get -128, and the inverted factor then 255. But the mul
+       * can overflow in this case (rather the rounding fixups for the mul,
+       * -128*255 will be positive).
+       * So we clamp the src and dst up here but only when necessary (we
+       * should do this before calculating blend factors but it's enough
+       * for avoiding overflow).
+       */
+      if (is_inverse_factor(factor_src)) {
+         src = lp_build_max(bld, src,
+                            lp_build_const_vec(bld->gallivm, bld->type, -1.0));
+      }
+      if (is_inverse_factor(factor_dst)) {
+         dst = lp_build_max(bld, dst,
+                            lp_build_const_vec(bld->gallivm, bld->type, -1.0));
+      }
+
+      lp_build_mul_norm_expand(bld, src, src_factor, &src_terml, &src_termh,
+                               is_inverse_factor(factor_src) ? TRUE : FALSE);
+      lp_build_mul_norm_expand(bld, dst, dst_factor, &dst_terml, &dst_termh,
+                               is_inverse_factor(factor_dst) ? TRUE : FALSE);
+      resl = lp_build_blend_func(&bldw, func, src_terml, dst_terml);
+      resh = lp_build_blend_func(&bldw, func, src_termh, dst_termh);
+
+      /*
+       * XXX pack2_native is not ok because the values have to be in dst
+       * range. We need native pack though for the correct order on avx2.
+       * Will break on everything not implementing clamping pack intrinsics
+       * (i.e. everything but sse2 and altivec).
+       */
+      return lp_build_pack2_native(bld->gallivm, wide_type, bld->type, resl, resh);
+   } else {
+      src_term = lp_build_mul(bld, src, src_factor);
+      dst_term = lp_build_mul(bld, dst, dst_factor);
+      return lp_build_blend_func(bld, func, src_term, dst_term);
+   }
 }
 
 void
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
index 45c5c2b..c16ef1a 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
@@ -112,22 +112,34 @@
    case PIPE_BLENDFACTOR_DST_ALPHA:
       return bld->dst;
    case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-      if(alpha)
+      if (alpha)
          return bld->base.one;
       else {
          /*
           * If there's no dst alpha the complement is zero but for unclamped
-          * float inputs min can be non-zero (negative).
+          * float inputs (or snorm inputs) min can be non-zero (negative).
           */
-         if (!bld->has_dst_alpha) {
-            if (!bld->saturate)
+         if (!bld->saturate) {
+            if (!bld->has_dst_alpha) {
                bld->saturate = lp_build_min(&bld->base, src_alpha, bld->base.zero);
-         }
-         else {
-            if(!bld->inv_dst)
-               bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
-            if(!bld->saturate)
+            }
+            else if (bld->base.type.norm && bld->base.type.sign) {
+               /*
+                * The complement/min totally doesn't work, since
+                * the complement is in range [0,2] but the other
+                * min input is [-1,1]. However, we can just clamp to 0
+                * before doing the complement...
+                */
+               LLVMValueRef inv_dst;
+               inv_dst = lp_build_max(&bld->base, bld->base.zero, bld->dst);
+               inv_dst = lp_build_comp(&bld->base, inv_dst);
+               bld->saturate = lp_build_min(&bld->base, src_alpha, inv_dst);
+            } else {
+               if (!bld->inv_dst) {
+                  bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
+               }
                bld->saturate = lp_build_min(&bld->base, src_alpha, bld->inv_dst);
+            }
          }
          return bld->saturate;
       }
@@ -140,24 +152,24 @@
    case PIPE_BLENDFACTOR_SRC1_ALPHA:
       return src1_alpha;
    case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-      if(!bld->inv_src)
+      if (!bld->inv_src)
          bld->inv_src = lp_build_comp(&bld->base, bld->src);
       return bld->inv_src;
    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-      if(!bld->inv_src_alpha)
+      if (!bld->inv_src_alpha)
          bld->inv_src_alpha = lp_build_comp(&bld->base, src_alpha);
       return bld->inv_src_alpha;
    case PIPE_BLENDFACTOR_INV_DST_COLOR:
    case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-      if(!bld->inv_dst)
+      if (!bld->inv_dst)
          bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
       return bld->inv_dst;
    case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-      if(!bld->inv_const)
+      if (!bld->inv_const)
          bld->inv_const = lp_build_comp(&bld->base, bld->const_);
       return bld->inv_const;
    case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-      if(!bld->inv_const_alpha)
+      if (!bld->inv_const_alpha)
          bld->inv_const_alpha = lp_build_comp(&bld->base, const_alpha);
       return bld->inv_const_alpha;
    case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
@@ -331,7 +343,7 @@
    bld.const_alpha = const_alpha;
    bld.has_dst_alpha = FALSE;
 
-   /* Find the alpha channel if not provided seperately */
+   /* Find the alpha channel if not provided separately */
    if (!src_alpha) {
       for (i = 0; i < 4; ++i) {
          if (swizzle[i] == 3) {
@@ -349,7 +361,7 @@
    }
 
    if (blend->logicop_enable) {
-      if(!type.floating) {
+      if (!type.floating) {
          result = lp_build_logicop(gallivm->builder, blend->logicop_func, src, dst);
       }
       else {
@@ -361,6 +373,7 @@
       boolean rgb_alpha_same = (state->rgb_src_factor == state->rgb_dst_factor &&
                                 state->alpha_src_factor == state->alpha_dst_factor) ||
                                nr_channels == 1;
+      boolean alpha_only = nr_channels == 1 && alpha_swizzle == PIPE_SWIZZLE_X;
 
       src_factor = lp_build_blend_factor(&bld, state->rgb_src_factor,
                                          state->alpha_src_factor,
@@ -374,8 +387,8 @@
 
       result = lp_build_blend(&bld.base,
                               state->rgb_func,
-                              state->rgb_src_factor,
-                              state->rgb_dst_factor,
+                              alpha_only ? state->alpha_src_factor : state->rgb_src_factor,
+                              alpha_only ? state->alpha_dst_factor : state->rgb_dst_factor,
                               src,
                               dst,
                               src_factor,
@@ -383,8 +396,8 @@
                               rgb_alpha_same,
                               false);
 
-      if(state->rgb_func != state->alpha_func && nr_channels > 1 &&
-                            alpha_swizzle != PIPE_SWIZZLE_NONE) {
+      if (state->rgb_func != state->alpha_func && nr_channels > 1 &&
+          alpha_swizzle != PIPE_SWIZZLE_NONE) {
          LLVMValueRef alpha;
 
          alpha = lp_build_blend(&bld.base,