NEON fast path for box blur

Calculate 8 channels in parallel by using 16-bits to store each channel. Due to the limitation of VQRDMULH, (int16 * int16 * 2 + 0x8000) >> 16, the fast path can only support kernelSize < 128.
8 significant bits are kept at least in each stage, the final error should less-equal than 1.

Pre-fetching memory for X-direction read. In fact pre-fetching memory doesn't help much for Y direction read, since it is a waste to load a cache line for only read 8 bytes.(I left it there to keep the symmetry. pre-fetch is cheap :) )

bench data on Nexus 10
before:
running bench [640 480]      blur_image_filter_large_10.00_10.00   8888:  cmsecs =  25081.48
running bench [640 480]      blur_image_filter_small_10.00_10.00   8888:  cmsecs =  25038.04
running bench [640 480]        blur_image_filter_large_1.00_1.00   8888:  cmsecs =  25209.04
running bench [640 480]        blur_image_filter_small_1.00_1.00   8888:  cmsecs =  24928.01
running bench [640 480]        blur_image_filter_large_0.00_1.00   8888:  cmsecs =  17160.98
running bench [640 480]       blur_image_filter_large_0.00_10.00   8888:  cmsecs =  17924.11
running bench [640 480]        blur_image_filter_large_1.00_0.00   8888:  cmsecs =  14609.19
running bench [640 480]       blur_image_filter_large_10.00_0.00   8888:  cmsecs =  14625.91

after:
running bench [640 480]      blur_image_filter_large_10.00_10.00   8888:  cmsecs =  14848.42
running bench [640 480]      blur_image_filter_small_10.00_10.00   8888:  cmsecs =  16037.29
running bench [640 480]        blur_image_filter_large_1.00_1.00   8888:  cmsecs =  14819.55
running bench [640 480]        blur_image_filter_small_1.00_1.00   8888:  cmsecs =  14563.69
running bench [640 480]        blur_image_filter_large_0.00_1.00   8888:  cmsecs =  11905.34
running bench [640 480]       blur_image_filter_large_0.00_10.00   8888:  cmsecs =  11883.85
running bench [640 480]        blur_image_filter_large_1.00_0.00   8888:  cmsecs =   9576.51
running bench [640 480]       blur_image_filter_large_10.00_0.00   8888:  cmsecs =   9793.84

BUG=
R=senorblanco@chromium.org, mtklein@google.com, reed@google.com, kevin.petit@arm.com, kevin.petit.arm@gmail.com

Author: zheng.xu@arm.com

Review URL: https://codereview.chromium.org/105893003

git-svn-id: http://skia.googlecode.com/svn/trunk@13036 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/gm/imageblur.cpp b/gm/imageblur.cpp
index 0e50653..4c3f42d 100644
--- a/gm/imageblur.cpp
+++ b/gm/imageblur.cpp
@@ -16,13 +16,15 @@
 
 class ImageBlurGM : public GM {
 public:
-    ImageBlurGM() {
+    ImageBlurGM(SkScalar sigmaX, SkScalar sigmaY, const char* suffix)
+        : fSigmaX(sigmaX), fSigmaY(sigmaY) {
         this->setBGColor(0xFF000000);
+        fName.printf("imageblur%s", suffix);
     }
 
 protected:
     virtual SkString onShortName() {
-        return SkString("imageblur");
+        return fName;
     }
 
     virtual SkISize onISize() {
@@ -31,7 +33,7 @@
 
     virtual void onDraw(SkCanvas* canvas) {
         SkPaint paint;
-        paint.setImageFilter(new SkBlurImageFilter(24.0f, 0.0f))->unref();
+        paint.setImageFilter(new SkBlurImageFilter(fSigmaX, fSigmaY))->unref();
         canvas->saveLayer(NULL, &paint);
         const char* str = "The quick brown fox jumped over the lazy dog.";
 
@@ -50,12 +52,19 @@
     }
 
 private:
+    SkScalar fSigmaX;
+    SkScalar fSigmaY;
+    SkString fName;
+
     typedef GM INHERITED;
 };
 
 //////////////////////////////////////////////////////////////////////////////
 
-static GM* MyFactory(void*) { return new ImageBlurGM; }
-static GMRegistry reg(MyFactory);
+static GM* MyFactory1(void*) { return new ImageBlurGM(24.0f, 0.0f, ""); }
+static GMRegistry reg1(MyFactory1);
+
+static GM* MyFactory2(void*) { return new ImageBlurGM(80.0f, 80.0f, "_large"); }
+static GMRegistry reg2(MyFactory2);
 
 }