start on SkJumper lowp mode

Just 3 stages implemented so far:

  load_8888
  swap_rb
  store_8888

That's enough to make the shortest non-trivial pipeline
that you see in the new unit test.

Change-Id: Iabf90866ab452f7183d8c8dec1405ece2db695dc
Reviewed-on: https://skia-review.googlesource.com/18458
Commit-Queue: Mike Klein <mtklein@chromium.org>
Reviewed-by: Herb Derby <herb@google.com>
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index c5840d3..77a5856 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -29,6 +29,7 @@
 // We can't express the real types of most stage functions portably, so we use a stand-in.
 // We'll only ever call start_pipeline(), which then chains into the rest for us.
 using StageFn = void(void);
+using StartPipelineFn = void(size_t,size_t,size_t,void**,K*);
 
 // Some platforms expect C "name" maps to asm "_name", others to "name".
 #if defined(__APPLE__)
@@ -43,29 +44,31 @@
     // We'll just run portable code.
 
 #elif defined(__aarch64__)
-    void ASM(start_pipeline,aarch64)(size_t,size_t,size_t, void**, K*);
+    StartPipelineFn ASM(start_pipeline,aarch64);
     StageFn ASM(just_return,aarch64);
     #define M(st) StageFn ASM(st,aarch64);
         SK_RASTER_PIPELINE_STAGES(M)
     #undef M
 
 #elif defined(__arm__)
-    void ASM(start_pipeline,vfp4)(size_t,size_t,size_t, void**, K*);
+    StartPipelineFn ASM(start_pipeline,vfp4);
     StageFn ASM(just_return,vfp4);
     #define M(st) StageFn ASM(st,vfp4);
         SK_RASTER_PIPELINE_STAGES(M)
     #undef M
 
 #elif defined(__x86_64__) || defined(_M_X64)
-    void ASM(start_pipeline,hsw  )(size_t,size_t,size_t, void**, K*);
-    void ASM(start_pipeline,avx  )(size_t,size_t,size_t, void**, K*);
-    void ASM(start_pipeline,sse41)(size_t,size_t,size_t, void**, K*);
-    void ASM(start_pipeline,sse2 )(size_t,size_t,size_t, void**, K*);
+    StartPipelineFn ASM(start_pipeline,hsw       ),
+                    ASM(start_pipeline,avx       ),
+                    ASM(start_pipeline,sse41     ),
+                    ASM(start_pipeline,sse2      ),
+                    ASM(start_pipeline,ssse3_lowp);
 
     StageFn ASM(just_return,hsw),
             ASM(just_return,avx),
             ASM(just_return,sse41),
-            ASM(just_return,sse2);
+            ASM(just_return,sse2),
+            ASM(just_return,ssse3_lowp);
 
     #define M(st) StageFn ASM(st,hsw);
         SK_RASTER_PIPELINE_STAGES(M)
@@ -79,10 +82,14 @@
     #define M(st) StageFn ASM(st,sse2);
         SK_RASTER_PIPELINE_STAGES(M)
     #undef M
+
+    StageFn ASM(load_8888,  ssse3_lowp),
+            ASM(store_8888, ssse3_lowp),
+            ASM(swap_rb,    ssse3_lowp);
 #endif
 
     // Portable, single-pixel stages.
-    void sk_start_pipeline(size_t,size_t,size_t, void**, K*);
+    StartPipelineFn sk_start_pipeline;
     StageFn sk_just_return;
     #define M(st) StageFn sk_##st;
         SK_RASTER_PIPELINE_STAGES(M)
@@ -170,43 +177,72 @@
     return kPortable;
 }
 
-void SkRasterPipeline::build_pipeline(const SkJumper_Engine& engine, void** ip) const {
-    const StageList* st = fStages;
+StartPipelineFn* SkRasterPipeline::build_pipeline(void** ip) const {
+#if defined(__x86_64__) || defined(_M_X64)
+    if (SkCpu::Supports(SkCpu::SSSE3)) {
+        void** reset_point = ip;
+
+        *--ip = (void*)ASM(just_return,ssse3_lowp);
+        for (const StageList* st = fStages; st; st = st->prev) {
+            StageFn* fn = nullptr;
+            switch (st->stage) {
+                case SkRasterPipeline::load_8888:  fn = ASM(load_8888, ssse3_lowp); break;
+                case SkRasterPipeline::store_8888: fn = ASM(store_8888,ssse3_lowp); break;
+                case SkRasterPipeline::swap_rb:    fn = ASM(swap_rb,   ssse3_lowp); break;
+                default:
+                    //SkDebugf("can't %d\n", st->stage);
+                    ip = reset_point;
+            }
+            if (ip == reset_point) {
+                break;
+            }
+            if (st->ctx) {
+                *--ip = st->ctx;
+            }
+            *--ip = (void*)fn;
+        }
+
+        if (ip != reset_point) {
+            return ASM(start_pipeline,ssse3_lowp);
+        }
+    }
+#endif
+    gChooseEngineOnce([]{ gEngine = choose_engine(); });
+
     // We're building the pipeline backwards, so we start with the final stage just_return.
-    *--ip = (void*)engine.just_return;
+    *--ip = (void*)gEngine.just_return;
 
     // Still going backwards, each stage's context pointer then its StageFn.
-    for (; st; st = st->prev) {
+    for (const StageList* st = fStages; st; st = st->prev) {
         if (st->ctx) {
             *--ip = st->ctx;
         }
-        *--ip = (void*)engine.stages[st->stage];
+        *--ip = (void*)gEngine.stages[st->stage];
     }
+    return gEngine.start_pipeline;
 }
 
 void SkRasterPipeline::run(size_t x, size_t y, size_t n) const {
     if (this->empty()) {
         return;
     }
-    gChooseEngineOnce([]{ gEngine = choose_engine(); });
 
     // Best to not use fAlloc here... we can't bound how often run() will be called.
     SkAutoSTMalloc<64, void*> program(fSlotsNeeded);
 
-    this->build_pipeline(gEngine, program.get() + fSlotsNeeded);
-    gEngine.start_pipeline(x,y,x+n, program.get(), &kConstants);
+    auto start_pipeline = this->build_pipeline(program.get() + fSlotsNeeded);
+    start_pipeline(x,y,x+n, program.get(), &kConstants);
 }
 
 std::function<void(size_t, size_t, size_t)> SkRasterPipeline::compile() const {
     if (this->empty()) {
         return [](size_t, size_t, size_t) {};
     }
-    gChooseEngineOnce([]{ gEngine = choose_engine(); });
 
     void** program = fAlloc->makeArray<void*>(fSlotsNeeded);
-    this->build_pipeline(gEngine, program + fSlotsNeeded);
+    auto start_pipeline = this->build_pipeline(program + fSlotsNeeded);
 
     return [=](size_t x, size_t y, size_t n) {
-        gEngine.start_pipeline(x,y,x+n, program, &kConstants);
+        start_pipeline(x,y,x+n, program, &kConstants);
     };
 }