Switch back to stack-based SkSL interpreter

It's slower, but code size is quite a bit smaller, memory usage is
smaller, and we think that mapping it to SkVM is just as easy.

This effectively reverts all of the following commits:

"Fix gcc9 warning around size of memset."
https://skia-review.googlesource.com/c/skia/+/279861

"Remove unused (and misleading) 'instruction' from SkSLInterpreter.h"
https://skia-review.googlesource.com/c/skia/+/278177

"Interpreter: Fix intrinsics when called with vector types"
https://skia-review.googlesource.com/c/skia/+/272721

"Make it easier to add vector versions of byte code instructions"
https://skia-review.googlesource.com/c/skia/+/272527

"Interpreter: Support returns from runStriped"
https://skia-review.googlesource.com/c/skia/+/268941

"add SkSLInterpreter vector instructions"
https://skia-review.googlesource.com/c/skia/+/266560

"Fix crash when editing particle scripts"
https://skia-review.googlesource.com/c/skia/+/269487

"Revert "Revert "Complete rewrite of the SkSL interpreter"""
https://skia-review.googlesource.com/c/skia/+/266205

Change-Id: I4258596399c4ca94489d4faf8aacfec88afeee13
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/281205
Reviewed-by: Mike Klein <mtklein@google.com>
Reviewed-by: Ethan Nicholas <ethannicholas@google.com>
Commit-Queue: Brian Osman <brianosman@google.com>
diff --git a/src/sksl/SkSLByteCodeGenerator.cpp b/src/sksl/SkSLByteCodeGenerator.cpp
index 3a65d8c..d2e960a 100644
--- a/src/sksl/SkSLByteCodeGenerator.cpp
+++ b/src/sksl/SkSLByteCodeGenerator.cpp
@@ -7,24 +7,50 @@
 
 #include "src/sksl/SkSLByteCodeGenerator.h"
 
+#include <algorithm>
+
 namespace SkSL {
 
-ByteCodeGenerator::ByteCodeGenerator(const Program* program, ErrorReporter* errors,
-                                     ByteCode* output)
+static TypeCategory type_category(const Type& type) {
+    switch (type.kind()) {
+        case Type::Kind::kVector_Kind:
+        case Type::Kind::kMatrix_Kind:
+            return type_category(type.componentType());
+        default:
+            if (type.fName == "bool") {
+                return TypeCategory::kBool;
+            } else if (type.fName == "int" ||
+                       type.fName == "short" ||
+                       type.fName == "$intLiteral") {
+                return TypeCategory::kSigned;
+            } else if (type.fName == "uint" ||
+                       type.fName == "ushort") {
+                return TypeCategory::kUnsigned;
+            } else {
+                SkASSERT(type.fName == "float" ||
+                         type.fName == "half" ||
+                         type.fName == "$floatLiteral");
+                return TypeCategory::kFloat;
+            }
+            ABORT("unsupported type: %s\n", type.displayName().c_str());
+    }
+}
+
+
+ByteCodeGenerator::ByteCodeGenerator(const Context* context, const Program* program, ErrorReporter* errors,
+                  ByteCode* output)
     : INHERITED(program, errors, nullptr)
+    , fContext(*context)
     , fOutput(output)
     , fIntrinsics {
-        // "Normal" intrinsics are all $genType f($genType), mapped to a single instruction
-        { "cos",     ByteCode::Instruction::kCos },
-        { "sin",     ByteCode::Instruction::kSin },
-        { "sqrt",    ByteCode::Instruction::kSqrt },
-        { "tan",     ByteCode::Instruction::kTan },
-
-        // Special intrinsics have other signatures, or non-standard code-gen
+        { "cos",     ByteCodeInstruction::kCos },
         { "dot",     SpecialIntrinsic::kDot },
-        { "inverse", SpecialIntrinsic::kInverse },
-        { "print",   SpecialIntrinsic::kPrint },
-    } {}
+        { "inverse", ByteCodeInstruction::kInverse2x2 },
+        { "sin",     ByteCodeInstruction::kSin },
+        { "sqrt",    ByteCodeInstruction::kSqrt },
+        { "tan",     ByteCodeInstruction::kTan },
+      } {}
+
 
 int ByteCodeGenerator::SlotCount(const Type& type) {
     if (type.kind() == Type::kOther_Kind) {
@@ -54,74 +80,89 @@
 static inline bool is_in(const SkSL::Variable& var) {
     return var.fModifiers.fFlags & Modifiers::kIn_Flag;
 }
-ByteCodeGenerator::Location ByteCodeGenerator::getLocation(const Variable& var) {
-    // given that we seldom have more than a couple of variables, linear search is probably the most
-    // efficient way to handle lookups
-    switch (var.fStorage) {
-        case Variable::kLocal_Storage: {
-            for (int i = fLocals.size() - 1; i >= 0; --i) {
-                if (fLocals[i] == &var) {
-                    return ByteCode::Pointer{(uint16_t) (i + fParameterCount)};
-                }
-            }
-            int result = fLocals.size() + fParameterCount;
-            fLocals.push_back(&var);
-            for (int i = 0; i < SlotCount(var.fType) - 1; ++i) {
-                fLocals.push_back(nullptr);
-            }
-            SkASSERT(result <= ByteCode::kPointerMax);
-            return ByteCode::Pointer{(uint16_t) result};
+
+void ByteCodeGenerator::gatherUniforms(const Type& type, const String& name) {
+    if (type.kind() == Type::kOther_Kind) {
+        return;
+    } else if (type.kind() == Type::kStruct_Kind) {
+        for (const auto& f : type.fields()) {
+            this->gatherUniforms(*f.fType, name + "." + f.fName);
         }
-        case Variable::kParameter_Storage: {
-            int offset = 0;
-            for (const auto& p : fFunction->fDeclaration.fParameters) {
-                if (p == &var) {
-                    SkASSERT(offset <= ByteCode::kPointerMax);
-                    return ByteCode::Pointer{(uint16_t) offset};
-                }
-                offset += SlotCount(p->fType);
-            }
-            SkASSERT(false);
-            return ByteCode::Pointer{0};
+    } else if (type.kind() == Type::kArray_Kind) {
+        for (int i = 0; i < type.columns(); ++i) {
+            this->gatherUniforms(type.componentType(), String::printf("%s[%d]", name.c_str(), i));
         }
-        case Variable::kGlobal_Storage: {
-            if (is_in(var)) {
-                // If you see this error, it means the program is using raw 'in' variables. You
-                // should either specialize the program (Compiler::specialize) to bake in the final
-                // values of the 'in' variables, or not use 'in' variables (maybe you meant to use
-                // 'uniform' instead?).
-                fErrors.error(var.fOffset,
-                              "'in' variable is not specialized or has unsupported type");
-                return ByteCode::Pointer{0};
+    } else {
+        fOutput->fUniforms.push_back({ name, type_category(type), type.rows(), type.columns(),
+                                       fOutput->fUniformSlotCount });
+        fOutput->fUniformSlotCount += type.columns() * type.rows();
+    }
+}
+
+bool ByteCodeGenerator::generateCode() {
+    for (const auto& e : fProgram) {
+        switch (e.fKind) {
+            case ProgramElement::kFunction_Kind: {
+                std::unique_ptr<ByteCodeFunction> f = this->writeFunction((FunctionDefinition&) e);
+                if (!f) {
+                    return false;
+                }
+                fOutput->fFunctions.push_back(std::move(f));
+                fFunctions.push_back(&(FunctionDefinition&)e);
+                break;
             }
-            bool isUniform = is_uniform(var);
-            int offset = isUniform ? fOutput->getGlobalSlotCount() : 0;
-            for (const auto& e : fProgram) {
-                if (e.fKind == ProgramElement::kVar_Kind) {
-                    VarDeclarations& decl = (VarDeclarations&) e;
-                    for (const auto& v : decl.fVars) {
-                        const Variable* declVar = ((VarDeclaration&) *v).fVar;
-                        if (declVar->fModifiers.fLayout.fBuiltin >= 0 || is_in(*declVar)) {
-                            continue;
-                        }
-                        if (isUniform != is_uniform(*declVar)) {
-                            continue;
-                        }
-                        if (declVar == &var) {
-                            SkASSERT(offset <= ByteCode::kPointerMax);
-                            return ByteCode::Pointer{(uint16_t) offset};
-                        }
-                        offset += SlotCount(declVar->fType);
+            case ProgramElement::kVar_Kind: {
+                VarDeclarations& decl = (VarDeclarations&) e;
+                for (const auto& v : decl.fVars) {
+                    const Variable* declVar = ((VarDeclaration&) *v).fVar;
+                    if (declVar->fModifiers.fLayout.fBuiltin >= 0 || is_in(*declVar)) {
+                        continue;
+                    }
+                    if (is_uniform(*declVar)) {
+                        this->gatherUniforms(declVar->fType, declVar->fName);
+                    } else {
+                        fOutput->fGlobalSlotCount += SlotCount(declVar->fType);
                     }
                 }
+                break;
             }
-            SkASSERT(false);
-            return ByteCode::Pointer{0};
+            default:
+                ; // ignore
         }
-        default:
-            SkASSERT(false);
-            return ByteCode::Pointer{0};
     }
+    return 0 == fErrors.errorCount();
+}
+
+std::unique_ptr<ByteCodeFunction> ByteCodeGenerator::writeFunction(const FunctionDefinition& f) {
+    fFunction = &f;
+    std::unique_ptr<ByteCodeFunction> result(new ByteCodeFunction(&f.fDeclaration));
+    fParameterCount = result->fParameterCount;
+    fLoopCount = fMaxLoopCount = 0;
+    fConditionCount = fMaxConditionCount = 0;
+    fStackCount = fMaxStackCount = 0;
+    fCode = &result->fCode;
+
+    this->writeStatement(*f.fBody);
+    if (0 == fErrors.errorCount()) {
+        SkASSERT(fLoopCount == 0);
+        SkASSERT(fConditionCount == 0);
+        SkASSERT(fStackCount == 0);
+    }
+    this->write(ByteCodeInstruction::kReturn, 0);
+    this->write8(0);
+
+    result->fLocalCount     = fLocals.size();
+    result->fConditionCount = fMaxConditionCount;
+    result->fLoopCount      = fMaxLoopCount;
+    result->fStackCount     = fMaxStackCount;
+
+    const Type& returnType = f.fDeclaration.fReturnType;
+    if (returnType != *fContext.fVoid_Type) {
+        result->fReturnCount = SlotCount(returnType);
+    }
+    fLocals.clear();
+    fFunction = nullptr;
+    return result;
 }
 
 // A "simple" Swizzle is based on a variable (or a compound variable like a struct or array), and
@@ -145,439 +186,520 @@
     return true;
 }
 
+int ByteCodeGenerator::StackUsage(ByteCodeInstruction inst, int count_) {
+    // Ensures that we use count iff we're passed a non-default value. Most instructions have an
+    // implicit count, so the caller shouldn't need to worry about it (or count makes no sense).
+    // The asserts avoids callers thinking they're supplying useful information in that scenario,
+    // or failing to supply necessary information for the ops that need a count.
+    struct CountValue {
+        operator int() {
+            SkASSERT(val != ByteCodeGenerator::kUnusedStackCount);
+            SkDEBUGCODE(used = true);
+            return val;
+        }
+        ~CountValue() {
+            SkASSERT(used || val == ByteCodeGenerator::kUnusedStackCount);
+        }
+        int val;
+        SkDEBUGCODE(bool used = false;)
+    } count = { count_ };
+
+    switch (inst) {
+        // Unary functions/operators that don't change stack depth at all:
+#define VECTOR_UNARY_OP(base)                \
+        case ByteCodeInstruction::base:      \
+        case ByteCodeInstruction::base ## 2: \
+        case ByteCodeInstruction::base ## 3: \
+        case ByteCodeInstruction::base ## 4: \
+            return 0;
+
+        VECTOR_UNARY_OP(kConvertFtoI)
+        VECTOR_UNARY_OP(kConvertStoF)
+        VECTOR_UNARY_OP(kConvertUtoF)
+
+        VECTOR_UNARY_OP(kCos)
+        VECTOR_UNARY_OP(kSin)
+        VECTOR_UNARY_OP(kSqrt)
+        VECTOR_UNARY_OP(kTan)
+
+        VECTOR_UNARY_OP(kNegateF)
+        VECTOR_UNARY_OP(kNegateI)
+
+        case ByteCodeInstruction::kInverse2x2:
+        case ByteCodeInstruction::kInverse3x3:
+        case ByteCodeInstruction::kInverse4x4: return 0;
+
+        case ByteCodeInstruction::kClampIndex: return 0;
+        case ByteCodeInstruction::kNotB: return 0;
+        case ByteCodeInstruction::kNegateFN: return 0;
+        case ByteCodeInstruction::kShiftLeft: return 0;
+        case ByteCodeInstruction::kShiftRightS: return 0;
+        case ByteCodeInstruction::kShiftRightU: return 0;
+
+#undef VECTOR_UNARY_OP
+
+        // Binary functions/operators that do a 2 -> 1 reduction (possibly N times)
+#define VECTOR_BINARY_OP(base)                          \
+        case ByteCodeInstruction::base:      return -1; \
+        case ByteCodeInstruction::base ## 2: return -2; \
+        case ByteCodeInstruction::base ## 3: return -3; \
+        case ByteCodeInstruction::base ## 4: return -4;
+
+#define VECTOR_MATRIX_BINARY_OP(base)                   \
+        VECTOR_BINARY_OP(base)                          \
+        case ByteCodeInstruction::base ## N: return -count;
+
+        case ByteCodeInstruction::kAndB: return -1;
+        case ByteCodeInstruction::kOrB:  return -1;
+        case ByteCodeInstruction::kXorB: return -1;
+
+        VECTOR_BINARY_OP(kAddI)
+        VECTOR_MATRIX_BINARY_OP(kAddF)
+
+        VECTOR_BINARY_OP(kCompareIEQ)
+        VECTOR_MATRIX_BINARY_OP(kCompareFEQ)
+        VECTOR_BINARY_OP(kCompareINEQ)
+        VECTOR_MATRIX_BINARY_OP(kCompareFNEQ)
+        VECTOR_BINARY_OP(kCompareSGT)
+        VECTOR_BINARY_OP(kCompareUGT)
+        VECTOR_BINARY_OP(kCompareFGT)
+        VECTOR_BINARY_OP(kCompareSGTEQ)
+        VECTOR_BINARY_OP(kCompareUGTEQ)
+        VECTOR_BINARY_OP(kCompareFGTEQ)
+        VECTOR_BINARY_OP(kCompareSLT)
+        VECTOR_BINARY_OP(kCompareULT)
+        VECTOR_BINARY_OP(kCompareFLT)
+        VECTOR_BINARY_OP(kCompareSLTEQ)
+        VECTOR_BINARY_OP(kCompareULTEQ)
+        VECTOR_BINARY_OP(kCompareFLTEQ)
+
+        VECTOR_BINARY_OP(kDivideS)
+        VECTOR_BINARY_OP(kDivideU)
+        VECTOR_MATRIX_BINARY_OP(kDivideF)
+        VECTOR_BINARY_OP(kMultiplyI)
+        VECTOR_MATRIX_BINARY_OP(kMultiplyF)
+        VECTOR_BINARY_OP(kRemainderF)
+        VECTOR_BINARY_OP(kRemainderS)
+        VECTOR_BINARY_OP(kRemainderU)
+        VECTOR_BINARY_OP(kSubtractI)
+        VECTOR_MATRIX_BINARY_OP(kSubtractF)
+
+#undef VECTOR_BINARY_OP
+#undef VECTOR_MATRIX_BINARY_OP
+
+        // Ops that push or load data to grow the stack:
+        case ByteCodeInstruction::kDup:
+        case ByteCodeInstruction::kLoad:
+        case ByteCodeInstruction::kLoadGlobal:
+        case ByteCodeInstruction::kLoadUniform:
+        case ByteCodeInstruction::kReadExternal:
+        case ByteCodeInstruction::kPushImmediate:
+            return 1;
+
+        case ByteCodeInstruction::kDup2:
+        case ByteCodeInstruction::kLoad2:
+        case ByteCodeInstruction::kLoadGlobal2:
+        case ByteCodeInstruction::kLoadUniform2:
+        case ByteCodeInstruction::kReadExternal2:
+            return 2;
+
+        case ByteCodeInstruction::kDup3:
+        case ByteCodeInstruction::kLoad3:
+        case ByteCodeInstruction::kLoadGlobal3:
+        case ByteCodeInstruction::kLoadUniform3:
+        case ByteCodeInstruction::kReadExternal3:
+            return 3;
+
+        case ByteCodeInstruction::kDup4:
+        case ByteCodeInstruction::kLoad4:
+        case ByteCodeInstruction::kLoadGlobal4:
+        case ByteCodeInstruction::kLoadUniform4:
+        case ByteCodeInstruction::kReadExternal4:
+            return 4;
+
+        case ByteCodeInstruction::kDupN:
+        case ByteCodeInstruction::kLoadSwizzle:
+        case ByteCodeInstruction::kLoadSwizzleGlobal:
+        case ByteCodeInstruction::kLoadSwizzleUniform:
+            return count;
+
+        // Pushes 'count' values, minus one for the 'address' that's consumed first
+        case ByteCodeInstruction::kLoadExtended:
+        case ByteCodeInstruction::kLoadExtendedGlobal:
+        case ByteCodeInstruction::kLoadExtendedUniform:
+            return count - 1;
+
+        // Ops that pop or store data to shrink the stack:
+        case ByteCodeInstruction::kPop:
+        case ByteCodeInstruction::kStore:
+        case ByteCodeInstruction::kStoreGlobal:
+        case ByteCodeInstruction::kWriteExternal:
+            return -1;
+
+        case ByteCodeInstruction::kPop2:
+        case ByteCodeInstruction::kStore2:
+        case ByteCodeInstruction::kStoreGlobal2:
+        case ByteCodeInstruction::kWriteExternal2:
+            return -2;
+
+        case ByteCodeInstruction::kPop3:
+        case ByteCodeInstruction::kStore3:
+        case ByteCodeInstruction::kStoreGlobal3:
+        case ByteCodeInstruction::kWriteExternal3:
+            return -3;
+
+        case ByteCodeInstruction::kPop4:
+        case ByteCodeInstruction::kStore4:
+        case ByteCodeInstruction::kStoreGlobal4:
+        case ByteCodeInstruction::kWriteExternal4:
+            return -4;
+
+        case ByteCodeInstruction::kPopN:
+        case ByteCodeInstruction::kStoreSwizzle:
+        case ByteCodeInstruction::kStoreSwizzleGlobal:
+            return -count;
+
+        // Consumes 'count' values, plus one for the 'address'
+        case ByteCodeInstruction::kStoreExtended:
+        case ByteCodeInstruction::kStoreExtendedGlobal:
+        case ByteCodeInstruction::kStoreSwizzleIndirect:
+        case ByteCodeInstruction::kStoreSwizzleIndirectGlobal:
+            return -count - 1;
+
+        // Strange ops where the caller computes the delta for us:
+        case ByteCodeInstruction::kCallExternal:
+        case ByteCodeInstruction::kMatrixToMatrix:
+        case ByteCodeInstruction::kMatrixMultiply:
+        case ByteCodeInstruction::kReserve:
+        case ByteCodeInstruction::kReturn:
+        case ByteCodeInstruction::kScalarToMatrix:
+        case ByteCodeInstruction::kSwizzle:
+            return count;
+
+        // Miscellaneous
+
+        // kCall is net-zero. Max stack depth is adjusted in writeFunctionCall.
+        case ByteCodeInstruction::kCall:             return 0;
+        case ByteCodeInstruction::kBranch:           return 0;
+        case ByteCodeInstruction::kBranchIfAllFalse: return 0;
+
+        case ByteCodeInstruction::kMaskPush:         return -1;
+        case ByteCodeInstruction::kMaskPop:          return 0;
+        case ByteCodeInstruction::kMaskNegate:       return 0;
+        case ByteCodeInstruction::kMaskBlend:        return -count;
+
+        case ByteCodeInstruction::kLoopBegin:        return 0;
+        case ByteCodeInstruction::kLoopNext:         return 0;
+        case ByteCodeInstruction::kLoopMask:         return -1;
+        case ByteCodeInstruction::kLoopEnd:          return 0;
+        case ByteCodeInstruction::kLoopBreak:        return 0;
+        case ByteCodeInstruction::kLoopContinue:     return 0;
+
+        default:
+            ABORT("unsupported instruction %d\n", (int)inst);
+            return 0;
+    }
+}
+
+ByteCodeGenerator::Location ByteCodeGenerator::getLocation(const Variable& var) {
+    // given that we seldom have more than a couple of variables, linear search is probably the most
+    // efficient way to handle lookups
+    switch (var.fStorage) {
+        case Variable::kLocal_Storage: {
+            for (int i = fLocals.size() - 1; i >= 0; --i) {
+                if (fLocals[i] == &var) {
+                    SkASSERT(fParameterCount + i <= 255);
+                    return { fParameterCount + i, Storage::kLocal };
+                }
+            }
+            int result = fParameterCount + fLocals.size();
+            fLocals.push_back(&var);
+            for (int i = 0; i < SlotCount(var.fType) - 1; ++i) {
+                fLocals.push_back(nullptr);
+            }
+            SkASSERT(result <= 255);
+            return { result, Storage::kLocal };
+        }
+        case Variable::kParameter_Storage: {
+            int offset = 0;
+            for (const auto& p : fFunction->fDeclaration.fParameters) {
+                if (p == &var) {
+                    SkASSERT(offset <= 255);
+                    return { offset, Storage::kLocal };
+                }
+                offset += SlotCount(p->fType);
+            }
+            SkASSERT(false);
+            return Location::MakeInvalid();
+        }
+        case Variable::kGlobal_Storage: {
+            if (is_in(var)) {
+                // If you see this error, it means the program is using raw 'in' variables. You
+                // should either specialize the program (Compiler::specialize) to bake in the final
+                // values of the 'in' variables, or not use 'in' variables (maybe you meant to use
+                // 'uniform' instead?).
+                fErrors.error(var.fOffset,
+                              "'in' variable is not specialized or has unsupported type");
+                return Location::MakeInvalid();
+            }
+            int offset = 0;
+            bool isUniform = is_uniform(var);
+            for (const auto& e : fProgram) {
+                if (e.fKind == ProgramElement::kVar_Kind) {
+                    VarDeclarations& decl = (VarDeclarations&) e;
+                    for (const auto& v : decl.fVars) {
+                        const Variable* declVar = ((VarDeclaration&) *v).fVar;
+                        if (declVar->fModifiers.fLayout.fBuiltin >= 0 || is_in(*declVar)) {
+                            continue;
+                        }
+                        if (isUniform != is_uniform(*declVar)) {
+                            continue;
+                        }
+                        if (declVar == &var) {
+                            SkASSERT(offset <= 255);
+                            return  { offset, isUniform ? Storage::kUniform : Storage::kGlobal };
+                        }
+                        offset += SlotCount(declVar->fType);
+                    }
+                }
+            }
+            SkASSERT(false);
+            return Location::MakeInvalid();
+        }
+        default:
+            SkASSERT(false);
+            return Location::MakeInvalid();
+    }
+}
+
 ByteCodeGenerator::Location ByteCodeGenerator::getLocation(const Expression& expr) {
     switch (expr.fKind) {
         case Expression::kFieldAccess_Kind: {
-            const FieldAccess& f = (const FieldAccess&) expr;
-            Location result = this->getLocation(*f.fBase);
+            const FieldAccess& f = (const FieldAccess&)expr;
+            Location baseLoc = this->getLocation(*f.fBase);
             int offset = 0;
             for (int i = 0; i < f.fFieldIndex; ++i) {
                 offset += SlotCount(*f.fBase->fType.fields()[i].fType);
             }
-            return result.offset(*this, offset);
-        }
-        case Expression::kIndex_Kind: {
-            const IndexExpression& idx = (const IndexExpression&) expr;
-            int stride = SlotCount(idx.fType);
-            int length = idx.fBase->fType.columns();
-            Location result = this->getLocation(*idx.fBase);
-            if (idx.fIndex->isConstant()) {
-                int64_t index = idx.fIndex->getConstantInt();
-                if (index < 0 || index >= length) {
-                    fErrors.error(idx.fIndex->fOffset, "Array index out of bounds");
-                    return result;
+            if (baseLoc.isOnStack()) {
+                if (offset != 0) {
+                    this->write(ByteCodeInstruction::kPushImmediate);
+                    this->write32(offset);
+                    this->write(ByteCodeInstruction::kAddI);
+                    this->write8(1);
                 }
-                return result.offset(*this, index * stride);
+                return baseLoc;
             } else {
-                ByteCode::Register index = this->next(1);
-                this->writeExpression(*idx.fIndex, index);
-                this->write(ByteCode::Instruction::kBoundsCheck);
-                this->write(index);
-                this->write(length);
-                ByteCode::Register imm = this->next(1);
-                this->write(ByteCode::Instruction::kImmediate);
-                this->write(imm);
-                this->write(ByteCode::Immediate{stride});
-                ByteCode::Register offset = this->next(1);
-                this->write(ByteCode::Instruction::kMultiplyI);
-                this->write(offset);
-                this->write(index);
-                this->write(imm);
-                return result.offset(*this, offset);
+                return baseLoc + offset;
             }
         }
+        case Expression::kIndex_Kind: {
+            const IndexExpression& i = (const IndexExpression&)expr;
+            int stride = SlotCount(i.fType);
+            int length = i.fBase->fType.columns();
+            SkASSERT(length <= 255);
+            int offset = -1;
+            if (i.fIndex->isConstant()) {
+                int64_t index = i.fIndex->getConstantInt();
+                if (index < 0 || index >= length) {
+                    fErrors.error(i.fIndex->fOffset, "Array index out of bounds.");
+                    return Location::MakeInvalid();
+                }
+                offset = index * stride;
+            } else {
+                if (i.fIndex->hasSideEffects()) {
+                    // Having a side-effect in an indexer is technically safe for an rvalue,
+                    // but with lvalues we have to evaluate the indexer twice, so make it an error.
+                    fErrors.error(i.fIndex->fOffset,
+                            "Index expressions with side-effects not supported in byte code.");
+                    return Location::MakeInvalid();
+                }
+                this->writeExpression(*i.fIndex);
+                this->write(ByteCodeInstruction::kClampIndex);
+                this->write8(length);
+                if (stride != 1) {
+                    this->write(ByteCodeInstruction::kPushImmediate);
+                    this->write32(stride);
+                    this->write(ByteCodeInstruction::kMultiplyI);
+                    this->write8(1);
+                }
+            }
+            Location baseLoc = this->getLocation(*i.fBase);
+
+            // Are both components known statically?
+            if (!baseLoc.isOnStack() && offset >= 0) {
+                return baseLoc + offset;
+            }
+
+            // At least one component is dynamic (and on the stack).
+
+            // If the other component is zero, we're done
+            if (baseLoc.fSlot == 0 || offset == 0) {
+                return baseLoc.makeOnStack();
+            }
+
+            // Push the non-dynamic component (if any) to the stack, then add the two
+            if (!baseLoc.isOnStack()) {
+                this->write(ByteCodeInstruction::kPushImmediate);
+                this->write32(baseLoc.fSlot);
+            }
+            if (offset >= 0) {
+                this->write(ByteCodeInstruction::kPushImmediate);
+                this->write32(offset);
+            }
+            this->write(ByteCodeInstruction::kAddI);
+            this->write8(1);
+            return baseLoc.makeOnStack();
+        }
         case Expression::kSwizzle_Kind: {
-            const Swizzle& s = (const Swizzle&) expr;
+            const Swizzle& s = (const Swizzle&)expr;
             SkASSERT(swizzle_is_simple(s));
-            return this->getLocation(*s.fBase).offset(*this, s.fComponents[0]);
+            Location baseLoc = this->getLocation(*s.fBase);
+            int offset = s.fComponents[0];
+            if (baseLoc.isOnStack()) {
+                if (offset != 0) {
+                    this->write(ByteCodeInstruction::kPushImmediate);
+                    this->write32(offset);
+                    this->write(ByteCodeInstruction::kAddI);
+                    this->write8(1);
+                }
+                return baseLoc;
+            } else {
+                return baseLoc + offset;
+            }
         }
         case Expression::kVariableReference_Kind: {
-            const Variable& var = ((const VariableReference&) expr).fVariable;
+            const Variable& var = ((const VariableReference&)expr).fVariable;
             return this->getLocation(var);
         }
         default:
             SkASSERT(false);
-            return ByteCode::Pointer{0};
+            return Location::MakeInvalid();
     }
 }
 
-Variable::Storage ByteCodeGenerator::getStorage(const Expression& expr) {
-    switch (expr.fKind) {
-        case Expression::kFieldAccess_Kind: {
-            const FieldAccess& f = (const FieldAccess&) expr;
-            return this->getStorage(*f.fBase);
-        }
-        case Expression::kIndex_Kind: {
-            const IndexExpression& idx = (const IndexExpression&) expr;
-            return this->getStorage(*idx.fBase);
-        }
-        case Expression::kSwizzle_Kind: {
-            const Swizzle& s = (const Swizzle&) expr;
-            return this->getStorage(*s.fBase);
-        }
-        case Expression::kVariableReference_Kind: {
-            const Variable& var = ((const VariableReference&) expr).fVariable;
-            return var.fStorage;
-        }
-        default:
-            SkASSERT(false);
-            return Variable::kLocal_Storage;
-    }
+void ByteCodeGenerator::write8(uint8_t b) {
+    fCode->push_back(b);
 }
 
-ByteCode::Instruction ByteCodeGenerator::getLoadInstruction(ByteCodeGenerator::Location location,
-                                                            Variable::Storage storage) {
-    switch (storage) {
-        case Variable::kGlobal_Storage:
-            switch (location.fKind) {
-                case Location::kPointer_Kind: return ByteCode::Instruction::kLoadDirect;
-                case Location::kRegister_Kind: return ByteCode::Instruction::kLoad;
-            }
-        case Variable::kParameter_Storage:
-            switch (location.fKind) {
-                case Location::kPointer_Kind: return ByteCode::Instruction::kLoadParameterDirect;
-                case Location::kRegister_Kind: return ByteCode::Instruction::kLoadParameter;
-            }
-        case Variable::kLocal_Storage:
-            switch (location.fKind) {
-                case Location::kPointer_Kind: return ByteCode::Instruction::kLoadStackDirect;
-                case Location::kRegister_Kind: return ByteCode::Instruction::kLoadStack;
-            }
-        default:
-            break;
-    }
-    SkASSERT(false);
-    return ByteCode::Instruction::kNop;
+void ByteCodeGenerator::write16(uint16_t i) {
+    size_t n = fCode->size();
+    fCode->resize(n+2);
+    memcpy(fCode->data() + n, &i, 2);
 }
 
-ByteCode::Instruction ByteCodeGenerator::getStoreInstruction(ByteCodeGenerator::Location location,
-                                                             Variable::Storage storage) {
-    switch (storage) {
-        case Variable::kGlobal_Storage:
-            switch (location.fKind) {
-                case Location::kPointer_Kind: return ByteCode::Instruction::kStoreDirect;
-                case Location::kRegister_Kind: return ByteCode::Instruction::kStore;
-            }
-        case Variable::kParameter_Storage:
-            switch (location.fKind) {
-                case Location::kPointer_Kind: return ByteCode::Instruction::kStoreParameterDirect;
-                case Location::kRegister_Kind: return ByteCode::Instruction::kStoreParameter;
-            }
-        case Variable::kLocal_Storage:
-            switch (location.fKind) {
-                case Location::kPointer_Kind: return ByteCode::Instruction::kStoreStackDirect;
-                case Location::kRegister_Kind: return ByteCode::Instruction::kStoreStack;
-            }
-        default:
-            break;
-    }
-    SkASSERT(false);
-    return ByteCode::Instruction::kNop;
+void ByteCodeGenerator::write32(uint32_t i) {
+    size_t n = fCode->size();
+    fCode->resize(n+4);
+    memcpy(fCode->data() + n, &i, 4);
 }
 
-#define VEC(inst) ((ByteCode::Instruction) ((uint16_t) inst + 1))
+void ByteCodeGenerator::write(ByteCodeInstruction i, int count) {
+    switch (i) {
+        case ByteCodeInstruction::kLoopBegin: this->enterLoop();      break;
+        case ByteCodeInstruction::kLoopEnd:   this->exitLoop();       break;
 
-class ByteCodeSimpleLValue : public ByteCodeGenerator::LValue {
-public:
-    ByteCodeSimpleLValue(ByteCodeGenerator* generator, ByteCodeGenerator::Location location,
-                         int count, ByteCode::Instruction load, ByteCode::Instruction store)
-        : INHERITED(*generator)
-        , fLocation(location)
-        , fCount((uint8_t) count)
-        , fLoad(load)
-        , fStore(store) {}
-
-    void load(ByteCode::Register result) override {
-        fGenerator.write(fLoad, fCount);
-        fGenerator.write(result);
-        fGenerator.write(fLocation);
+        case ByteCodeInstruction::kMaskPush:  this->enterCondition(); break;
+        case ByteCodeInstruction::kMaskPop:
+        case ByteCodeInstruction::kMaskBlend: this->exitCondition();  break;
+        default: /* Do nothing */ break;
     }
-
-    void store(ByteCode::Register src) override {
-        fGenerator.write(fStore, fCount);
-        fGenerator.write(fLocation);
-        fGenerator.write(src);
-    }
-
-private:
-    ByteCodeGenerator::Location fLocation;
-
-    uint8_t fCount;
-
-    ByteCode::Instruction fLoad;
-
-    ByteCode::Instruction fStore;
-
-    typedef ByteCodeGenerator::LValue INHERITED;
-};
-
-class ByteCodeSwizzleLValue : public ByteCodeGenerator::LValue {
-public:
-    ByteCodeSwizzleLValue(ByteCodeGenerator* generator, const Swizzle* swizzle)
-        : INHERITED(*generator)
-        , fSwizzle(*swizzle) {}
-
-    void load(ByteCode::Register result) override {
-        fGenerator.writeSwizzle(fSwizzle, result);
-    }
-
-    void store(ByteCode::Register src) override {
-        ByteCodeGenerator::Location target = fGenerator.getLocation(*fSwizzle.fBase);
-        ByteCode::Instruction inst = fGenerator.getStoreInstruction(
-                                                            target,
-                                                            fGenerator.getStorage(*fSwizzle.fBase));
-        for (size_t i = 0; i < fSwizzle.fComponents.size(); ++i) {
-            ByteCodeGenerator::Location final = target.offset(fGenerator, fSwizzle.fComponents[i]);
-            fGenerator.write(inst);
-            fGenerator.write(final);
-            fGenerator.write(src + i);
-        }
-    }
-
-private:
-    const Swizzle& fSwizzle;
-
-    typedef ByteCodeGenerator::LValue INHERITED;
-};
-
-class ByteCodeExternalValueLValue : public ByteCodeGenerator::LValue {
-public:
-    ByteCodeExternalValueLValue(ByteCodeGenerator* generator, ExternalValue& value, int index)
-        : INHERITED(*generator)
-        , fIndex(index)
-        , fSlotCount(ByteCodeGenerator::SlotCount(value.type())) {
-        SkASSERT(fSlotCount <= 4);
-    }
-
-    void load(ByteCode::Register result) override {
-        fGenerator.write(ByteCode::Instruction::kReadExternal);
-        fGenerator.write(result);
-        fGenerator.write((uint8_t) fSlotCount);
-        fGenerator.write((uint8_t) fIndex);
-    }
-
-    void store(ByteCode::Register src) override {
-        fGenerator.write(ByteCode::Instruction::kWriteExternal);
-        fGenerator.write((uint8_t) fIndex);
-        fGenerator.write((uint8_t) fSlotCount);
-        fGenerator.write(src);
-    }
-
-private:
-    typedef LValue INHERITED;
-
-    int fIndex;
-
-    int fSlotCount;
-};
-
-std::unique_ptr<ByteCodeGenerator::LValue> ByteCodeGenerator::getLValue(const Expression& expr) {
-    switch (expr.fKind) {
-        case Expression::kExternalValue_Kind: {
-            ExternalValue* value = ((ExternalValueReference&) expr).fValue;
-            int index = fOutput->fExternalValues.size();
-            fOutput->fExternalValues.push_back(value);
-            SkASSERT(index <= 255);
-            return std::unique_ptr<LValue>(new ByteCodeExternalValueLValue(this, *value, index));
-        }
-        case Expression::kFieldAccess_Kind:
-        case Expression::kIndex_Kind:
-        case Expression::kVariableReference_Kind: {
-            Location location = this->getLocation(expr);
-            Variable::Storage storage = this->getStorage(expr);
-            ByteCode::Instruction loadInst = this->getLoadInstruction(location, storage);
-            ByteCode::Instruction storeInst = this->getStoreInstruction(location, storage);
-            return std::unique_ptr<LValue>(new ByteCodeSimpleLValue(this, location,
-                                                                    SlotCount(expr.fType),
-                                                                    loadInst, storeInst));
-        }
-        case Expression::kSwizzle_Kind:
-            return std::unique_ptr<LValue>(new ByteCodeSwizzleLValue(this, &(Swizzle&) expr));
-        default:
-            ABORT("unsupported lvalue\n");
-    }
+    instruction val = (instruction) i;
+    size_t n = fCode->size();
+    fCode->resize(n + sizeof(val));
+    memcpy(fCode->data() + n, &val, sizeof(val));
+    fStackCount += StackUsage(i, count);
+    fMaxStackCount = std::max(fMaxStackCount, fStackCount);
 }
 
-ByteCode::Register ByteCodeGenerator::next(int count) {
-    SkASSERT(fNextRegister + count <= ByteCode::kRegisterMax);
-    fNextRegister += count;
-    return ByteCode::Register{(uint16_t) (fNextRegister - count)};
+static ByteCodeInstruction vector_instruction(ByteCodeInstruction base, int count) {
+    SkASSERT(count >= 1 && count <= 4);
+    return ((ByteCodeInstruction) ((int) base + 1 - count));
 }
 
-static TypeCategory type_category(const Type& type) {
-    switch (type.kind()) {
-        case Type::Kind::kVector_Kind:
-        case Type::Kind::kMatrix_Kind:
-            return type_category(type.componentType());
-        default:
-            String name = type.displayName();
-            if (name == "bool") {
-                return TypeCategory::kBool;
-            } else if (name == "int" || name == "short") {
-                return TypeCategory::kSigned;
-            } else if (name == "uint" || name == "ushort") {
-                return TypeCategory::kUnsigned;
-            } else {
-                SkASSERT(name == "float" || name == "half");
-                return TypeCategory::kFloat;
-            }
-            ABORT("unsupported type: %s\n", name.c_str());
-    }
-}
-
-void ByteCodeGenerator::write(ByteCode::Instruction inst, int count) {
-    SkASSERT(count <= 255);
-    if (count > 1) {
-        this->write(VEC(inst));
-        this->write((uint8_t) count);
-    }
-    else {
-        this->write(inst);
-    }
-}
-
-void ByteCodeGenerator::writeTypedInstruction(const Type& type, ByteCode::Instruction s,
-                                              ByteCode::Instruction u, ByteCode::Instruction f) {
+void ByteCodeGenerator::writeTypedInstruction(const Type& type, ByteCodeInstruction s,
+                                              ByteCodeInstruction u, ByteCodeInstruction f,
+                                              int count, bool writeCount) {
     switch (type_category(type)) {
         case TypeCategory::kSigned:
-            this->write(s);
+            this->write(vector_instruction(s, count));
             break;
         case TypeCategory::kUnsigned:
-            this->write(u);
+            this->write(vector_instruction(u, count));
             break;
         case TypeCategory::kFloat: {
-            this->write(f);
+            if (count > 4) {
+                this->write((ByteCodeInstruction)((int)f + 1), count);
+            } else {
+                this->write(vector_instruction(f, count));
+            }
             break;
         }
         default:
             SkASSERT(false);
     }
-}
-
-void ByteCodeGenerator::writeVectorBinaryInstruction(const Type& operandType,
-                                                     ByteCode::Register left,
-                                                     ByteCode::Register right,
-                                                     ByteCode::Instruction s,
-                                                     ByteCode::Instruction u,
-                                                     ByteCode::Instruction f,
-                                                     ByteCode::Register result) {
-    uint8_t count = (uint8_t) SlotCount(operandType);
-    if (count == 1) {
-        this->writeTypedInstruction(operandType, s, u, f);
-    }
-    else {
-        this->writeTypedInstruction(operandType, VEC(s), VEC(u), VEC(f));
-        this->write(count);
-    }
-    this->write(result);
-    this->write(left);
-    this->write(right);
-}
-
-void ByteCodeGenerator::writeBinaryInstruction(const Type& operandType,
-                                               ByteCode::Register left,
-                                               ByteCode::Register right,
-                                               ByteCode::Instruction s,
-                                               ByteCode::Instruction u,
-                                               ByteCode::Instruction f,
-                                               ByteCode::Register result) {
-    for (int i = 0; i < SlotCount(operandType); ++i) {
-        this->writeTypedInstruction(operandType, s, u, f);
-        this->write(result + i);
-        this->write(left + i);
-        this->write(right + i);
+    if (writeCount) {
+        this->write8(count);
     }
 }
 
-void ByteCodeGenerator::writeBinaryExpression(const BinaryExpression& b,
-                                              ByteCode::Register result) {
+bool ByteCodeGenerator::writeBinaryExpression(const BinaryExpression& b, bool discard) {
     if (b.fOperator == Token::Kind::EQ) {
         std::unique_ptr<LValue> lvalue = this->getLValue(*b.fLeft);
-        this->writeExpression(*b.fRight, result);
-        lvalue->store(result);
-        return;
+        this->writeExpression(*b.fRight);
+        lvalue->store(discard);
+        discard = false;
+        return discard;
     }
     const Type& lType = b.fLeft->fType;
     const Type& rType = b.fRight->fType;
     bool lVecOrMtx = (lType.kind() == Type::kVector_Kind || lType.kind() == Type::kMatrix_Kind);
     bool rVecOrMtx = (rType.kind() == Type::kVector_Kind || rType.kind() == Type::kMatrix_Kind);
-    const Type* operandType;
-    if (!lVecOrMtx && rVecOrMtx) {
-        operandType = &rType;
-    } else {
-        operandType = &lType;
-    }
     Token::Kind op;
     std::unique_ptr<LValue> lvalue;
-    ByteCode::Register left;
-    switch (b.fOperator) {
-        case Token::Kind::LOGICALAND:
-        case Token::Kind::LOGICALANDEQ:
-        case Token::Kind::LOGICALOR:
-        case Token::Kind::LOGICALOREQ:
-            left = result;
-            break;
-        default:
-            left = this->next(SlotCount(*operandType));
-    }
     if (is_assignment(b.fOperator)) {
         lvalue = this->getLValue(*b.fLeft);
-        lvalue->load(left);
+        lvalue->load();
         op = remove_assignment(b.fOperator);
     } else {
-        this->writeExpression(*b.fLeft, left);
+        this->writeExpression(*b.fLeft);
         op = b.fOperator;
         if (!lVecOrMtx && rVecOrMtx) {
-            this->write(ByteCode::Instruction::kSplat);
-            this->write((uint8_t) (SlotCount(rType) - 1));
-            this->write(left + 1);
-            this->write(left);
+            for (int i = SlotCount(rType); i > 1; --i) {
+                this->write(ByteCodeInstruction::kDup);
+                this->write8(1);
+            }
         }
     }
-    SkDEBUGCODE(TypeCategory tc = type_category(lType));
     int count = std::max(SlotCount(lType), SlotCount(rType));
+    SkDEBUGCODE(TypeCategory tc = type_category(lType));
     switch (op) {
         case Token::Kind::LOGICALAND: {
-            SkASSERT(left.fIndex == result.fIndex);
-            this->write(ByteCode::Instruction::kMaskPush);
-            ++fConditionCount;
-            this->write(left);
-            this->write(ByteCode::Instruction::kBranchIfAllFalse);
+            SkASSERT(tc == SkSL::TypeCategory::kBool && count == 1);
+            this->write(ByteCodeInstruction::kDup);
+            this->write8(1);
+            this->write(ByteCodeInstruction::kMaskPush);
+            this->write(ByteCodeInstruction::kBranchIfAllFalse);
             DeferredLocation falseLocation(this);
-            SkASSERT(SlotCount(b.fRight->fType) == 1);
-            ByteCode::Register right = this->next(1);
-            this->writeExpression(*b.fRight, right);
-            this->write(ByteCode::Instruction::kAnd);
-            this->write(result);
-            this->write(left);
-            this->write(right);
+            this->writeExpression(*b.fRight);
+            this->write(ByteCodeInstruction::kAndB);
             falseLocation.set();
-            --fConditionCount;
-            this->write(ByteCode::Instruction::kMaskPop);
-            return;
+            this->write(ByteCodeInstruction::kMaskPop);
+            return false;
         }
         case Token::Kind::LOGICALOR: {
-            SkASSERT(left.fIndex == result.fIndex);
-            ByteCode::Register mask = this->next(1);
-            this->write(ByteCode::Instruction::kNot);
-            this->write(mask);
-            this->write(left);
-            this->write(ByteCode::Instruction::kMaskPush);
-            ++fConditionCount;
-            this->write(mask);
-            this->write(ByteCode::Instruction::kBranchIfAllFalse);
+            SkASSERT(tc == SkSL::TypeCategory::kBool && count == 1);
+            this->write(ByteCodeInstruction::kDup);
+            this->write8(1);
+            this->write(ByteCodeInstruction::kNotB);
+            this->write(ByteCodeInstruction::kMaskPush);
+            this->write(ByteCodeInstruction::kBranchIfAllFalse);
             DeferredLocation falseLocation(this);
-            SkASSERT(SlotCount(b.fRight->fType) == 1);
-            ByteCode::Register right = this->next(1);
-            this->writeExpression(*b.fRight, right);
-            this->write(ByteCode::Instruction::kOr);
-            this->write(result);
-            this->write(left);
-            this->write(right);
+            this->writeExpression(*b.fRight);
+            this->write(ByteCodeInstruction::kOrB);
             falseLocation.set();
-            --fConditionCount;
-            this->write(ByteCode::Instruction::kMaskPop);
-            return;
+            this->write(ByteCodeInstruction::kMaskPop);
+            return false;
         }
         case Token::Kind::SHL:
         case Token::Kind::SHR: {
@@ -585,666 +707,775 @@
                                     tc == SkSL::TypeCategory::kUnsigned));
             if (!b.fRight->isConstant()) {
                 fErrors.error(b.fRight->fOffset, "Shift amounts must be constant");
-                return;
+                return false;
             }
             int64_t shift = b.fRight->getConstantInt();
             if (shift < 0 || shift > 31) {
                 fErrors.error(b.fRight->fOffset, "Shift amount out of range");
-                return;
+                return false;
             }
 
             if (op == Token::Kind::SHL) {
-                this->write(ByteCode::Instruction::kShiftLeft);
+                this->write(ByteCodeInstruction::kShiftLeft);
             } else {
                 this->write(type_category(lType) == TypeCategory::kSigned
-                                ? ByteCode::Instruction::kShiftRightS
-                                : ByteCode::Instruction::kShiftRightU);
+                                ? ByteCodeInstruction::kShiftRightS
+                                : ByteCodeInstruction::kShiftRightU);
             }
-            this->write(result);
-            this->write(left);
-            this->write((uint8_t) shift);
-            return;
+            this->write8(shift);
+            return false;
         }
-        case Token::Kind::STAR:
-            // Special case for M*V, V*M, M*M (but not V*V!)
-            if (lType.columns() > 1 && rType.columns() > 1 &&
-                (lType.rows() > 1 || rType.rows() > 1)) {
-                ByteCode::Register right = this->next(SlotCount(rType));
-                this->writeExpression(*b.fRight, right);
-                int rCols = rType.columns(),
-                    rRows = rType.rows(),
-                    lCols = lType.columns(),
-                    lRows = lType.rows();
-                // M*V treats the vector as a column
-                if (rType.kind() == Type::kVector_Kind) {
-                    std::swap(rCols, rRows);
-                }
-                SkASSERT(lCols == rRows);
-                SkASSERT(SlotCount(b.fType) == lRows * rCols);
-                this->write(ByteCode::Instruction::kMatrixMultiply);
-                this->write(result);
-                this->write(left);
-                this->write(right);
-                this->write((uint8_t) lCols);
-                this->write((uint8_t) lRows);
-                this->write((uint8_t) rCols);
-                return;
-            }
 
         default:
             break;
     }
-    ByteCode::Register right = this->next(SlotCount(*operandType));
-    this->writeExpression(*b.fRight, right);
+    this->writeExpression(*b.fRight);
     if (lVecOrMtx && !rVecOrMtx) {
-        this->write(ByteCode::Instruction::kSplat);
-        this->write((uint8_t) (SlotCount(*operandType) - 1));
-        this->write(right + 1);
-        this->write(right);
+        for (int i = SlotCount(lType); i > 1; --i) {
+            this->write(ByteCodeInstruction::kDup);
+            this->write8(1);
+        }
     }
-    switch (op) {
-        case Token::Kind::EQEQ:
-            this->writeBinaryInstruction(*operandType, left, right,
-                                         ByteCode::Instruction::kCompareEQI,
-                                         ByteCode::Instruction::kCompareEQI,
-                                         ByteCode::Instruction::kCompareEQF,
-                                         result);
-            // Collapse to a single bool
-            for (int i = 1; i < count; ++i) {
-                this->write(ByteCode::Instruction::kAnd);
-                this->write(result);
-                this->write(result);
-                this->write(result + i);
-            }
-            break;
-        case Token::Kind::GT:
-            this->writeBinaryInstruction(*operandType, left, right,
-                                         ByteCode::Instruction::kCompareGTS,
-                                         ByteCode::Instruction::kCompareGTU,
-                                         ByteCode::Instruction::kCompareGTF,
-                                         result);
-            break;
-        case Token::Kind::GTEQ:
-            this->writeBinaryInstruction(*operandType, left, right,
-                                         ByteCode::Instruction::kCompareGTEQS,
-                                         ByteCode::Instruction::kCompareGTEQU,
-                                         ByteCode::Instruction::kCompareGTEQF,
-                                         result);
-            break;
-        case Token::Kind::LT:
-            this->writeBinaryInstruction(*operandType, left, right,
-                                         ByteCode::Instruction::kCompareLTS,
-                                         ByteCode::Instruction::kCompareLTU,
-                                         ByteCode::Instruction::kCompareLTF,
-                                         result);
-            break;
-        case Token::Kind::LTEQ:
-            this->writeBinaryInstruction(*operandType, left, right,
-                                         ByteCode::Instruction::kCompareLTEQS,
-                                         ByteCode::Instruction::kCompareLTEQU,
-                                         ByteCode::Instruction::kCompareLTEQF,
-                                         result);
-            break;
-        case Token::Kind::MINUS:
-            this->writeVectorBinaryInstruction(*operandType, left, right,
-                                               ByteCode::Instruction::kSubtractI,
-                                               ByteCode::Instruction::kSubtractI,
-                                               ByteCode::Instruction::kSubtractF,
-                                               result);
-            break;
-        case Token::Kind::NEQ:
-            this->writeBinaryInstruction(*operandType, left, right,
-                                         ByteCode::Instruction::kCompareNEQI,
-                                         ByteCode::Instruction::kCompareNEQI,
-                                         ByteCode::Instruction::kCompareNEQF,
-                                         result);
-            // Collapse to a single bool
-            for (int i = 1; i < count; ++i) {
-                this->write(ByteCode::Instruction::kOr);
-                this->write(result);
-                this->write(result);
-                this->write(result + i);
-            }
-            break;
-        case Token::Kind::PERCENT:
-            this->writeVectorBinaryInstruction(*operandType, left, right,
-                                               ByteCode::Instruction::kRemainderS,
-                                               ByteCode::Instruction::kRemainderU,
-                                               ByteCode::Instruction::kRemainderF,
-                                               result);
-            break;
-        case Token::Kind::PLUS:
-            this->writeVectorBinaryInstruction(*operandType, left, right,
-                                               ByteCode::Instruction::kAddI,
-                                               ByteCode::Instruction::kAddI,
-                                               ByteCode::Instruction::kAddF,
-                                               result);
-            break;
-        case Token::Kind::SLASH:
-            this->writeVectorBinaryInstruction(*operandType, left, right,
-                                               ByteCode::Instruction::kDivideS,
-                                               ByteCode::Instruction::kDivideU,
-                                               ByteCode::Instruction::kDivideF,
-                                               result);
-            break;
-        case Token::Kind::STAR:
-            this->writeVectorBinaryInstruction(*operandType, left, right,
-                                               ByteCode::Instruction::kMultiplyI,
-                                               ByteCode::Instruction::kMultiplyI,
-                                               ByteCode::Instruction::kMultiplyF,
-                                               result);
-            break;
-        case Token::Kind::LOGICALXOR: {
-            SkASSERT(tc == SkSL::TypeCategory::kBool);
-            this->write(ByteCode::Instruction::kXor);
-            this->write(result);
-            this->write(left);
-            this->write(right);
-            break;
+    // Special case for M*V, V*M, M*M (but not V*V!)
+    if (op == Token::Kind::STAR && lVecOrMtx && rVecOrMtx &&
+        !(lType.kind() == Type::kVector_Kind && rType.kind() == Type::kVector_Kind)) {
+        this->write(ByteCodeInstruction::kMatrixMultiply,
+                    SlotCount(b.fType) - (SlotCount(lType) + SlotCount(rType)));
+        int rCols = rType.columns(),
+            rRows = rType.rows(),
+            lCols = lType.columns(),
+            lRows = lType.rows();
+        // M*V treats the vector as a column
+        if (rType.kind() == Type::kVector_Kind) {
+            std::swap(rCols, rRows);
         }
-        case Token::Kind::BITWISEAND: {
-            SkASSERT(tc == SkSL::TypeCategory::kSigned || tc == SkSL::TypeCategory::kUnsigned);
-            this->write(ByteCode::Instruction::kAnd);
-            this->write(result);
-            this->write(left);
-            this->write(right);
-            break;
+        SkASSERT(lCols == rRows);
+        SkASSERT(SlotCount(b.fType) == lRows * rCols);
+        this->write8(lCols);
+        this->write8(lRows);
+        this->write8(rCols);
+    } else {
+        switch (op) {
+            case Token::Kind::EQEQ:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kCompareIEQ,
+                                            ByteCodeInstruction::kCompareIEQ,
+                                            ByteCodeInstruction::kCompareFEQ,
+                                            count);
+                // Collapse to a single bool
+                for (int i = count; i > 1; --i) {
+                    this->write(ByteCodeInstruction::kAndB);
+                }
+                break;
+            case Token::Kind::GT:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kCompareSGT,
+                                            ByteCodeInstruction::kCompareUGT,
+                                            ByteCodeInstruction::kCompareFGT,
+                                            count);
+                break;
+            case Token::Kind::GTEQ:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kCompareSGTEQ,
+                                            ByteCodeInstruction::kCompareUGTEQ,
+                                            ByteCodeInstruction::kCompareFGTEQ,
+                                            count);
+                break;
+            case Token::Kind::LT:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kCompareSLT,
+                                            ByteCodeInstruction::kCompareULT,
+                                            ByteCodeInstruction::kCompareFLT,
+                                            count);
+                break;
+            case Token::Kind::LTEQ:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kCompareSLTEQ,
+                                            ByteCodeInstruction::kCompareULTEQ,
+                                            ByteCodeInstruction::kCompareFLTEQ,
+                                            count);
+                break;
+            case Token::Kind::MINUS:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kSubtractI,
+                                            ByteCodeInstruction::kSubtractI,
+                                            ByteCodeInstruction::kSubtractF,
+                                            count);
+                break;
+            case Token::Kind::NEQ:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kCompareINEQ,
+                                            ByteCodeInstruction::kCompareINEQ,
+                                            ByteCodeInstruction::kCompareFNEQ,
+                                            count);
+                // Collapse to a single bool
+                for (int i = count; i > 1; --i) {
+                    this->write(ByteCodeInstruction::kOrB);
+                }
+                break;
+            case Token::Kind::PERCENT:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kRemainderS,
+                                            ByteCodeInstruction::kRemainderU,
+                                            ByteCodeInstruction::kRemainderF,
+                                            count);
+                break;
+            case Token::Kind::PLUS:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kAddI,
+                                            ByteCodeInstruction::kAddI,
+                                            ByteCodeInstruction::kAddF,
+                                            count);
+                break;
+            case Token::Kind::SLASH:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kDivideS,
+                                            ByteCodeInstruction::kDivideU,
+                                            ByteCodeInstruction::kDivideF,
+                                            count);
+                break;
+            case Token::Kind::STAR:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kMultiplyI,
+                                            ByteCodeInstruction::kMultiplyI,
+                                            ByteCodeInstruction::kMultiplyF,
+                                            count);
+                break;
+
+            case Token::Kind::LOGICALXOR:
+                SkASSERT(tc == SkSL::TypeCategory::kBool && count == 1);
+                this->write(ByteCodeInstruction::kXorB);
+                break;
+
+            case Token::Kind::BITWISEAND:
+                SkASSERT(count == 1 && (tc == SkSL::TypeCategory::kSigned ||
+                                        tc == SkSL::TypeCategory::kUnsigned));
+                this->write(ByteCodeInstruction::kAndB);
+                break;
+            case Token::Kind::BITWISEOR:
+                SkASSERT(count == 1 && (tc == SkSL::TypeCategory::kSigned ||
+                                        tc == SkSL::TypeCategory::kUnsigned));
+                this->write(ByteCodeInstruction::kOrB);
+                break;
+            case Token::Kind::BITWISEXOR:
+                SkASSERT(count == 1 && (tc == SkSL::TypeCategory::kSigned ||
+                                        tc == SkSL::TypeCategory::kUnsigned));
+                this->write(ByteCodeInstruction::kXorB);
+                break;
+
+            default:
+                fErrors.error(b.fOffset, SkSL::String::printf("Unsupported binary operator '%s'",
+                                                              Compiler::OperatorName(op)));
+                break;
         }
-        case Token::Kind::BITWISEOR: {
-            SkASSERT(tc == SkSL::TypeCategory::kSigned || tc == SkSL::TypeCategory::kUnsigned);
-            this->write(ByteCode::Instruction::kOr);
-            this->write(result);
-            this->write(left);
-            this->write(right);
-            break;
-        }
-        case Token::Kind::BITWISEXOR: {
-            SkASSERT(tc == SkSL::TypeCategory::kSigned || tc == SkSL::TypeCategory::kUnsigned);
-            this->write(ByteCode::Instruction::kXor);
-            this->write(result);
-            this->write(left);
-            this->write(right);
-            break;
-        }
-        default:
-            fErrors.error(b.fOffset, SkSL::String::printf("Unsupported binary operator '%s'",
-                                                          Compiler::OperatorName(op)));
-            break;
     }
     if (lvalue) {
-        lvalue->store(result);
+        lvalue->store(discard);
+        discard = false;
     }
+    return discard;
 }
 
-void ByteCodeGenerator::writeConstructor(const Constructor& c, ByteCode::Register result) {
-    if (c.fType.rows() > 1) {
-        if (c.fArguments.size() == 1) {
-            if (SlotCount(c.fArguments[0]->fType) == 1) {
-                ByteCode::Register v = this->next(1);
-                this->writeExpression(*c.fArguments[0], v);
-                this->write(ByteCode::Instruction::kScalarToMatrix);
-                this->write(result);
-                this->write(v);
-                this->write((uint8_t) c.fType.columns());
-                this->write((uint8_t) c.fType.rows());
-                return;
-            } else if (c.fArguments[0]->fType.rows() > 1) {
-                ByteCode::Register v = this->next(SlotCount(c.fArguments[0]->fType));
-                this->writeExpression(*c.fArguments[0], v);
-                this->write(ByteCode::Instruction::kMatrixToMatrix);
-                this->write(result);
-                this->write(v);
-                this->write((uint8_t) c.fArguments[0]->fType.columns());
-                this->write((uint8_t) c.fArguments[0]->fType.rows());
-                this->write((uint8_t) c.fType.columns());
-                this->write((uint8_t) c.fType.rows());
-                return;
+void ByteCodeGenerator::writeBoolLiteral(const BoolLiteral& b) {
+    this->write(ByteCodeInstruction::kPushImmediate);
+    this->write32(b.fValue ? ~0 : 0);
+}
+
+void ByteCodeGenerator::writeConstructor(const Constructor& c) {
+    for (const auto& arg : c.fArguments) {
+        this->writeExpression(*arg);
+    }
+    if (c.fArguments.size() == 1) {
+        const Type& inType = c.fArguments[0]->fType;
+        const Type& outType = c.fType;
+        TypeCategory inCategory = type_category(inType);
+        TypeCategory outCategory = type_category(outType);
+        int inCount = SlotCount(inType);
+        int outCount = SlotCount(outType);
+        if (inCategory != outCategory) {
+            SkASSERT(inCount == outCount);
+            if (inCategory == TypeCategory::kFloat) {
+                SkASSERT(outCategory == TypeCategory::kSigned ||
+                         outCategory == TypeCategory::kUnsigned);
+                this->write(vector_instruction(ByteCodeInstruction::kConvertFtoI, outCount));
+            } else if (outCategory == TypeCategory::kFloat) {
+                if (inCategory == TypeCategory::kSigned) {
+                    this->write(vector_instruction(ByteCodeInstruction::kConvertStoF, outCount));
+                } else {
+                    SkASSERT(inCategory == TypeCategory::kUnsigned);
+                    this->write(vector_instruction(ByteCodeInstruction::kConvertUtoF, outCount));
+                }
+            } else {
+                SkASSERT(false);
             }
         }
-        int offset = 0;
-        for (const auto& arg : c.fArguments) {
-            this->writeExpression(*arg, ByteCode::Register{(uint16_t) (result.fIndex + offset)});
-            offset += SlotCount(arg->fType);
-        }
-        return;
-    }
-    if (c.fArguments.size() == 1 && c.fArguments[0]->fType.columns() == 1 &&
-        c.fType.columns() > 1) {
-        SkASSERT(SlotCount(c.fArguments[0]->fType) == 1);
-        ByteCode::Register v = result;
-        this->writeExpression(*c.fArguments[0], v);
-        this->write(ByteCode::Instruction::kSplat);
-        this->write((uint8_t) (c.fType.columns() - 1));
-        this->write(v + 1);
-        this->write(v);
-        return;
-    }
-    ByteCode::Instruction inst;
-    switch (type_category(c.fArguments[0]->fType)) {
-        case TypeCategory::kSigned:
-            if (type_category(c.fType) == TypeCategory::kFloat) {
-                inst = ByteCode::Instruction::kSignedToFloat;
+        if (inType.kind() == Type::kMatrix_Kind && outType.kind() == Type::kMatrix_Kind) {
+            this->write(ByteCodeInstruction::kMatrixToMatrix,
+                        SlotCount(outType) - SlotCount(inType));
+            this->write8(inType.columns());
+            this->write8(inType.rows());
+            this->write8(outType.columns());
+            this->write8(outType.rows());
+        } else if (inCount != outCount) {
+            SkASSERT(inCount == 1);
+            if (outType.kind() == Type::kMatrix_Kind) {
+                this->write(ByteCodeInstruction::kScalarToMatrix, SlotCount(outType) - 1);
+                this->write8(outType.columns());
+                this->write8(outType.rows());
             } else {
-                inst = ByteCode::Instruction::kNop;
-            }
-            break;
-        case TypeCategory::kUnsigned:
-            if (type_category(c.fType) == TypeCategory::kFloat) {
-                inst = ByteCode::Instruction::kUnsignedToFloat;
-            } else {
-                inst = ByteCode::Instruction::kNop;
-            }
-            break;
-        case TypeCategory::kFloat:
-            if (type_category(c.fType) == TypeCategory::kSigned) {
-                inst = ByteCode::Instruction::kFloatToSigned;
-            } else if (type_category(c.fType) == TypeCategory::kUnsigned) {
-                inst = ByteCode::Instruction::kFloatToUnsigned;
-            } else {
-                inst = ByteCode::Instruction::kNop;
-            }
-            break;
-        default:
-            SkASSERT(false);
-            return;
-    }
-    ByteCode::Register values;
-    if (inst == ByteCode::Instruction::kNop) {
-        values = result;
-    } else {
-        values = this->next(SlotCount(c.fType));
-    }
-    ByteCode::Register v = values;
-    for (size_t i = 0; i < c.fArguments.size(); ++i) {
-        this->writeExpression(*c.fArguments[i], v);
-        v.fIndex += SlotCount(c.fArguments[i]->fType);
-    }
-    if (inst != ByteCode::Instruction::kNop) {
-        v = values;
-        ByteCode::Register target = result;
-        for (size_t i = 0; i < c.fArguments.size(); ++i) {
-            int count = SlotCount(c.fArguments[i]->fType);
-            for (int j = 0; j < count; ++j) {
-                this->write(inst);
-                this->write(target);
-                ++target.fIndex;
-                this->write(v + j);
+                SkASSERT(outType.kind() == Type::kVector_Kind);
+                for (; inCount != outCount; ++inCount) {
+                    this->write(ByteCodeInstruction::kDup);
+                    this->write8(1);
+                }
             }
         }
     }
 }
 
-void ByteCodeGenerator::writeExternalFunctionCall(const ExternalFunctionCall& f,
-                                                  ByteCode::Register result) {
+void ByteCodeGenerator::writeExternalFunctionCall(const ExternalFunctionCall& f) {
     int argumentCount = 0;
     for (const auto& arg : f.fArguments) {
+        this->writeExpression(*arg);
         argumentCount += SlotCount(arg->fType);
     }
-    ByteCode::Register args = this->next(argumentCount);
-    argumentCount = 0;
-    for (const auto& arg : f.fArguments) {
-        this->writeExpression(*arg, args + argumentCount);
-        argumentCount += SlotCount(arg->fType);
-    }
-    this->write(ByteCode::Instruction::kCallExternal);
-    this->write(result);
+    this->write(ByteCodeInstruction::kCallExternal, SlotCount(f.fType) - argumentCount);
+    SkASSERT(argumentCount <= 255);
+    this->write8(argumentCount);
+    this->write8(SlotCount(f.fType));
     int index = fOutput->fExternalValues.size();
     fOutput->fExternalValues.push_back(f.fFunction);
     SkASSERT(index <= 255);
-    this->write((uint8_t) index);
-    SkASSERT(SlotCount(f.fType) <= 255);
-    this->write((uint8_t) SlotCount(f.fType));
-    this->write(args);
-    SkASSERT(argumentCount <= 255);
-    this->write((uint8_t) argumentCount);
+    this->write8(index);
 }
 
-void ByteCodeGenerator::writeExternalValue(const ExternalValueReference& e,
-                                           ByteCode::Register result) {
-    this->write(ByteCode::Instruction::kReadExternal);
-    this->write(result);
-    this->write((uint8_t) SlotCount(e.fValue->type()));
+void ByteCodeGenerator::writeExternalValue(const ExternalValueReference& e) {
+    int count = SlotCount(e.fValue->type());
+    this->write(vector_instruction(ByteCodeInstruction::kReadExternal, count));
+    this->write8(count);
     int index = fOutput->fExternalValues.size();
     fOutput->fExternalValues.push_back(e.fValue);
     SkASSERT(index <= 255);
-    this->write((uint8_t) index);
+    this->write8(index);
 }
 
-void ByteCodeGenerator::writeIntrinsicCall(const FunctionCall& c, Intrinsic intrinsic,
-                                           ByteCode::Register result) {
-    if (intrinsic.fIsSpecial) {
-        switch (intrinsic.fValue.fSpecial) {
+void ByteCodeGenerator::writeVariableExpression(const Expression& expr) {
+    Location location = this->getLocation(expr);
+    int count = SlotCount(expr.fType);
+    if (location.isOnStack() || count > 4) {
+        if (!location.isOnStack()) {
+            this->write(ByteCodeInstruction::kPushImmediate);
+            this->write32(location.fSlot);
+        }
+        this->write(location.selectLoad(ByteCodeInstruction::kLoadExtended,
+                                        ByteCodeInstruction::kLoadExtendedGlobal,
+                                        ByteCodeInstruction::kLoadExtendedUniform),
+                    count);
+        this->write8(count);
+    } else {
+        this->write(vector_instruction(location.selectLoad(ByteCodeInstruction::kLoad,
+                                                           ByteCodeInstruction::kLoadGlobal,
+                                                           ByteCodeInstruction::kLoadUniform),
+                                       count));
+        this->write8(count);
+        this->write8(location.fSlot);
+    }
+}
+
+static inline uint32_t float_to_bits(float x) {
+    uint32_t u;
+    memcpy(&u, &x, sizeof(uint32_t));
+    return u;
+}
+
+void ByteCodeGenerator::writeFloatLiteral(const FloatLiteral& f) {
+    this->write(ByteCodeInstruction::kPushImmediate);
+    this->write32(float_to_bits(f.fValue));
+}
+
+void ByteCodeGenerator::writeIntrinsicCall(const FunctionCall& c) {
+    auto found = fIntrinsics.find(c.fFunction.fName);
+    if (found == fIntrinsics.end()) {
+        fErrors.error(c.fOffset, String::printf("Unsupported intrinsic: '%s'",
+                                                String(c.fFunction.fName).c_str()));
+        return;
+    }
+    int count = SlotCount(c.fArguments[0]->fType);
+    if (found->second.fIsSpecial) {
+        SpecialIntrinsic special = found->second.fValue.fSpecial;
+        switch (special) {
             case SpecialIntrinsic::kDot: {
                 SkASSERT(c.fArguments.size() == 2);
-                int count = SlotCount(c.fArguments[0]->fType);
-                ByteCode::Register left = this->next(count);
-                this->writeExpression(*c.fArguments[0], left);
-                ByteCode::Register right = this->next(count);
-                this->writeExpression(*c.fArguments[1], right);
-                ByteCode::Register product = this->next(count);
-                this->writeTypedInstruction(c.fType,
-                                            ByteCode::Instruction::kMultiplyIN,
-                                            ByteCode::Instruction::kMultiplyIN,
-                                            ByteCode::Instruction::kMultiplyFN);
-                this->write((uint8_t) count);
-                this->write(product);
-                this->write(left);
-                this->write(right);
-                ByteCode::Register total = product;
-                for (int i = 1; i < count; ++i) {
-                    this->writeTypedInstruction(c.fType,
-                                                ByteCode::Instruction::kAddI,
-                                                ByteCode::Instruction::kAddI,
-                                                ByteCode::Instruction::kAddF);
-                    ByteCode::Register sum = i == count - 1 ? result : this->next(1);
-                    this->write(sum);
-                    this->write(total);
-                    this->write(product + i);
-                    total = sum;
+                SkASSERT(count == SlotCount(c.fArguments[1]->fType));
+                this->write(vector_instruction(ByteCodeInstruction::kMultiplyF, count));
+                this->write8(count);
+                for (int i = count; i > 1; --i) {
+                    this->write(ByteCodeInstruction::kAddF);
+                    this->write8(1);
                 }
                 break;
             }
-            case SpecialIntrinsic::kInverse: {
-                SkASSERT(c.fArguments.size() == 1);
-                int count = SlotCount(c.fArguments[0]->fType);
-                ByteCode::Register arg = this->next(count);
-                this->writeExpression(*c.fArguments[0], arg);
-                switch (SlotCount(c.fArguments[0]->fType)) {
-                    case 4:  this->write(ByteCode::Instruction::kInverse2x2); break;
-                    case 9:  this->write(ByteCode::Instruction::kInverse3x3); break;
-                    case 16: this->write(ByteCode::Instruction::kInverse4x4); break;
-                    default: SkASSERT(false);
-                }
-                this->write(result);
-                this->write(arg);
-                break;
-            }
-            case SpecialIntrinsic::kPrint: {
-                SkASSERT(c.fArguments.size() == 1);
-                SkASSERT(SlotCount(c.fArguments[0]->fType) == 1);
-                ByteCode::Register arg = this->next(1);
-                this->writeExpression(*c.fArguments[0], arg);
-                this->write(ByteCode::Instruction::kPrint);
-                this->write(arg);
-                break;
-            }
+            default:
+                SkASSERT(false);
         }
     } else {
-        int count = SlotCount(c.fType);
-        std::vector<ByteCode::Register> argRegs;
-        for (const auto& expr : c.fArguments) {
-            SkASSERT(SlotCount(expr->fType) == count);
-            ByteCode::Register reg = this->next(count);
-            this->writeExpression(*expr, reg);
-            argRegs.push_back(reg);
-        }
-        for (int i = 0; i < count; ++i) {
-            this->write(intrinsic.fValue.fInstruction);
-            if (c.fType.fName != "void") {
-                this->write(result + i);
+        switch (found->second.fValue.fInstruction) {
+            case ByteCodeInstruction::kCos:
+            case ByteCodeInstruction::kSin:
+            case ByteCodeInstruction::kTan:
+                SkASSERT(c.fArguments.size() > 0);
+                this->write(vector_instruction(found->second.fValue.fInstruction, count));
+                this->write8(count);
+                break;
+            case ByteCodeInstruction::kSqrt:
+                SkASSERT(c.fArguments.size() > 0);
+                this->write(vector_instruction(found->second.fValue.fInstruction, count));
+                break;
+            case ByteCodeInstruction::kInverse2x2: {
+                SkASSERT(c.fArguments.size() > 0);
+                auto op = ByteCodeInstruction::kInverse2x2;
+                switch (count) {
+                    case 4: break;  // float2x2
+                    case 9:  op = ByteCodeInstruction::kInverse3x3; break;
+                    case 16: op = ByteCodeInstruction::kInverse4x4; break;
+                    default: SkASSERT(false);
+                }
+                this->write(op);
+                break;
             }
-            for (ByteCode::Register arg : argRegs) {
-                this->write(arg + i);
-            }
+            default:
+                SkASSERT(false);
         }
     }
 }
 
-void ByteCodeGenerator::writeFunctionCall(const FunctionCall& c, ByteCode::Register result) {
-    auto found = fIntrinsics.find(c.fFunction.fName);
-    if (found != fIntrinsics.end()) {
-        return this->writeIntrinsicCall(c, found->second, result);
-    }
-    int argCount = c.fArguments.size();
-    std::vector<std::unique_ptr<LValue>> lvalues;
-    int parameterSlotCount = 0;
-    for (const auto& p : c.fFunction.fParameters) {
-        parameterSlotCount += SlotCount(p->fType);
-    }
-    ByteCode::Register argStart = this->next(parameterSlotCount);
-    ByteCode::Register nextArg = argStart;
-    for (int i = 0; i < argCount; ++i) {
-        const auto& param = c.fFunction.fParameters[i];
-        const auto& arg = c.fArguments[i];
-        if (param->fModifiers.fFlags & Modifiers::kOut_Flag) {
-            lvalues.emplace_back(this->getLValue(*arg));
-            lvalues.back()->load(nextArg);
-        } else {
-            this->writeExpression(*arg, nextArg);
-        }
-        nextArg.fIndex += SlotCount(arg->fType);
-    }
+void ByteCodeGenerator::writeFunctionCall(const FunctionCall& f) {
     // Find the index of the function we're calling. We explicitly do not allow calls to functions
     // before they're defined. This is an easy-to-understand rule that prevents recursion.
-    size_t idx;
-    for (idx = 0; idx < fFunctions.size(); ++idx) {
-        if (c.fFunction.matches(fFunctions[idx]->fDeclaration)) {
+    int idx = -1;
+    for (size_t i = 0; i < fFunctions.size(); ++i) {
+        if (f.fFunction.matches(fFunctions[i]->fDeclaration)) {
+            idx = i;
             break;
         }
     }
-    if (idx > 255) {
-        fErrors.error(c.fOffset, "Function count limit exceeded");
-        return;
-    } else if (idx >= fOutput->fFunctions.size()) {
-        fErrors.error(c.fOffset, "Call to undefined function");
-        return;
-    }
-
-    this->write(ByteCode::Instruction::kCall);
-    this->write(result);
-    this->write((uint8_t) idx);
-    this->write(argStart);
-    nextArg = argStart;
-    auto lvalue = lvalues.begin();
-    for (int i = 0; i < argCount; ++i) {
-        const auto& param = c.fFunction.fParameters[i];
-        if (param->fModifiers.fFlags & Modifiers::kOut_Flag) {
-            (*(lvalue++))->store(nextArg);
+    if (idx == -1) {
+        for (const auto& arg : f.fArguments) {
+            this->writeExpression(*arg);
         }
-        nextArg.fIndex += SlotCount(param->fType);
+        this->writeIntrinsicCall(f);
+        return;
     }
+
+
+    if (idx > 255) {
+        fErrors.error(f.fOffset, "Function count limit exceeded");
+        return;
+    } else if (idx >= (int) fFunctions.size()) {
+        fErrors.error(f.fOffset, "Call to undefined function");
+        return;
+    }
+
+    // We may need to deal with out parameters, so the sequence is tricky
+    if (int returnCount = SlotCount(f.fType)) {
+        this->write(ByteCodeInstruction::kReserve, returnCount);
+        this->write8(returnCount);
+    }
+
+    int argCount = f.fArguments.size();
+    std::vector<std::unique_ptr<LValue>> lvalues;
+    for (int i = 0; i < argCount; ++i) {
+        const auto& param = f.fFunction.fParameters[i];
+        const auto& arg = f.fArguments[i];
+        if (param->fModifiers.fFlags & Modifiers::kOut_Flag) {
+            lvalues.emplace_back(this->getLValue(*arg));
+            lvalues.back()->load();
+        } else {
+            this->writeExpression(*arg);
+        }
+    }
+
+    // The space used by the call is based on the callee, but it also unwinds all of that before
+    // we continue execution. We adjust our max stack depths below.
+    this->write(ByteCodeInstruction::kCall);
+    this->write8(idx);
+
+    const ByteCodeFunction* callee = fOutput->fFunctions[idx].get();
+    fMaxLoopCount      = std::max(fMaxLoopCount,      fLoopCount      + callee->fLoopCount);
+    fMaxConditionCount = std::max(fMaxConditionCount, fConditionCount + callee->fConditionCount);
+    fMaxStackCount     = std::max(fMaxStackCount,     fStackCount     + callee->fLocalCount
+                                                                      + callee->fStackCount);
+
+    // After the called function returns, the stack will still contain our arguments. We have to
+    // pop them (storing any out parameters back to their lvalues as we go). We glob together slot
+    // counts for all parameters that aren't out-params, so we can pop them in one big chunk.
+    int popCount = 0;
+    auto pop = [&]() {
+        if (popCount > 4) {
+            this->write(ByteCodeInstruction::kPopN, popCount);
+            this->write8(popCount);
+        } else if (popCount > 0) {
+            this->write(vector_instruction(ByteCodeInstruction::kPop, popCount));
+        }
+        popCount = 0;
+    };
+
+    for (int i = argCount - 1; i >= 0; --i) {
+        const auto& param = f.fFunction.fParameters[i];
+        const auto& arg = f.fArguments[i];
+        if (param->fModifiers.fFlags & Modifiers::kOut_Flag) {
+            pop();
+            lvalues.back()->store(true);
+            lvalues.pop_back();
+        } else {
+            popCount += SlotCount(arg->fType);
+        }
+    }
+    pop();
 }
 
-void ByteCodeGenerator::incOrDec(Token::Kind op, Expression& operand, bool prefix,
-                                 ByteCode::Register result) {
-    SkASSERT(op == Token::Kind::PLUSPLUS || op == Token::Kind::MINUSMINUS);
-    std::unique_ptr<LValue> lvalue = this->getLValue(operand);
-    SkASSERT(SlotCount(operand.fType) == 1);
-    ByteCode::Register value;
-    if (prefix) {
-        value = this->next(1);
-    } else {
-        value = result;
-    }
-    lvalue->load(value);
-    ByteCode::Register one = this->next(1);
-    this->write(ByteCode::Instruction::kImmediate);
-    this->write(one);
-    if (type_category(operand.fType) == TypeCategory::kFloat) {
-        this->write(ByteCode::Immediate(1.0f));
-    } else {
-        this->write(ByteCode::Immediate((int32_t) 1));
-    }
-    if (op == Token::Kind::PLUSPLUS) {
-        this->writeTypedInstruction(operand.fType,
-                                    ByteCode::Instruction::kAddI,
-                                    ByteCode::Instruction::kAddI,
-                                    ByteCode::Instruction::kAddF);
-    } else {
-        this->writeTypedInstruction(operand.fType,
-                                    ByteCode::Instruction::kSubtractI,
-                                    ByteCode::Instruction::kSubtractI,
-                                    ByteCode::Instruction::kSubtractF);
-    }
-    if (prefix) {
-        this->write(result);
-        this->write(value);
-        this->write(one);
-        lvalue->store(result);
-    } else {
-        ByteCode::Register temp = this->next(1);
-        this->write(temp);
-        this->write(value);
-        this->write(one);
-        lvalue->store(temp);
-    }
+void ByteCodeGenerator::writeIntLiteral(const IntLiteral& i) {
+    this->write(ByteCodeInstruction::kPushImmediate);
+    this->write32(i.fValue);
 }
 
-void ByteCodeGenerator::writePostfixExpression(const PostfixExpression& p,
-                                               ByteCode::Register result) {
-    this->incOrDec(p.fOperator, *p.fOperand, false, result);
+void ByteCodeGenerator::writeNullLiteral(const NullLiteral& n) {
+    // not yet implemented
+    abort();
 }
 
-void ByteCodeGenerator::writePrefixExpression(const PrefixExpression& p,
-                                              ByteCode::Register result) {
+bool ByteCodeGenerator::writePrefixExpression(const PrefixExpression& p, bool discard) {
     switch (p.fOperator) {
-        case Token::Kind::PLUSPLUS:
+        case Token::Kind::PLUSPLUS: // fall through
         case Token::Kind::MINUSMINUS: {
-            return this->incOrDec(p.fOperator, *p.fOperand, true, result);
+            SkASSERT(SlotCount(p.fOperand->fType) == 1);
+            std::unique_ptr<LValue> lvalue = this->getLValue(*p.fOperand);
+            lvalue->load();
+            this->write(ByteCodeInstruction::kPushImmediate);
+            this->write32(type_category(p.fType) == TypeCategory::kFloat ? float_to_bits(1.0f) : 1);
+            if (p.fOperator == Token::Kind::PLUSPLUS) {
+                this->writeTypedInstruction(p.fType,
+                                            ByteCodeInstruction::kAddI,
+                                            ByteCodeInstruction::kAddI,
+                                            ByteCodeInstruction::kAddF,
+                                            1);
+            } else {
+                this->writeTypedInstruction(p.fType,
+                                            ByteCodeInstruction::kSubtractI,
+                                            ByteCodeInstruction::kSubtractI,
+                                            ByteCodeInstruction::kSubtractF,
+                                            1);
+            }
+            lvalue->store(discard);
+            discard = false;
+            break;
         }
         case Token::Kind::MINUS: {
-            ByteCode::Register src = this->next(SlotCount(p.fType));
-            this->writeExpression(*p.fOperand, src);
-            for (int i = 0; i < SlotCount(p.fType); ++i) {
-                this->writeTypedInstruction(p.fType,
-                                            ByteCode::Instruction::kNegateS,
-                                            ByteCode::Instruction::kNegateS,
-                                            ByteCode::Instruction::kNegateF);
-                this->write(result + i);
-                this->write(src + i);
-            }
+            this->writeExpression(*p.fOperand);
+            this->writeTypedInstruction(p.fType,
+                                        ByteCodeInstruction::kNegateI,
+                                        ByteCodeInstruction::kNegateI,
+                                        ByteCodeInstruction::kNegateF,
+                                        SlotCount(p.fOperand->fType),
+                                        false);
             break;
         }
         case Token::Kind::LOGICALNOT:
         case Token::Kind::BITWISENOT: {
-            ByteCode::Register src = this->next(SlotCount(p.fType));
-            this->writeExpression(*p.fOperand, src);
-            for (int i = 0; i < SlotCount(p.fType); ++i) {
-                this->write(ByteCode::Instruction::kNot);
-                this->write(result + i);
-                this->write(src + i);
-            }
+            SkASSERT(SlotCount(p.fOperand->fType) == 1);
+            SkDEBUGCODE(TypeCategory tc = type_category(p.fOperand->fType));
+            SkASSERT((p.fOperator == Token::Kind::LOGICALNOT && tc == TypeCategory::kBool) ||
+                     (p.fOperator == Token::Kind::BITWISENOT && (tc == TypeCategory::kSigned ||
+                                                                 tc == TypeCategory::kUnsigned)));
+            this->writeExpression(*p.fOperand);
+            this->write(ByteCodeInstruction::kNotB);
             break;
         }
         default:
             SkASSERT(false);
     }
+    return discard;
 }
 
-void ByteCodeGenerator::writeSwizzle(const Swizzle& s, ByteCode::Register result) {
+bool ByteCodeGenerator::writePostfixExpression(const PostfixExpression& p, bool discard) {
+    switch (p.fOperator) {
+        case Token::Kind::PLUSPLUS: // fall through
+        case Token::Kind::MINUSMINUS: {
+            SkASSERT(SlotCount(p.fOperand->fType) == 1);
+            std::unique_ptr<LValue> lvalue = this->getLValue(*p.fOperand);
+            lvalue->load();
+            // If we're not supposed to discard the result, then make a copy *before* the +/-
+            if (!discard) {
+                this->write(ByteCodeInstruction::kDup);
+                this->write8(1);
+            }
+            this->write(ByteCodeInstruction::kPushImmediate);
+            this->write32(type_category(p.fType) == TypeCategory::kFloat ? float_to_bits(1.0f) : 1);
+            if (p.fOperator == Token::Kind::PLUSPLUS) {
+                this->writeTypedInstruction(p.fType,
+                                            ByteCodeInstruction::kAddI,
+                                            ByteCodeInstruction::kAddI,
+                                            ByteCodeInstruction::kAddF,
+                                            1);
+            } else {
+                this->writeTypedInstruction(p.fType,
+                                            ByteCodeInstruction::kSubtractI,
+                                            ByteCodeInstruction::kSubtractI,
+                                            ByteCodeInstruction::kSubtractF,
+                                            1);
+            }
+            // Always consume the result as part of the store
+            lvalue->store(true);
+            discard = false;
+            break;
+        }
+        default:
+            SkASSERT(false);
+    }
+    return discard;
+}
+
+void ByteCodeGenerator::writeSwizzle(const Swizzle& s) {
     if (swizzle_is_simple(s)) {
-        this->writeVariableExpression(s, result);
+        this->writeVariableExpression(s);
         return;
     }
-    ByteCode::Register base = this->writeExpression(*s.fBase);
-    for (int i = 0; i < (int) s.fComponents.size(); ++i) {
-        this->write(ByteCode::Instruction::kCopy);
-        this->write(result + i);
-        this->write(base + s.fComponents[i]);
+
+    switch (s.fBase->fKind) {
+        case Expression::kVariableReference_Kind: {
+            Location location = this->getLocation(*s.fBase);
+            this->write(location.selectLoad(ByteCodeInstruction::kLoadSwizzle,
+                                            ByteCodeInstruction::kLoadSwizzleGlobal,
+                                            ByteCodeInstruction::kLoadSwizzleUniform),
+                        s.fComponents.size());
+            this->write8(location.fSlot);
+            this->write8(s.fComponents.size());
+            for (int c : s.fComponents) {
+                this->write8(c);
+            }
+            break;
+        }
+        default:
+            this->writeExpression(*s.fBase);
+            this->write(ByteCodeInstruction::kSwizzle,
+                        s.fComponents.size() - s.fBase->fType.columns());
+            this->write8(s.fBase->fType.columns());
+            this->write8(s.fComponents.size());
+            for (int c : s.fComponents) {
+                this->write8(c);
+            }
     }
 }
 
-void ByteCodeGenerator::writeTernaryExpression(const TernaryExpression& t,
-                                               ByteCode::Register result) {
+void ByteCodeGenerator::writeTernaryExpression(const TernaryExpression& t) {
     int count = SlotCount(t.fType);
     SkASSERT(count == SlotCount(t.fIfTrue->fType));
     SkASSERT(count == SlotCount(t.fIfFalse->fType));
 
-    ByteCode::Register test = this->writeExpression(*t.fTest);
-    this->write(ByteCode::Instruction::kMaskPush);
-    ++fConditionCount;
-    this->write(test);
-    ByteCode::Register ifTrue = this->writeExpression(*t.fIfTrue);
-    this->write(ByteCode::Instruction::kMaskNegate);
-    ByteCode::Register ifFalse = this->writeExpression(*t.fIfFalse);
-    --fConditionCount;
-    this->write(ByteCode::Instruction::kMaskPop);
-    for (int i = 0; i < count; ++i) {
-        this->write(ByteCode::Instruction::kSelect);
-        this->write(result + i);
-        this->write(test);
-        this->write(ifTrue + i);
-        this->write(ifFalse + i);
-    }
+    this->writeExpression(*t.fTest);
+    this->write(ByteCodeInstruction::kMaskPush);
+    this->writeExpression(*t.fIfTrue);
+    this->write(ByteCodeInstruction::kMaskNegate);
+    this->writeExpression(*t.fIfFalse);
+    this->write(ByteCodeInstruction::kMaskBlend, count);
+    this->write8(count);
 }
 
-void ByteCodeGenerator::writeVariableExpression(const Expression& expr,
-                                                ByteCode::Register result) {
-    ByteCodeGenerator::Location location = this->getLocation(expr);
-    int count = SlotCount(expr.fType);
-    ByteCode::Instruction load = this->getLoadInstruction(location, this->getStorage(expr));
-    this->write(load, count);
-    this->write(result);
-    this->write(location);
-}
-
-void ByteCodeGenerator::writeExpression(const Expression& expr, ByteCode::Register result) {
-    switch (expr.fKind) {
-        case Expression::kBoolLiteral_Kind: {
-            this->write(ByteCode::Instruction::kImmediate);
-            this->write(result);
-            this->write(ByteCode::Immediate((int32_t) (((BoolLiteral&) expr).fValue ? -1 : 0)));
+void ByteCodeGenerator::writeExpression(const Expression& e, bool discard) {
+    switch (e.fKind) {
+        case Expression::kBinary_Kind:
+            discard = this->writeBinaryExpression((BinaryExpression&) e, discard);
             break;
-        }
-        case Expression::kBinary_Kind: {
-            this->writeBinaryExpression((BinaryExpression&) expr, result);
+        case Expression::kBoolLiteral_Kind:
+            this->writeBoolLiteral((BoolLiteral&) e);
             break;
-        }
-        case Expression::kConstructor_Kind: {
-            this->writeConstructor((Constructor&) expr, result);
+        case Expression::kConstructor_Kind:
+            this->writeConstructor((Constructor&) e);
             break;
-        }
         case Expression::kExternalFunctionCall_Kind:
-            this->writeExternalFunctionCall((ExternalFunctionCall&) expr, result);
+            this->writeExternalFunctionCall((ExternalFunctionCall&) e);
             break;
         case Expression::kExternalValue_Kind:
-            this->writeExternalValue((ExternalValueReference&) expr, result);
-            break;
-        case Expression::kFloatLiteral_Kind: {
-            this->write(ByteCode::Instruction::kImmediate);
-            this->write(result);
-            this->write(ByteCode::Immediate((float) ((FloatLiteral&) expr).fValue));
-            break;
-        }
-        case Expression::kFunctionCall_Kind: {
-            this->writeFunctionCall((FunctionCall&) expr, result);
-            break;
-        }
-        case Expression::kIntLiteral_Kind: {
-            this->write(ByteCode::Instruction::kImmediate);
-            this->write(result);
-            this->write(ByteCode::Immediate((int32_t) ((IntLiteral&) expr).fValue));
-            break;
-        }
-        case Expression::kPostfix_Kind:
-            this->writePostfixExpression((PostfixExpression&) expr, result);
-            break;
-        case Expression::kPrefix_Kind:
-            this->writePrefixExpression((PrefixExpression&) expr, result);
-            break;
-        case Expression::kSwizzle_Kind:
-            this->writeSwizzle((Swizzle&) expr, result);
-            break;
-        case Expression::kTernary_Kind:
-            this->writeTernaryExpression((TernaryExpression&) expr, result);
+            this->writeExternalValue((ExternalValueReference&) e);
             break;
         case Expression::kFieldAccess_Kind:
         case Expression::kIndex_Kind:
         case Expression::kVariableReference_Kind:
-            this->writeVariableExpression(expr, result);
+            this->writeVariableExpression(e);
+            break;
+        case Expression::kFloatLiteral_Kind:
+            this->writeFloatLiteral((FloatLiteral&) e);
+            break;
+        case Expression::kFunctionCall_Kind:
+            this->writeFunctionCall((FunctionCall&) e);
+            break;
+        case Expression::kIntLiteral_Kind:
+            this->writeIntLiteral((IntLiteral&) e);
+            break;
+        case Expression::kNullLiteral_Kind:
+            this->writeNullLiteral((NullLiteral&) e);
+            break;
+        case Expression::kPrefix_Kind:
+            discard = this->writePrefixExpression((PrefixExpression&) e, discard);
+            break;
+        case Expression::kPostfix_Kind:
+            discard = this->writePostfixExpression((PostfixExpression&) e, discard);
+            break;
+        case Expression::kSwizzle_Kind:
+            this->writeSwizzle((Swizzle&) e);
+            break;
+        case Expression::kTernary_Kind:
+            this->writeTernaryExpression((TernaryExpression&) e);
             break;
         default:
 #ifdef SK_DEBUG
-            ABORT("unsupported lvalue %s\n", expr.description().c_str());
+            printf("unsupported expression %s\n", e.description().c_str());
 #endif
-            break;
+            SkASSERT(false);
+    }
+    if (discard) {
+        int count = SlotCount(e.fType);
+        if (count > 4) {
+            this->write(ByteCodeInstruction::kPopN, count);
+            this->write8(count);
+        } else if (count != 0) {
+            this->write(vector_instruction(ByteCodeInstruction::kPop, count));
+        }
+        discard = false;
     }
 }
 
-ByteCode::Register ByteCodeGenerator::writeExpression(const Expression& expr) {
-    ByteCode::Register result = this->next(SlotCount(expr.fType));
-    this->writeExpression(expr, result);
-    return result;
+class ByteCodeExternalValueLValue : public ByteCodeGenerator::LValue {
+public:
+    ByteCodeExternalValueLValue(ByteCodeGenerator* generator, ExternalValue& value, int index)
+        : INHERITED(*generator)
+        , fCount(ByteCodeGenerator::SlotCount(value.type()))
+        , fIndex(index) {}
+
+    void load() override {
+        fGenerator.write(vector_instruction(ByteCodeInstruction::kReadExternal, fCount));
+        fGenerator.write8(fCount);
+        fGenerator.write8(fIndex);
+    }
+
+    void store(bool discard) override {
+        if (!discard) {
+            fGenerator.write(vector_instruction(ByteCodeInstruction::kDup, fCount));
+            fGenerator.write8(fCount);
+        }
+        fGenerator.write(vector_instruction(ByteCodeInstruction::kWriteExternal, fCount));
+        fGenerator.write8(fCount);
+        fGenerator.write8(fIndex);
+    }
+
+private:
+    typedef LValue INHERITED;
+
+    int fCount;
+
+    int fIndex;
+};
+
+class ByteCodeSwizzleLValue : public ByteCodeGenerator::LValue {
+public:
+    ByteCodeSwizzleLValue(ByteCodeGenerator* generator, const Swizzle& swizzle)
+        : INHERITED(*generator)
+        , fSwizzle(swizzle) {}
+
+    void load() override {
+        fGenerator.writeSwizzle(fSwizzle);
+    }
+
+    void store(bool discard) override {
+        int count = fSwizzle.fComponents.size();
+        if (!discard) {
+            fGenerator.write(vector_instruction(ByteCodeInstruction::kDup, count));
+            fGenerator.write8(count);
+        }
+        ByteCodeGenerator::Location location = fGenerator.getLocation(*fSwizzle.fBase);
+        if (location.isOnStack()) {
+            fGenerator.write(location.selectStore(ByteCodeInstruction::kStoreSwizzleIndirect,
+                                                  ByteCodeInstruction::kStoreSwizzleIndirectGlobal),
+                             count);
+        } else {
+            fGenerator.write(location.selectStore(ByteCodeInstruction::kStoreSwizzle,
+                                                  ByteCodeInstruction::kStoreSwizzleGlobal),
+                             count);
+            fGenerator.write8(location.fSlot);
+        }
+        fGenerator.write8(count);
+        for (int c : fSwizzle.fComponents) {
+            fGenerator.write8(c);
+        }
+    }
+
+private:
+    const Swizzle& fSwizzle;
+
+    typedef LValue INHERITED;
+};
+
+class ByteCodeExpressionLValue : public ByteCodeGenerator::LValue {
+public:
+    ByteCodeExpressionLValue(ByteCodeGenerator* generator, const Expression& expr)
+        : INHERITED(*generator)
+        , fExpression(expr) {}
+
+    void load() override {
+        fGenerator.writeVariableExpression(fExpression);
+    }
+
+    void store(bool discard) override {
+        int count = ByteCodeGenerator::SlotCount(fExpression.fType);
+        if (!discard) {
+            if (count > 4) {
+                fGenerator.write(ByteCodeInstruction::kDupN, count);
+                fGenerator.write8(count);
+            } else {
+                fGenerator.write(vector_instruction(ByteCodeInstruction::kDup, count));
+                fGenerator.write8(count);
+            }
+        }
+        ByteCodeGenerator::Location location = fGenerator.getLocation(fExpression);
+        if (location.isOnStack() || count > 4) {
+            if (!location.isOnStack()) {
+                fGenerator.write(ByteCodeInstruction::kPushImmediate);
+                fGenerator.write32(location.fSlot);
+            }
+            fGenerator.write(location.selectStore(ByteCodeInstruction::kStoreExtended,
+                                                  ByteCodeInstruction::kStoreExtendedGlobal),
+                             count);
+            fGenerator.write8(count);
+        } else {
+            fGenerator.write(
+                    vector_instruction(location.selectStore(ByteCodeInstruction::kStore,
+                                                            ByteCodeInstruction::kStoreGlobal),
+                                       count));
+            fGenerator.write8(location.fSlot);
+        }
+    }
+
+private:
+    typedef LValue INHERITED;
+
+    const Expression& fExpression;
+};
+
+std::unique_ptr<ByteCodeGenerator::LValue> ByteCodeGenerator::getLValue(const Expression& e) {
+    switch (e.fKind) {
+        case Expression::kExternalValue_Kind: {
+            ExternalValue* value = ((ExternalValueReference&) e).fValue;
+            int index = fOutput->fExternalValues.size();
+            fOutput->fExternalValues.push_back(value);
+            SkASSERT(index <= 255);
+            return std::unique_ptr<LValue>(new ByteCodeExternalValueLValue(this, *value, index));
+        }
+        case Expression::kFieldAccess_Kind:
+        case Expression::kIndex_Kind:
+        case Expression::kVariableReference_Kind:
+            return std::unique_ptr<LValue>(new ByteCodeExpressionLValue(this, e));
+        case Expression::kSwizzle_Kind: {
+            const Swizzle& s = (const Swizzle&) e;
+            return swizzle_is_simple(s)
+                    ? std::unique_ptr<LValue>(new ByteCodeExpressionLValue(this, e))
+                    : std::unique_ptr<LValue>(new ByteCodeSwizzleLValue(this, s));
+        }
+        case Expression::kTernary_Kind:
+        default:
+#ifdef SK_DEBUG
+            ABORT("unsupported lvalue %s\n", e.description().c_str());
+#endif
+            return nullptr;
+    }
 }
 
 void ByteCodeGenerator::writeBlock(const Block& b) {
@@ -1253,119 +1484,146 @@
     }
 }
 
+void ByteCodeGenerator::setBreakTargets() {
+    std::vector<DeferredLocation>& breaks = fBreakTargets.top();
+    for (DeferredLocation& b : breaks) {
+        b.set();
+    }
+    fBreakTargets.pop();
+}
+
+void ByteCodeGenerator::setContinueTargets() {
+    std::vector<DeferredLocation>& continues = fContinueTargets.top();
+    for (DeferredLocation& c : continues) {
+        c.set();
+    }
+    fContinueTargets.pop();
+}
+
+void ByteCodeGenerator::writeBreakStatement(const BreakStatement& b) {
+    // TODO: Include BranchIfAllFalse to top-most LoopNext
+    this->write(ByteCodeInstruction::kLoopBreak);
+}
+
+void ByteCodeGenerator::writeContinueStatement(const ContinueStatement& c) {
+    // TODO: Include BranchIfAllFalse to top-most LoopNext
+    this->write(ByteCodeInstruction::kLoopContinue);
+}
+
 void ByteCodeGenerator::writeDoStatement(const DoStatement& d) {
-    this->write(ByteCode::Instruction::kLoopBegin);
-    ++fConditionCount;
-    SkASSERT(fCode->size() < ByteCode::kPointerMax);
-    ByteCode::Pointer start{(uint16_t) fCode->size()};
+    this->write(ByteCodeInstruction::kLoopBegin);
+    size_t start = fCode->size();
     this->writeStatement(*d.fStatement);
-    ByteCode::Register test = this->writeExpression(*d.fTest);
-    this->write(ByteCode::Instruction::kLoopNext);
-    this->write(ByteCode::Instruction::kLoopMask);
-    this->write(test);
-    this->write(ByteCode::Instruction::kBranchIfAllFalse);
+    this->write(ByteCodeInstruction::kLoopNext);
+    this->writeExpression(*d.fTest);
+    this->write(ByteCodeInstruction::kLoopMask);
+    // TODO: Could shorten this with kBranchIfAnyTrue
+    this->write(ByteCodeInstruction::kBranchIfAllFalse);
     DeferredLocation endLocation(this);
-    this->write(ByteCode::Instruction::kBranch);
-    this->write(start);
+    this->write(ByteCodeInstruction::kBranch);
+    this->write16(start);
     endLocation.set();
-    --fConditionCount;
-    this->write(ByteCode::Instruction::kLoopEnd);
+    this->write(ByteCodeInstruction::kLoopEnd);
 }
 
 void ByteCodeGenerator::writeForStatement(const ForStatement& f) {
+    fContinueTargets.emplace();
+    fBreakTargets.emplace();
     if (f.fInitializer) {
         this->writeStatement(*f.fInitializer);
     }
-    this->write(ByteCode::Instruction::kLoopBegin);
-    ++fConditionCount;
-    ByteCode::Pointer start{(uint16_t) fCode->size()};
+    this->write(ByteCodeInstruction::kLoopBegin);
+    size_t start = fCode->size();
     if (f.fTest) {
-        ByteCode::Register test = this->writeExpression(*f.fTest);
-        this->write(ByteCode::Instruction::kLoopMask);
-        this->write(test);
+        this->writeExpression(*f.fTest);
+        this->write(ByteCodeInstruction::kLoopMask);
     }
-    this->write(ByteCode::Instruction::kBranchIfAllFalse);
+    this->write(ByteCodeInstruction::kBranchIfAllFalse);
     DeferredLocation endLocation(this);
     this->writeStatement(*f.fStatement);
-    this->write(ByteCode::Instruction::kLoopNext);
+    this->write(ByteCodeInstruction::kLoopNext);
     if (f.fNext) {
-        this->writeExpression(*f.fNext);
+        this->writeExpression(*f.fNext, true);
     }
-    this->write(ByteCode::Instruction::kBranch);
-    this->write(start);
+    this->write(ByteCodeInstruction::kBranch);
+    this->write16(start);
     endLocation.set();
-    --fConditionCount;
-    this->write(ByteCode::Instruction::kLoopEnd);
+    this->write(ByteCodeInstruction::kLoopEnd);
 }
 
 void ByteCodeGenerator::writeIfStatement(const IfStatement& i) {
-    ByteCode::Register test = this->writeExpression(*i.fTest);
-    this->write(ByteCode::Instruction::kMaskPush);
-    ++fConditionCount;
-    this->write(test);
-    this->write(ByteCode::Instruction::kBranchIfAllFalse);
+    this->writeExpression(*i.fTest);
+    this->write(ByteCodeInstruction::kMaskPush);
+    this->write(ByteCodeInstruction::kBranchIfAllFalse);
     DeferredLocation falseLocation(this);
     this->writeStatement(*i.fIfTrue);
     falseLocation.set();
     if (i.fIfFalse) {
-        this->write(ByteCode::Instruction::kMaskNegate);
-        this->write(ByteCode::Instruction::kBranchIfAllFalse);
+        this->write(ByteCodeInstruction::kMaskNegate);
+        this->write(ByteCodeInstruction::kBranchIfAllFalse);
         DeferredLocation endLocation(this);
         this->writeStatement(*i.fIfFalse);
         endLocation.set();
     }
-    --fConditionCount;
-    this->write(ByteCode::Instruction::kMaskPop);
+    this->write(ByteCodeInstruction::kMaskPop);
 }
 
-void ByteCodeGenerator::writeReturn(const ReturnStatement& r) {
-    if (fConditionCount) {
+void ByteCodeGenerator::writeReturnStatement(const ReturnStatement& r) {
+    if (fLoopCount || fConditionCount) {
         fErrors.error(r.fOffset, "return not allowed inside conditional or loop");
         return;
     }
-    if (r.fExpression) {
-        ByteCode::Register value = this->writeExpression(*r.fExpression);
-        this->write(ByteCode::Instruction::kReturnValue);
-        this->write(value);
-    }
-    else {
-        this->write(ByteCode::Instruction::kReturn);
-    }
+    int count = SlotCount(r.fExpression->fType);
+    this->writeExpression(*r.fExpression);
+
+    // Technically, the kReturn also pops fOutput->fLocalCount values from the stack, too, but we
+    // haven't counted pushing those (they're outside the scope of our stack tracking). Instead,
+    // we account for those in writeFunction().
+
+    // This is all fine because we don't allow conditional returns, so we only return once anyway.
+    this->write(ByteCodeInstruction::kReturn, -count);
+    this->write8(count);
+}
+
+void ByteCodeGenerator::writeSwitchStatement(const SwitchStatement& r) {
+    // not yet implemented
+    abort();
 }
 
 void ByteCodeGenerator::writeVarDeclarations(const VarDeclarations& v) {
     for (const auto& declStatement : v.fVars) {
         const VarDeclaration& decl = (VarDeclaration&) *declStatement;
-        // we need to grab the location even if we don't use it, to ensure it
-        // has been allocated
-        ByteCodeGenerator::Location location = this->getLocation(*decl.fVar);
+        // we need to grab the location even if we don't use it, to ensure it has been allocated
+        Location location = this->getLocation(*decl.fVar);
         if (decl.fValue) {
-            ByteCode::Register src = this->writeExpression(*decl.fValue);
-            uint8_t count = (uint8_t) SlotCount(decl.fVar->fType);
-            this->write(ByteCode::Instruction::kStoreStackDirect, count);
-            this->write(location);
-            this->write(src);
+            this->writeExpression(*decl.fValue);
+            int count = SlotCount(decl.fValue->fType);
+            if (count > 4) {
+                this->write(ByteCodeInstruction::kPushImmediate);
+                this->write32(location.fSlot);
+                this->write(ByteCodeInstruction::kStoreExtended, count);
+                this->write8(count);
+            } else {
+                this->write(vector_instruction(ByteCodeInstruction::kStore, count));
+                this->write8(location.fSlot);
+            }
         }
     }
 }
 
 void ByteCodeGenerator::writeWhileStatement(const WhileStatement& w) {
-    this->write(ByteCode::Instruction::kLoopBegin);
-    ++fConditionCount;
-    SkASSERT(fCode->size() < ByteCode::kPointerMax);
-    ByteCode::Pointer start{(uint16_t) fCode->size()};
-    ByteCode::Register test = this->writeExpression(*w.fTest);
-    this->write(ByteCode::Instruction::kLoopMask);
-    this->write(test);
-    this->write(ByteCode::Instruction::kBranchIfAllFalse);
+    this->write(ByteCodeInstruction::kLoopBegin);
+    size_t cond = fCode->size();
+    this->writeExpression(*w.fTest);
+    this->write(ByteCodeInstruction::kLoopMask);
+    this->write(ByteCodeInstruction::kBranchIfAllFalse);
     DeferredLocation endLocation(this);
     this->writeStatement(*w.fStatement);
-    this->write(ByteCode::Instruction::kLoopNext);
-    this->write(ByteCode::Instruction::kBranch);
-    this->write(start);
+    this->write(ByteCodeInstruction::kLoopNext);
+    this->write(ByteCodeInstruction::kBranch);
+    this->write16(cond);
     endLocation.set();
-    --fConditionCount;
-    this->write(ByteCode::Instruction::kLoopEnd);
+    this->write(ByteCodeInstruction::kLoopEnd);
 }
 
 void ByteCodeGenerator::writeStatement(const Statement& s) {
@@ -1374,16 +1632,19 @@
             this->writeBlock((Block&) s);
             break;
         case Statement::kBreak_Kind:
-            this->write(ByteCode::Instruction::kBreak);
+            this->writeBreakStatement((BreakStatement&) s);
             break;
         case Statement::kContinue_Kind:
-            this->write(ByteCode::Instruction::kContinue);
+            this->writeContinueStatement((ContinueStatement&) s);
             break;
+        case Statement::kDiscard_Kind:
+            // not yet implemented
+            abort();
         case Statement::kDo_Kind:
             this->writeDoStatement((DoStatement&) s);
             break;
         case Statement::kExpression_Kind:
-            this->writeExpression(*((ExpressionStatement&) s).fExpression);
+            this->writeExpression(*((ExpressionStatement&) s).fExpression, true);
             break;
         case Statement::kFor_Kind:
             this->writeForStatement((ForStatement&) s);
@@ -1394,7 +1655,10 @@
         case Statement::kNop_Kind:
             break;
         case Statement::kReturn_Kind:
-            this->writeReturn((ReturnStatement&) s);
+            this->writeReturnStatement((ReturnStatement&) s);
+            break;
+        case Statement::kSwitch_Kind:
+            this->writeSwitchStatement((SwitchStatement&) s);
             break;
         case Statement::kVarDeclarations_Kind:
             this->writeVarDeclarations(*((VarDeclarationsStatement&) s).fDeclaration);
@@ -1403,80 +1667,18 @@
             this->writeWhileStatement((WhileStatement&) s);
             break;
         default:
-            ABORT("unsupported statement\n");
+            SkASSERT(false);
     }
 }
 
-void ByteCodeGenerator::writeFunction(const FunctionDefinition& f) {
-    fFunction = &f;
-    std::unique_ptr<ByteCodeFunction> result(new ByteCodeFunction(&f.fDeclaration));
-    result->fReturnSlotCount = SlotCount(f.fDeclaration.fReturnType);
+ByteCodeFunction::ByteCodeFunction(const FunctionDeclaration* declaration)
+        : fName(declaration->fName) {
     fParameterCount = 0;
-    fConditionCount = 0;
-    for (const auto& p : f.fDeclaration.fParameters) {
-        int count = SlotCount(p->fType);
-        bool isOut = ((p->fModifiers.fFlags & Modifiers::kOut_Flag) != 0);
-        result->fParameters.push_back(ByteCodeFunction::Parameter{count, isOut});
-        fParameterCount += count;
-    }
-    result->fParameterSlotCount = fParameterCount;
-    fCode = &result->fCode;
-    this->writeStatement(*f.fBody);
-    result->fStackSlotCount = fLocals.size();
-    if (f.fDeclaration.fReturnType.fName == "void") {
-        this->write(ByteCode::Instruction::kReturn);
-    } else {
-        this->write(ByteCode::Instruction::kAbort);
-    }
-    fOutput->fFunctions.push_back(std::move(result));
-    SkASSERT(fConditionCount == 0);
-}
-
-void ByteCodeGenerator::gatherUniforms(const Type& type, const String& name) {
-    if (type.kind() == Type::kOther_Kind) {
-        return;
-    } else if (type.kind() == Type::kStruct_Kind) {
-        for (const auto& f : type.fields()) {
-            this->gatherUniforms(*f.fType, name + "." + f.fName);
-        }
-    } else if (type.kind() == Type::kArray_Kind) {
-        for (int i = 0; i < type.columns(); ++i) {
-            this->gatherUniforms(type.componentType(), String::printf("%s[%d]", name.c_str(), i));
-        }
-    } else {
-        fOutput->fUniforms.push_back({ name, type_category(type), type.rows(), type.columns(),
-                                       fOutput->fUniformSlotCount });
-        fOutput->fUniformSlotCount += type.columns() * type.rows();
+    for (const auto& p : declaration->fParameters) {
+        int slots = ByteCodeGenerator::SlotCount(p->fType);
+        fParameters.push_back({ slots, (bool)(p->fModifiers.fFlags & Modifiers::kOut_Flag) });
+        fParameterCount += slots;
     }
 }
 
-bool ByteCodeGenerator::generateCode() {
-    fOutput->fGlobalSlotCount = 0;
-    fOutput->fUniformSlotCount = 0;
-    for (const auto& pe : fProgram) {
-        if (pe.fKind == ProgramElement::kVar_Kind) {
-            VarDeclarations& decl = (VarDeclarations&) pe;
-            for (const auto& v : decl.fVars) {
-                const Variable* declVar = ((VarDeclaration&) *v).fVar;
-                if (declVar->fModifiers.fLayout.fBuiltin >= 0 || is_in(*declVar)) {
-                    continue;
-                }
-                if (is_uniform(*declVar)) {
-                    this->gatherUniforms(declVar->fType, declVar->fName);
-                } else {
-                    fOutput->fGlobalSlotCount += SlotCount(declVar->fType);
-                }
-            }
-        }
-    }
-    for (const auto& pe : fProgram) {
-        if (pe.fKind == ProgramElement::kFunction_Kind) {
-            FunctionDefinition& f = (FunctionDefinition&) pe;
-            fFunctions.push_back(&f);
-            this->writeFunction(f);
-        }
-    }
-    return fErrors.errorCount() == 0;
 }
-
-} // namespace