improved SkSL lexer performance

Bug: skia:
Change-Id: Ibbb427e511118a0a1819094a59c17cc0f966008c
Reviewed-on: https://skia-review.googlesource.com/145336
Reviewed-by: Greg Daniel <egdaniel@google.com>
Commit-Queue: Ethan Nicholas <ethannicholas@google.com>
diff --git a/src/sksl/SkSLLexer.cpp b/src/sksl/SkSLLexer.cpp
index 5bb970a..b514462 100644
--- a/src/sksl/SkSLLexer.cpp
+++ b/src/sksl/SkSLLexer.cpp
@@ -11,7 +11,7 @@
 
 namespace SkSL {
 
-static int16_t mappings[127] = {
+static int8_t mappings[127] = {
         0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  3,  3,  1,  3,  3,  3,  3,  3,  3,  3,  3,
         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  1,  4,  3,  5,  6,  7,  8,  3,  9,  10, 11, 12,
         13, 14, 15, 16, 17, 18, 18, 18, 18, 18, 18, 18, 18, 18, 19, 20, 21, 22, 23, 24, 25, 26,
@@ -940,31 +940,30 @@
 };
 
 Token Lexer::next() {
-    int startOffset = fOffset;
+    // note that we cheat here: normally a lexer needs to worry about the case
+    // where a token has a prefix which is not itself a valid token - for instance,
+    // maybe we have a valid token 'while', but 'w', 'wh', etc. are not valid
+    // tokens. Our grammar doesn't have this property, so we can simplify the logic
+    // a bit.
+    int32_t startOffset = fOffset;
     if (startOffset == fLength) {
         return Token(Token::END_OF_FILE, startOffset, 0);
     }
-    int offset = startOffset;
-    int state = 1;
-    Token::Kind lastAccept = Token::Kind::INVALID;
-    int lastAcceptEnd = startOffset + 1;
-    while (offset < fLength) {
-        if ((uint8_t)fText[offset] >= 127) {
+    int16_t state = 1;
+    while (fOffset < fLength) {
+        if ((uint8_t)fText[fOffset] >= 127) {
+            ++fOffset;
             break;
         }
-        state = transitions[mappings[(int)fText[offset]]][state];
-        ++offset;
-        if (!state) {
+        int16_t newState = transitions[mappings[(int)fText[fOffset]]][state];
+        if (!newState) {
             break;
         }
-        // We seem to be getting away without doing this check.
-        /*if (accepts[state] != -1)*/ {
-            lastAccept = (Token::Kind)accepts[state];
-            lastAcceptEnd = offset;
-        }
+        state = newState;
+        ++fOffset;
     }
-    fOffset = lastAcceptEnd;
-    return Token(lastAccept, startOffset, lastAcceptEnd - startOffset);
+    Token::Kind kind = (Token::Kind)accepts[state];
+    return Token(kind, startOffset, fOffset - startOffset);
 }
 
-}  // namespace SkSL
+}  // namespace
diff --git a/src/sksl/SkSLLexer.h b/src/sksl/SkSLLexer.h
index 2700287..1d090f2 100644
--- a/src/sksl/SkSLLexer.h
+++ b/src/sksl/SkSLLexer.h
@@ -215,7 +215,8 @@
 
     Token() : fKind(Kind::INVALID), fOffset(-1), fLength(-1) {}
 
-    Token(Kind kind, int offset, int length) : fKind(kind), fOffset(offset), fLength(length) {}
+    Token(Kind kind, int32_t offset, int32_t length)
+            : fKind(kind), fOffset(offset), fLength(length) {}
 
     Kind fKind;
     int fOffset;
@@ -224,7 +225,7 @@
 
 class Lexer {
 public:
-    void start(const char* text, size_t length) {
+    void start(const char* text, int32_t length) {
         fText = text;
         fLength = length;
         fOffset = 0;
@@ -234,9 +235,9 @@
 
 private:
     const char* fText;
-    int fLength;
-    int fOffset;
+    int32_t fLength;
+    int32_t fOffset;
 };
 
-}  // namespace SkSL
+}  // namespace
 #endif
diff --git a/src/sksl/lex/Main.cpp b/src/sksl/lex/Main.cpp
index 9a30b8b..04f266b 100644
--- a/src/sksl/lex/Main.cpp
+++ b/src/sksl/lex/Main.cpp
@@ -54,7 +54,7 @@
     out << "    , fOffset(-1)\n";
     out << "    , fLength(-1) {}\n";
     out << "\n";
-    out << "    " << token << "(Kind kind, int offset, int length)\n";
+    out << "    " << token << "(Kind kind, int32_t offset, int32_t length)\n";
     out << "    : fKind(kind)\n";
     out << "    , fOffset(offset)\n";
     out << "    , fLength(length) {}\n";
@@ -66,7 +66,7 @@
     out << "\n";
     out << "class " << lexer << " {\n";
     out << "public:\n";
-    out << "    void start(const char* text, size_t length) {\n";
+    out << "    void start(const char* text, int32_t length) {\n";
     out << "        fText = text;\n";
     out << "        fLength = length;\n";
     out << "        fOffset = 0;\n";
@@ -76,8 +76,8 @@
     out << "\n";
     out << "private:\n";
     out << "    const char* fText;\n";
-    out << "    int fLength;\n";
-    out << "    int fOffset;\n";
+    out << "    int32_t fLength;\n";
+    out << "    int32_t fOffset;\n";
     out << "};\n";
     out << "\n";
     out << "} // namespace\n";
@@ -98,7 +98,7 @@
     for (const auto& row : dfa.fTransitions) {
         states = std::max(states, row.size());
     }
-    out << "static int16_t mappings[" << dfa.fCharMappings.size() << "] = {\n    ";
+    out << "static int8_t mappings[" << dfa.fCharMappings.size() << "] = {\n    ";
     const char* separator = "";
     for (int m : dfa.fCharMappings) {
         out << separator << std::to_string(m);
@@ -131,32 +131,31 @@
     out << " };\n";
     out << "\n";
 
-    out << token << " " << lexer << "::next() {\n";;
-    out << "    int startOffset = fOffset;\n";
+    out << token << " " << lexer << "::next() {\n";
+    out << "    // note that we cheat here: normally a lexer needs to worry about the case\n";
+    out << "    // where a token has a prefix which is not itself a valid token - for instance, \n";
+    out << "    // maybe we have a valid token 'while', but 'w', 'wh', etc. are not valid\n";
+    out << "    // tokens. Our grammar doesn't have this property, so we can simplify the logic\n";
+    out << "    // a bit.\n";
+    out << "    int32_t startOffset = fOffset;\n";
     out << "    if (startOffset == fLength) {\n";
     out << "        return " << token << "(" << token << "::END_OF_FILE, startOffset, 0);\n";
     out << "    }\n";
-    out << "    int offset = startOffset;\n";
-    out << "    int state = 1;\n";
-    out << "    " << token << "::Kind lastAccept = " << token << "::Kind::INVALID;\n";
-    out << "    int lastAcceptEnd = startOffset + 1;\n";
-    out << "    while (offset < fLength) {\n";
-    out << "        if ((uint8_t) fText[offset] >= " << dfa.fCharMappings.size() << ") {";
+    out << "    int16_t state = 1;\n";
+    out << "    while (fOffset < fLength) {\n";
+    out << "        if ((uint8_t) fText[fOffset] >= " << dfa.fCharMappings.size() << ") {";
+    out << "            ++fOffset;\n";
     out << "            break;";
     out << "        }";
-    out << "        state = transitions[mappings[(int) fText[offset]]][state];\n";
-    out << "        ++offset;\n";
-    out << "        if (!state) {\n";
+    out << "        int16_t newState = transitions[mappings[(int) fText[fOffset]]][state];\n";
+    out << "        if (!newState) {\n";
     out << "            break;\n";
     out << "        }\n";
-    out << "        // We seem to be getting away without doing this check.\n";
-    out << "        /*if (accepts[state] != -1)*/ {\n";
-    out << "            lastAccept = (" << token << "::Kind) accepts[state];\n";
-    out << "            lastAcceptEnd = offset;\n";
-    out << "        }\n";
+    out << "        state = newState;";
+    out << "        ++fOffset;\n";
     out << "    }\n";
-    out << "    fOffset = lastAcceptEnd;\n";
-    out << "    return " << token << "(lastAccept, startOffset, lastAcceptEnd - startOffset);\n";
+    out << "    Token::Kind kind = (" << token << "::Kind) accepts[state];\n";
+    out << "    return " << token << "(kind, startOffset, fOffset - startOffset);\n";
     out << "}\n";
     out << "\n";
     out << "} // namespace\n";