Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 1 | /* |
| 2 | * Copyright 2017 Google Inc. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license that can be |
| 5 | * found in the LICENSE file. |
| 6 | */ |
| 7 | |
| 8 | #include "NFAtoDFA.h" |
| 9 | #include "RegexParser.h" |
| 10 | |
| 11 | #include <fstream> |
| 12 | #include <sstream> |
| 13 | #include <string> |
| 14 | |
| 15 | /** |
| 16 | * Processes a .lex file and produces .h and .cpp files which implement a lexical analyzer. The .lex |
| 17 | * file is a text file with one token definition per line. Each line is of the form: |
| 18 | * <TOKEN_NAME> = <pattern> |
| 19 | * where <pattern> is either a regular expression (e.g [0-9]) or a double-quoted literal string. |
| 20 | */ |
| 21 | |
| 22 | static constexpr const char* HEADER = |
| 23 | "/*\n" |
| 24 | " * Copyright 2017 Google Inc.\n" |
| 25 | " *\n" |
| 26 | " * Use of this source code is governed by a BSD-style license that can be\n" |
| 27 | " * found in the LICENSE file.\n" |
| 28 | " */\n" |
| 29 | "/*****************************************************************************************\n" |
| 30 | " ******************** This file was generated by sksllex. Do not edit. *******************\n" |
| 31 | " *****************************************************************************************/\n"; |
| 32 | |
| 33 | void writeH(const DFA& dfa, const char* lexer, const char* token, |
| 34 | const std::vector<std::string>& tokens, const char* hPath) { |
| 35 | std::ofstream out(hPath); |
Ethan Nicholas | d9d33c3 | 2018-06-12 11:05:59 -0400 | [diff] [blame] | 36 | SkASSERT(out.good()); |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 37 | out << HEADER; |
| 38 | out << "#ifndef SKSL_" << lexer << "\n"; |
| 39 | out << "#define SKSL_" << lexer << "\n"; |
| 40 | out << "#include <cstddef>\n"; |
| 41 | out << "#include <cstdint>\n"; |
| 42 | out << "namespace SkSL {\n"; |
| 43 | out << "\n"; |
| 44 | out << "struct " << token << " {\n"; |
| 45 | out << " enum Kind {\n"; |
| 46 | for (const std::string& t : tokens) { |
| 47 | out << " #undef " << t << "\n"; |
| 48 | out << " " << t << ",\n"; |
| 49 | } |
| 50 | out << " };\n"; |
| 51 | out << "\n"; |
| 52 | out << " " << token << "()\n"; |
| 53 | out << " : fKind(Kind::INVALID)\n"; |
| 54 | out << " , fOffset(-1)\n"; |
| 55 | out << " , fLength(-1) {}\n"; |
| 56 | out << "\n"; |
Ethan Nicholas | c7735f1 | 2018-08-03 11:48:20 -0400 | [diff] [blame] | 57 | out << " " << token << "(Kind kind, int32_t offset, int32_t length)\n"; |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 58 | out << " : fKind(kind)\n"; |
| 59 | out << " , fOffset(offset)\n"; |
| 60 | out << " , fLength(length) {}\n"; |
| 61 | out << "\n"; |
| 62 | out << " Kind fKind;\n"; |
| 63 | out << " int fOffset;\n"; |
| 64 | out << " int fLength;\n"; |
| 65 | out << "};\n"; |
| 66 | out << "\n"; |
| 67 | out << "class " << lexer << " {\n"; |
| 68 | out << "public:\n"; |
Ethan Nicholas | c7735f1 | 2018-08-03 11:48:20 -0400 | [diff] [blame] | 69 | out << " void start(const char* text, int32_t length) {\n"; |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 70 | out << " fText = text;\n"; |
| 71 | out << " fLength = length;\n"; |
| 72 | out << " fOffset = 0;\n"; |
| 73 | out << " }\n"; |
| 74 | out << "\n"; |
| 75 | out << " " << token << " next();\n"; |
| 76 | out << "\n"; |
| 77 | out << "private:\n"; |
| 78 | out << " const char* fText;\n"; |
Ethan Nicholas | c7735f1 | 2018-08-03 11:48:20 -0400 | [diff] [blame] | 79 | out << " int32_t fLength;\n"; |
| 80 | out << " int32_t fOffset;\n"; |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 81 | out << "};\n"; |
| 82 | out << "\n"; |
| 83 | out << "} // namespace\n"; |
| 84 | out << "#endif\n"; |
| 85 | } |
| 86 | |
| 87 | void writeCPP(const DFA& dfa, const char* lexer, const char* token, const char* include, |
| 88 | const char* cppPath) { |
| 89 | std::ofstream out(cppPath); |
Ethan Nicholas | d9d33c3 | 2018-06-12 11:05:59 -0400 | [diff] [blame] | 90 | SkASSERT(out.good()); |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 91 | out << HEADER; |
| 92 | out << "#include \"" << include << "\"\n"; |
| 93 | out << "\n"; |
| 94 | out << "namespace SkSL {\n"; |
| 95 | out << "\n"; |
| 96 | |
| 97 | size_t states = 0; |
| 98 | for (const auto& row : dfa.fTransitions) { |
| 99 | states = std::max(states, row.size()); |
| 100 | } |
Ethan Nicholas | c7735f1 | 2018-08-03 11:48:20 -0400 | [diff] [blame] | 101 | out << "static int8_t mappings[" << dfa.fCharMappings.size() << "] = {\n "; |
Ethan Nicholas | 906126e | 2017-09-19 14:38:40 -0400 | [diff] [blame] | 102 | const char* separator = ""; |
| 103 | for (int m : dfa.fCharMappings) { |
| 104 | out << separator << std::to_string(m); |
| 105 | separator = ", "; |
| 106 | } |
| 107 | out << "\n};\n"; |
| 108 | out << "static int16_t transitions[" << dfa.fTransitions.size() << "][" << states << "] = {\n"; |
| 109 | for (size_t c = 0; c < dfa.fTransitions.size(); ++c) { |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 110 | out << " {"; |
Ethan Nicholas | 906126e | 2017-09-19 14:38:40 -0400 | [diff] [blame] | 111 | for (size_t j = 0; j < states; ++j) { |
| 112 | if ((size_t) c < dfa.fTransitions.size() && j < dfa.fTransitions[c].size()) { |
| 113 | out << " " << dfa.fTransitions[c][j] << ","; |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 114 | } else { |
| 115 | out << " 0,"; |
| 116 | } |
| 117 | } |
| 118 | out << " },\n"; |
| 119 | } |
| 120 | out << "};\n"; |
| 121 | out << "\n"; |
| 122 | |
Ethan Nicholas | 906126e | 2017-09-19 14:38:40 -0400 | [diff] [blame] | 123 | out << "static int8_t accepts[" << states << "] = {"; |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 124 | for (size_t i = 0; i < states; ++i) { |
| 125 | if (i < dfa.fAccepts.size()) { |
| 126 | out << " " << dfa.fAccepts[i] << ","; |
| 127 | } else { |
Ethan Nicholas | 906126e | 2017-09-19 14:38:40 -0400 | [diff] [blame] | 128 | out << " " << INVALID << ","; |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 129 | } |
| 130 | } |
| 131 | out << " };\n"; |
| 132 | out << "\n"; |
| 133 | |
Ethan Nicholas | c7735f1 | 2018-08-03 11:48:20 -0400 | [diff] [blame] | 134 | out << token << " " << lexer << "::next() {\n"; |
| 135 | out << " // note that we cheat here: normally a lexer needs to worry about the case\n"; |
| 136 | out << " // where a token has a prefix which is not itself a valid token - for instance, \n"; |
| 137 | out << " // maybe we have a valid token 'while', but 'w', 'wh', etc. are not valid\n"; |
| 138 | out << " // tokens. Our grammar doesn't have this property, so we can simplify the logic\n"; |
| 139 | out << " // a bit.\n"; |
| 140 | out << " int32_t startOffset = fOffset;\n"; |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 141 | out << " if (startOffset == fLength) {\n"; |
| 142 | out << " return " << token << "(" << token << "::END_OF_FILE, startOffset, 0);\n"; |
| 143 | out << " }\n"; |
Ethan Nicholas | c7735f1 | 2018-08-03 11:48:20 -0400 | [diff] [blame] | 144 | out << " int16_t state = 1;\n"; |
| 145 | out << " while (fOffset < fLength) {\n"; |
| 146 | out << " if ((uint8_t) fText[fOffset] >= " << dfa.fCharMappings.size() << ") {"; |
| 147 | out << " ++fOffset;\n"; |
Ethan Nicholas | e148bf7 | 2017-10-06 14:22:22 -0400 | [diff] [blame] | 148 | out << " break;"; |
| 149 | out << " }"; |
Ethan Nicholas | c7735f1 | 2018-08-03 11:48:20 -0400 | [diff] [blame] | 150 | out << " int16_t newState = transitions[mappings[(int) fText[fOffset]]][state];\n"; |
| 151 | out << " if (!newState) {\n"; |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 152 | out << " break;\n"; |
| 153 | out << " }\n"; |
Ethan Nicholas | c7735f1 | 2018-08-03 11:48:20 -0400 | [diff] [blame] | 154 | out << " state = newState;"; |
| 155 | out << " ++fOffset;\n"; |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 156 | out << " }\n"; |
Ethan Nicholas | c7735f1 | 2018-08-03 11:48:20 -0400 | [diff] [blame] | 157 | out << " Token::Kind kind = (" << token << "::Kind) accepts[state];\n"; |
| 158 | out << " return " << token << "(kind, startOffset, fOffset - startOffset);\n"; |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 159 | out << "}\n"; |
| 160 | out << "\n"; |
| 161 | out << "} // namespace\n"; |
| 162 | } |
| 163 | |
| 164 | void process(const char* inPath, const char* lexer, const char* token, const char* hPath, |
| 165 | const char* cppPath) { |
| 166 | NFA nfa; |
| 167 | std::vector<std::string> tokens; |
| 168 | tokens.push_back("END_OF_FILE"); |
| 169 | std::string line; |
| 170 | std::ifstream in(inPath); |
| 171 | while (std::getline(in, line)) { |
| 172 | std::istringstream split(line); |
| 173 | std::string name, delimiter, pattern; |
| 174 | if (split >> name >> delimiter >> pattern) { |
Ethan Nicholas | d9d33c3 | 2018-06-12 11:05:59 -0400 | [diff] [blame] | 175 | SkASSERT(split.eof()); |
| 176 | SkASSERT(name != ""); |
| 177 | SkASSERT(delimiter == "="); |
| 178 | SkASSERT(pattern != ""); |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 179 | tokens.push_back(name); |
| 180 | if (pattern[0] == '"') { |
Ethan Nicholas | d9d33c3 | 2018-06-12 11:05:59 -0400 | [diff] [blame] | 181 | SkASSERT(pattern.size() > 2 && pattern[pattern.size() - 1] == '"'); |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 182 | RegexNode node = RegexNode(RegexNode::kChar_Kind, pattern[1]); |
| 183 | for (size_t i = 2; i < pattern.size() - 1; ++i) { |
| 184 | node = RegexNode(RegexNode::kConcat_Kind, node, |
| 185 | RegexNode(RegexNode::kChar_Kind, pattern[i])); |
| 186 | } |
| 187 | nfa.addRegex(node); |
| 188 | } |
| 189 | else { |
| 190 | nfa.addRegex(RegexParser().parse(pattern)); |
| 191 | } |
| 192 | } |
| 193 | } |
| 194 | NFAtoDFA converter(&nfa); |
| 195 | DFA dfa = converter.convert(); |
| 196 | writeH(dfa, lexer, token, tokens, hPath); |
| 197 | writeCPP(dfa, lexer, token, (std::string("SkSL") + lexer + ".h").c_str(), cppPath); |
| 198 | } |
| 199 | |
| 200 | int main(int argc, const char** argv) { |
| 201 | if (argc != 6) { |
| 202 | printf("usage: sksllex <input.lex> <lexername> <tokenname> <output.h> <output.cpp>\n"); |
| 203 | exit(1); |
| 204 | } |
| 205 | process(argv[1], argv[2], argv[3], argv[4], argv[5]); |
| 206 | return 0; |
| 207 | } |