Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 1 | /* |
| 2 | * Copyright 2017 Google Inc. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license that can be |
| 5 | * found in the LICENSE file. |
| 6 | */ |
| 7 | |
| 8 | #include "NFAtoDFA.h" |
| 9 | #include "RegexParser.h" |
| 10 | |
| 11 | #include <fstream> |
| 12 | #include <sstream> |
| 13 | #include <string> |
| 14 | |
| 15 | /** |
| 16 | * Processes a .lex file and produces .h and .cpp files which implement a lexical analyzer. The .lex |
| 17 | * file is a text file with one token definition per line. Each line is of the form: |
| 18 | * <TOKEN_NAME> = <pattern> |
| 19 | * where <pattern> is either a regular expression (e.g [0-9]) or a double-quoted literal string. |
| 20 | */ |
| 21 | |
| 22 | static constexpr const char* HEADER = |
| 23 | "/*\n" |
| 24 | " * Copyright 2017 Google Inc.\n" |
| 25 | " *\n" |
| 26 | " * Use of this source code is governed by a BSD-style license that can be\n" |
| 27 | " * found in the LICENSE file.\n" |
| 28 | " */\n" |
| 29 | "/*****************************************************************************************\n" |
| 30 | " ******************** This file was generated by sksllex. Do not edit. *******************\n" |
| 31 | " *****************************************************************************************/\n"; |
| 32 | |
| 33 | void writeH(const DFA& dfa, const char* lexer, const char* token, |
| 34 | const std::vector<std::string>& tokens, const char* hPath) { |
| 35 | std::ofstream out(hPath); |
Ethan Nicholas | d9d33c3 | 2018-06-12 11:05:59 -0400 | [diff] [blame] | 36 | SkASSERT(out.good()); |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 37 | out << HEADER; |
| 38 | out << "#ifndef SKSL_" << lexer << "\n"; |
| 39 | out << "#define SKSL_" << lexer << "\n"; |
| 40 | out << "#include <cstddef>\n"; |
| 41 | out << "#include <cstdint>\n"; |
| 42 | out << "namespace SkSL {\n"; |
| 43 | out << "\n"; |
| 44 | out << "struct " << token << " {\n"; |
| 45 | out << " enum Kind {\n"; |
| 46 | for (const std::string& t : tokens) { |
| 47 | out << " #undef " << t << "\n"; |
| 48 | out << " " << t << ",\n"; |
| 49 | } |
| 50 | out << " };\n"; |
| 51 | out << "\n"; |
| 52 | out << " " << token << "()\n"; |
| 53 | out << " : fKind(Kind::INVALID)\n"; |
| 54 | out << " , fOffset(-1)\n"; |
| 55 | out << " , fLength(-1) {}\n"; |
| 56 | out << "\n"; |
| 57 | out << " " << token << "(Kind kind, int offset, int length)\n"; |
| 58 | out << " : fKind(kind)\n"; |
| 59 | out << " , fOffset(offset)\n"; |
| 60 | out << " , fLength(length) {}\n"; |
| 61 | out << "\n"; |
| 62 | out << " Kind fKind;\n"; |
| 63 | out << " int fOffset;\n"; |
| 64 | out << " int fLength;\n"; |
| 65 | out << "};\n"; |
| 66 | out << "\n"; |
| 67 | out << "class " << lexer << " {\n"; |
| 68 | out << "public:\n"; |
| 69 | out << " void start(const char* text, size_t length) {\n"; |
| 70 | out << " fText = text;\n"; |
| 71 | out << " fLength = length;\n"; |
| 72 | out << " fOffset = 0;\n"; |
| 73 | out << " }\n"; |
| 74 | out << "\n"; |
| 75 | out << " " << token << " next();\n"; |
| 76 | out << "\n"; |
| 77 | out << "private:\n"; |
| 78 | out << " const char* fText;\n"; |
| 79 | out << " int fLength;\n"; |
| 80 | out << " int fOffset;\n"; |
| 81 | out << "};\n"; |
| 82 | out << "\n"; |
| 83 | out << "} // namespace\n"; |
| 84 | out << "#endif\n"; |
| 85 | } |
| 86 | |
| 87 | void writeCPP(const DFA& dfa, const char* lexer, const char* token, const char* include, |
| 88 | const char* cppPath) { |
| 89 | std::ofstream out(cppPath); |
Ethan Nicholas | d9d33c3 | 2018-06-12 11:05:59 -0400 | [diff] [blame] | 90 | SkASSERT(out.good()); |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 91 | out << HEADER; |
| 92 | out << "#include \"" << include << "\"\n"; |
| 93 | out << "\n"; |
| 94 | out << "namespace SkSL {\n"; |
| 95 | out << "\n"; |
| 96 | |
| 97 | size_t states = 0; |
| 98 | for (const auto& row : dfa.fTransitions) { |
| 99 | states = std::max(states, row.size()); |
| 100 | } |
Ethan Nicholas | 906126e | 2017-09-19 14:38:40 -0400 | [diff] [blame] | 101 | out << "static int16_t mappings[" << dfa.fCharMappings.size() << "] = {\n "; |
| 102 | const char* separator = ""; |
| 103 | for (int m : dfa.fCharMappings) { |
| 104 | out << separator << std::to_string(m); |
| 105 | separator = ", "; |
| 106 | } |
| 107 | out << "\n};\n"; |
| 108 | out << "static int16_t transitions[" << dfa.fTransitions.size() << "][" << states << "] = {\n"; |
| 109 | for (size_t c = 0; c < dfa.fTransitions.size(); ++c) { |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 110 | out << " {"; |
Ethan Nicholas | 906126e | 2017-09-19 14:38:40 -0400 | [diff] [blame] | 111 | for (size_t j = 0; j < states; ++j) { |
| 112 | if ((size_t) c < dfa.fTransitions.size() && j < dfa.fTransitions[c].size()) { |
| 113 | out << " " << dfa.fTransitions[c][j] << ","; |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 114 | } else { |
| 115 | out << " 0,"; |
| 116 | } |
| 117 | } |
| 118 | out << " },\n"; |
| 119 | } |
| 120 | out << "};\n"; |
| 121 | out << "\n"; |
| 122 | |
Ethan Nicholas | 906126e | 2017-09-19 14:38:40 -0400 | [diff] [blame] | 123 | out << "static int8_t accepts[" << states << "] = {"; |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 124 | for (size_t i = 0; i < states; ++i) { |
| 125 | if (i < dfa.fAccepts.size()) { |
| 126 | out << " " << dfa.fAccepts[i] << ","; |
| 127 | } else { |
Ethan Nicholas | 906126e | 2017-09-19 14:38:40 -0400 | [diff] [blame] | 128 | out << " " << INVALID << ","; |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 129 | } |
| 130 | } |
| 131 | out << " };\n"; |
| 132 | out << "\n"; |
| 133 | |
| 134 | out << token << " " << lexer << "::next() {\n";; |
| 135 | out << " int startOffset = fOffset;\n"; |
| 136 | out << " if (startOffset == fLength) {\n"; |
| 137 | out << " return " << token << "(" << token << "::END_OF_FILE, startOffset, 0);\n"; |
| 138 | out << " }\n"; |
| 139 | out << " int offset = startOffset;\n"; |
| 140 | out << " int state = 1;\n"; |
| 141 | out << " " << token << "::Kind lastAccept = " << token << "::Kind::INVALID;\n"; |
| 142 | out << " int lastAcceptEnd = startOffset + 1;\n"; |
| 143 | out << " while (offset < fLength) {\n"; |
Ethan Nicholas | e148bf7 | 2017-10-06 14:22:22 -0400 | [diff] [blame] | 144 | out << " if ((uint8_t) fText[offset] >= " << dfa.fCharMappings.size() << ") {"; |
| 145 | out << " break;"; |
| 146 | out << " }"; |
Ethan Nicholas | 906126e | 2017-09-19 14:38:40 -0400 | [diff] [blame] | 147 | out << " state = transitions[mappings[(int) fText[offset]]][state];\n"; |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 148 | out << " ++offset;\n"; |
| 149 | out << " if (!state) {\n"; |
| 150 | out << " break;\n"; |
| 151 | out << " }\n"; |
Mike Klein | 38f118a | 2018-06-28 18:04:02 -0400 | [diff] [blame] | 152 | out << " // We seem to be getting away without doing this check.\n"; |
| 153 | out << " /*if (accepts[state] != -1)*/ {\n"; |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 154 | out << " lastAccept = (" << token << "::Kind) accepts[state];\n"; |
| 155 | out << " lastAcceptEnd = offset;\n"; |
| 156 | out << " }\n"; |
| 157 | out << " }\n"; |
| 158 | out << " fOffset = lastAcceptEnd;\n"; |
| 159 | out << " return " << token << "(lastAccept, startOffset, lastAcceptEnd - startOffset);\n"; |
| 160 | out << "}\n"; |
| 161 | out << "\n"; |
| 162 | out << "} // namespace\n"; |
| 163 | } |
| 164 | |
| 165 | void process(const char* inPath, const char* lexer, const char* token, const char* hPath, |
| 166 | const char* cppPath) { |
| 167 | NFA nfa; |
| 168 | std::vector<std::string> tokens; |
| 169 | tokens.push_back("END_OF_FILE"); |
| 170 | std::string line; |
| 171 | std::ifstream in(inPath); |
| 172 | while (std::getline(in, line)) { |
| 173 | std::istringstream split(line); |
| 174 | std::string name, delimiter, pattern; |
| 175 | if (split >> name >> delimiter >> pattern) { |
Ethan Nicholas | d9d33c3 | 2018-06-12 11:05:59 -0400 | [diff] [blame] | 176 | SkASSERT(split.eof()); |
| 177 | SkASSERT(name != ""); |
| 178 | SkASSERT(delimiter == "="); |
| 179 | SkASSERT(pattern != ""); |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 180 | tokens.push_back(name); |
| 181 | if (pattern[0] == '"') { |
Ethan Nicholas | d9d33c3 | 2018-06-12 11:05:59 -0400 | [diff] [blame] | 182 | SkASSERT(pattern.size() > 2 && pattern[pattern.size() - 1] == '"'); |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 183 | RegexNode node = RegexNode(RegexNode::kChar_Kind, pattern[1]); |
| 184 | for (size_t i = 2; i < pattern.size() - 1; ++i) { |
| 185 | node = RegexNode(RegexNode::kConcat_Kind, node, |
| 186 | RegexNode(RegexNode::kChar_Kind, pattern[i])); |
| 187 | } |
| 188 | nfa.addRegex(node); |
| 189 | } |
| 190 | else { |
| 191 | nfa.addRegex(RegexParser().parse(pattern)); |
| 192 | } |
| 193 | } |
| 194 | } |
| 195 | NFAtoDFA converter(&nfa); |
| 196 | DFA dfa = converter.convert(); |
| 197 | writeH(dfa, lexer, token, tokens, hPath); |
| 198 | writeCPP(dfa, lexer, token, (std::string("SkSL") + lexer + ".h").c_str(), cppPath); |
| 199 | } |
| 200 | |
| 201 | int main(int argc, const char** argv) { |
| 202 | if (argc != 6) { |
| 203 | printf("usage: sksllex <input.lex> <lexername> <tokenname> <output.h> <output.cpp>\n"); |
| 204 | exit(1); |
| 205 | } |
| 206 | process(argv[1], argv[2], argv[3], argv[4], argv[5]); |
| 207 | return 0; |
| 208 | } |