blob: 04f266bf553bc8dc79ab6da27410f2c09a1f49a4 [file] [log] [blame]
Ethan Nicholasca82a922017-09-07 09:39:50 -04001/*
2 * Copyright 2017 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "NFAtoDFA.h"
9#include "RegexParser.h"
10
11#include <fstream>
12#include <sstream>
13#include <string>
14
15/**
16 * Processes a .lex file and produces .h and .cpp files which implement a lexical analyzer. The .lex
17 * file is a text file with one token definition per line. Each line is of the form:
18 * <TOKEN_NAME> = <pattern>
19 * where <pattern> is either a regular expression (e.g [0-9]) or a double-quoted literal string.
20 */
21
22static constexpr const char* HEADER =
23 "/*\n"
24 " * Copyright 2017 Google Inc.\n"
25 " *\n"
26 " * Use of this source code is governed by a BSD-style license that can be\n"
27 " * found in the LICENSE file.\n"
28 " */\n"
29 "/*****************************************************************************************\n"
30 " ******************** This file was generated by sksllex. Do not edit. *******************\n"
31 " *****************************************************************************************/\n";
32
33void writeH(const DFA& dfa, const char* lexer, const char* token,
34 const std::vector<std::string>& tokens, const char* hPath) {
35 std::ofstream out(hPath);
Ethan Nicholasd9d33c32018-06-12 11:05:59 -040036 SkASSERT(out.good());
Ethan Nicholasca82a922017-09-07 09:39:50 -040037 out << HEADER;
38 out << "#ifndef SKSL_" << lexer << "\n";
39 out << "#define SKSL_" << lexer << "\n";
40 out << "#include <cstddef>\n";
41 out << "#include <cstdint>\n";
42 out << "namespace SkSL {\n";
43 out << "\n";
44 out << "struct " << token << " {\n";
45 out << " enum Kind {\n";
46 for (const std::string& t : tokens) {
47 out << " #undef " << t << "\n";
48 out << " " << t << ",\n";
49 }
50 out << " };\n";
51 out << "\n";
52 out << " " << token << "()\n";
53 out << " : fKind(Kind::INVALID)\n";
54 out << " , fOffset(-1)\n";
55 out << " , fLength(-1) {}\n";
56 out << "\n";
Ethan Nicholasc7735f12018-08-03 11:48:20 -040057 out << " " << token << "(Kind kind, int32_t offset, int32_t length)\n";
Ethan Nicholasca82a922017-09-07 09:39:50 -040058 out << " : fKind(kind)\n";
59 out << " , fOffset(offset)\n";
60 out << " , fLength(length) {}\n";
61 out << "\n";
62 out << " Kind fKind;\n";
63 out << " int fOffset;\n";
64 out << " int fLength;\n";
65 out << "};\n";
66 out << "\n";
67 out << "class " << lexer << " {\n";
68 out << "public:\n";
Ethan Nicholasc7735f12018-08-03 11:48:20 -040069 out << " void start(const char* text, int32_t length) {\n";
Ethan Nicholasca82a922017-09-07 09:39:50 -040070 out << " fText = text;\n";
71 out << " fLength = length;\n";
72 out << " fOffset = 0;\n";
73 out << " }\n";
74 out << "\n";
75 out << " " << token << " next();\n";
76 out << "\n";
77 out << "private:\n";
78 out << " const char* fText;\n";
Ethan Nicholasc7735f12018-08-03 11:48:20 -040079 out << " int32_t fLength;\n";
80 out << " int32_t fOffset;\n";
Ethan Nicholasca82a922017-09-07 09:39:50 -040081 out << "};\n";
82 out << "\n";
83 out << "} // namespace\n";
84 out << "#endif\n";
85}
86
87void writeCPP(const DFA& dfa, const char* lexer, const char* token, const char* include,
88 const char* cppPath) {
89 std::ofstream out(cppPath);
Ethan Nicholasd9d33c32018-06-12 11:05:59 -040090 SkASSERT(out.good());
Ethan Nicholasca82a922017-09-07 09:39:50 -040091 out << HEADER;
92 out << "#include \"" << include << "\"\n";
93 out << "\n";
94 out << "namespace SkSL {\n";
95 out << "\n";
96
97 size_t states = 0;
98 for (const auto& row : dfa.fTransitions) {
99 states = std::max(states, row.size());
100 }
Ethan Nicholasc7735f12018-08-03 11:48:20 -0400101 out << "static int8_t mappings[" << dfa.fCharMappings.size() << "] = {\n ";
Ethan Nicholas906126e2017-09-19 14:38:40 -0400102 const char* separator = "";
103 for (int m : dfa.fCharMappings) {
104 out << separator << std::to_string(m);
105 separator = ", ";
106 }
107 out << "\n};\n";
108 out << "static int16_t transitions[" << dfa.fTransitions.size() << "][" << states << "] = {\n";
109 for (size_t c = 0; c < dfa.fTransitions.size(); ++c) {
Ethan Nicholasca82a922017-09-07 09:39:50 -0400110 out << " {";
Ethan Nicholas906126e2017-09-19 14:38:40 -0400111 for (size_t j = 0; j < states; ++j) {
112 if ((size_t) c < dfa.fTransitions.size() && j < dfa.fTransitions[c].size()) {
113 out << " " << dfa.fTransitions[c][j] << ",";
Ethan Nicholasca82a922017-09-07 09:39:50 -0400114 } else {
115 out << " 0,";
116 }
117 }
118 out << " },\n";
119 }
120 out << "};\n";
121 out << "\n";
122
Ethan Nicholas906126e2017-09-19 14:38:40 -0400123 out << "static int8_t accepts[" << states << "] = {";
Ethan Nicholasca82a922017-09-07 09:39:50 -0400124 for (size_t i = 0; i < states; ++i) {
125 if (i < dfa.fAccepts.size()) {
126 out << " " << dfa.fAccepts[i] << ",";
127 } else {
Ethan Nicholas906126e2017-09-19 14:38:40 -0400128 out << " " << INVALID << ",";
Ethan Nicholasca82a922017-09-07 09:39:50 -0400129 }
130 }
131 out << " };\n";
132 out << "\n";
133
Ethan Nicholasc7735f12018-08-03 11:48:20 -0400134 out << token << " " << lexer << "::next() {\n";
135 out << " // note that we cheat here: normally a lexer needs to worry about the case\n";
136 out << " // where a token has a prefix which is not itself a valid token - for instance, \n";
137 out << " // maybe we have a valid token 'while', but 'w', 'wh', etc. are not valid\n";
138 out << " // tokens. Our grammar doesn't have this property, so we can simplify the logic\n";
139 out << " // a bit.\n";
140 out << " int32_t startOffset = fOffset;\n";
Ethan Nicholasca82a922017-09-07 09:39:50 -0400141 out << " if (startOffset == fLength) {\n";
142 out << " return " << token << "(" << token << "::END_OF_FILE, startOffset, 0);\n";
143 out << " }\n";
Ethan Nicholasc7735f12018-08-03 11:48:20 -0400144 out << " int16_t state = 1;\n";
145 out << " while (fOffset < fLength) {\n";
146 out << " if ((uint8_t) fText[fOffset] >= " << dfa.fCharMappings.size() << ") {";
147 out << " ++fOffset;\n";
Ethan Nicholase148bf72017-10-06 14:22:22 -0400148 out << " break;";
149 out << " }";
Ethan Nicholasc7735f12018-08-03 11:48:20 -0400150 out << " int16_t newState = transitions[mappings[(int) fText[fOffset]]][state];\n";
151 out << " if (!newState) {\n";
Ethan Nicholasca82a922017-09-07 09:39:50 -0400152 out << " break;\n";
153 out << " }\n";
Ethan Nicholasc7735f12018-08-03 11:48:20 -0400154 out << " state = newState;";
155 out << " ++fOffset;\n";
Ethan Nicholasca82a922017-09-07 09:39:50 -0400156 out << " }\n";
Ethan Nicholasc7735f12018-08-03 11:48:20 -0400157 out << " Token::Kind kind = (" << token << "::Kind) accepts[state];\n";
158 out << " return " << token << "(kind, startOffset, fOffset - startOffset);\n";
Ethan Nicholasca82a922017-09-07 09:39:50 -0400159 out << "}\n";
160 out << "\n";
161 out << "} // namespace\n";
162}
163
164void process(const char* inPath, const char* lexer, const char* token, const char* hPath,
165 const char* cppPath) {
166 NFA nfa;
167 std::vector<std::string> tokens;
168 tokens.push_back("END_OF_FILE");
169 std::string line;
170 std::ifstream in(inPath);
171 while (std::getline(in, line)) {
172 std::istringstream split(line);
173 std::string name, delimiter, pattern;
174 if (split >> name >> delimiter >> pattern) {
Ethan Nicholasd9d33c32018-06-12 11:05:59 -0400175 SkASSERT(split.eof());
176 SkASSERT(name != "");
177 SkASSERT(delimiter == "=");
178 SkASSERT(pattern != "");
Ethan Nicholasca82a922017-09-07 09:39:50 -0400179 tokens.push_back(name);
180 if (pattern[0] == '"') {
Ethan Nicholasd9d33c32018-06-12 11:05:59 -0400181 SkASSERT(pattern.size() > 2 && pattern[pattern.size() - 1] == '"');
Ethan Nicholasca82a922017-09-07 09:39:50 -0400182 RegexNode node = RegexNode(RegexNode::kChar_Kind, pattern[1]);
183 for (size_t i = 2; i < pattern.size() - 1; ++i) {
184 node = RegexNode(RegexNode::kConcat_Kind, node,
185 RegexNode(RegexNode::kChar_Kind, pattern[i]));
186 }
187 nfa.addRegex(node);
188 }
189 else {
190 nfa.addRegex(RegexParser().parse(pattern));
191 }
192 }
193 }
194 NFAtoDFA converter(&nfa);
195 DFA dfa = converter.convert();
196 writeH(dfa, lexer, token, tokens, hPath);
197 writeCPP(dfa, lexer, token, (std::string("SkSL") + lexer + ".h").c_str(), cppPath);
198}
199
200int main(int argc, const char** argv) {
201 if (argc != 6) {
202 printf("usage: sksllex <input.lex> <lexername> <tokenname> <output.h> <output.cpp>\n");
203 exit(1);
204 }
205 process(argv[1], argv[2], argv[3], argv[4], argv[5]);
206 return 0;
207}