blob: 28ccb547a2634f015056da2a6cc337151a3a5a6c [file] [log] [blame]
Ethan Nicholasca82a922017-09-07 09:39:50 -04001/*
2 * Copyright 2017 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "DFA.h"
9#include "DFAState.h"
10#include "NFA.h"
11#include "NFAState.h"
12
13#include <algorithm>
14#include <climits>
15#include <memory>
16#include <unordered_map>
Ethan Nicholas906126e2017-09-19 14:38:40 -040017#include <set>
Ethan Nicholasca82a922017-09-07 09:39:50 -040018#include <vector>
19
20/**
21 * Converts a nondeterministic finite automaton to a deterministic finite automaton. Since NFAs and
22 * DFAs differ only in that an NFA allows multiple states at the same time, we can find each
23 * possible combination of simultaneous NFA states and give this combination a label. These labelled
24 * nodes are our DFA nodes, since we can only be in one such unique set of NFA states at a time.
25 *
26 * As an NFA can end up in multiple accept states at the same time (for instance, the token "while"
27 * is valid for both WHILE and IDENTIFIER), we disambiguate by preferring the first matching regex
28 * (in terms of the order in which they were added to the NFA).
29 */
30class NFAtoDFA {
31public:
Ethan Nicholas906126e2017-09-19 14:38:40 -040032 static constexpr char START_CHAR = 9;
33 static constexpr char END_CHAR = 126;
34
Ethan Nicholasca82a922017-09-07 09:39:50 -040035 NFAtoDFA(NFA* nfa)
36 : fNFA(*nfa) {}
37
38 /**
39 * Returns a DFA created from the NFA.
40 */
41 DFA convert() {
42 // create state 0, the "reject" state
43 getState(DFAState::Label({}));
44 // create a state representing being in all of the NFA's start states at once
45 std::vector<int> startStates = fNFA.fStartStates;
46 std::sort(startStates.begin(), startStates.end());
47 // this becomes state 1, our start state
48 DFAState* start = getState(DFAState::Label(startStates));
49 this->scanState(start);
50
Ethan Nicholas906126e2017-09-19 14:38:40 -040051 this->computeMappings();
52
Ethan Nicholasca82a922017-09-07 09:39:50 -040053 int stateCount = 0;
54 for (const auto& row : fTransitions) {
55 stateCount = std::max(stateCount, (int) row.size());
56 }
Ethan Nicholas906126e2017-09-19 14:38:40 -040057 return DFA(fCharMappings, fTransitions, fAccepts);
Ethan Nicholasca82a922017-09-07 09:39:50 -040058 }
59
60private:
61 /**
62 * Returns an existing state with the given label, or creates a new one and returns it.
63 */
64 DFAState* getState(DFAState::Label label) {
65 auto found = fStates.find(label);
66 if (found == fStates.end()) {
67 int id = fStates.size();
68 fStates[label] = std::unique_ptr<DFAState>(new DFAState(id, label));
69 return fStates[label].get();
70 }
71 return found->second.get();
72 }
73
74 void add(int nfaState, std::vector<int>* states) {
75 NFAState state = fNFA.fStates[nfaState];
76 if (state.fKind == NFAState::kRemapped_Kind) {
77 for (int next : state.fData) {
78 this->add(next, states);
79 }
80 } else {
81 for (int state : *states) {
82 if (nfaState == state) {
83 return;
84 }
85 }
86 states->push_back(nfaState);
87 }
88 }
89
90 void addTransition(char c, int start, int next) {
91 while (fTransitions.size() <= (size_t) c) {
92 fTransitions.push_back(std::vector<int>());
93 }
94 std::vector<int>& row = fTransitions[c];
95 while (row.size() <= (size_t) start) {
Ethan Nicholas906126e2017-09-19 14:38:40 -040096 row.push_back(INVALID);
Ethan Nicholasca82a922017-09-07 09:39:50 -040097 }
98 row[start] = next;
99 }
100
101 void scanState(DFAState* state) {
102 state->fIsScanned = true;
Ethan Nicholas906126e2017-09-19 14:38:40 -0400103 for (char c = START_CHAR; c <= END_CHAR; ++c) {
Ethan Nicholasca82a922017-09-07 09:39:50 -0400104 std::vector<int> next;
105 int bestAccept = INT_MAX;
106 for (int idx : state->fLabel.fStates) {
107 const NFAState& nfaState = fNFA.fStates[idx];
108 if (nfaState.accept(c)) {
109 for (int nextState : nfaState.fNext) {
110 if (fNFA.fStates[nextState].fKind == NFAState::kAccept_Kind) {
111 bestAccept = std::min(bestAccept, fNFA.fStates[nextState].fData[0]);
112 }
113 this->add(nextState, &next);
114 }
115 }
116 }
117 std::sort(next.begin(), next.end());
118 DFAState* nextState = this->getState(DFAState::Label(next));
119 this->addTransition(c, state->fId, nextState->fId);
120 if (bestAccept != INT_MAX) {
121 while (fAccepts.size() <= (size_t) nextState->fId) {
Ethan Nicholas906126e2017-09-19 14:38:40 -0400122 fAccepts.push_back(INVALID);
Ethan Nicholasca82a922017-09-07 09:39:50 -0400123 }
124 fAccepts[nextState->fId] = bestAccept;
125 }
126 if (!nextState->fIsScanned) {
127 this->scanState(nextState);
128 }
129 }
130 }
131
Ethan Nicholas906126e2017-09-19 14:38:40 -0400132 // collapse rows with the same transitions to a single row. This is common, as each row
133 // represents a character and often there are many characters for which all transitions are
134 // identical (e.g. [0-9] are treated the same way by all lexer rules)
135 void computeMappings() {
136 // mappings[<input row>] = <output row>
137 std::vector<std::vector<int>*> uniques;
138 // this could be done more efficiently, but O(n^2) is plenty fast for our purposes
139 for (size_t i = 0; i < fTransitions.size(); ++i) {
140 int found = -1;
141 for (size_t j = 0; j < uniques.size(); ++j) {
142 if (*uniques[j] == fTransitions[i]) {
143 found = j;
144 break;
145 }
146 }
147 if (found == -1) {
148 found = (int) uniques.size();
149 uniques.push_back(&fTransitions[i]);
150 }
151 fCharMappings.push_back(found);
152 }
153 std::vector<std::vector<int>> newTransitions;
154 for (std::vector<int>* row : uniques) {
155 newTransitions.push_back(*row);
156 }
157 fTransitions = newTransitions;
158 }
159
Ethan Nicholasca82a922017-09-07 09:39:50 -0400160 const NFA& fNFA;
161 std::unordered_map<DFAState::Label, std::unique_ptr<DFAState>> fStates;
162 std::vector<std::vector<int>> fTransitions;
Ethan Nicholas906126e2017-09-19 14:38:40 -0400163 std::vector<int> fCharMappings;
Ethan Nicholasca82a922017-09-07 09:39:50 -0400164 std::vector<int> fAccepts;
165};