Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 1 | /* |
| 2 | * Copyright 2017 Google Inc. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license that can be |
| 5 | * found in the LICENSE file. |
| 6 | */ |
| 7 | |
| 8 | #include "DFA.h" |
| 9 | #include "DFAState.h" |
| 10 | #include "NFA.h" |
| 11 | #include "NFAState.h" |
| 12 | |
| 13 | #include <algorithm> |
| 14 | #include <climits> |
| 15 | #include <memory> |
| 16 | #include <unordered_map> |
Ethan Nicholas | 906126e | 2017-09-19 14:38:40 -0400 | [diff] [blame] | 17 | #include <set> |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 18 | #include <vector> |
| 19 | |
| 20 | /** |
| 21 | * Converts a nondeterministic finite automaton to a deterministic finite automaton. Since NFAs and |
| 22 | * DFAs differ only in that an NFA allows multiple states at the same time, we can find each |
| 23 | * possible combination of simultaneous NFA states and give this combination a label. These labelled |
| 24 | * nodes are our DFA nodes, since we can only be in one such unique set of NFA states at a time. |
| 25 | * |
| 26 | * As an NFA can end up in multiple accept states at the same time (for instance, the token "while" |
| 27 | * is valid for both WHILE and IDENTIFIER), we disambiguate by preferring the first matching regex |
| 28 | * (in terms of the order in which they were added to the NFA). |
| 29 | */ |
| 30 | class NFAtoDFA { |
| 31 | public: |
Ethan Nicholas | 906126e | 2017-09-19 14:38:40 -0400 | [diff] [blame] | 32 | static constexpr char START_CHAR = 9; |
| 33 | static constexpr char END_CHAR = 126; |
| 34 | |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 35 | NFAtoDFA(NFA* nfa) |
| 36 | : fNFA(*nfa) {} |
| 37 | |
| 38 | /** |
| 39 | * Returns a DFA created from the NFA. |
| 40 | */ |
| 41 | DFA convert() { |
| 42 | // create state 0, the "reject" state |
| 43 | getState(DFAState::Label({})); |
| 44 | // create a state representing being in all of the NFA's start states at once |
| 45 | std::vector<int> startStates = fNFA.fStartStates; |
| 46 | std::sort(startStates.begin(), startStates.end()); |
| 47 | // this becomes state 1, our start state |
| 48 | DFAState* start = getState(DFAState::Label(startStates)); |
| 49 | this->scanState(start); |
| 50 | |
Ethan Nicholas | 906126e | 2017-09-19 14:38:40 -0400 | [diff] [blame] | 51 | this->computeMappings(); |
| 52 | |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 53 | int stateCount = 0; |
| 54 | for (const auto& row : fTransitions) { |
| 55 | stateCount = std::max(stateCount, (int) row.size()); |
| 56 | } |
Ethan Nicholas | 906126e | 2017-09-19 14:38:40 -0400 | [diff] [blame] | 57 | return DFA(fCharMappings, fTransitions, fAccepts); |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 58 | } |
| 59 | |
| 60 | private: |
| 61 | /** |
| 62 | * Returns an existing state with the given label, or creates a new one and returns it. |
| 63 | */ |
| 64 | DFAState* getState(DFAState::Label label) { |
| 65 | auto found = fStates.find(label); |
| 66 | if (found == fStates.end()) { |
| 67 | int id = fStates.size(); |
| 68 | fStates[label] = std::unique_ptr<DFAState>(new DFAState(id, label)); |
| 69 | return fStates[label].get(); |
| 70 | } |
| 71 | return found->second.get(); |
| 72 | } |
| 73 | |
| 74 | void add(int nfaState, std::vector<int>* states) { |
| 75 | NFAState state = fNFA.fStates[nfaState]; |
| 76 | if (state.fKind == NFAState::kRemapped_Kind) { |
| 77 | for (int next : state.fData) { |
| 78 | this->add(next, states); |
| 79 | } |
| 80 | } else { |
| 81 | for (int state : *states) { |
| 82 | if (nfaState == state) { |
| 83 | return; |
| 84 | } |
| 85 | } |
| 86 | states->push_back(nfaState); |
| 87 | } |
| 88 | } |
| 89 | |
| 90 | void addTransition(char c, int start, int next) { |
| 91 | while (fTransitions.size() <= (size_t) c) { |
| 92 | fTransitions.push_back(std::vector<int>()); |
| 93 | } |
| 94 | std::vector<int>& row = fTransitions[c]; |
| 95 | while (row.size() <= (size_t) start) { |
Ethan Nicholas | 906126e | 2017-09-19 14:38:40 -0400 | [diff] [blame] | 96 | row.push_back(INVALID); |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 97 | } |
| 98 | row[start] = next; |
| 99 | } |
| 100 | |
| 101 | void scanState(DFAState* state) { |
| 102 | state->fIsScanned = true; |
Ethan Nicholas | 906126e | 2017-09-19 14:38:40 -0400 | [diff] [blame] | 103 | for (char c = START_CHAR; c <= END_CHAR; ++c) { |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 104 | std::vector<int> next; |
| 105 | int bestAccept = INT_MAX; |
| 106 | for (int idx : state->fLabel.fStates) { |
| 107 | const NFAState& nfaState = fNFA.fStates[idx]; |
| 108 | if (nfaState.accept(c)) { |
| 109 | for (int nextState : nfaState.fNext) { |
| 110 | if (fNFA.fStates[nextState].fKind == NFAState::kAccept_Kind) { |
| 111 | bestAccept = std::min(bestAccept, fNFA.fStates[nextState].fData[0]); |
| 112 | } |
| 113 | this->add(nextState, &next); |
| 114 | } |
| 115 | } |
| 116 | } |
| 117 | std::sort(next.begin(), next.end()); |
| 118 | DFAState* nextState = this->getState(DFAState::Label(next)); |
| 119 | this->addTransition(c, state->fId, nextState->fId); |
| 120 | if (bestAccept != INT_MAX) { |
| 121 | while (fAccepts.size() <= (size_t) nextState->fId) { |
Ethan Nicholas | 906126e | 2017-09-19 14:38:40 -0400 | [diff] [blame] | 122 | fAccepts.push_back(INVALID); |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 123 | } |
| 124 | fAccepts[nextState->fId] = bestAccept; |
| 125 | } |
| 126 | if (!nextState->fIsScanned) { |
| 127 | this->scanState(nextState); |
| 128 | } |
| 129 | } |
| 130 | } |
| 131 | |
Ethan Nicholas | 906126e | 2017-09-19 14:38:40 -0400 | [diff] [blame] | 132 | // collapse rows with the same transitions to a single row. This is common, as each row |
| 133 | // represents a character and often there are many characters for which all transitions are |
| 134 | // identical (e.g. [0-9] are treated the same way by all lexer rules) |
| 135 | void computeMappings() { |
| 136 | // mappings[<input row>] = <output row> |
| 137 | std::vector<std::vector<int>*> uniques; |
| 138 | // this could be done more efficiently, but O(n^2) is plenty fast for our purposes |
| 139 | for (size_t i = 0; i < fTransitions.size(); ++i) { |
| 140 | int found = -1; |
| 141 | for (size_t j = 0; j < uniques.size(); ++j) { |
| 142 | if (*uniques[j] == fTransitions[i]) { |
| 143 | found = j; |
| 144 | break; |
| 145 | } |
| 146 | } |
| 147 | if (found == -1) { |
| 148 | found = (int) uniques.size(); |
| 149 | uniques.push_back(&fTransitions[i]); |
| 150 | } |
| 151 | fCharMappings.push_back(found); |
| 152 | } |
| 153 | std::vector<std::vector<int>> newTransitions; |
| 154 | for (std::vector<int>* row : uniques) { |
| 155 | newTransitions.push_back(*row); |
| 156 | } |
| 157 | fTransitions = newTransitions; |
| 158 | } |
| 159 | |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 160 | const NFA& fNFA; |
| 161 | std::unordered_map<DFAState::Label, std::unique_ptr<DFAState>> fStates; |
| 162 | std::vector<std::vector<int>> fTransitions; |
Ethan Nicholas | 906126e | 2017-09-19 14:38:40 -0400 | [diff] [blame] | 163 | std::vector<int> fCharMappings; |
Ethan Nicholas | ca82a92 | 2017-09-07 09:39:50 -0400 | [diff] [blame] | 164 | std::vector<int> fAccepts; |
| 165 | }; |