| /* |
| * Copyright 2017 Google Inc. |
| * |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE file. |
| */ |
| |
| #include "DFA.h" |
| #include "DFAState.h" |
| #include "NFA.h" |
| #include "NFAState.h" |
| |
| #include <algorithm> |
| #include <climits> |
| #include <memory> |
| #include <unordered_map> |
| #include <set> |
| #include <vector> |
| |
| /** |
| * Converts a nondeterministic finite automaton to a deterministic finite automaton. Since NFAs and |
| * DFAs differ only in that an NFA allows multiple states at the same time, we can find each |
| * possible combination of simultaneous NFA states and give this combination a label. These labelled |
| * nodes are our DFA nodes, since we can only be in one such unique set of NFA states at a time. |
| * |
| * As an NFA can end up in multiple accept states at the same time (for instance, the token "while" |
| * is valid for both WHILE and IDENTIFIER), we disambiguate by preferring the first matching regex |
| * (in terms of the order in which they were added to the NFA). |
| */ |
| class NFAtoDFA { |
| public: |
| static constexpr char START_CHAR = 9; |
| static constexpr char END_CHAR = 126; |
| |
| NFAtoDFA(NFA* nfa) |
| : fNFA(*nfa) {} |
| |
| /** |
| * Returns a DFA created from the NFA. |
| */ |
| DFA convert() { |
| // create state 0, the "reject" state |
| getState(DFAState::Label({})); |
| // create a state representing being in all of the NFA's start states at once |
| std::vector<int> startStates = fNFA.fStartStates; |
| std::sort(startStates.begin(), startStates.end()); |
| // this becomes state 1, our start state |
| DFAState* start = getState(DFAState::Label(startStates)); |
| this->scanState(start); |
| |
| this->computeMappings(); |
| |
| int stateCount = 0; |
| for (const auto& row : fTransitions) { |
| stateCount = std::max(stateCount, (int) row.size()); |
| } |
| return DFA(fCharMappings, fTransitions, fAccepts); |
| } |
| |
| private: |
| /** |
| * Returns an existing state with the given label, or creates a new one and returns it. |
| */ |
| DFAState* getState(DFAState::Label label) { |
| auto found = fStates.find(label); |
| if (found == fStates.end()) { |
| int id = fStates.size(); |
| fStates[label] = std::unique_ptr<DFAState>(new DFAState(id, label)); |
| return fStates[label].get(); |
| } |
| return found->second.get(); |
| } |
| |
| void add(int nfaState, std::vector<int>* states) { |
| NFAState state = fNFA.fStates[nfaState]; |
| if (state.fKind == NFAState::kRemapped_Kind) { |
| for (int next : state.fData) { |
| this->add(next, states); |
| } |
| } else { |
| for (int state : *states) { |
| if (nfaState == state) { |
| return; |
| } |
| } |
| states->push_back(nfaState); |
| } |
| } |
| |
| void addTransition(char c, int start, int next) { |
| while (fTransitions.size() <= (size_t) c) { |
| fTransitions.push_back(std::vector<int>()); |
| } |
| std::vector<int>& row = fTransitions[c]; |
| while (row.size() <= (size_t) start) { |
| row.push_back(INVALID); |
| } |
| row[start] = next; |
| } |
| |
| void scanState(DFAState* state) { |
| state->fIsScanned = true; |
| for (char c = START_CHAR; c <= END_CHAR; ++c) { |
| std::vector<int> next; |
| int bestAccept = INT_MAX; |
| for (int idx : state->fLabel.fStates) { |
| const NFAState& nfaState = fNFA.fStates[idx]; |
| if (nfaState.accept(c)) { |
| for (int nextState : nfaState.fNext) { |
| if (fNFA.fStates[nextState].fKind == NFAState::kAccept_Kind) { |
| bestAccept = std::min(bestAccept, fNFA.fStates[nextState].fData[0]); |
| } |
| this->add(nextState, &next); |
| } |
| } |
| } |
| std::sort(next.begin(), next.end()); |
| DFAState* nextState = this->getState(DFAState::Label(next)); |
| this->addTransition(c, state->fId, nextState->fId); |
| if (bestAccept != INT_MAX) { |
| while (fAccepts.size() <= (size_t) nextState->fId) { |
| fAccepts.push_back(INVALID); |
| } |
| fAccepts[nextState->fId] = bestAccept; |
| } |
| if (!nextState->fIsScanned) { |
| this->scanState(nextState); |
| } |
| } |
| } |
| |
| // collapse rows with the same transitions to a single row. This is common, as each row |
| // represents a character and often there are many characters for which all transitions are |
| // identical (e.g. [0-9] are treated the same way by all lexer rules) |
| void computeMappings() { |
| // mappings[<input row>] = <output row> |
| std::vector<std::vector<int>*> uniques; |
| // this could be done more efficiently, but O(n^2) is plenty fast for our purposes |
| for (size_t i = 0; i < fTransitions.size(); ++i) { |
| int found = -1; |
| for (size_t j = 0; j < uniques.size(); ++j) { |
| if (*uniques[j] == fTransitions[i]) { |
| found = j; |
| break; |
| } |
| } |
| if (found == -1) { |
| found = (int) uniques.size(); |
| uniques.push_back(&fTransitions[i]); |
| } |
| fCharMappings.push_back(found); |
| } |
| std::vector<std::vector<int>> newTransitions; |
| for (std::vector<int>* row : uniques) { |
| newTransitions.push_back(*row); |
| } |
| fTransitions = newTransitions; |
| } |
| |
| const NFA& fNFA; |
| std::unordered_map<DFAState::Label, std::unique_ptr<DFAState>> fStates; |
| std::vector<std::vector<int>> fTransitions; |
| std::vector<int> fCharMappings; |
| std::vector<int> fAccepts; |
| }; |