| /* |
| * Copyright (C) 2018 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #ifndef LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_ENCODER_H_ |
| #define LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_ENCODER_H_ |
| |
| #include <vector> |
| |
| #include "utils/base/logging.h" |
| #include "utils/sentencepiece/matcher.h" |
| #include "utils/strings/stringpiece.h" |
| |
| namespace libtextclassifier3 { |
| |
| // Encoder to segment/tokenize strings into pieces such that the sum of the |
| // scores of the pieces used is maximized. |
| class Encoder { |
| public: |
| // matcher: the list of valid sentence pieces represented as a matcher, e.g. |
| // a trie. |
| // num_pieces: the number of pieces in the trie. |
| // pieces_scores: the scores of the individual pieces. |
| // start_code: code that is used as encoding of the start of input. |
| // end_code: code that is used as encoding of the end of input. |
| // encoding_offset: value added to the sentence piece ids to make them |
| // not interesecting with start_code and end_code. |
| // unknown_code: code that is used for out-of-dictionary characters. |
| // unknown_score: the penality score associated with the unknown code. |
| Encoder(const SentencePieceMatcher* matcher, const int num_pieces, |
| const float* pieces_scores, int start_code = 0, int end_code = 1, |
| int encoding_offset = 2, int unknown_code = -1, |
| float unknown_score = 0.f) |
| : num_pieces_(num_pieces), |
| scores_(pieces_scores), |
| matcher_(matcher), |
| start_code_(start_code), |
| end_code_(end_code), |
| encoding_offset_(encoding_offset), |
| unknown_code_(unknown_code), |
| unknown_score_(unknown_score) {} |
| |
| // Segment the input so that the total score of the pieces used is maximized. |
| // This is a simplified implementation of the general Viterbi algorithm, |
| // assuming independence between individual pieces. |
| std::vector<int> Encode(StringPiece normalized_text) const; |
| |
| private: |
| // State in the dynamic programming algorithm. |
| struct SegmentationEntry { |
| // Accumulated score. |
| float score; |
| |
| // Position before last piece. |
| int previous_pos; |
| |
| // Last piece used. |
| int piece_id; |
| |
| // Total number of pieces used. |
| int num_pieces; |
| }; |
| |
| const int num_pieces_; |
| const float* scores_; |
| const SentencePieceMatcher* matcher_; |
| const int start_code_; |
| const int end_code_; |
| const int encoding_offset_; |
| const int unknown_code_; |
| const int unknown_score_; |
| }; |
| |
| } // namespace libtextclassifier3 |
| |
| #endif // LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_ENCODER_H_ |