| /* |
| * Copyright (C) 2017 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #ifndef LIBTEXTCLASSIFIER_LANG_ID_LANG_ID_H_ |
| #define LIBTEXTCLASSIFIER_LANG_ID_LANG_ID_H_ |
| |
| // Clients who want to perform language identification should use this header. |
| // |
| // Note for lang id implementors: keep this header as linght as possible. E.g., |
| // any macro defined here (or in a transitively #included file) is a potential |
| // name conflict with our clients. |
| |
| #include <memory> |
| #include <string> |
| #include <vector> |
| |
| #include "util/base/macros.h" |
| |
| namespace libtextclassifier { |
| namespace nlp_core { |
| namespace lang_id { |
| |
| // Forward-declaration of the class that performs all underlying work. |
| class LangIdImpl; |
| |
| // Class for detecting the language of a document. |
| // |
| // NOTE: this class is thread-unsafe. |
| class LangId { |
| public: |
| // Constructs a LangId object, loading an EmbeddingNetworkProto model from the |
| // indicated file. |
| // |
| // Note: we don't crash if we detect a problem at construction time (e.g., |
| // file doesn't exist, or its content is corrupted). Instead, we mark the |
| // newly-constructed object as invalid; clients can invoke FindLanguage() on |
| // an invalid object: nothing crashes, but accuracy will be bad. |
| explicit LangId(const std::string &filename); |
| |
| // Same as above but uses a file descriptor. |
| explicit LangId(int fd); |
| |
| // Same as above but uses already mapped memory region |
| explicit LangId(const char *ptr, size_t length); |
| |
| virtual ~LangId(); |
| |
| // Sets probability threshold for predictions. If our likeliest prediction is |
| // below this threshold, we report the default language (see |
| // SetDefaultLanguage()). Othewise, we report the likelist language. |
| // |
| // By default (if this method is not called) we use the probability threshold |
| // stored in the model, as the task parameter "reliability_thresh". If that |
| // task parameter is not specified, we use 0.5. A client can use this method |
| // to get a different precision / recall trade-off. The higher the threshold, |
| // the higher the precision and lower the recall rate. |
| void SetProbabilityThreshold(float threshold); |
| |
| // Sets default language to report if errors prevent running the real |
| // inference code or if prediction confidence is too small. |
| void SetDefaultLanguage(const std::string &lang); |
| |
| // Returns language code for the most likely language that text is written in. |
| // Note: if this LangId object is not valid (see |
| // is_valid()), this method returns the default language specified via |
| // SetDefaultLanguage() or (if that method was never invoked), the empty |
| // std::string. |
| std::string FindLanguage(const std::string &text) const; |
| |
| // Returns a vector of language codes along with the probability for each |
| // language. The result contains at least one element. The sum of |
| // probabilities may be less than 1.0. |
| std::vector<std::pair<std::string, float>> FindLanguages( |
| const std::string &text) const; |
| |
| // Returns true if this object has been correctly initialized and is ready to |
| // perform predictions. For more info, see doc for LangId |
| // constructor above. |
| bool is_valid() const; |
| |
| // Returns version number for the model. |
| int version() const; |
| |
| private: |
| // Returns a vector of probabilities of languages of the text. |
| std::vector<float> ScoreLanguages(const std::string &text) const; |
| |
| // Pimpl ("pointer to implementation") pattern, to hide all internals from our |
| // clients. |
| std::unique_ptr<LangIdImpl> pimpl_; |
| |
| TC_DISALLOW_COPY_AND_ASSIGN(LangId); |
| }; |
| |
| } // namespace lang_id |
| } // namespace nlp_core |
| } // namespace libtextclassifier |
| |
| #endif // LIBTEXTCLASSIFIER_LANG_ID_LANG_ID_H_ |