native/utils/wordpiece_tokenizer.h - platform/external/libtextclassifier - Gitiles

 /*
  * Copyright (C) 2018 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #ifndef LIBTEXTCLASSIFIER_UTILS_WORDPIECE_TOKENIZER_H_
 #define LIBTEXTCLASSIFIER_UTILS_WORDPIECE_TOKENIZER_H_

 #include <string>
 #include <vector>

 #include "absl/strings/string_view.h"

 namespace libtextclassifier3 {

 struct LookupStatus {
   LookupStatus() : error_msg(""), success(true) {}
   explicit LookupStatus(const std::string& msg)
       : error_msg(msg), success(false) {}
   std::string error_msg;
   bool success;

   static LookupStatus OK() { return LookupStatus(); }
 };

 class WordpieceVocab {
  public:
   virtual ~WordpieceVocab() {}
   virtual LookupStatus Contains(const absl::string_view key,
                                 bool* value) const = 0;
 };

 LookupStatus WordpieceTokenize(
     const absl::string_view token, const int max_bytes_per_token,
     const int max_chars_per_subtoken, const std::string& suffix_indicator,
     bool use_unknown_token, const std::string& unknown_token,
     bool split_unknown_characters, const WordpieceVocab* vocab_map,
     std::vector<std::string>* subwords, std::vector<int>* begin_offset,
     std::vector<int>* end_offset, int* num_word_pieces);

 // As above but with `max_bytes_per_subtoken` unknown,
 // and split_unknown_characters=false. (For backwards compatibility.)
 LookupStatus WordpieceTokenize(
     const absl::string_view token, const int max_bytes_per_token,
     const std::string& suffix_indicator, bool use_unknown_token,
     const std::string& unknown_token, const WordpieceVocab* vocab_map,
     std::vector<std::string>* subwords, std::vector<int>* begin_offset,
     std::vector<int>* end_offset, int* num_word_pieces);

 }  // namespace libtextclassifier3

 #endif  // LIBTEXTCLASSIFIER_UTILS_WORDPIECE_TOKENIZER_H_
	/*
	* Copyright (C) 2018 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#ifndef LIBTEXTCLASSIFIER_UTILS_WORDPIECE_TOKENIZER_H_
	#define LIBTEXTCLASSIFIER_UTILS_WORDPIECE_TOKENIZER_H_

	#include <string>
	#include <vector>

	#include "absl/strings/string_view.h"

	namespace libtextclassifier3 {

	struct LookupStatus {
	LookupStatus() : error_msg(""), success(true) {}
	explicit LookupStatus(const std::string& msg)
	: error_msg(msg), success(false) {}
	std::string error_msg;
	bool success;

	static LookupStatus OK() { return LookupStatus(); }
	};

	class WordpieceVocab {
	public:
	virtual ~WordpieceVocab() {}
	virtual LookupStatus Contains(const absl::string_view key,
	bool* value) const = 0;
	};

	LookupStatus WordpieceTokenize(
	const absl::string_view token, const int max_bytes_per_token,
	const int max_chars_per_subtoken, const std::string& suffix_indicator,
	bool use_unknown_token, const std::string& unknown_token,
	bool split_unknown_characters, const WordpieceVocab* vocab_map,
	std::vector<std::string>* subwords, std::vector<int>* begin_offset,
	std::vector<int>* end_offset, int* num_word_pieces);

	// As above but with `max_bytes_per_subtoken` unknown,
	// and split_unknown_characters=false. (For backwards compatibility.)
	LookupStatus WordpieceTokenize(
	const absl::string_view token, const int max_bytes_per_token,
	const std::string& suffix_indicator, bool use_unknown_token,
	const std::string& unknown_token, const WordpieceVocab* vocab_map,
	std::vector<std::string>* subwords, std::vector<int>* begin_offset,
	std::vector<int>* end_offset, int* num_word_pieces);

	} // namespace libtextclassifier3

	#endif // LIBTEXTCLASSIFIER_UTILS_WORDPIECE_TOKENIZER_H_