icing/util/i18n-utils.cc - platform/external/icing - Gitiles

 // Copyright (C) 2019 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "icing/util/i18n-utils.h"

 #include <cctype>
 #include <string_view>

 #include "icing/text_classifier/lib3/utils/base/statusor.h"
 #include "icing/absl_ports/canonical_errors.h"
 #include "icing/absl_ports/str_cat.h"
 #include "icing/util/logging.h"
 #include "unicode/uchar.h"
 #include "unicode/umachine.h"
 #include "unicode/ustring.h"
 #include "unicode/utf16.h"
 #include "unicode/utf8.h"
 #include "unicode/utypes.h"

 namespace icing {
 namespace lib {
 namespace i18n_utils {

 namespace {

 // All ASCII punctuation that's also in a Unicode Punctuation category
 // (https://www.fileformat.info/info/unicode/category/index.htm). The set of
 // characters that are regarded as punctuation is not the same for std::ispunct
 // and u_ispunct.
 const std::string ascii_icu_punctuation = "!\"#%&'*,./:;?@\\_-([{}])";

 }  // namespace

 libtextclassifier3::StatusOr<std::string> Utf16ToUtf8(
     const std::u16string& utf16_string) {
   std::string utf8_string;
   // Allocates the maximum possible UTF8 string length:
   // 3 UTF-8 bytes per UTF16 code unit, plus one for the terminating NUL.
   //
   // NOTE: we need to call resize() but not reserve() because values can't be
   // set at positions after length().
   utf8_string.resize(utf16_string.length() * 3 + 1);

   int result_length = 0;
   UErrorCode status = U_ZERO_ERROR;
   u_strToUTF8(&utf8_string[0], utf8_string.length(), &result_length,
               utf16_string.data(), utf16_string.length(), &status);
   // Corrects the length
   utf8_string.resize(result_length);

   if (U_FAILURE(status)) {
     return absl_ports::InternalError("Failed to convert UTF16 string to UTF8");
   }
   return utf8_string;
 }

 libtextclassifier3::StatusOr<std::u16string> Utf8ToUtf16(
     std::string_view utf8_string) {
   std::u16string utf16_result;
   // The UTF16 string won't be longer than its UTF8 format
   //
   // NOTE: we need to call resize() but not reserve() because values can't be
   // set at positions after length().
   utf16_result.resize(utf8_string.length());

   int result_length = 0;
   UErrorCode status = U_ZERO_ERROR;
   u_strFromUTF8(&utf16_result[0], utf16_result.length(), &result_length,
                 utf8_string.data(), utf8_string.length(), &status);
   // Corrects the length
   utf16_result.resize(result_length);

   if (U_FAILURE(status)) {
     return absl_ports::InternalError(absl_ports::StrCat(
         "Failed to convert UTF8 string '", utf8_string, "' to UTF16"));
   }
   return utf16_result;
 }

 UChar32 GetUChar32At(const char* data, int length, int position) {
   UChar32 uchar32;
   U8_NEXT_OR_FFFD(data, position, length, uchar32);
   return uchar32;
 }

 void SafeTruncateUtf8(std::string* str, int truncate_to_length) {
   if (str == nullptr || truncate_to_length >= str->length()) {
     return;
   }

   str->resize(SafeTruncateUtf8Length(str->c_str(), truncate_to_length));
 }

 int SafeTruncateUtf8Length(const char* str, int desired_length) {
   while (desired_length > 0) {
     if (IsLeadUtf8Byte(str[desired_length])) {
       break;
     }
     --desired_length;
   }
   return desired_length;
 }

 bool IsAscii(char c) { return U8_IS_SINGLE((uint8_t)c); }

 bool IsAscii(UChar32 c) { return U8_LENGTH(c) == 1; }

 int GetUtf8Length(UChar32 c) { return U8_LENGTH(c); }

 int GetUtf16Length(UChar32 c) { return U16_LENGTH(c); }

 bool IsLeadUtf8Byte(char c) { return IsAscii(c) || U8_IS_LEAD((uint8_t)c); }

 bool IsPunctuationAt(std::string_view input, int position, int* char_len_out) {
   if (IsAscii(input[position])) {
     if (char_len_out != nullptr) {
       *char_len_out = 1;
     }
     return ascii_icu_punctuation.find(input[position]) != std::string::npos;
   }
   UChar32 c = GetUChar32At(input.data(), input.length(), position);
   if (char_len_out != nullptr) {
     *char_len_out = U8_LENGTH(c);
   }
   return u_ispunct(c);
 }

 bool IsWhitespaceAt(std::string_view input, int position) {
   if (IsAscii(input[position])) {
     return std::isspace(input[position]);
   }
   UChar32 c = GetUChar32At(input.data(), input.length(), position);
   return u_isUWhiteSpace(c);
 }

 bool IsAlphabeticAt(std::string_view input, int position) {
   if (IsAscii(input[position])) {
     return std::isalpha(input[position]);
   }
   UChar32 c = GetUChar32At(input.data(), input.length(), position);
   return u_isUAlphabetic(c);
 }

 void AppendUchar32ToUtf8(std::string* utf8_string, UChar32 uchar) {
   uint8_t utf8_buffer[4];  // U8_APPEND writes 0 to 4 bytes

   int utf8_index = 0;
   UBool has_error = FALSE;

   // utf8_index is advanced to the end of the contents if successful
   U8_APPEND(utf8_buffer, utf8_index, sizeof(utf8_buffer), uchar, has_error);

   if (has_error) {
     ICING_LOG(WARNING) << "Error appending UChar32 to the UTF8 string.";
     return;
   }
   utf8_string->append(reinterpret_cast<char*>(utf8_buffer), utf8_index);
 }

 }  // namespace i18n_utils
 }  // namespace lib
 }  // namespace icing
	// Copyright (C) 2019 Google LLC
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include "icing/util/i18n-utils.h"

	#include <cctype>
	#include <string_view>

	#include "icing/text_classifier/lib3/utils/base/statusor.h"
	#include "icing/absl_ports/canonical_errors.h"
	#include "icing/absl_ports/str_cat.h"
	#include "icing/util/logging.h"
	#include "unicode/uchar.h"
	#include "unicode/umachine.h"
	#include "unicode/ustring.h"
	#include "unicode/utf16.h"
	#include "unicode/utf8.h"
	#include "unicode/utypes.h"

	namespace icing {
	namespace lib {
	namespace i18n_utils {

	namespace {

	// All ASCII punctuation that's also in a Unicode Punctuation category
	// (https://www.fileformat.info/info/unicode/category/index.htm). The set of
	// characters that are regarded as punctuation is not the same for std::ispunct
	// and u_ispunct.
	const std::string ascii_icu_punctuation = "!\"#%&'*,./:;?@\\_-([{}])";

	} // namespace

	libtextclassifier3::StatusOr<std::string> Utf16ToUtf8(
	const std::u16string& utf16_string) {
	std::string utf8_string;
	// Allocates the maximum possible UTF8 string length:
	// 3 UTF-8 bytes per UTF16 code unit, plus one for the terminating NUL.
	//
	// NOTE: we need to call resize() but not reserve() because values can't be
	// set at positions after length().
	utf8_string.resize(utf16_string.length() * 3 + 1);

	int result_length = 0;
	UErrorCode status = U_ZERO_ERROR;
	u_strToUTF8(&utf8_string[0], utf8_string.length(), &result_length,
	utf16_string.data(), utf16_string.length(), &status);
	// Corrects the length
	utf8_string.resize(result_length);

	if (U_FAILURE(status)) {
	return absl_ports::InternalError("Failed to convert UTF16 string to UTF8");
	}
	return utf8_string;
	}

	libtextclassifier3::StatusOr<std::u16string> Utf8ToUtf16(
	std::string_view utf8_string) {
	std::u16string utf16_result;
	// The UTF16 string won't be longer than its UTF8 format
	//
	// NOTE: we need to call resize() but not reserve() because values can't be
	// set at positions after length().
	utf16_result.resize(utf8_string.length());

	int result_length = 0;
	UErrorCode status = U_ZERO_ERROR;
	u_strFromUTF8(&utf16_result[0], utf16_result.length(), &result_length,
	utf8_string.data(), utf8_string.length(), &status);
	// Corrects the length
	utf16_result.resize(result_length);

	if (U_FAILURE(status)) {
	return absl_ports::InternalError(absl_ports::StrCat(
	"Failed to convert UTF8 string '", utf8_string, "' to UTF16"));
	}
	return utf16_result;
	}

	UChar32 GetUChar32At(const char* data, int length, int position) {
	UChar32 uchar32;
	U8_NEXT_OR_FFFD(data, position, length, uchar32);
	return uchar32;
	}

	void SafeTruncateUtf8(std::string* str, int truncate_to_length) {
	if (str == nullptr \|\| truncate_to_length >= str->length()) {
	return;
	}

	str->resize(SafeTruncateUtf8Length(str->c_str(), truncate_to_length));
	}

	int SafeTruncateUtf8Length(const char* str, int desired_length) {
	while (desired_length > 0) {
	if (IsLeadUtf8Byte(str[desired_length])) {
	break;
	}
	--desired_length;
	}
	return desired_length;
	}

	bool IsAscii(char c) { return U8_IS_SINGLE((uint8_t)c); }

	bool IsAscii(UChar32 c) { return U8_LENGTH(c) == 1; }

	int GetUtf8Length(UChar32 c) { return U8_LENGTH(c); }

	int GetUtf16Length(UChar32 c) { return U16_LENGTH(c); }

	bool IsLeadUtf8Byte(char c) { return IsAscii(c) \|\| U8_IS_LEAD((uint8_t)c); }

	bool IsPunctuationAt(std::string_view input, int position, int* char_len_out) {
	if (IsAscii(input[position])) {
	if (char_len_out != nullptr) {
	*char_len_out = 1;
	}
	return ascii_icu_punctuation.find(input[position]) != std::string::npos;
	}
	UChar32 c = GetUChar32At(input.data(), input.length(), position);
	if (char_len_out != nullptr) {
	*char_len_out = U8_LENGTH(c);
	}
	return u_ispunct(c);
	}

	bool IsWhitespaceAt(std::string_view input, int position) {
	if (IsAscii(input[position])) {
	return std::isspace(input[position]);
	}
	UChar32 c = GetUChar32At(input.data(), input.length(), position);
	return u_isUWhiteSpace(c);
	}

	bool IsAlphabeticAt(std::string_view input, int position) {
	if (IsAscii(input[position])) {
	return std::isalpha(input[position]);
	}
	UChar32 c = GetUChar32At(input.data(), input.length(), position);
	return u_isUAlphabetic(c);
	}

	void AppendUchar32ToUtf8(std::string* utf8_string, UChar32 uchar) {
	uint8_t utf8_buffer[4]; // U8_APPEND writes 0 to 4 bytes

	int utf8_index = 0;
	UBool has_error = FALSE;

	// utf8_index is advanced to the end of the contents if successful
	U8_APPEND(utf8_buffer, utf8_index, sizeof(utf8_buffer), uchar, has_error);

	if (has_error) {
	ICING_LOG(WARNING) << "Error appending UChar32 to the UTF8 string.";
	return;
	}
	utf8_string->append(reinterpret_cast<char*>(utf8_buffer), utf8_index);
	}

	} // namespace i18n_utils
	} // namespace lib
	} // namespace icing