blob: bd41b51b1a99886a72f8d9cf159c01e7c9e76b65 [file] [log] [blame]
// Copyright (C) 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "icing/index/index.h"
#include <cstdint>
#include <memory>
#include <string>
#include <utility>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
#include "icing/index/hit/hit.h"
#include "icing/index/iterator/doc-hit-info-iterator-or.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/index/lite/doc-hit-info-iterator-term-lite.h"
#include "icing/index/lite/lite-index.h"
#include "icing/index/main/doc-hit-info-iterator-term-main.h"
#include "icing/index/term-id-codec.h"
#include "icing/index/term-property-id.h"
#include "icing/legacy/core/icing-string-util.h"
#include "icing/legacy/index/icing-dynamic-trie.h"
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/proto/term.pb.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
namespace icing {
namespace lib {
namespace {
libtextclassifier3::StatusOr<LiteIndex::Options> CreateLiteIndexOptions(
const Index::Options& options) {
if (options.index_merge_size <= 0) {
return absl_ports::InvalidArgumentError(
"Requested hit buffer size must be greater than 0.");
}
if (options.index_merge_size > LiteIndex::max_hit_buffer_size()) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
"Requested hit buffer size %d is too large.",
options.index_merge_size));
}
return LiteIndex::Options(options.base_dir + "/idx/lite.",
options.index_merge_size);
}
std::string MakeMainIndexFilepath(const std::string& base_dir) {
return base_dir + "/idx/main";
}
IcingDynamicTrie::Options GetMainLexiconOptions() {
// The default values for IcingDynamicTrie::Options is fine for the main
// lexicon.
return IcingDynamicTrie::Options();
}
// Helper function to check if a term is in the given namespaces.
// TODO(tjbarron): Implement a method PropertyReadersAll.HasAnyProperty().
bool IsTermInNamespaces(
const IcingDynamicTrie::PropertyReadersAll& property_reader,
uint32_t value_index, const std::vector<NamespaceId>& namespace_ids) {
if (namespace_ids.empty()) {
return true;
}
for (NamespaceId namespace_id : namespace_ids) {
if (property_reader.HasProperty(GetNamespacePropertyId(namespace_id),
value_index)) {
return true;
}
}
return false;
}
enum class MergeAction { kTakeLiteTerm, kTakeMainTerm, kMergeTerms };
std::vector<TermMetadata> MergeTermMetadatas(
std::vector<TermMetadata> lite_term_metadata_list,
std::vector<TermMetadata> main_term_metadata_list, int num_to_return) {
std::vector<TermMetadata> merged_term_metadata_list;
merged_term_metadata_list.reserve(
std::min(lite_term_metadata_list.size() + main_term_metadata_list.size(),
static_cast<size_t>(num_to_return)));
auto lite_term_itr = lite_term_metadata_list.begin();
auto main_term_itr = main_term_metadata_list.begin();
MergeAction merge_action;
while (merged_term_metadata_list.size() < num_to_return &&
(lite_term_itr != lite_term_metadata_list.end() ||
main_term_itr != main_term_metadata_list.end())) {
// Get pointers to the next metadatas in each group, if available
// Determine how to merge.
if (main_term_itr == main_term_metadata_list.end()) {
merge_action = MergeAction::kTakeLiteTerm;
} else if (lite_term_itr == lite_term_metadata_list.end()) {
merge_action = MergeAction::kTakeMainTerm;
} else if (lite_term_itr->content < main_term_itr->content) {
merge_action = MergeAction::kTakeLiteTerm;
} else if (main_term_itr->content < lite_term_itr->content) {
merge_action = MergeAction::kTakeMainTerm;
} else {
// The next metadatas refer to the same term. Combine them.
merge_action = MergeAction::kMergeTerms;
}
switch (merge_action) {
case MergeAction::kTakeLiteTerm:
merged_term_metadata_list.push_back(std::move(*lite_term_itr));
++lite_term_itr;
break;
case MergeAction::kTakeMainTerm:
merged_term_metadata_list.push_back(std::move(*main_term_itr));
++main_term_itr;
break;
case MergeAction::kMergeTerms:
int total_est_hit_count =
lite_term_itr->hit_count + main_term_itr->hit_count;
merged_term_metadata_list.emplace_back(
std::move(lite_term_itr->content), total_est_hit_count);
++lite_term_itr;
++main_term_itr;
break;
}
}
return merged_term_metadata_list;
}
} // namespace
libtextclassifier3::StatusOr<std::unique_ptr<Index>> Index::Create(
const Options& options, const Filesystem* filesystem,
const IcingFilesystem* icing_filesystem) {
ICING_RETURN_ERROR_IF_NULL(filesystem);
ICING_RETURN_ERROR_IF_NULL(icing_filesystem);
ICING_ASSIGN_OR_RETURN(LiteIndex::Options lite_index_options,
CreateLiteIndexOptions(options));
ICING_ASSIGN_OR_RETURN(
std::unique_ptr<TermIdCodec> term_id_codec,
TermIdCodec::Create(
IcingDynamicTrie::max_value_index(GetMainLexiconOptions()),
IcingDynamicTrie::max_value_index(
lite_index_options.lexicon_options)));
ICING_ASSIGN_OR_RETURN(
std::unique_ptr<LiteIndex> lite_index,
LiteIndex::Create(lite_index_options, icing_filesystem));
ICING_ASSIGN_OR_RETURN(
std::unique_ptr<MainIndex> main_index,
MainIndex::Create(MakeMainIndexFilepath(options.base_dir), filesystem,
icing_filesystem));
return std::unique_ptr<Index>(new Index(options, std::move(term_id_codec),
std::move(lite_index),
std::move(main_index)));
}
libtextclassifier3::Status Index::TruncateTo(DocumentId document_id) {
if (lite_index_->last_added_document_id() != kInvalidDocumentId &&
lite_index_->last_added_document_id() > document_id) {
ICING_VLOG(1) << "Clipping to " << document_id
<< ". Throwing out lite index which is at "
<< lite_index_->last_added_document_id();
ICING_RETURN_IF_ERROR(lite_index_->Reset());
}
if (main_index_->last_added_document_id() != kInvalidDocumentId &&
main_index_->last_added_document_id() > document_id) {
ICING_VLOG(1) << "Clipping to " << document_id
<< ". Throwing out lite index which is at "
<< main_index_->last_added_document_id();
ICING_RETURN_IF_ERROR(main_index_->Reset());
}
return libtextclassifier3::Status::OK;
}
libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
Index::GetIterator(const std::string& term, SectionIdMask section_id_mask,
TermMatchType::Code term_match_type) {
std::unique_ptr<DocHitInfoIterator> lite_itr;
std::unique_ptr<DocHitInfoIterator> main_itr;
switch (term_match_type) {
case TermMatchType::EXACT_ONLY:
lite_itr = std::make_unique<DocHitInfoIteratorTermLiteExact>(
term_id_codec_.get(), lite_index_.get(), term, section_id_mask);
main_itr = std::make_unique<DocHitInfoIteratorTermMainExact>(
main_index_.get(), term, section_id_mask);
break;
case TermMatchType::PREFIX:
lite_itr = std::make_unique<DocHitInfoIteratorTermLitePrefix>(
term_id_codec_.get(), lite_index_.get(), term, section_id_mask);
main_itr = std::make_unique<DocHitInfoIteratorTermMainPrefix>(
main_index_.get(), term, section_id_mask);
break;
default:
return absl_ports::InvalidArgumentError(
absl_ports::StrCat("Invalid TermMatchType: ",
TermMatchType::Code_Name(term_match_type)));
}
return std::make_unique<DocHitInfoIteratorOr>(std::move(lite_itr),
std::move(main_itr));
}
libtextclassifier3::StatusOr<std::vector<TermMetadata>>
Index::FindLiteTermsByPrefix(const std::string& prefix,
const std::vector<NamespaceId>& namespace_ids,
int num_to_return) {
// Finds all the terms that start with the given prefix in the lexicon.
IcingDynamicTrie::Iterator term_iterator(lite_index_->lexicon(),
prefix.c_str());
// A property reader to help check if a term has some property.
IcingDynamicTrie::PropertyReadersAll property_reader(lite_index_->lexicon());
std::vector<TermMetadata> term_metadata_list;
while (term_iterator.IsValid() && term_metadata_list.size() < num_to_return) {
uint32_t term_value_index = term_iterator.GetValueIndex();
// Skips the terms that don't exist in the given namespaces. We won't skip
// any terms if namespace_ids is empty.
if (!IsTermInNamespaces(property_reader, term_value_index, namespace_ids)) {
term_iterator.Advance();
continue;
}
ICING_ASSIGN_OR_RETURN(
uint32_t term_id,
term_id_codec_->EncodeTvi(term_value_index, TviType::LITE),
absl_ports::InternalError("Failed to access terms in lexicon."));
term_metadata_list.emplace_back(term_iterator.GetKey(),
lite_index_->CountHits(term_id));
term_iterator.Advance();
}
if (term_iterator.IsValid()) {
// We exited the loop above because we hit the num_to_return limit.
ICING_LOG(WARNING) << "Ran into limit of " << num_to_return
<< " retrieving suggestions for " << prefix
<< ". Some suggestions may not be returned and others "
"may be misranked.";
}
return term_metadata_list;
}
libtextclassifier3::StatusOr<std::vector<TermMetadata>>
Index::FindTermsByPrefix(const std::string& prefix,
const std::vector<NamespaceId>& namespace_ids,
int num_to_return) {
std::vector<TermMetadata> term_metadata_list;
if (num_to_return <= 0) {
return term_metadata_list;
}
// Get results from the LiteIndex.
ICING_ASSIGN_OR_RETURN(
std::vector<TermMetadata> lite_term_metadata_list,
FindLiteTermsByPrefix(prefix, namespace_ids, num_to_return));
// Append results from the MainIndex.
ICING_ASSIGN_OR_RETURN(
std::vector<TermMetadata> main_term_metadata_list,
main_index_->FindTermsByPrefix(prefix, namespace_ids, num_to_return));
return MergeTermMetadatas(std::move(lite_term_metadata_list),
std::move(main_term_metadata_list), num_to_return);
}
libtextclassifier3::Status Index::Editor::BufferTerm(const char* term) {
// Step 1: See if this term is already in the lexicon
uint32_t tvi;
auto tvi_or = lite_index_->GetTermId(term);
// Step 2: Update the lexicon, either add the term or update its properties
if (tvi_or.ok()) {
tvi = tvi_or.ValueOrDie();
if (seen_tokens_.find(tvi) != seen_tokens_.end()) {
ICING_VLOG(1) << "Updating term frequency for term " << term;
if (seen_tokens_[tvi] != Hit::kMaxTermFrequency) {
++seen_tokens_[tvi];
}
return libtextclassifier3::Status::OK;
}
ICING_VLOG(1) << "Term " << term
<< " is already present in lexicon. Updating.";
// Already in the lexicon. Just update the properties.
ICING_RETURN_IF_ERROR(lite_index_->UpdateTermProperties(
tvi, term_match_type_ == TermMatchType::PREFIX, namespace_id_));
} else {
ICING_VLOG(1) << "Term " << term << " is not in lexicon. Inserting.";
// Haven't seen this term before. Add it to the lexicon.
ICING_ASSIGN_OR_RETURN(
tvi, lite_index_->InsertTerm(term, term_match_type_, namespace_id_));
}
// Token seen for the first time in the current document.
seen_tokens_[tvi] = 1;
return libtextclassifier3::Status::OK;
}
libtextclassifier3::Status Index::Editor::IndexAllBufferedTerms() {
for (auto itr = seen_tokens_.begin(); itr != seen_tokens_.end(); itr++) {
Hit hit(section_id_, document_id_, /*term_frequency=*/itr->second,
term_match_type_ == TermMatchType::PREFIX);
ICING_ASSIGN_OR_RETURN(
uint32_t term_id, term_id_codec_->EncodeTvi(itr->first, TviType::LITE));
ICING_RETURN_IF_ERROR(lite_index_->AddHit(term_id, hit));
}
return libtextclassifier3::Status::OK;
}
} // namespace lib
} // namespace icing