blob: a60764df399381ccb827f4e40e5b51a9a2d80f12 [file] [log] [blame]
// Copyright (C) 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "icing/index/main/doc-hit-info-iterator-term-main.h"
#include <cstdint>
#include <memory>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/main/posting-list-accessor.h"
#include "icing/index/main/posting-list-identifier.h"
#include "icing/legacy/core/icing-string-util.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
#include "icing/util/status-macros.h"
namespace icing {
namespace lib {
namespace {
std::string SectionIdMaskToString(SectionIdMask section_id_mask) {
std::string mask(kMaxSectionId + 1, '0');
for (SectionId i = kMaxSectionId; i >= 0; --i) {
if (section_id_mask & (1U << i)) {
mask[kMaxSectionId - i] = '1';
}
}
return mask;
}
} // namespace
libtextclassifier3::Status DocHitInfoIteratorTermMain::Advance() {
if (posting_list_accessor_ == nullptr ||
cached_doc_hit_infos_idx_ == (cached_doc_hit_infos_.size() - 2)) {
// If we haven't retrieved any hits before or we've already returned all but
// the last cached hit, then go get some more!
// We hold back the last cached hit because it could have more hits on the
// next posting list in the chain.
libtextclassifier3::Status status = RetrieveMoreHits();
if (!status.ok()) {
if (!absl_ports::IsNotFound(status)) {
// NOT_FOUND is expected to happen (not every term will be in the main
// index!). Other errors are worth logging.
ICING_LOG(ERROR) << "Failed to retrieve more hits "
<< status.error_message();
}
return absl_ports::ResourceExhaustedError(
"No more DocHitInfos in iterator");
}
} else {
++cached_doc_hit_infos_idx_;
}
if (cached_doc_hit_infos_idx_ == -1 ||
cached_doc_hit_infos_idx_ >= cached_doc_hit_infos_.size()) {
// Nothing more for the iterator to return. Set these members to invalid
// values.
doc_hit_info_ = DocHitInfo();
hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
return absl_ports::ResourceExhaustedError(
"No more DocHitInfos in iterator");
}
doc_hit_info_ = cached_doc_hit_infos_.at(cached_doc_hit_infos_idx_);
hit_intersect_section_ids_mask_ = doc_hit_info_.hit_section_ids_mask();
return libtextclassifier3::Status::OK;
}
libtextclassifier3::Status DocHitInfoIteratorTermMainExact::RetrieveMoreHits() {
DocHitInfo last_doc_hit_info;
if (!cached_doc_hit_infos_.empty()) {
last_doc_hit_info = cached_doc_hit_infos_.back();
}
cached_doc_hit_infos_idx_ = 0;
cached_doc_hit_infos_.clear();
if (last_doc_hit_info.document_id() != kInvalidDocumentId) {
// Carry over the last hit. It might need to be merged with the first hit of
// of the next posting list in the chain.
cached_doc_hit_infos_.push_back(last_doc_hit_info);
}
if (posting_list_accessor_ == nullptr) {
ICING_ASSIGN_OR_RETURN(posting_list_accessor_,
main_index_->GetAccessorForExactTerm(term_));
}
ICING_ASSIGN_OR_RETURN(std::vector<Hit> hits,
posting_list_accessor_->GetNextHitsBatch());
++num_blocks_inspected_;
cached_doc_hit_infos_.reserve(hits.size() + 1);
for (const Hit& hit : hits) {
// Check sections.
if (((1u << hit.section_id()) & section_restrict_mask_) == 0) {
continue;
}
// We want exact hits, skip prefix-only hits.
if (hit.is_prefix_hit()) {
continue;
}
if (cached_doc_hit_infos_.empty() ||
hit.document_id() != cached_doc_hit_infos_.back().document_id()) {
cached_doc_hit_infos_.push_back(DocHitInfo(hit.document_id()));
}
cached_doc_hit_infos_.back().UpdateSection(hit.section_id(), hit.score());
}
return libtextclassifier3::Status::OK;
}
std::string DocHitInfoIteratorTermMainExact::ToString() const {
return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
term_);
}
libtextclassifier3::Status
DocHitInfoIteratorTermMainPrefix::RetrieveMoreHits() {
DocHitInfo last_doc_hit_info;
if (!cached_doc_hit_infos_.empty()) {
last_doc_hit_info = cached_doc_hit_infos_.back();
}
cached_doc_hit_infos_idx_ = 0;
cached_doc_hit_infos_.clear();
if (last_doc_hit_info.document_id() != kInvalidDocumentId) {
// Carry over the last hit. It might need to be merged with the first hit of
// of the next posting list in the chain.
cached_doc_hit_infos_.push_back(last_doc_hit_info);
}
++num_blocks_inspected_;
if (posting_list_accessor_ == nullptr) {
ICING_ASSIGN_OR_RETURN(
MainIndex::GetPrefixAccessorResult result,
main_index_->GetAccessorForPrefixTerm(term_));
posting_list_accessor_ = std::move(result.accessor);
exact_ = result.exact;
}
ICING_ASSIGN_OR_RETURN(std::vector<Hit> hits,
posting_list_accessor_->GetNextHitsBatch());
cached_doc_hit_infos_.reserve(hits.size());
for (const Hit& hit : hits) {
// Check sections.
if (((1u << hit.section_id()) & section_restrict_mask_) == 0) {
continue;
}
// If we only want hits from prefix sections.
if (!exact_ && !hit.is_in_prefix_section()) {
continue;
}
if (cached_doc_hit_infos_.empty() ||
hit.document_id() != cached_doc_hit_infos_.back().document_id()) {
cached_doc_hit_infos_.push_back(DocHitInfo(hit.document_id()));
}
cached_doc_hit_infos_.back().UpdateSection(hit.section_id(), hit.score());
}
return libtextclassifier3::Status::OK;
}
std::string DocHitInfoIteratorTermMainPrefix::ToString() const {
return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
term_, "*");
}
} // namespace lib
} // namespace icing