blob: e68fbd2485e27e7520c01780587a4104697a8e52 [file] [log] [blame]
// Copyright (C) 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "icing/scoring/ranker.h"
#include <algorithm>
#include <vector>
#include "base/logging.h"
#include "utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/scoring/scored-document-hit.h"
#include "icing/store/document-id.h"
#include "icing/util/logging.h"
namespace icing {
namespace lib {
namespace {
// For all the heap manipulations in this file, we use a vector to represent the
// heap. The element at index 0 is the root node. For any node at index i, its
// left child node is at 2 * i + 1, its right child node is at 2 * i + 2.
// Helper function to wrap the heapify algorithm, it heapifies the target
// subtree node in place.
void Heapify(std::vector<ScoredDocumentHit>* scored_document_hits,
int target_subtree_root_index,
const ScoredDocumentHitComparator scored_document_hit_comparator) {
const int heap_size = scored_document_hits->size();
if (target_subtree_root_index >= heap_size) {
return;
}
// Initializes subtree root as the current best node.
int best = target_subtree_root_index;
// If we represent a heap in an array/vector, indices of left and right
// children can be calculated.
const int left = target_subtree_root_index * 2 + 1;
const int right = target_subtree_root_index * 2 + 2;
// If left child is better than current best
if (left < heap_size &&
scored_document_hit_comparator(scored_document_hits->at(left),
scored_document_hits->at(best))) {
best = left;
}
// If right child is better than current best
if (right < heap_size &&
scored_document_hit_comparator(scored_document_hits->at(right),
scored_document_hits->at(best))) {
best = right;
}
// If the best is not the subtree root, swap and continue heapifying the lower
// level subtree
if (best != target_subtree_root_index) {
std::swap(scored_document_hits->at(best),
scored_document_hits->at(target_subtree_root_index));
Heapify(scored_document_hits, best, scored_document_hit_comparator);
}
}
// Helper function to build a heap in place whose root is the best node defined
// by scored_document_hit_comparator. Time complexity is O(n).
void BuildHeap(
std::vector<ScoredDocumentHit>* scored_document_hits,
const ScoredDocumentHitComparator scored_document_hit_comparator) {
const int heap_size = scored_document_hits->size();
// Since we use a vector to represent the heap, [size / 2 - 1] is the index
// of the parent node of the last node.
for (int subtree_root_index = heap_size / 2 - 1; subtree_root_index >= 0;
subtree_root_index--) {
Heapify(scored_document_hits, subtree_root_index,
scored_document_hit_comparator);
}
}
// Helper function to extract the root from the heap. The heap structure will be
// maintained.
//
// Returns:
// The current root element on success
// RESOURCE_EXHAUSTED_ERROR if heap is empty
libtextclassifier3::StatusOr<ScoredDocumentHit> ExtractRoot(
std::vector<ScoredDocumentHit>* scored_document_hits,
ScoredDocumentHitComparator scored_document_hit_comparator) {
if (scored_document_hits->empty()) {
// An invalid ScoredDocumentHit
return absl_ports::ResourceExhaustedError("Heap is empty");
}
// Steps to extract root from heap:
// 1. copy out root
ScoredDocumentHit root = scored_document_hits->at(0);
const size_t last_node_index = scored_document_hits->size() - 1;
// 2. swap root and the last node
std::swap(scored_document_hits->at(0),
scored_document_hits->at(last_node_index));
// 3. remove last node
scored_document_hits->pop_back();
// 4. heapify root
Heapify(scored_document_hits, /*target_subtree_root_index=*/0,
scored_document_hit_comparator);
return root;
}
std::vector<ScoredDocumentHit> HeapifyAndProduceTopN(
std::vector<ScoredDocumentHit> scored_document_hits, int num_result,
bool is_descending) {
// Build a heap in place
const ScoredDocumentHitComparator scored_document_hit_comparator(
is_descending);
BuildHeap(&scored_document_hits, scored_document_hit_comparator);
// Get best nodes from heap one by one
std::vector<ScoredDocumentHit> scored_document_hit_result;
int result_size =
std::min(num_result, static_cast<int>(scored_document_hits.size()));
while (result_size-- > 0) {
libtextclassifier3::StatusOr<ScoredDocumentHit> next_best_document_hit_or =
ExtractRoot(&scored_document_hits, scored_document_hit_comparator);
if (next_best_document_hit_or.ok()) {
scored_document_hit_result.push_back(
std::move(next_best_document_hit_or).ValueOrDie());
} else {
ICING_VLOG(1) << next_best_document_hit_or.status().error_message();
}
}
return scored_document_hit_result;
}
} // namespace
std::vector<ScoredDocumentHit> GetTopNFromScoredDocumentHits(
std::vector<ScoredDocumentHit> scored_document_hits, int num_result,
bool is_descending) {
return HeapifyAndProduceTopN(std::move(scored_document_hits), num_result,
is_descending);
}
} // namespace lib
} // namespace icing