icing/index/lite/doc-hit-info-iterator-term-lite.h - platform/external/icing - Gitiles

 // Copyright (C) 2019 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
 #define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_

 #include <cstdint>
 #include <vector>

 #include "icing/text_classifier/lib3/utils/base/status.h"
 #include "icing/index/hit/doc-hit-info.h"
 #include "icing/index/iterator/doc-hit-info-iterator.h"
 #include "icing/index/lite/lite-index.h"
 #include "icing/index/term-id-codec.h"
 #include "icing/schema/section.h"

 namespace icing {
 namespace lib {

 class DocHitInfoIteratorTermLite : public DocHitInfoIterator {
  public:
   explicit DocHitInfoIteratorTermLite(const TermIdCodec* term_id_codec,
                                       LiteIndex* lite_index,
                                       const std::string& term,
                                       SectionIdMask section_restrict_mask)
       : term_(term),
         lite_index_(lite_index),
         cached_hits_idx_(-1),
         term_id_codec_(term_id_codec),
         num_advance_calls_(0),
         section_restrict_mask_(section_restrict_mask) {}

   libtextclassifier3::Status Advance() override;

   int32_t GetNumBlocksInspected() const override {
     // TODO(b/137862424): Implement this once the main index is added.
     return 0;
   }
   int32_t GetNumLeafAdvanceCalls() const override { return num_advance_calls_; }

   void PopulateMatchedTermsStats(
       std::vector<TermMatchInfo>* matched_terms_stats,
       SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override {
     if (doc_hit_info_.document_id() == kInvalidDocumentId) {
       // Current hit isn't valid, return.
       return;
     }
     SectionIdMask section_mask =
         doc_hit_info_.hit_section_ids_mask() & filtering_section_mask;
     SectionIdMask section_mask_copy = section_mask;
     std::array<Hit::TermFrequency, kMaxSectionId> section_term_frequencies = {
         Hit::kNoTermFrequency};
     while (section_mask_copy) {
       SectionId section_id = __builtin_ctz(section_mask_copy);
       section_term_frequencies.at(section_id) =
           doc_hit_info_.hit_term_frequency(section_id);
       section_mask_copy &= ~(1u << section_id);
     }
     TermMatchInfo term_stats(term_, section_mask,
                              std::move(section_term_frequencies));

     for (const TermMatchInfo& cur_term_stats : *matched_terms_stats) {
       if (cur_term_stats.term == term_stats.term) {
         // Same docId and same term, we don't need to add the term and the term
         // frequency should always be the same
         return;
       }
     }
     matched_terms_stats->push_back(std::move(term_stats));
   }

  protected:
   // Add DocHitInfos corresponding to term_ to cached_hits_.
   virtual libtextclassifier3::Status RetrieveMoreHits() = 0;

   const std::string term_;
   LiteIndex* const lite_index_;
   // Stores hits retrieved from the index. This may only be a subset of the hits
   // that are present in the index. Current value pointed to by the Iterator is
   // tracked by cached_hits_idx_.
   std::vector<DocHitInfo> cached_hits_;
   int cached_hits_idx_;
   const TermIdCodec* term_id_codec_;
   int num_advance_calls_;
   // Mask indicating which sections hits should be considered for.
   // Ex. 0000 0000 0000 0010 means that only hits from section 1 are desired.
   const SectionIdMask section_restrict_mask_;
 };

 class DocHitInfoIteratorTermLiteExact : public DocHitInfoIteratorTermLite {
  public:
   explicit DocHitInfoIteratorTermLiteExact(const TermIdCodec* term_id_codec,
                                            LiteIndex* lite_index,
                                            const std::string& term,
                                            SectionIdMask section_id_mask)
       : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term,
                                    section_id_mask) {}

   std::string ToString() const override;

  protected:
   libtextclassifier3::Status RetrieveMoreHits() override;
 };

 class DocHitInfoIteratorTermLitePrefix : public DocHitInfoIteratorTermLite {
  public:
   explicit DocHitInfoIteratorTermLitePrefix(const TermIdCodec* term_id_codec,
                                             LiteIndex* lite_index,
                                             const std::string& term,
                                             SectionIdMask section_id_mask)
       : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term,
                                    section_id_mask) {}

   std::string ToString() const override;

  protected:
   libtextclassifier3::Status RetrieveMoreHits() override;

  private:
   // After retrieving DocHitInfos from the index, a DocHitInfo for docid 1 and
   // "foo" and a DocHitInfo for docid 1 and "fool". These DocHitInfos should be
   // merged.
   void SortAndDedupeDocumentIds();
 };

 }  // namespace lib
 }  // namespace icing

 #endif  // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
	// Copyright (C) 2019 Google LLC
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
	#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_

	#include <cstdint>
	#include <vector>

	#include "icing/text_classifier/lib3/utils/base/status.h"
	#include "icing/index/hit/doc-hit-info.h"
	#include "icing/index/iterator/doc-hit-info-iterator.h"
	#include "icing/index/lite/lite-index.h"
	#include "icing/index/term-id-codec.h"
	#include "icing/schema/section.h"

	namespace icing {
	namespace lib {

	class DocHitInfoIteratorTermLite : public DocHitInfoIterator {
	public:
	explicit DocHitInfoIteratorTermLite(const TermIdCodec* term_id_codec,
	LiteIndex* lite_index,
	const std::string& term,
	SectionIdMask section_restrict_mask)
	: term_(term),
	lite_index_(lite_index),
	cached_hits_idx_(-1),
	term_id_codec_(term_id_codec),
	num_advance_calls_(0),
	section_restrict_mask_(section_restrict_mask) {}

	libtextclassifier3::Status Advance() override;

	int32_t GetNumBlocksInspected() const override {
	// TODO(b/137862424): Implement this once the main index is added.
	return 0;
	}
	int32_t GetNumLeafAdvanceCalls() const override { return num_advance_calls_; }

	void PopulateMatchedTermsStats(
	std::vector<TermMatchInfo>* matched_terms_stats,
	SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override {
	if (doc_hit_info_.document_id() == kInvalidDocumentId) {
	// Current hit isn't valid, return.
	return;
	}
	SectionIdMask section_mask =
	doc_hit_info_.hit_section_ids_mask() & filtering_section_mask;
	SectionIdMask section_mask_copy = section_mask;
	std::array<Hit::TermFrequency, kMaxSectionId> section_term_frequencies = {
	Hit::kNoTermFrequency};
	while (section_mask_copy) {
	SectionId section_id = __builtin_ctz(section_mask_copy);
	section_term_frequencies.at(section_id) =
	doc_hit_info_.hit_term_frequency(section_id);
	section_mask_copy &= ~(1u << section_id);
	}
	TermMatchInfo term_stats(term_, section_mask,
	std::move(section_term_frequencies));

	for (const TermMatchInfo& cur_term_stats : *matched_terms_stats) {
	if (cur_term_stats.term == term_stats.term) {
	// Same docId and same term, we don't need to add the term and the term
	// frequency should always be the same
	return;
	}
	}
	matched_terms_stats->push_back(std::move(term_stats));
	}

	protected:
	// Add DocHitInfos corresponding to term_ to cached_hits_.
	virtual libtextclassifier3::Status RetrieveMoreHits() = 0;

	const std::string term_;
	LiteIndex* const lite_index_;
	// Stores hits retrieved from the index. This may only be a subset of the hits
	// that are present in the index. Current value pointed to by the Iterator is
	// tracked by cached_hits_idx_.
	std::vector<DocHitInfo> cached_hits_;
	int cached_hits_idx_;
	const TermIdCodec* term_id_codec_;
	int num_advance_calls_;
	// Mask indicating which sections hits should be considered for.
	// Ex. 0000 0000 0000 0010 means that only hits from section 1 are desired.
	const SectionIdMask section_restrict_mask_;
	};

	class DocHitInfoIteratorTermLiteExact : public DocHitInfoIteratorTermLite {
	public:
	explicit DocHitInfoIteratorTermLiteExact(const TermIdCodec* term_id_codec,
	LiteIndex* lite_index,
	const std::string& term,
	SectionIdMask section_id_mask)
	: DocHitInfoIteratorTermLite(term_id_codec, lite_index, term,
	section_id_mask) {}

	std::string ToString() const override;

	protected:
	libtextclassifier3::Status RetrieveMoreHits() override;
	};

	class DocHitInfoIteratorTermLitePrefix : public DocHitInfoIteratorTermLite {
	public:
	explicit DocHitInfoIteratorTermLitePrefix(const TermIdCodec* term_id_codec,
	LiteIndex* lite_index,
	const std::string& term,
	SectionIdMask section_id_mask)
	: DocHitInfoIteratorTermLite(term_id_codec, lite_index, term,
	section_id_mask) {}

	std::string ToString() const override;

	protected:
	libtextclassifier3::Status RetrieveMoreHits() override;

	private:
	// After retrieving DocHitInfos from the index, a DocHitInfo for docid 1 and
	// "foo" and a DocHitInfo for docid 1 and "fool". These DocHitInfos should be
	// merged.
	void SortAndDedupeDocumentIds();
	};

	} // namespace lib
	} // namespace icing

	#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_