icing/store/document-store.h - platform/external/icing - Gitiles

 // Copyright (C) 2019 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #ifndef ICING_STORE_DOCUMENT_STORE_H_
 #define ICING_STORE_DOCUMENT_STORE_H_

 #include <cstdint>
 #include <memory>
 #include <string>
 #include <string_view>
 #include <vector>

 #include "icing/text_classifier/lib3/utils/base/status.h"
 #include "icing/text_classifier/lib3/utils/base/statusor.h"
 #include "icing/file/file-backed-proto-log.h"
 #include "icing/file/file-backed-vector.h"
 #include "icing/file/filesystem.h"
 #include "icing/proto/document.pb.h"
 #include "icing/proto/document_wrapper.pb.h"
 #include "icing/proto/logging.pb.h"
 #include "icing/proto/optimize.pb.h"
 #include "icing/proto/persist.pb.h"
 #include "icing/proto/storage.pb.h"
 #include "icing/schema/schema-store.h"
 #include "icing/store/corpus-associated-scoring-data.h"
 #include "icing/store/corpus-id.h"
 #include "icing/store/document-associated-score-data.h"
 #include "icing/store/document-filter-data.h"
 #include "icing/store/document-id.h"
 #include "icing/store/key-mapper.h"
 #include "icing/store/namespace-id.h"
 #include "icing/store/usage-store.h"
 #include "icing/tokenization/language-segmenter.h"
 #include "icing/util/clock.h"
 #include "icing/util/crc32.h"
 #include "icing/util/data-loss.h"
 #include "icing/util/document-validator.h"

 namespace icing {
 namespace lib {

 // Provides storage interfaces for documents.
 class DocumentStore {
  public:
   struct Header {
     static constexpr int32_t kMagic = 0x746f7265;

     // Holds the magic as a quick sanity check against file corruption.
     int32_t magic;

     // Checksum of the DocumentStore's sub-component's checksums.
     uint32_t checksum;
   };

   struct OptimizeInfo {
     // The estimated size in bytes of the optimizable docs. We don't track the
     // size of each document, so we estimate by taking the size of the entire
     // DocumentStore and dividing that by the total number of documents we have.
     // So we end up with an average document size.
     int64_t estimated_optimizable_bytes = 0;

     // Number of total documents the DocumentStore tracks.
     int32_t total_docs = 0;

     // Number of optimizable (deleted + expired) docs the DocumentStore tracks.
     int32_t optimizable_docs = 0;
   };

   struct DeleteByGroupResult {
     // Status representing whether or not the operation succeeded. See the
     // comments above the function that returns this result to determine what
     // possible statuses could be returned.
     libtextclassifier3::Status status;

     int num_docs_deleted = 0;
   };

   struct CreateResult {
     // A successfully initialized document store.
     std::unique_ptr<DocumentStore> document_store;

     // The data status after initializing from a previous state. Data loss can
     // happen if the file is corrupted or some previously added data was
     // unpersisted. This may be used to signal that any derived data off of the
     // document store may need to be regenerated.
     DataLoss data_loss;
   };

   // Not copyable
   DocumentStore(const DocumentStore&) = delete;
   DocumentStore& operator=(const DocumentStore&) = delete;

   // Persists and updates checksum of subcomponents.
   ~DocumentStore();

   // Factory method to create, initialize, and return a DocumentStore. The base
   // directory is used to persist document store files. If document store was
   // previously initialized with this directory, it will reload the files saved
   // by the last instance.
   //
   // force_recovery_and_revalidate_documents=true will pre-emptively throw out
   // the derived files and validate each document while recreating them. This
   // can be used to indicate that the schema (and type ids) may have changed and
   // those changes might not have been applied to the document store.
   //
   // If initialize_stats is present, the fields related to DocumentStore will be
   // populated.
   //
   // Does not take any ownership, and all pointers except initialize_stats must
   // refer to valid objects that outlive the one constructed.
   //
   // TODO(cassiewang): Consider returning a status indicating that derived files
   // were regenerated. This may be helpful in logs.
   //
   // Returns:
   //   A DocumentStore::CreateResult on success
   //   FAILED_PRECONDITION on any null pointer input
   //   INTERNAL_ERROR on IO error
   static libtextclassifier3::StatusOr<DocumentStore::CreateResult> Create(
       const Filesystem* filesystem, const std::string& base_dir,
       const Clock* clock, const SchemaStore* schema_store,
       bool force_recovery_and_revalidate_documents = false,
       InitializeStatsProto* initialize_stats = nullptr);

   // Returns the maximum DocumentId that the DocumentStore has assigned. If
   // there has not been any DocumentIds assigned, i.e. the DocumentStore is
   // empty, then kInvalidDocumentId is returned. This does not filter out
   // DocumentIds of deleted or expired documents.
   DocumentId last_added_document_id() const {
     if (document_id_mapper_->num_elements() == 0) {
       return kInvalidDocumentId;
     }
     return document_id_mapper_->num_elements() - 1;
   }

   // Returns the number of documents. The result does not filter out DocumentIds
   // of deleted or expired documents.
   int num_documents() const { return document_id_mapper_->num_elements(); }

   // Puts the document into document store.
   //
   // If put_document_stats is present, the fields related to DocumentStore will
   // be populated.
   //
   // Returns:
   //   A newly generated document id on success
   //   FAILED_PRECONDITION if schema hasn't been set yet
   //   NOT_FOUND if the schema_type or a property config of the document doesn't
   //     exist in schema
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::StatusOr<DocumentId> Put(
       const DocumentProto& document, int32_t num_tokens = 0,
       PutDocumentStatsProto* put_document_stats = nullptr);
   libtextclassifier3::StatusOr<DocumentId> Put(
       DocumentProto&& document, int32_t num_tokens = 0,
       PutDocumentStatsProto* put_document_stats = nullptr);

   // Finds and returns the document identified by the given key (namespace +
   // uri). If 'clear_internal_fields' is true, document level data that's
   // generated internally by DocumentStore is cleared.
   //
   // Returns:
   //   The document found on success
   //   NOT_FOUND if the key doesn't exist or document has been deleted
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::StatusOr<DocumentProto> Get(
       std::string_view name_space, std::string_view uri,
       bool clear_internal_fields = true) const;

   // Finds and returns the document identified by the given document id. If
   // 'clear_internal_fields' is true, document level data that's generated
   // internally by DocumentStore is cleared.
   //
   // Returns:
   //   The document found on success
   //   INVALID_ARGUMENT if document_id is less than 0 or greater than the
   //                    maximum value
   //   NOT_FOUND if the document doesn't exist or has been deleted
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::StatusOr<DocumentProto> Get(
       DocumentId document_id, bool clear_internal_fields = true) const;

   // Returns all namespaces which have at least 1 active document (not deleted
   // or expired). Order of namespaces is undefined.
   std::vector<std::string> GetAllNamespaces() const;

   // Check if a document exists. Existence means it hasn't been deleted and it
   // hasn't expired yet.
   //
   // Returns:
   //   boolean whether a document exists or not
   bool DoesDocumentExist(DocumentId document_id) const;

   // Deletes the document identified by the given namespace and uri. The
   // document proto will be erased immediately.
   //
   // NOTE:
   //    Space is not reclaimed for deleted documents until Optimize() is
   //    called.
   //
   // Returns:
   //   OK on success
   //   NOT_FOUND if no document exists with namespace, uri
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::Status Delete(std::string_view name_space,
                                     std::string_view uri);

   // Deletes the document identified by the given document_id. The document
   // proto will be erased immediately.
   //
   // NOTE:
   //    Space is not reclaimed for deleted documents until Optimize() is
   //    called.
   //
   // Returns:
   //   OK on success
   //   INTERNAL_ERROR on IO error
   //   INVALID_ARGUMENT if document_id is invalid.
   libtextclassifier3::Status Delete(DocumentId document_id);

   // Returns the NamespaceId of the string namespace
   //
   // Returns:
   //   NamespaceId on success
   //   NOT_FOUND if the namespace doesn't exist
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::StatusOr<NamespaceId> GetNamespaceId(
       std::string_view name_space) const;

   // Returns the CorpusId associated with the given namespace and schema.
   //
   // Returns:
   //   A CorpusId on success
   //   NOT_FOUND if the key doesn't exist
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::StatusOr<CorpusId> GetCorpusId(
       const std::string_view name_space, const std::string_view schema) const;

   // Returns the DocumentAssociatedScoreData of the document specified by the
   // DocumentId.
   //
   // NOTE: This does not check if the document exists and will return the
   // DocumentFilterData of the document even if it has been deleted. Users
   // should check DoesDocumentExist(document_id) if they only want existing
   // documents' DocumentFilterData.
   //
   // Returns:
   //   DocumentAssociatedScoreData on success
   //   OUT_OF_RANGE if document_id is negative or exceeds previously seen
   //                DocumentIds
   //   NOT_FOUND if no score data is found
   libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
   GetDocumentAssociatedScoreData(DocumentId document_id) const;

   // Returns the CorpusAssociatedScoreData of the corpus specified by the
   // corpus_id.
   //
   // NOTE: This does not check if the corpus exists and will return the
   // CorpusAssociatedScoreData of the corpus even if all documents belonging to
   // that corpus have been deleted.
   //
   // Returns:
   //   CorpusAssociatedScoreData on success
   //   OUT_OF_RANGE if corpus_id is negative or exceeds previously seen
   //                CorpusIds
   libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
   GetCorpusAssociatedScoreData(CorpusId corpus_id) const;

   // Returns the DocumentFilterData of the document specified by the DocumentId.
   //
   // NOTE: This does not check if the document exists and will return the
   // DocumentFilterData of the document even if it has been deleted. Users
   // should check DoesDocumentExist(document_id) if they only want existing
   // documents' DocumentFilterData.
   //
   // Returns:
   //   DocumentFilterData on success
   //   OUT_OF_RANGE if document_id is negative or exceeds previously seen
   //                DocumentIds
   //   NOT_FOUND if no filter data is found
   libtextclassifier3::StatusOr<DocumentFilterData> GetDocumentFilterData(
       DocumentId document_id) const;

   // Gets the usage scores of a document.
   //
   // Returns:
   //   UsageScores on success
   //   INVALID_ARGUMENT if document_id is invalid
   //   INTERNAL_ERROR on I/O errors
   libtextclassifier3::StatusOr<UsageStore::UsageScores> GetUsageScores(
       DocumentId document_id) const;

   // Reports usage. The corresponding usage scores of the specified document in
   // the report will be updated.
   //
   // Returns:
   //   OK on success
   //   NOT_FOUND if the [namesapce + uri] key in the report doesn't exist
   //   INTERNAL_ERROR on I/O errors.
   libtextclassifier3::Status ReportUsage(const UsageReport& usage_report);

   // Deletes all documents belonging to the given namespace. The documents will
   // be erased immediately.
   //
   // NOTE:
   //    Space is not reclaimed for deleted documents until Optimize() is
   //    called.
   //
   // Returns:
   //   OK on success
   //   NOT_FOUND if namespace doesn't exist
   //   INTERNAL_ERROR on IO error
   DeleteByGroupResult DeleteByNamespace(std::string_view name_space);

   // Deletes all documents belonging to the given schema type. The documents
   // will be erased immediately.
   //
   // NOTE:
   //    Space is not reclaimed for deleted documents until Optimize() is
   //    called.
   //
   // Returns:
   //   OK on success
   //   NOT_FOUND if schema_type doesn't exist
   //   INTERNAL_ERROR on IO error
   DeleteByGroupResult DeleteBySchemaType(std::string_view schema_type);

   // Syncs all the data and metadata changes to disk.
   //
   // Returns:
   //   OK on success
   //   INTERNAL on I/O error
   libtextclassifier3::Status PersistToDisk(PersistType::Code persist_type);

   // Calculates the StorageInfo for the Document Store.
   //
   // If an IO error occurs while trying to calculate the value for a field, then
   // that field will be set to -1.
   DocumentStorageInfoProto GetStorageInfo() const;

   // Update any derived data off of the SchemaStore with the new SchemaStore.
   // This may include pointers, SchemaTypeIds, etc.
   //
   // NOTE: This function may delete documents. A document may be invalidated by
   // the new SchemaStore, such as failing validation or having its schema type
   // deleted from the schema.
   //
   // This is best used if the caller is unsure about what's changed in the
   // SchemaStore, and wants to update all information no matter what. If the
   // caller does know what has changed, then it's recommended to call
   // OptimizedUpdateSchemaStore.
   //
   // Returns;
   //   OK on success
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::Status UpdateSchemaStore(const SchemaStore* schema_store);

   // Performs the same funtionality as UpdateSchemaStore, but this can be more
   // optimized in terms of less disk reads and less work if we know exactly
   // what's changed between the old and new SchemaStore.
   //
   // Returns;
   //   OK on success
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::Status OptimizedUpdateSchemaStore(
       const SchemaStore* schema_store,
       const SchemaStore::SetSchemaResult& set_schema_result);

   // Reduces internal file sizes by reclaiming space of deleted documents and
   // regenerating derived files.
   //
   // NOTE: The tasks in this method are too expensive to be executed in
   // real-time. The caller should decide how frequently and when to call this
   // method based on device usage.
   //
   // Returns:
   //   OK on success
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::Status Optimize();

   // Copy data from current base directory into a new directory. Any outdated or
   // deleted data won't be copied. During the process, document ids will be
   // reassigned so any files / classes that are based on old document ids may be
   // outdated.
   //
   // stats will be set if non-null.
   //
   // NOTE: The tasks in this method are too expensive to be executed in
   // real-time. The caller should decide how frequently and when to call this
   // method based on device usage.
   //
   // Returns:
   //   OK on success
   //   INVALID_ARGUMENT if new_directory is same as current base directory
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::Status OptimizeInto(
       const std::string& new_directory, const LanguageSegmenter* lang_segmenter,
       OptimizeStatsProto* stats = nullptr);

   // Calculates status for a potential Optimize call. Includes how many docs
   // there are vs how many would be optimized away. And also includes an
   // estimated size gains, in bytes, if Optimize were called.
   //
   // Returns:
   //   OptimizeInfo on success
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::StatusOr<OptimizeInfo> GetOptimizeInfo() const;

   // Computes the combined checksum of the document store - includes the ground
   // truth and all derived files.
   //
   // Returns:
   //   Combined checksum on success
   //   INTERNAL_ERROR on compute error
   libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const;

  private:
   // Use DocumentStore::Create() to instantiate.
   DocumentStore(const Filesystem* filesystem, std::string_view base_dir,
                 const Clock* clock, const SchemaStore* schema_store);

   const Filesystem* const filesystem_;
   const std::string base_dir_;
   const Clock& clock_;

   // Handles the ground truth schema and all of the derived data off of the
   // schema
   const SchemaStore* schema_store_;

   // Used to validate incoming documents
   DocumentValidator document_validator_;

   // A log used to store all documents, it serves as a ground truth of doc
   // store. key_mapper_ and document_id_mapper_ can be regenerated from it.
   std::unique_ptr<FileBackedProtoLog<DocumentWrapper>> document_log_;

   // Key (namespace + uri) to DocumentId mapping
   std::unique_ptr<KeyMapper<DocumentId>> document_key_mapper_;

   // DocumentId to file offset mapping
   std::unique_ptr<FileBackedVector<int64_t>> document_id_mapper_;

   // A cache of document associated scores. The ground truth of the scores is
   // DocumentProto stored in document_log_. This cache contains:
   //   - CorpusId
   //   - Document score
   //   - Document creation timestamp in seconds
   //   - Document length in number of tokens
   std::unique_ptr<FileBackedVector<DocumentAssociatedScoreData>> score_cache_;

   // A cache of data, indexed by DocumentId, used to filter documents. Currently
   // contains:
   //   - NamespaceId
   //   - SchemaTypeId
   //   - Expiration timestamp in seconds
   std::unique_ptr<FileBackedVector<DocumentFilterData>> filter_cache_;

   // A cache of corpus associated scores. The ground truth of the scores is
   // DocumentProto stored in document_log_. This cache contains:
   //   - Number of documents belonging to the corpus score
   //   - The sum of the documents' lengths, in number of tokens.
   std::unique_ptr<FileBackedVector<CorpusAssociatedScoreData>>
       corpus_score_cache_;

   // Maps namespaces to a densely-assigned unique id. Namespaces are assigned an
   // id when the first document belonging to that namespace is added to the
   // DocumentStore. Namespaces may be removed from the mapper during compaction.
   std::unique_ptr<KeyMapper<NamespaceId>> namespace_mapper_;

   // Maps a corpus, i.e. a (namespace, schema type) pair, to a densely-assigned
   // unique id. A coprus is assigned an
   // id when the first document belonging to that corpus is added to the
   // DocumentStore. Corpus ids may be removed from the mapper during compaction.
   std::unique_ptr<KeyMapper<CorpusId>> corpus_mapper_;

   // A storage class that caches all usage scores. Usage scores are not
   // considered as ground truth. Usage scores are associated with document ids
   // so they need to be updated when document ids change.
   std::unique_ptr<UsageStore> usage_store_;

   // Used internally to indicate whether the class has been initialized. This is
   // to guard against cases where the object has been created, but Initialize
   // fails in the constructor. If we have successfully exited the constructor,
   // then this field can be ignored. Clients of DocumentStore should not need to
   // worry about this field.
   bool initialized_ = false;

   libtextclassifier3::StatusOr<DataLoss> Initialize(
       bool force_recovery_and_revalidate_documents,
       InitializeStatsProto* initialize_stats);

   // Creates sub-components and verifies the integrity of each sub-component.
   //
   // Returns an error if subcomponents failed to initialize successfully.
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::Status InitializeDerivedFiles();

   // Re-generates all files derived from the ground truth: the document log.
   //
   // revalidate_documents=true will also cause each document to be revalidated
   // the schema as it is read out of the document log.
   //
   // NOTE: if this function fails, the only thing we can do is to retry it until
   // it succeeds or prevent the initialization of a DocumentStore. The
   // DocumentStore object wouldn't work reliably if this fails.
   //
   // Steps:
   //   1. Delete all derived files.
   //   2. Iterate through document log, put data into new key mapper and
   //   document_id
   //      mapper.
   //   3. Create header and store the updated combined checksum
   libtextclassifier3::Status RegenerateDerivedFiles(bool revalidate_documents);

   // Resets the unique_ptr to the document_key_mapper, deletes the underlying
   // file, and re-creates a new instance of the document_key_mapper .
   //
   // Returns OK or any IO errors.
   libtextclassifier3::Status ResetDocumentKeyMapper();

   // Resets the unique_ptr to the document_id_mapper, deletes the underlying
   // file, and re-creates a new instance of the document_id_mapper.
   //
   // Returns OK or any IO errors.
   libtextclassifier3::Status ResetDocumentIdMapper();

   // Resets the unique_ptr to the score_cache, deletes the underlying file, and
   // re-creates a new instance of the score_cache.
   //
   // Returns OK or any IO errors.
   libtextclassifier3::Status ResetDocumentAssociatedScoreCache();

   // Resets the unique_ptr to the corpus_score_cache, deletes the underlying
   // file, and re-creates a new instance of the corpus_score_cache.
   //
   // Returns OK or any IO errors.
   libtextclassifier3::Status ResetCorpusAssociatedScoreCache();

   // Resets the unique_ptr to the filter_cache, deletes the underlying file, and
   // re-creates a new instance of the filter_cache.
   //
   // Returns OK or any IO errors.
   libtextclassifier3::Status ResetFilterCache();

   // Resets the unique_ptr to the namespace_mapper, deletes the underlying file,
   // and re-creates a new instance of the namespace_mapper.
   //
   // Returns OK or any IO errors.
   libtextclassifier3::Status ResetNamespaceMapper();

   // Resets the unique_ptr to the corpus_mapper, deletes the underlying file,
   // and re-creates a new instance of the corpus_mapper.
   //
   // Returns OK or any IO errors.
   libtextclassifier3::Status ResetCorpusMapper();

   // Checks if the header exists already. This does not create the header file
   // if it doesn't exist.
   bool HeaderExists();

   // Update, replace and persist the header file. Creates the header file if it
   // doesn't exist.
   //
   // Returns:
   //   OK on success
   //   INTERNAL on I/O error
   libtextclassifier3::Status UpdateHeader(const Crc32& checksum);

   libtextclassifier3::StatusOr<DocumentId> InternalPut(
       DocumentProto& document,
       PutDocumentStatsProto* put_document_stats = nullptr);

   // Helper function to do batch deletes. Documents with the given
   // "namespace_id" and "schema_type_id" will be deleted. If callers don't need
   // to specify the namespace or schema type, pass in kInvalidNamespaceId or
   // kInvalidSchemaTypeId. The document protos with their derived data will be
   // erased / cleared immediately.
   //
   // NOTE: Space is not reclaimed in the derived files until Optimize() is
   // called.
   //
   // Returns:
   //   Number of documents that were actually updated to be deleted
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::StatusOr<int> BatchDelete(NamespaceId namespace_id,
                                                 SchemaTypeId schema_type_id);

   // Helper method to find a DocumentId that is associated with the given
   // namespace and uri.
   //
   // NOTE: The DocumentId may refer to a invalid document (deleted
   // or expired). Callers can call DoesDocumentExist(document_id) to ensure it
   // refers to a valid Document.
   //
   // Returns:
   //   A DocumentId on success
   //   NOT_FOUND if the key doesn't exist
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::StatusOr<DocumentId> GetDocumentId(
       std::string_view name_space, std::string_view uri) const;

   // Returns the CorpusAssociatedScoreData of the corpus specified by the
   // corpus_id.
   //
   // If the corpus_id has never been seen before, it returns a
   // CorpusAssociatedScoreData with properties set to default values.
   //
   // NOTE: This does not check if the corpus exists and will return the
   // CorpusAssociatedScoreData of the corpus even if all documents belonging to
   // that corpus have been deleted.
   //
   // Returns:
   //   CorpusAssociatedScoreData on success
   libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
   GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const;

   // Helper method to validate the document id and return the file offset of the
   // associated document in document_log_.
   //
   // This can be a more informative call than just DoesDocumentExist because it
   // can return more status errors on whether the Document actually doesn't
   // exist or if there was an internal error while accessing files.
   //
   // Returns:
   //   The file offset on success
   //   INVALID_ARGUMENT if document_id is less than 0 or greater than the
   //                    maximum value
   //   NOT_FOUND if the document doesn't exist (i.e. deleted or expired)
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::StatusOr<int64_t> DoesDocumentExistAndGetFileOffset(
       DocumentId document_id) const;

   // Updates the entry in the score cache for document_id.
   libtextclassifier3::Status UpdateDocumentAssociatedScoreCache(
       DocumentId document_id, const DocumentAssociatedScoreData& score_data);

   // Updates the entry in the corpus score cache for corpus_id.
   libtextclassifier3::Status UpdateCorpusAssociatedScoreCache(
       CorpusId corpus_id, const CorpusAssociatedScoreData& score_data);

   // Updates the entry in the filter cache for document_id.
   libtextclassifier3::Status UpdateFilterCache(
       DocumentId document_id, const DocumentFilterData& filter_data);

   // Helper method to clear the derived data of a document
   libtextclassifier3::Status ClearDerivedData(DocumentId document_id);

   // Sets usage scores for the given document.
   libtextclassifier3::Status SetUsageScores(
       DocumentId document_id, const UsageStore::UsageScores& usage_scores);

   // Returns:
   //   - on success, a DocumentStorageInfoProto with the fields relating to the
   //     size of Document Store member variables populated.
   //   - INTERNAL on failure to get file size
   DocumentStorageInfoProto GetMemberStorageInfo() const;

   // Returns:
   //   - on success, the storage_info that was passed in but with the number of
   //     alive, deleted and expired documents also set.
   //   - OUT_OF_RANGE, this should never happen. This could only be returned if
   //     the document_id_mapper somehow became larger than the filter cache.
   DocumentStorageInfoProto CalculateDocumentStatusCounts(
       DocumentStorageInfoProto storage_info) const;
 };

 }  // namespace lib
 }  // namespace icing

 #endif  // ICING_STORE_DOCUMENT_STORE_H_