blob: 2e93ef1c9a4bb9deccf41ec4efe9d079085d3fa1 [file] [log] [blame]
// Copyright (C) 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Copyright 2011 Google Inc. All Rights Reserved.
// Author: ulas@google.com (Ulas Kirazci)
//
// Trie for word prefix lookups. Features:
//
// - Dynamic additions (but not deletions)
// - Low memory usage
// - Reasonable latency but not QPS
// - Revive from persistence is a disk read
// - Stores a 4-byte value associated with every key
//
// Associated with each value in the trie is a set of property ids. For
// efficiency, property ids should start at 0 and be densely packed. A value
// may have more than one id set. There is an additional deleted property
// for each value, which is set only when all the property ids associated with a
// value have been cleared. In the flash_index, property ids are used to track
// corpus ids.
//
// Not thread-safe.
#ifndef ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_
#define ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_
#include <stdint.h>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "icing/legacy/core/icing-compat.h"
#include "icing/legacy/core/icing-packed-pod.h"
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/legacy/index/icing-mmapper.h"
#include "icing/legacy/index/icing-storage.h"
#include "icing/legacy/index/proto/icing-dynamic-trie-header.pb.h"
#include "utf.h"
namespace icing {
namespace lib {
class IcingFlashBitmap;
class IcingDynamicTrie : public IIcingStorage {
class Dumper;
class IcingDynamicTrieStorage;
public:
// Adjacent bit fields are usually packed automatically. However, that is
// implementation specific:
// http://en.cppreference.com/w/cpp/language/bit_field
// So we'll set packed to be explicit.
class Node {
public:
// This object is only ever used by an ArrayStorage, which allocates
// sizeof(Node) bytes, zeroes them out and then casts to a Node.
Node() = delete;
uint32_t next_index() const { return next_index_; }
void set_next_index(uint32_t next_index) { next_index_ = next_index; }
bool is_leaf() const { return is_leaf_; }
void set_is_leaf(bool is_leaf) { is_leaf_ = is_leaf; }
uint8_t log2_num_children() const { return log2_num_children_; }
void set_log2_num_children(uint8_t log2_num_children) {
log2_num_children_ = log2_num_children;
}
private:
uint32_t next_index_ : 27;
uint32_t is_leaf_ : 1;
uint32_t log2_num_children_ : 4;
} __attribute__((packed));
static_assert(sizeof(Node) == 4, "");
static_assert(icing_is_packed_pod<Node>::value, "go/icing-ubsan");
// Adjacent bit fields are usually packed automatically. However, that is
// implementation specific:
// http://en.cppreference.com/w/cpp/language/bit_field.
// So we'll set packed to be explicit.
union Next {
Next(uint8_t val, uint32_t node_index) {
used.val = val;
used.node_index = node_index;
}
uint8_t val() const { return used.val; }
void set_val(uint8_t val) { used.val = val; }
uint32_t node_index() const { return used.node_index; }
void set_node_index(uint32_t node_index) { used.node_index = node_index; }
uint32_t next_index() const { return freelink.next_index; }
void set_next_index(uint32_t next_index) {
freelink.next_index = next_index;
}
bool operator<(const Next &next2) const {
if (val() == next2.val()) {
return node_index() < next2.node_index();
}
return val() < next2.val();
}
private:
// This object is only ever used by an ArrayStorage, which allocates
// sizeof(Node) bytes, zeroes them out and then casts to a Node.
Next() = default;
struct {
uint32_t val : 8;
uint32_t node_index : 24;
} used;
struct {
uint32_t next_index : 32;
} freelink;
} __attribute__((packed));
static_assert(sizeof(Next) == 4, "");
static_assert(sizeof(Next) % alignof(Next) == 0, "");
static_assert(icing_is_packed_pod<Next>::value, "go/icing-ubsan");
static const int kMaxNextArraySize = 256;
static const int kNumNextAllocationBuckets = 9; // [log2(1), log2(256)]
static const uint32_t kMaxPropertyId = (1 << 16) - 1;
static const uint32_t kInvalidValueIndex = 0;
static const uint32_t kNoCrc = 0;
struct Stats {
uint32_t num_keys;
// Node stats
uint32_t num_nodes;
uint32_t max_nodes;
// Count of intermediate nodes.
uint32_t num_intermediates;
// Count of leaf nodes.
uint32_t num_leaves;
// Next stats
uint32_t num_nexts;
uint32_t max_nexts;
// Count of next arrays by size.
uint32_t child_counts[kMaxNextArraySize];
// Wasted next array space per allocation bucket (in Nexts, not
// bytes).
uint32_t wasted[kNumNextAllocationBuckets];
// Sum of wasted array.
uint32_t total_wasted;
// Suffix stats
uint32_t suffixes_size;
uint32_t max_suffixes_size;
// Bytes actually used by suffixes.
uint32_t suffixes_used;
// Number of suffixes that are just empty strings.
uint32_t null_suffixes;
// Next free-list stats
uint32_t num_free[kNumNextAllocationBuckets];
// Total Next nodes free (weighted sum of the above).
uint32_t total_free;
// Dirty pages.
uint32_t dirty_pages_nodes;
uint32_t dirty_pages_nexts;
uint32_t dirty_pages_suffixes;
std::string DumpStats(int verbosity) const;
};
// Options when creating the trie. Maximums for the node/next/suffix
// arrays must be specified in advance.
struct Options {
// Absolute maximums.
static const uint32_t kMaxNodes, kMaxNexts, kMaxSuffixesSize, kMaxValueSize;
// The default takes 13MB of memory and can take about 1M English
// words.
Options()
: max_nodes(1U << 20),
max_nexts(1U << 20),
max_suffixes_size(5U << 20),
value_size(sizeof(uint32_t)) {}
Options(uint32_t max_nodes_in, uint32_t max_nexts_in,
uint32_t max_suffixes_size_in, uint32_t value_size_in)
: max_nodes(max_nodes_in),
max_nexts(max_nexts_in),
max_suffixes_size(max_suffixes_size_in),
value_size(value_size_in) {}
uint32_t max_nodes;
uint32_t max_nexts;
uint32_t max_suffixes_size;
uint32_t value_size;
// True if options do not exceed absolute maximums.
bool is_valid() const;
};
// These can be supplied during runtime, as opposed to the persisted
// Options above.
struct RuntimeOptions {
enum StoragePolicy {
// Changes are reflected in the underlying file immediately but
// more vulnerable to corruption.
kMapSharedWithCrc,
// Changes only applied during Flush. Smaller window of
// vulnerability to corruption.
kExplicitFlush,
};
RuntimeOptions &set_storage_policy(StoragePolicy sp) {
storage_policy = sp;
return *this;
}
StoragePolicy storage_policy = kExplicitFlush;
};
static uint32_t max_value_index(const Options &options) {
return options.max_suffixes_size;
}
// Light-weight constructor. Real work happens in Create or Init.
IcingDynamicTrie(const std::string &filename_base,
const RuntimeOptions &runtime_options,
const IcingFilesystem *filesystem);
~IcingDynamicTrie() override;
bool is_initialized() const { return is_initialized_; }
// Create, but do not Init, a new trie with options if the file does
// not already exist.
//
// Returns true if successfully created all files or files already
// exist. Does not do a complete sanity check for when files seem to
// exist. Cleans up files if creation fails midstream.
bool CreateIfNotExist(const Options &options);
bool UpgradeTo(int new_version) override { return true; }
bool Init() override;
void Close() override;
bool Remove() override;
uint64_t GetDiskUsage() const override;
// REQUIRED: For all functions below is_initialized() == true.
// Number of keys in trie.
uint32_t size() const;
// Collecting stats.
void CollectStats(Stats *stats) const;
// Gets all of the contents of the trie for debugging purposes. Note: this
// stores the entire set of terms in memory.
// pretty_print - The tree structure of the trie will be written to this.
// keys - All keys in the trie are appended to this vector.
void DumpTrie(std::ostream *pretty_print,
std::vector<std::string> *keys) const;
// Empty out the trie without closing or removing.
void Clear();
// Sync to disk.
bool Sync() override;
// Tell kernel we will access the memory shortly.
void Warm() const;
// Potentially about to get nuked.
void OnSleep() override;
// Compact trie into out for value indices present in old_tvi_to_new_value.
class NewValueMap {
public:
virtual ~NewValueMap();
// Returns the new value we want to assign to the entry at old
// value index. We don't take ownership of the pointer.
virtual const void *GetNewValue(uint32_t old_value_index) const = 0;
};
// Compacts this trie. This drops all deleted keys, drops all keys for which
// old_tvi_to_new_value returns nullptr, updates values to be the values
// returned by old_tvi_to_new_value, rewrites tvis, and saves the results into
// the trie given in 'out'. 'old_to_new_tvi' is be populated with a mapping of
// old value_index to new value_index.
bool Compact(const NewValueMap &old_tvi_to_new_value, IcingDynamicTrie *out,
std::unordered_map<uint32_t, uint32_t> *old_to_new_tvi) const;
// Insert value at key. If key already exists and replace == true,
// replaces old value with value. We take a copy of value.
//
// If value_index is not NULL, returns a pointer to value in
// value_index. This can then be used with SetValueAtIndex
// below. value_index is not valid past a Clear/Read/Write.
//
// Returns false if there is no space left in the trie.
//
// REQUIRES: value a buffer of size value_size()
bool Insert(const char *key, const void *value) {
return Insert(key, value, nullptr, true, nullptr);
}
bool Insert(const char *key, const void *value, uint32_t *value_index,
bool replace) {
return Insert(key, value, value_index, replace, nullptr);
}
bool Insert(const char *key, const void *value, uint32_t *value_index,
bool replace, bool *pnew_key);
// Get a value returned by Insert value_index. This points to the
// value in the trie. The pointer is immutable and always valid
// while the trie is alive.
const void *GetValueAtIndex(uint32_t value_index) const;
// Set a value returned by Insert value_index. We take a copy of
// value.
//
// REQUIRES: value a buffer of size value_size()
void SetValueAtIndex(uint32_t value_index, const void *value);
// Returns true if key is found and sets value. If value_index is
// not NULL, returns value_index (see Insert discussion above).
// If the key is not found, returns false and neither value nor
// value_index is modified.
//
// REQUIRES: value a buffer of size value_size()
bool Find(const char *key, void *value) const {
return Find(key, value, nullptr);
}
bool Find(const char *key, void *value, uint32_t *value_index) const;
// Find the input key and all keys that are a variant of the input
// key according to a variant map. Currently supports
// transliteration. For example "a" is a variant for "à" or "á" so
// an "a" in the input key can match those characters in the trie in
// addition to itself.
//
// If prefix is set, also returns any prefix matches (so value_index
// will be invalid).
//
// REQUIRES: all terms in the lexicon to be valid utf8.
struct OriginalMatch {
uint32_t value_index;
std::string orig;
OriginalMatch() : value_index(kInvalidValueIndex) {}
bool is_full_match() const { return value_index != kInvalidValueIndex; }
};
void GetDebugInfo(int verbosity, std::string *out) const override;
double min_free_fraction() const;
uint32_t value_size() const;
uint32_t max_value_index() const;
// If in kMapSharedWithCrc mode, update crcs and return the master
// crc, else return kNoCrc. This crc includes both the trie files
// and property bitmaps.
uint32_t UpdateCrc();
// Store dynamic properties for each value. When a property is added to
// a value, the deleted flag is cleared for it (if it was previously set).
bool SetProperty(uint32_t value_index, uint32_t property_id);
bool ClearProperty(uint32_t value_index, uint32_t property_id);
// Store deleted property for each value.
// This method is not the only way the deleted property can be set; the trie
// may set this property itself during other operations if it can determine a
// value becomes superfluous.
bool SetDeleted(uint32_t value_index);
// Clears the deleted property for each value.
bool ClearDeleted(uint32_t value_index);
// Clear a specific property id from all values. For each value that has this
// property cleared, also check to see if it was the only property set; if
// so, set the deleted property for the value to indicate it no longer has any
// properties associated with it.
bool ClearPropertyForAllValues(uint32_t property_id);
// Access properties. Usage:
//
// IcingDynamicTrie::PropertyReader reader(trie, 10);
// char value[SIZE];
// uint32_t value_index;
// if (trie.Find("abc", value, &value_index) &&
// reader.HasProperty(value_index)) {
// ...
// }
//
// Readers are valid as long as the underlying trie is open.
class PropertyReaderBase {
public:
// Whether underlying file exists.
bool Exists() const;
// Returns false for all values if underlying file is missing.
bool HasProperty(uint32_t value_index) const;
protected:
PropertyReaderBase(const IcingDynamicTrie &trie, bool deleted,
uint32_t property_id);
// Does not own.
const IcingFlashBitmap *bitmap_;
const IcingDynamicTrie &trie_;
};
// Reader for a given property. It is invalidated when the underlying property
// is deleted, or the trie is closed.
class PropertyReader : public PropertyReaderBase {
public:
PropertyReader(const IcingDynamicTrie &trie, uint32_t property_id)
: PropertyReaderBase(trie, false, property_id) {}
};
// Reader for the deleted property. It is invalidated when the trie is closed.
class PropertyDeletedReader : public PropertyReaderBase {
public:
explicit PropertyDeletedReader(const IcingDynamicTrie &trie)
: PropertyReaderBase(trie, true, 0) {}
};
// Reader for all properties (but not the deleted one). It is invalidated when
// the trie is closed.
class PropertyReadersAll {
public:
explicit PropertyReadersAll(const IcingDynamicTrie &trie);
// Whether underlying file for property_id exists.
bool Exists(uint32_t property_id) const;
// Returns false if underlying file or property doesn't exist.
bool HasProperty(uint32_t property_id, uint32_t value_index) const;
// Returns true if the value at value_index is set for the only the supplied
// property_id, and none of the other properties.
bool IsPropertyUnique(uint32_t property_id, uint32_t value_index) const;
// For iterating.
size_t size() const;
private:
const IcingDynamicTrie &trie_;
};
// Iterate through trie in lexicographic order.
//
// Not thread-safe.
//
// Change in underlying trie invalidates iterator.
class Iterator {
public:
Iterator(const IcingDynamicTrie &trie, const char *prefix);
void Reset();
bool Advance();
// If !IsValid(), GetKey() will return NULL and GetValue() will
// return 0.
bool IsValid() const;
const char *GetKey() const;
// This points directly to the underlying data and is valid while
// the trie is alive. We keep ownership of the pointer.
const void *GetValue() const;
uint32_t GetValueIndex() const;
private:
Iterator();
// Copy is ok.
// Helper function that takes the left-most branch down
// intermediate nodes to a leaf.
void LeftBranchToLeaf(uint32_t node_index);
std::string cur_key_;
const char *cur_suffix_;
int cur_suffix_len_;
struct Branch {
uint32_t node_idx;
int child_idx;
explicit Branch(uint32_t ni) : node_idx(ni), child_idx(0) {}
};
std::vector<Branch> branch_stack_;
bool single_leaf_match_;
const IcingDynamicTrie &trie_;
};
// Represents a non-leaf node or a "virtual" trie node in the suffix
// region.
struct LogicalNode {
const Node *node;
int suffix_offset;
LogicalNode() : node(nullptr), suffix_offset(0) {}
LogicalNode(const Node *node_in, int suffix_offset_in)
: node(node_in), suffix_offset(suffix_offset_in) {}
};
// Iterate over all utf8 chars in the trie anchored at prefix (or
// node). If trie has invalid utf8 chars, behavior is undefined (but
// won't crash).
class Utf8Iterator {
public:
void Reset();
bool Advance();
bool IsValid() const;
private:
struct Branch {
const Node *node;
const Next *child;
const Next *child_end;
bool IsFinished();
};
Utf8Iterator();
// Copy is ok.
void LeftBranchToUtf8End();
void InitBranch(Branch *branch, const Node *start, char key_char);
void GoIntoSuffix(const Node *node);
char cur_[UTFmax + 1]; // NULL-terminated
int cur_len_;
LogicalNode cur_logical_node_;
Branch branch_stack_[UTFmax];
Branch *branch_end_;
const IcingDynamicTrie &trie_;
const Node *start_node_;
};
private:
class CandidateSet;
// For testing only.
friend class IcingDynamicTrieTest_SyncErrorRecovery_Test;
friend class IcingDynamicTrieTest_BitmapsClosedWhenInitFails_Test;
void GetHeader(IcingDynamicTrieHeader *hdr) const;
void SetHeader(const IcingDynamicTrieHeader &new_hdr);
static const uint32_t kInvalidNodeIndex;
static const uint32_t kInvalidNextIndex;
static const uint32_t kInvalidSuffixIndex;
// Stats helpers.
void CollectStatsRecursive(const Node &node, Stats *stats) const;
// Helpers for Find and Insert.
const Next *GetNextByChar(const Node *node, uint8_t key_char) const;
const Next *LowerBound(const Next *start, const Next *end,
uint8_t key_char) const;
void FindBestNode(const char *key, uint32_t *best_node_index, int *key_offset,
bool prefix) const;
// For value properties. This truncates the data by clearing it, but leaving
// the storage intact.
bool InitPropertyBitmaps();
// Returns a pointer to a bitmap that is successfully opened.
static std::unique_ptr<IcingFlashBitmap> OpenAndInitBitmap(
const std::string &filename, bool verify,
const IcingFilesystem *filesystem);
// Returns a pointer to a writable bitmap, creating it if necessary. Returned
// pointer should not be freed, it will be maintained by property_bitmaps_.
// Returns null if bitmap failed to load.
IcingFlashBitmap *OpenOrCreatePropertyBitmap(uint32_t property_id);
uint64_t ValueIndexToPropertyBitmapIndex(uint32_t value_index) const;
const std::string filename_base_;
bool is_initialized_;
const RuntimeOptions runtime_options_;
std::unique_ptr<IcingDynamicTrieStorage> storage_;
const std::string property_bitmaps_prefix_;
std::vector<std::unique_ptr<IcingFlashBitmap>> property_bitmaps_;
const std::string deleted_bitmap_filename_;
std::unique_ptr<IcingFlashBitmap> deleted_bitmap_;
const IcingFilesystem *const filesystem_;
};
} // namespace lib
} // namespace icing
#endif // ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_