Replace the bigram list position with the map and filter

Passing the position will not allow us a reasonable lookup
time. Replace this with a map and bloom filter for very fast
lookup.

Bug: 6313806
Change-Id: I3a61c0001cbc987c1c3c7b8df635d4590a370144
diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp
index d120261..220b340 100644
--- a/native/jni/src/bigram_dictionary.cpp
+++ b/native/jni/src/bigram_dictionary.cpp
@@ -158,6 +158,11 @@
     filter[bucket >> 3] |= (1 << (bucket & 0x7));
 }
 
+static inline bool isInFilter(uint8_t *filter, const int position) {
+    const unsigned int bucket = position % BIGRAM_FILTER_MODULO;
+    return filter[bucket >> 3] & (1 << (bucket & 0x7));
+}
+
 void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord,
         const int prevWordLength, std::map<int, int> *map, uint8_t *filter) {
     memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE);
diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h
index d5d67c1..71ade48 100644
--- a/native/jni/src/binary_format.h
+++ b/native/jni/src/binary_format.h
@@ -66,7 +66,8 @@
             const int length);
     static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth,
             uint16_t* outWord);
-    static int getProbability(const int bigramListPosition, const int unigramFreq);
+    static int getProbability(const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
+            const int unigramFreq);
 
     // Flags for special processing
     // Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or
@@ -519,9 +520,11 @@
 }
 
 // This should probably return a probability in log space.
-inline int BinaryFormat::getProbability(const int bigramListPosition, const int unigramFreq) {
-    // TODO: use the bigram list position to get the bigram probability. If the bigram
-    // is not found, use the unigram frequency.
+inline int BinaryFormat::getProbability(const std::map<int, int> *bigramMap,
+        const uint8_t *bigramFilter, const int unigramFreq) {
+    // TODO: use the bigram filter for fast rejection, then the bigram map for lookup
+    // to get the bigram probability. If the bigram is not found, use the unigram frequency.
+    // Don't forget that they can be null.
     // TODO: if the unigram frequency is used, compute the actual probability
     return unigramFreq;
 }
diff --git a/native/jni/src/dictionary.h b/native/jni/src/dictionary.h
index 8bdd771..bce86d1 100644
--- a/native/jni/src/dictionary.h
+++ b/native/jni/src/dictionary.h
@@ -37,17 +37,13 @@
     int getSuggestions(ProximityInfo *proximityInfo, int *xcoordinates, int *ycoordinates,
             int *codes, int codesSize, const int32_t* prevWordChars, const int prevWordLength,
             bool useFullEditDistance, unsigned short *outWords, int *frequencies) {
-        // bigramListPosition is, as an int, the offset of the bigram list in the file.
-        // If none, it's zero.
-        const int bigramListPosition = !prevWordChars ? 0
-                : mBigramDictionary->getBigramListPositionForWord(prevWordChars, prevWordLength);
         std::map<int, int> bigramMap;
         uint8_t bigramFilter[BIGRAM_FILTER_BYTE_SIZE];
         mBigramDictionary->fillBigramAddressToFrequencyMapAndFilter(prevWordChars,
                 prevWordLength, &bigramMap, bigramFilter);
         return mUnigramDictionary->getSuggestions(proximityInfo, mWordsPriorityQueuePool,
-                mCorrection, xcoordinates, ycoordinates, codes, codesSize, bigramListPosition,
-                useFullEditDistance, outWords, frequencies);
+                mCorrection, xcoordinates, ycoordinates, codes, codesSize, &bigramMap,
+                bigramFilter, useFullEditDistance, outWords, frequencies);
     }
 
     int getBigrams(const int32_t *word, int length, int *codes, int codesSize,
diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp
index 0c759d4..2e5468d 100644
--- a/native/jni/src/unigram_dictionary.cpp
+++ b/native/jni/src/unigram_dictionary.cpp
@@ -98,7 +98,7 @@
 void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
         const int *xcoordinates, const int *ycoordinates, const int *codesBuffer,
         int *xCoordinatesBuffer, int *yCoordinatesBuffer,
-        const int codesBufferSize, const int bigramListPosition,
+        const int codesBufferSize, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
         const bool useFullEditDistance, const int *codesSrc,
         const int codesRemain, const int currentDepth, int *codesDest, Correction *correction,
         WordsPriorityQueuePool *queuePool,
@@ -128,7 +128,7 @@
                         replacementCodePoint;
                 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
                         codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize,
-                        bigramListPosition, useFullEditDistance, codesSrc + i + 1,
+                        bigramMap, bigramFilter, useFullEditDistance, codesSrc + i + 1,
                         codesRemain - i - 1, currentDepth + 1, codesDest + i, correction,
                         queuePool, digraphs, digraphsSize);
 
@@ -138,7 +138,7 @@
                 memcpy(codesDest + i, codesSrc + i, BYTES_IN_ONE_CHAR);
                 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
                         codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize,
-                        bigramListPosition, useFullEditDistance, codesSrc + i, codesRemain - i,
+                        bigramMap, bigramFilter, useFullEditDistance, codesSrc + i, codesRemain - i,
                         currentDepth + 1, codesDest + i, correction, queuePool, digraphs,
                         digraphsSize);
                 return;
@@ -161,16 +161,18 @@
     }
 
     getWordSuggestions(proximityInfo, xCoordinatesBuffer, yCoordinatesBuffer, codesBuffer,
-            startIndex + codesRemain, bigramListPosition, useFullEditDistance, correction,
+            startIndex + codesRemain, bigramMap, bigramFilter, useFullEditDistance, correction,
             queuePool);
 }
 
-// bigramListPosition is the offset in the file to the list of bigrams for the previous word.
+// bigramMap contains the association <bigram address> -> <bigram frequency>
+// bigramFilter is a bloom filter for fast rejection: see functions setInFilter and isInFilter
+// in bigram_dictionary.cpp
 int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo,
         WordsPriorityQueuePool *queuePool, Correction *correction, const int *xcoordinates,
         const int *ycoordinates, const int *codes, const int codesSize,
-        const int bigramListPosition, const bool useFullEditDistance, unsigned short *outWords,
-        int *frequencies) {
+        const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
+        const bool useFullEditDistance, unsigned short *outWords, int *frequencies) {
 
     queuePool->clearAll();
     Correction* masterCorrection = correction;
@@ -180,7 +182,7 @@
         int xCoordinatesBuffer[codesSize];
         int yCoordinatesBuffer[codesSize];
         getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
-                xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramListPosition,
+                xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramMap, bigramFilter,
                 useFullEditDistance, codes, codesSize, 0, codesBuffer, masterCorrection,
                 queuePool, GERMAN_UMLAUT_DIGRAPHS,
                 sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]));
@@ -189,13 +191,13 @@
         int xCoordinatesBuffer[codesSize];
         int yCoordinatesBuffer[codesSize];
         getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
-                xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramListPosition,
+                xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramMap, bigramFilter,
                 useFullEditDistance, codes, codesSize, 0, codesBuffer, masterCorrection,
                 queuePool, FRENCH_LIGATURES_DIGRAPHS,
                 sizeof(FRENCH_LIGATURES_DIGRAPHS) / sizeof(FRENCH_LIGATURES_DIGRAPHS[0]));
     } else { // Normal processing
         getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize,
-                bigramListPosition, useFullEditDistance, masterCorrection, queuePool);
+                bigramMap, bigramFilter, useFullEditDistance, masterCorrection, queuePool);
     }
 
     PROF_START(20);
@@ -228,15 +230,15 @@
 
 void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
         const int *xcoordinates, const int *ycoordinates, const int *codes,
-        const int inputLength, const int bigramListPosition, const bool useFullEditDistance,
-        Correction *correction, WordsPriorityQueuePool *queuePool) {
+        const int inputLength, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
+        const bool useFullEditDistance, Correction *correction, WordsPriorityQueuePool *queuePool) {
 
     PROF_OPEN;
     PROF_START(0);
     PROF_END(0);
 
     PROF_START(1);
-    getOneWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, bigramListPosition,
+    getOneWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, bigramMap, bigramFilter,
             useFullEditDistance, inputLength, correction, queuePool);
     PROF_END(1);
 
@@ -308,15 +310,16 @@
 
 void UnigramDictionary::getOneWordSuggestions(ProximityInfo *proximityInfo,
         const int *xcoordinates, const int *ycoordinates, const int *codes,
-        const int bigramListPosition, const bool useFullEditDistance, const int inputLength,
+        const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
+        const bool useFullEditDistance, const int inputLength,
         Correction *correction, WordsPriorityQueuePool *queuePool) {
     initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction);
-    getSuggestionCandidates(useFullEditDistance, inputLength, bigramListPosition, correction,
+    getSuggestionCandidates(useFullEditDistance, inputLength, bigramMap, bigramFilter, correction,
             queuePool, true /* doAutoCompletion */, DEFAULT_MAX_ERRORS, FIRST_WORD_INDEX);
 }
 
 void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance,
-        const int inputLength, const int bigramListPosition,
+        const int inputLength, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
         Correction *correction, WordsPriorityQueuePool *queuePool,
         const bool doAutoCompletion, const int maxErrors, const int currentWordIndex) {
     // TODO: Remove setCorrectionParams
@@ -337,7 +340,7 @@
             int firstChildPos;
 
             const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos,
-                    bigramListPosition, correction, &childCount, &firstChildPos, &siblingPos,
+                    bigramMap, bigramFilter, correction, &childCount, &firstChildPos, &siblingPos,
                     queuePool, currentWordIndex);
             // Update next sibling pos
             correction->setTreeSiblingPos(outputIndex, siblingPos);
@@ -432,8 +435,8 @@
             queuePool->clearSubQueue(currentWordIndex);
             // TODO: pass the bigram list for substring suggestion
             getSuggestionCandidates(useFullEditDistance, inputWordLength,
-                    0 /* bigramListPosition */, correction, queuePool, false /* doAutoCompletion */,
-                    MAX_ERRORS_FOR_TWO_WORDS, currentWordIndex);
+                    0 /* bigramMap */, 0 /* bigramFilter */, correction, queuePool,
+                    false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS, currentWordIndex);
             if (DEBUG_DICT) {
                 if (currentWordIndex < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS) {
                     AKLOGI("Dump word candidates(%d) %d", currentWordIndex, inputWordLength);
@@ -763,9 +766,9 @@
 // the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any
 // given level, as output into newCount when traversing this level's parent.
 inline bool UnigramDictionary::processCurrentNode(const int initialPos,
-        const int bigramListPosition, Correction *correction, int *newCount,
-        int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool,
-        const int currentWordIndex) {
+        const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, Correction *correction,
+        int *newCount, int *newChildrenPosition, int *nextSiblingPosition,
+        WordsPriorityQueuePool *queuePool, const int currentWordIndex) {
     if (DEBUG_DICT) {
         correction->checkState();
     }
@@ -846,9 +849,9 @@
         const int childrenAddressPos = BinaryFormat::skipFrequency(flags, pos);
         const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos);
         TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos);
-        // The bigramListPosition is the offset in the file of the bigrams for the previous word,
-        // or zero if we don't know of any bigrams for it.
-        const int probability = BinaryFormat::getProbability(bigramListPosition, unigramFreq);
+        // bigramMap contains the bigram frequencies indexed by addresses for fast lookup.
+        // bigramFilter is a bloom filter of said frequencies for even faster rejection.
+        const int probability = BinaryFormat::getProbability(bigramMap, bigramFilter, unigramFreq);
         onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal,
                 currentWordIndex);
 
diff --git a/native/jni/src/unigram_dictionary.h b/native/jni/src/unigram_dictionary.h
index 0cc59ba..b923351 100644
--- a/native/jni/src/unigram_dictionary.h
+++ b/native/jni/src/unigram_dictionary.h
@@ -17,6 +17,7 @@
 #ifndef LATINIME_UNIGRAM_DICTIONARY_H
 #define LATINIME_UNIGRAM_DICTIONARY_H
 
+#include <map>
 #include <stdint.h>
 #include "correction.h"
 #include "correction_state.h"
@@ -75,32 +76,36 @@
     int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
     int getSuggestions(ProximityInfo *proximityInfo, WordsPriorityQueuePool *queuePool,
             Correction *correction, const int *xcoordinates, const int *ycoordinates,
-            const int *codes, const int codesSize, const int bigramListPosition,
-            const bool useFullEditDistance, unsigned short *outWords, int *frequencies);
+            const int *codes, const int codesSize, const std::map<int, int> *bigramMap,
+            const uint8_t *bigramFilter, const bool useFullEditDistance, unsigned short *outWords,
+            int *frequencies);
     virtual ~UnigramDictionary();
 
  private:
     void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
             const int *ycoordinates, const int *codes, const int inputLength,
-            const int bigramListPosition, const bool useFullEditDistance, Correction *correction,
+            const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
+            const bool useFullEditDistance, Correction *correction,
             WordsPriorityQueuePool *queuePool);
     int getDigraphReplacement(const int *codes, const int i, const int codesSize,
             const digraph_t* const digraphs, const unsigned int digraphsSize) const;
     void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
         const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
         int *xCoordinatesBuffer, int *yCoordinatesBuffer, const int codesBufferSize,
-        const int bigramListPosition, const bool useFullEditDistance, const int* codesSrc,
-        const int codesRemain, const int currentDepth, int* codesDest, Correction *correction,
+        const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
+        const bool useFullEditDistance, const int* codesSrc, const int codesRemain,
+        const int currentDepth, int* codesDest, Correction *correction,
         WordsPriorityQueuePool* queuePool, const digraph_t* const digraphs,
         const unsigned int digraphsSize);
     void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
             const int *ycoordinates, const int *codes, const int codesSize, Correction *correction);
     void getOneWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
-            const int *ycoordinates, const int *codes, const int bigramListPosition,
-            const bool useFullEditDistance, const int inputLength, Correction *correction,
-            WordsPriorityQueuePool* queuePool);
+            const int *ycoordinates, const int *codes, const std::map<int, int> *bigramMap,
+            const uint8_t *bigramFilter, const bool useFullEditDistance, const int inputLength,
+            Correction *correction, WordsPriorityQueuePool* queuePool);
     void getSuggestionCandidates(
-            const bool useFullEditDistance, const int inputLength, const int bigramListPosition,
+            const bool useFullEditDistance, const int inputLength,
+            const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
             Correction *correction, WordsPriorityQueuePool* queuePool, const bool doAutoCompletion,
             const int maxErrors, const int currentWordIndex);
     void getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo,
@@ -114,9 +119,10 @@
     bool needsToSkipCurrentNode(const unsigned short c,
             const int inputIndex, const int skipPos, const int depth);
     // Process a node by considering proximity, missing and excessive character
-    bool processCurrentNode(const int initialPos, const int bigramListPosition,
-            Correction *correction, int *newCount, int *newChildPosition, int *nextSiblingPosition,
-            WordsPriorityQueuePool *queuePool, const int currentWordIndex);
+    bool processCurrentNode(const int initialPos, const std::map<int, int> *bigramMap,
+            const uint8_t *bigramFilter, Correction *correction, int *newCount,
+            int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool,
+            const int currentWordIndex);
     int getMostFrequentWordLike(const int startInputIndex, const int inputLength,
             ProximityInfo *proximityInfo, unsigned short *word);
     int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,