Read shortcuts as strings in the dictionary. This has no impact on performance. Before: (0) 9.61 (0.01%) (1) 57514.58 (56.70%) (2) 10.55 (0.01%) (3) 10.79 (0.01%) (4) 133.20 (0.13%) (5) 43553.87 (42.94%) (6) 10.03 (0.01%) (20) 47.20 (0.05%) Total 101431.47 (sum of others 101289.84) After: (0) 10.52 (0.01%) (1) 56311.16 (56.66%) (2) 13.40 (0.01%) (3) 10.98 (0.01%) (4) 136.72 (0.14%) (5) 42707.92 (42.97%) (6) 9.79 (0.01%) (20) 51.35 (0.05%) Total 99390.76 (sum of others 99251.84) The difference is not significant with regard to measure imprecision Change-Id: I2e4f1ef7a5e99082e67dd27f56cf4fc432bb48fa

commit: 9a933a742d2a3ffdfb955705ad086035bc27db60 [log] [tgz]
author: Jean Chalard <jchalard@google.com> Tue Mar 27 19:56:23 2012 +0900
committer: Jean Chalard <jchalard@google.com> Fri Apr 06 16:22:08 2012 +0900
tree: 991c505bb2c4a3dff0f3704e36837d2f63628293
parent: 7540fd009d47d7210f1bbbbae75582698be6f313 [diff]
diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp
index f7a3d3e..8d6c3d1 100644
--- a/native/jni/src/bigram_dictionary.cpp
+++ b/native/jni/src/bigram_dictionary.cpp

@@ -123,6 +123,7 @@
     }
     pos = BinaryFormat::skipChildrenPosition(flags, pos);
     pos = BinaryFormat::skipFrequency(flags, pos);
+    pos = BinaryFormat::skipShortcuts(root, flags, pos);
     int bigramFlags;
     int bigramCount = 0;
     do {

diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h
index ab033ad..2ac6e05 100644
--- a/native/jni/src/binary_format.h
+++ b/native/jni/src/binary_format.h

@@ -40,6 +40,9 @@
     // implementations. On this occasion, we made the magic number 32 bits long.
     const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;
 
+    const static int CHARACTER_ARRAY_TERMINATOR_SIZE = 1;
+    const static int SHORTCUT_LIST_SIZE_SIZE = 2;
+
     static int detectFormat(const uint8_t* const dict);
     static unsigned int getHeaderSize(const uint8_t* const dict);
     static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos);
@@ -47,9 +50,10 @@
     static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos);
     static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos);
     static int skipOtherCharacters(const uint8_t* const dict, const int pos);
-    static int skipAttributes(const uint8_t* const dict, const int pos);
     static int skipChildrenPosition(const uint8_t flags, const int pos);
     static int skipFrequency(const uint8_t flags, const int pos);
+    static int skipShortcuts(const uint8_t* const dict, const uint8_t flags, const int pos);
+    static int skipBigrams(const uint8_t* const dict, const uint8_t flags, const int pos);
     static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos);
     static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags,
             const int pos);
@@ -157,12 +161,12 @@
     */
 }
 
-inline int BinaryFormat::skipAttributes(const uint8_t* const dict, const int pos) {
+static inline int skipExistingBigrams(const uint8_t* const dict, const int pos) {
     int currentPos = pos;
-    uint8_t flags = getFlagsAndForwardPointer(dict, &currentPos);
+    uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, &currentPos);
     while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {
         currentPos += attributeAddressSize(flags);
-        flags = getFlagsAndForwardPointer(dict, &currentPos);
+        flags = BinaryFormat::getFlagsAndForwardPointer(dict, &currentPos);
     }
     currentPos += attributeAddressSize(flags);
     return currentPos;
@@ -174,6 +178,10 @@
     /* See the note in attributeAddressSize. The same applies here */
 }
 
+static inline int shortcutByteSize(const uint8_t* const dict, const int pos) {
+    return ((int)(dict[pos] << 8)) + (dict[pos + 1]);
+}
+
 inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) {
     return pos + childrenAddressSize(flags);
 }
@@ -182,16 +190,30 @@
     return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
 }
 
+inline int BinaryFormat::skipShortcuts(const uint8_t* const dict, const uint8_t flags,
+        const int pos) {
+    if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) {
+        return pos + shortcutByteSize(dict, pos);
+    } else {
+        return pos;
+    }
+}
+
+inline int BinaryFormat::skipBigrams(const uint8_t* const dict, const uint8_t flags,
+        const int pos) {
+    if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
+        return skipExistingBigrams(dict, pos);
+    } else {
+        return pos;
+    }
+}
+
 inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags,
         const int pos) {
     // This function skips all attributes: shortcuts and bigrams.
     int newPos = pos;
-    if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) {
-        newPos = skipAttributes(dict, newPos);
-    }
-    if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
-        newPos = skipAttributes(dict, newPos);
-    }
+    newPos = skipShortcuts(dict, flags, newPos);
+    newPos = skipBigrams(dict, flags, newPos);
     return newPos;
 }
 

diff --git a/native/jni/src/terminal_attributes.h b/native/jni/src/terminal_attributes.h
index 1f98159..9a803cc 100644
--- a/native/jni/src/terminal_attributes.h
+++ b/native/jni/src/terminal_attributes.h

@@ -45,13 +45,19 @@
 
         // Gets the shortcut target itself as a uint16_t string. For parameters and return value
         // see BinaryFormat::getWordAtAddress.
+        // TODO: make the output an uint32_t* to handle the whole unicode range.
         inline int getNextShortcutTarget(const int maxDepth, uint16_t* outWord) {
             const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(mDict, &mPos);
             mHasNextShortcutTarget =
                     0 != (shortcutFlags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT);
-            int shortcutAddress =
-                    BinaryFormat::getAttributeAddressAndForwardPointer(mDict, shortcutFlags, &mPos);
-            return BinaryFormat::getWordAtAddress(mDict, shortcutAddress, maxDepth, outWord);
+            unsigned int i;
+            for (i = 0; i < MAX_WORD_LENGTH_INTERNAL; ++i) {
+                const int charCode = BinaryFormat::getCharCodeAndForwardPointer(mDict, &mPos);
+                if (NOT_A_CHARACTER == charCode) break;
+                outWord[i] = (uint16_t)charCode;
+            }
+            mPos += BinaryFormat::CHARACTER_ARRAY_TERMINATOR_SIZE;
+            return i;
         }
     };
 
@@ -65,12 +71,10 @@
             mDict(dict), mFlags(flags), mStartPos(pos) {
     }
 
-    inline bool isShortcutOnly() const {
-        return 0 != (mFlags & UnigramDictionary::FLAG_IS_SHORTCUT_ONLY);
-    }
-
     inline ShortcutIterator getShortcutIterator() const {
-        return ShortcutIterator(mDict, mStartPos, mFlags);
+        // The size of the shortcuts is stored here so that the whole shortcut chunk can be
+        // skipped quickly, so we ignore it.
+        return ShortcutIterator(mDict, mStartPos + BinaryFormat::SHORTCUT_LIST_SIZE_SIZE, mFlags);
     }
 };
 } // namespace latinime

diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp
index ed4c066..50805ad 100644
--- a/native/jni/src/unigram_dictionary.cpp
+++ b/native/jni/src/unigram_dictionary.cpp

@@ -366,10 +366,9 @@
         WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
         const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength);
         if (finalFreq != NOT_A_FREQUENCY) {
-            if (!terminalAttributes.isShortcutOnly()) {
-                addWord(wordPointer, wordLength, finalFreq, masterQueue);
-            }
+            addWord(wordPointer, wordLength, finalFreq, masterQueue);
 
+            const int shortcutFreq = finalFreq > 0 ? finalFreq - 1 : 0;
             // Please note that the shortcut candidates will be added to the master queue only.
             TerminalAttributes::ShortcutIterator iterator =
                     terminalAttributes.getShortcutIterator();
@@ -379,11 +378,12 @@
                 // We need to either modulate the frequency of each shortcut according
                 // to its own shortcut frequency or to make the queue
                 // so that the insert order is protected inside the queue for words
-                // with the same score.
+                // with the same score. For the moment we use -1 to make sure the shortcut will
+                // never be in front of the word.
                 uint16_t shortcutTarget[MAX_WORD_LENGTH_INTERNAL];
                 const int shortcutTargetStringLength = iterator.getNextShortcutTarget(
                         MAX_WORD_LENGTH_INTERNAL, shortcutTarget);
-                addWord(shortcutTarget, shortcutTargetStringLength, finalFreq, masterQueue);
+                addWord(shortcutTarget, shortcutTargetStringLength, shortcutFreq, masterQueue);
             }
         }
     }

diff --git a/native/jni/src/unigram_dictionary.h b/native/jni/src/unigram_dictionary.h
index c8f1556..d501d50 100644
--- a/native/jni/src/unigram_dictionary.h
+++ b/native/jni/src/unigram_dictionary.h

@@ -49,10 +49,6 @@
     static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
     // Flag for bigram presence
     static const int FLAG_HAS_BIGRAMS = 0x04;
-    // Flag for shortcut-only words. Some words are shortcut-only, which means they match when
-    // the user types them but they don't pop in the suggestion strip, only the words they are
-    // shortcuts for do.
-    static const int FLAG_IS_SHORTCUT_ONLY = 0x02;
 
     // Attribute (bigram/shortcut) related flags:
     // Flag for presence of more attributes
commit	9a933a742d2a3ffdfb955705ad086035bc27db60	[log] [tgz]
author	Jean Chalard <jchalard@google.com>	Tue Mar 27 19:56:23 2012 +0900
committer	Jean Chalard <jchalard@google.com>	Fri Apr 06 16:22:08 2012 +0900
tree	991c505bb2c4a3dff0f3704e36837d2f63628293
parent	7540fd009d47d7210f1bbbbae75582698be6f313 [diff]