Read shortcuts as strings in the dictionary.
This has no impact on performance.
Before:
(0) 9.61 (0.01%)
(1) 57514.58 (56.70%)
(2) 10.55 (0.01%)
(3) 10.79 (0.01%)
(4) 133.20 (0.13%)
(5) 43553.87 (42.94%)
(6) 10.03 (0.01%)
(20) 47.20 (0.05%)
Total 101431.47 (sum of others 101289.84)
After:
(0) 10.52 (0.01%)
(1) 56311.16 (56.66%)
(2) 13.40 (0.01%)
(3) 10.98 (0.01%)
(4) 136.72 (0.14%)
(5) 42707.92 (42.97%)
(6) 9.79 (0.01%)
(20) 51.35 (0.05%)
Total 99390.76 (sum of others 99251.84)
The difference is not significant with regard to measure imprecision
Change-Id: I2e4f1ef7a5e99082e67dd27f56cf4fc432bb48fa
diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp
index f7a3d3e..8d6c3d1 100644
--- a/native/jni/src/bigram_dictionary.cpp
+++ b/native/jni/src/bigram_dictionary.cpp
@@ -123,6 +123,7 @@
}
pos = BinaryFormat::skipChildrenPosition(flags, pos);
pos = BinaryFormat::skipFrequency(flags, pos);
+ pos = BinaryFormat::skipShortcuts(root, flags, pos);
int bigramFlags;
int bigramCount = 0;
do {
diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h
index ab033ad..2ac6e05 100644
--- a/native/jni/src/binary_format.h
+++ b/native/jni/src/binary_format.h
@@ -40,6 +40,9 @@
// implementations. On this occasion, we made the magic number 32 bits long.
const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;
+ const static int CHARACTER_ARRAY_TERMINATOR_SIZE = 1;
+ const static int SHORTCUT_LIST_SIZE_SIZE = 2;
+
static int detectFormat(const uint8_t* const dict);
static unsigned int getHeaderSize(const uint8_t* const dict);
static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos);
@@ -47,9 +50,10 @@
static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos);
static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos);
static int skipOtherCharacters(const uint8_t* const dict, const int pos);
- static int skipAttributes(const uint8_t* const dict, const int pos);
static int skipChildrenPosition(const uint8_t flags, const int pos);
static int skipFrequency(const uint8_t flags, const int pos);
+ static int skipShortcuts(const uint8_t* const dict, const uint8_t flags, const int pos);
+ static int skipBigrams(const uint8_t* const dict, const uint8_t flags, const int pos);
static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos);
static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags,
const int pos);
@@ -157,12 +161,12 @@
*/
}
-inline int BinaryFormat::skipAttributes(const uint8_t* const dict, const int pos) {
+static inline int skipExistingBigrams(const uint8_t* const dict, const int pos) {
int currentPos = pos;
- uint8_t flags = getFlagsAndForwardPointer(dict, ¤tPos);
+ uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos);
while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {
currentPos += attributeAddressSize(flags);
- flags = getFlagsAndForwardPointer(dict, ¤tPos);
+ flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos);
}
currentPos += attributeAddressSize(flags);
return currentPos;
@@ -174,6 +178,10 @@
/* See the note in attributeAddressSize. The same applies here */
}
+static inline int shortcutByteSize(const uint8_t* const dict, const int pos) {
+ return ((int)(dict[pos] << 8)) + (dict[pos + 1]);
+}
+
inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) {
return pos + childrenAddressSize(flags);
}
@@ -182,16 +190,30 @@
return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
}
+inline int BinaryFormat::skipShortcuts(const uint8_t* const dict, const uint8_t flags,
+ const int pos) {
+ if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) {
+ return pos + shortcutByteSize(dict, pos);
+ } else {
+ return pos;
+ }
+}
+
+inline int BinaryFormat::skipBigrams(const uint8_t* const dict, const uint8_t flags,
+ const int pos) {
+ if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
+ return skipExistingBigrams(dict, pos);
+ } else {
+ return pos;
+ }
+}
+
inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags,
const int pos) {
// This function skips all attributes: shortcuts and bigrams.
int newPos = pos;
- if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) {
- newPos = skipAttributes(dict, newPos);
- }
- if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
- newPos = skipAttributes(dict, newPos);
- }
+ newPos = skipShortcuts(dict, flags, newPos);
+ newPos = skipBigrams(dict, flags, newPos);
return newPos;
}
diff --git a/native/jni/src/terminal_attributes.h b/native/jni/src/terminal_attributes.h
index 1f98159..9a803cc 100644
--- a/native/jni/src/terminal_attributes.h
+++ b/native/jni/src/terminal_attributes.h
@@ -45,13 +45,19 @@
// Gets the shortcut target itself as a uint16_t string. For parameters and return value
// see BinaryFormat::getWordAtAddress.
+ // TODO: make the output an uint32_t* to handle the whole unicode range.
inline int getNextShortcutTarget(const int maxDepth, uint16_t* outWord) {
const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(mDict, &mPos);
mHasNextShortcutTarget =
0 != (shortcutFlags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT);
- int shortcutAddress =
- BinaryFormat::getAttributeAddressAndForwardPointer(mDict, shortcutFlags, &mPos);
- return BinaryFormat::getWordAtAddress(mDict, shortcutAddress, maxDepth, outWord);
+ unsigned int i;
+ for (i = 0; i < MAX_WORD_LENGTH_INTERNAL; ++i) {
+ const int charCode = BinaryFormat::getCharCodeAndForwardPointer(mDict, &mPos);
+ if (NOT_A_CHARACTER == charCode) break;
+ outWord[i] = (uint16_t)charCode;
+ }
+ mPos += BinaryFormat::CHARACTER_ARRAY_TERMINATOR_SIZE;
+ return i;
}
};
@@ -65,12 +71,10 @@
mDict(dict), mFlags(flags), mStartPos(pos) {
}
- inline bool isShortcutOnly() const {
- return 0 != (mFlags & UnigramDictionary::FLAG_IS_SHORTCUT_ONLY);
- }
-
inline ShortcutIterator getShortcutIterator() const {
- return ShortcutIterator(mDict, mStartPos, mFlags);
+ // The size of the shortcuts is stored here so that the whole shortcut chunk can be
+ // skipped quickly, so we ignore it.
+ return ShortcutIterator(mDict, mStartPos + BinaryFormat::SHORTCUT_LIST_SIZE_SIZE, mFlags);
}
};
} // namespace latinime
diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp
index ed4c066..50805ad 100644
--- a/native/jni/src/unigram_dictionary.cpp
+++ b/native/jni/src/unigram_dictionary.cpp
@@ -366,10 +366,9 @@
WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength);
if (finalFreq != NOT_A_FREQUENCY) {
- if (!terminalAttributes.isShortcutOnly()) {
- addWord(wordPointer, wordLength, finalFreq, masterQueue);
- }
+ addWord(wordPointer, wordLength, finalFreq, masterQueue);
+ const int shortcutFreq = finalFreq > 0 ? finalFreq - 1 : 0;
// Please note that the shortcut candidates will be added to the master queue only.
TerminalAttributes::ShortcutIterator iterator =
terminalAttributes.getShortcutIterator();
@@ -379,11 +378,12 @@
// We need to either modulate the frequency of each shortcut according
// to its own shortcut frequency or to make the queue
// so that the insert order is protected inside the queue for words
- // with the same score.
+ // with the same score. For the moment we use -1 to make sure the shortcut will
+ // never be in front of the word.
uint16_t shortcutTarget[MAX_WORD_LENGTH_INTERNAL];
const int shortcutTargetStringLength = iterator.getNextShortcutTarget(
MAX_WORD_LENGTH_INTERNAL, shortcutTarget);
- addWord(shortcutTarget, shortcutTargetStringLength, finalFreq, masterQueue);
+ addWord(shortcutTarget, shortcutTargetStringLength, shortcutFreq, masterQueue);
}
}
}
diff --git a/native/jni/src/unigram_dictionary.h b/native/jni/src/unigram_dictionary.h
index c8f1556..d501d50 100644
--- a/native/jni/src/unigram_dictionary.h
+++ b/native/jni/src/unigram_dictionary.h
@@ -49,10 +49,6 @@
static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
// Flag for bigram presence
static const int FLAG_HAS_BIGRAMS = 0x04;
- // Flag for shortcut-only words. Some words are shortcut-only, which means they match when
- // the user types them but they don't pop in the suggestion strip, only the words they are
- // shortcuts for do.
- static const int FLAG_IS_SHORTCUT_ONLY = 0x02;
// Attribute (bigram/shortcut) related flags:
// Flag for presence of more attributes