Keisuke Kuroyanagi | c4696b2 | 2014-08-01 20:19:16 +0900 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2014, The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | #include "suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h" |
| 18 | |
Keisuke Kuroyanagi | 063f86d | 2014-08-21 12:48:24 +0900 | [diff] [blame] | 19 | #include <algorithm> |
| 20 | #include <cstring> |
| 21 | |
Keisuke Kuroyanagi | 9aa6699 | 2014-08-22 20:07:54 +0900 | [diff] [blame] | 22 | #include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" |
| 23 | |
Keisuke Kuroyanagi | c4696b2 | 2014-08-01 20:19:16 +0900 | [diff] [blame] | 24 | namespace latinime { |
| 25 | |
Keisuke Kuroyanagi | 5400701 | 2014-10-15 12:29:31 +0900 | [diff] [blame] | 26 | const int LanguageModelDictContent::DUMMY_PROBABILITY_FOR_VALID_WORDS = 1; |
Keisuke Kuroyanagi | 758d093 | 2014-08-27 20:04:39 +0900 | [diff] [blame] | 27 | |
Keisuke Kuroyanagi | c4696b2 | 2014-08-01 20:19:16 +0900 | [diff] [blame] | 28 | bool LanguageModelDictContent::save(FILE *const file) const { |
| 29 | return mTrieMap.save(file); |
| 30 | } |
| 31 | |
Keisuke Kuroyanagi | 0889484 | 2014-08-05 12:38:55 +0900 | [diff] [blame] | 32 | bool LanguageModelDictContent::runGC( |
| 33 | const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, |
Keisuke Kuroyanagi | 47fc656 | 2014-10-21 16:36:03 +0900 | [diff] [blame^] | 34 | const LanguageModelDictContent *const originalContent) { |
Keisuke Kuroyanagi | 0889484 | 2014-08-05 12:38:55 +0900 | [diff] [blame] | 35 | return runGCInner(terminalIdMap, originalContent->mTrieMap.getEntriesInRootLevel(), |
Keisuke Kuroyanagi | 47fc656 | 2014-10-21 16:36:03 +0900 | [diff] [blame^] | 36 | 0 /* nextLevelBitmapEntryIndex */); |
Keisuke Kuroyanagi | 0889484 | 2014-08-05 12:38:55 +0900 | [diff] [blame] | 37 | } |
| 38 | |
Keisuke Kuroyanagi | 7d911d6 | 2014-09-24 14:15:34 +0900 | [diff] [blame] | 39 | const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArrayView prevWordIds, |
Keisuke Kuroyanagi | 36ba139 | 2014-09-12 20:17:41 +0900 | [diff] [blame] | 40 | const int wordId, const HeaderPolicy *const headerPolicy) const { |
Keisuke Kuroyanagi | 395fe8e | 2014-09-10 19:51:12 +0900 | [diff] [blame] | 41 | int bitmapEntryIndices[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1]; |
| 42 | bitmapEntryIndices[0] = mTrieMap.getRootBitmapEntryIndex(); |
Keisuke Kuroyanagi | 72d17d9 | 2014-10-15 18:23:00 +0900 | [diff] [blame] | 43 | int maxPrevWordCount = 0; |
Keisuke Kuroyanagi | 395fe8e | 2014-09-10 19:51:12 +0900 | [diff] [blame] | 44 | for (size_t i = 0; i < prevWordIds.size(); ++i) { |
| 45 | const int nextBitmapEntryIndex = |
| 46 | mTrieMap.get(prevWordIds[i], bitmapEntryIndices[i]).mNextLevelBitmapEntryIndex; |
| 47 | if (nextBitmapEntryIndex == TrieMap::INVALID_INDEX) { |
| 48 | break; |
| 49 | } |
Keisuke Kuroyanagi | 72d17d9 | 2014-10-15 18:23:00 +0900 | [diff] [blame] | 50 | maxPrevWordCount = i + 1; |
Keisuke Kuroyanagi | 395fe8e | 2014-09-10 19:51:12 +0900 | [diff] [blame] | 51 | bitmapEntryIndices[i + 1] = nextBitmapEntryIndex; |
| 52 | } |
| 53 | |
Keisuke Kuroyanagi | 72d17d9 | 2014-10-15 18:23:00 +0900 | [diff] [blame] | 54 | for (int i = maxPrevWordCount; i >= 0; --i) { |
Keisuke Kuroyanagi | 395fe8e | 2014-09-10 19:51:12 +0900 | [diff] [blame] | 55 | const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndices[i]); |
| 56 | if (!result.mIsValid) { |
| 57 | continue; |
| 58 | } |
Keisuke Kuroyanagi | 36ba139 | 2014-09-12 20:17:41 +0900 | [diff] [blame] | 59 | const ProbabilityEntry probabilityEntry = |
| 60 | ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo); |
Keisuke Kuroyanagi | 7d911d6 | 2014-09-24 14:15:34 +0900 | [diff] [blame] | 61 | int probability = NOT_A_PROBABILITY; |
Keisuke Kuroyanagi | 395fe8e | 2014-09-10 19:51:12 +0900 | [diff] [blame] | 62 | if (mHasHistoricalInfo) { |
Keisuke Kuroyanagi | 7d911d6 | 2014-09-24 14:15:34 +0900 | [diff] [blame] | 63 | const int rawProbability = ForgettingCurveUtils::decodeProbability( |
Keisuke Kuroyanagi | cb4f544 | 2014-09-25 11:41:50 +0900 | [diff] [blame] | 64 | probabilityEntry.getHistoricalInfo(), headerPolicy); |
| 65 | if (rawProbability == NOT_A_PROBABILITY) { |
| 66 | // The entry should not be treated as a valid entry. |
| 67 | continue; |
| 68 | } |
Keisuke Kuroyanagi | 72d17d9 | 2014-10-15 18:23:00 +0900 | [diff] [blame] | 69 | if (i == 0) { |
| 70 | // unigram |
| 71 | probability = rawProbability; |
| 72 | } else { |
| 73 | const ProbabilityEntry prevWordProbabilityEntry = getNgramProbabilityEntry( |
| 74 | prevWordIds.skip(1 /* n */).limit(i - 1), prevWordIds[0]); |
| 75 | if (!prevWordProbabilityEntry.isValid()) { |
| 76 | continue; |
| 77 | } |
| 78 | if (prevWordProbabilityEntry.representsBeginningOfSentence()) { |
| 79 | probability = rawProbability; |
| 80 | } else { |
| 81 | const int prevWordRawProbability = ForgettingCurveUtils::decodeProbability( |
| 82 | prevWordProbabilityEntry.getHistoricalInfo(), headerPolicy); |
| 83 | probability = std::min(MAX_PROBABILITY - prevWordRawProbability |
| 84 | + rawProbability, MAX_PROBABILITY); |
| 85 | } |
| 86 | } |
Keisuke Kuroyanagi | 395fe8e | 2014-09-10 19:51:12 +0900 | [diff] [blame] | 87 | } else { |
Keisuke Kuroyanagi | 7d911d6 | 2014-09-24 14:15:34 +0900 | [diff] [blame] | 88 | probability = probabilityEntry.getProbability(); |
Keisuke Kuroyanagi | 395fe8e | 2014-09-10 19:51:12 +0900 | [diff] [blame] | 89 | } |
Keisuke Kuroyanagi | 7d911d6 | 2014-09-24 14:15:34 +0900 | [diff] [blame] | 90 | // TODO: Some flags in unigramProbabilityEntry should be overwritten by flags in |
| 91 | // probabilityEntry. |
| 92 | const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId); |
| 93 | return WordAttributes(probability, unigramProbabilityEntry.isNotAWord(), |
| 94 | unigramProbabilityEntry.isBlacklisted(), |
| 95 | unigramProbabilityEntry.isPossiblyOffensive()); |
Keisuke Kuroyanagi | 395fe8e | 2014-09-10 19:51:12 +0900 | [diff] [blame] | 96 | } |
| 97 | // Cannot find the word. |
Keisuke Kuroyanagi | 7d911d6 | 2014-09-24 14:15:34 +0900 | [diff] [blame] | 98 | return WordAttributes(); |
Keisuke Kuroyanagi | 395fe8e | 2014-09-10 19:51:12 +0900 | [diff] [blame] | 99 | } |
| 100 | |
Keisuke Kuroyanagi | 851e045 | 2014-08-05 14:13:07 +0900 | [diff] [blame] | 101 | ProbabilityEntry LanguageModelDictContent::getNgramProbabilityEntry( |
Keisuke Kuroyanagi | 0889484 | 2014-08-05 12:38:55 +0900 | [diff] [blame] | 102 | const WordIdArrayView prevWordIds, const int wordId) const { |
Keisuke Kuroyanagi | 03dc44f | 2014-08-05 14:51:11 +0900 | [diff] [blame] | 103 | const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds); |
| 104 | if (bitmapEntryIndex == TrieMap::INVALID_INDEX) { |
Keisuke Kuroyanagi | 0889484 | 2014-08-05 12:38:55 +0900 | [diff] [blame] | 105 | return ProbabilityEntry(); |
| 106 | } |
Keisuke Kuroyanagi | 03dc44f | 2014-08-05 14:51:11 +0900 | [diff] [blame] | 107 | const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndex); |
Keisuke Kuroyanagi | 0889484 | 2014-08-05 12:38:55 +0900 | [diff] [blame] | 108 | if (!result.mIsValid) { |
| 109 | // Not found. |
| 110 | return ProbabilityEntry(); |
| 111 | } |
| 112 | return ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo); |
| 113 | } |
| 114 | |
Keisuke Kuroyanagi | 851e045 | 2014-08-05 14:13:07 +0900 | [diff] [blame] | 115 | bool LanguageModelDictContent::setNgramProbabilityEntry(const WordIdArrayView prevWordIds, |
Keisuke Kuroyanagi | d3097c6 | 2014-08-15 19:55:07 +0900 | [diff] [blame] | 116 | const int wordId, const ProbabilityEntry *const probabilityEntry) { |
| 117 | if (wordId == Ver4DictConstants::NOT_A_TERMINAL_ID) { |
| 118 | return false; |
| 119 | } |
Keisuke Kuroyanagi | 9a23f0f | 2014-08-12 20:32:42 +0900 | [diff] [blame] | 120 | const int bitmapEntryIndex = createAndGetBitmapEntryIndex(prevWordIds); |
Keisuke Kuroyanagi | 03dc44f | 2014-08-05 14:51:11 +0900 | [diff] [blame] | 121 | if (bitmapEntryIndex == TrieMap::INVALID_INDEX) { |
Keisuke Kuroyanagi | 0889484 | 2014-08-05 12:38:55 +0900 | [diff] [blame] | 122 | return false; |
| 123 | } |
Keisuke Kuroyanagi | d3097c6 | 2014-08-15 19:55:07 +0900 | [diff] [blame] | 124 | return mTrieMap.put(wordId, probabilityEntry->encode(mHasHistoricalInfo), bitmapEntryIndex); |
Keisuke Kuroyanagi | 0889484 | 2014-08-05 12:38:55 +0900 | [diff] [blame] | 125 | } |
| 126 | |
Keisuke Kuroyanagi | b4531d8 | 2014-08-18 12:34:48 +0900 | [diff] [blame] | 127 | bool LanguageModelDictContent::removeNgramProbabilityEntry(const WordIdArrayView prevWordIds, |
| 128 | const int wordId) { |
| 129 | const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds); |
| 130 | if (bitmapEntryIndex == TrieMap::INVALID_INDEX) { |
| 131 | // Cannot find bitmap entry for the probability entry. The entry doesn't exist. |
| 132 | return false; |
| 133 | } |
| 134 | return mTrieMap.remove(wordId, bitmapEntryIndex); |
| 135 | } |
| 136 | |
Keisuke Kuroyanagi | 07b3b41 | 2014-08-26 12:01:08 +0900 | [diff] [blame] | 137 | LanguageModelDictContent::EntryRange LanguageModelDictContent::getProbabilityEntries( |
| 138 | const WordIdArrayView prevWordIds) const { |
| 139 | const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds); |
| 140 | return EntryRange(mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex), mHasHistoricalInfo); |
| 141 | } |
| 142 | |
Keisuke Kuroyanagi | 47fc656 | 2014-10-21 16:36:03 +0900 | [diff] [blame^] | 143 | bool LanguageModelDictContent::truncateEntries(const EntryCounts ¤tEntryCounts, |
| 144 | const EntryCounts &maxEntryCounts, const HeaderPolicy *const headerPolicy, |
| 145 | MutableEntryCounters *const outEntryCounters) { |
| 146 | for (int prevWordCount = 0; prevWordCount <= MAX_PREV_WORD_COUNT_FOR_N_GRAM; ++prevWordCount) { |
| 147 | const int totalWordCount = prevWordCount + 1; |
| 148 | if (currentEntryCounts.getNgramCount(totalWordCount) |
| 149 | <= maxEntryCounts.getNgramCount(totalWordCount)) { |
| 150 | outEntryCounters->setNgramCount(totalWordCount, |
| 151 | currentEntryCounts.getNgramCount(totalWordCount)); |
Keisuke Kuroyanagi | 063f86d | 2014-08-21 12:48:24 +0900 | [diff] [blame] | 152 | continue; |
| 153 | } |
Keisuke Kuroyanagi | 47fc656 | 2014-10-21 16:36:03 +0900 | [diff] [blame^] | 154 | int entryCount = 0; |
| 155 | if (!turncateEntriesInSpecifiedLevel(headerPolicy, |
| 156 | maxEntryCounts.getNgramCount(totalWordCount), prevWordCount, &entryCount)) { |
Keisuke Kuroyanagi | 063f86d | 2014-08-21 12:48:24 +0900 | [diff] [blame] | 157 | return false; |
| 158 | } |
Keisuke Kuroyanagi | 47fc656 | 2014-10-21 16:36:03 +0900 | [diff] [blame^] | 159 | outEntryCounters->setNgramCount(totalWordCount, entryCount); |
Keisuke Kuroyanagi | 063f86d | 2014-08-21 12:48:24 +0900 | [diff] [blame] | 160 | } |
| 161 | return true; |
| 162 | } |
| 163 | |
Keisuke Kuroyanagi | 5400701 | 2014-10-15 12:29:31 +0900 | [diff] [blame] | 164 | bool LanguageModelDictContent::updateAllEntriesOnInputWord(const WordIdArrayView prevWordIds, |
| 165 | const int wordId, const bool isValid, const HistoricalInfo historicalInfo, |
Keisuke Kuroyanagi | e8750d9 | 2014-10-21 15:46:14 +0900 | [diff] [blame] | 166 | const HeaderPolicy *const headerPolicy, MutableEntryCounters *const entryCountersToUpdate) { |
Keisuke Kuroyanagi | 5400701 | 2014-10-15 12:29:31 +0900 | [diff] [blame] | 167 | if (!mHasHistoricalInfo) { |
| 168 | AKLOGE("updateAllEntriesOnInputWord is called for dictionary without historical info."); |
| 169 | return false; |
| 170 | } |
| 171 | const ProbabilityEntry originalUnigramProbabilityEntry = getProbabilityEntry(wordId); |
| 172 | const ProbabilityEntry updatedUnigramProbabilityEntry = createUpdatedEntryFrom( |
| 173 | originalUnigramProbabilityEntry, isValid, historicalInfo, headerPolicy); |
| 174 | if (!setProbabilityEntry(wordId, &updatedUnigramProbabilityEntry)) { |
| 175 | return false; |
| 176 | } |
| 177 | for (size_t i = 0; i < prevWordIds.size(); ++i) { |
| 178 | if (prevWordIds[i] == NOT_A_WORD_ID) { |
| 179 | break; |
| 180 | } |
| 181 | // TODO: Optimize this code. |
| 182 | const WordIdArrayView limitedPrevWordIds = prevWordIds.limit(i + 1); |
| 183 | const ProbabilityEntry originalNgramProbabilityEntry = getNgramProbabilityEntry( |
| 184 | limitedPrevWordIds, wordId); |
| 185 | const ProbabilityEntry updatedNgramProbabilityEntry = createUpdatedEntryFrom( |
| 186 | originalNgramProbabilityEntry, isValid, historicalInfo, headerPolicy); |
| 187 | if (!setNgramProbabilityEntry(limitedPrevWordIds, wordId, &updatedNgramProbabilityEntry)) { |
| 188 | return false; |
| 189 | } |
Keisuke Kuroyanagi | e8750d9 | 2014-10-21 15:46:14 +0900 | [diff] [blame] | 190 | if (!originalNgramProbabilityEntry.isValid()) { |
| 191 | entryCountersToUpdate->incrementNgramCount(i + 2); |
Keisuke Kuroyanagi | 5400701 | 2014-10-15 12:29:31 +0900 | [diff] [blame] | 192 | } |
| 193 | } |
| 194 | return true; |
| 195 | } |
| 196 | |
| 197 | const ProbabilityEntry LanguageModelDictContent::createUpdatedEntryFrom( |
| 198 | const ProbabilityEntry &originalProbabilityEntry, const bool isValid, |
| 199 | const HistoricalInfo historicalInfo, const HeaderPolicy *const headerPolicy) const { |
| 200 | const HistoricalInfo updatedHistoricalInfo = ForgettingCurveUtils::createUpdatedHistoricalInfo( |
| 201 | originalProbabilityEntry.getHistoricalInfo(), isValid ? |
| 202 | DUMMY_PROBABILITY_FOR_VALID_WORDS : NOT_A_PROBABILITY, |
| 203 | &historicalInfo, headerPolicy); |
| 204 | if (originalProbabilityEntry.isValid()) { |
| 205 | return ProbabilityEntry(originalProbabilityEntry.getFlags(), &updatedHistoricalInfo); |
| 206 | } else { |
| 207 | return ProbabilityEntry(0 /* flags */, &updatedHistoricalInfo); |
| 208 | } |
| 209 | } |
| 210 | |
Keisuke Kuroyanagi | 0889484 | 2014-08-05 12:38:55 +0900 | [diff] [blame] | 211 | bool LanguageModelDictContent::runGCInner( |
| 212 | const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, |
Keisuke Kuroyanagi | 47fc656 | 2014-10-21 16:36:03 +0900 | [diff] [blame^] | 213 | const TrieMap::TrieMapRange trieMapRange, const int nextLevelBitmapEntryIndex) { |
Keisuke Kuroyanagi | 0889484 | 2014-08-05 12:38:55 +0900 | [diff] [blame] | 214 | for (auto &entry : trieMapRange) { |
| 215 | const auto it = terminalIdMap->find(entry.key()); |
| 216 | if (it == terminalIdMap->end() || it->second == Ver4DictConstants::NOT_A_TERMINAL_ID) { |
| 217 | // The word has been removed. |
| 218 | continue; |
| 219 | } |
| 220 | if (!mTrieMap.put(it->second, entry.value(), nextLevelBitmapEntryIndex)) { |
| 221 | return false; |
| 222 | } |
Keisuke Kuroyanagi | 0889484 | 2014-08-05 12:38:55 +0900 | [diff] [blame] | 223 | if (entry.hasNextLevelMap()) { |
| 224 | if (!runGCInner(terminalIdMap, entry.getEntriesInNextLevel(), |
Keisuke Kuroyanagi | 47fc656 | 2014-10-21 16:36:03 +0900 | [diff] [blame^] | 225 | mTrieMap.getNextLevelBitmapEntryIndex(it->second, nextLevelBitmapEntryIndex))) { |
Keisuke Kuroyanagi | 0889484 | 2014-08-05 12:38:55 +0900 | [diff] [blame] | 226 | return false; |
| 227 | } |
| 228 | } |
| 229 | } |
| 230 | return true; |
| 231 | } |
| 232 | |
Keisuke Kuroyanagi | 9a23f0f | 2014-08-12 20:32:42 +0900 | [diff] [blame] | 233 | int LanguageModelDictContent::createAndGetBitmapEntryIndex(const WordIdArrayView prevWordIds) { |
| 234 | if (prevWordIds.empty()) { |
| 235 | return mTrieMap.getRootBitmapEntryIndex(); |
| 236 | } |
| 237 | const int lastBitmapEntryIndex = |
| 238 | getBitmapEntryIndex(prevWordIds.limit(prevWordIds.size() - 1)); |
| 239 | if (lastBitmapEntryIndex == TrieMap::INVALID_INDEX) { |
| 240 | return TrieMap::INVALID_INDEX; |
| 241 | } |
Keisuke Kuroyanagi | 09c1549 | 2014-09-17 21:16:31 +0900 | [diff] [blame] | 242 | const int oldestPrevWordId = prevWordIds.lastOrDefault(NOT_A_WORD_ID); |
Keisuke Kuroyanagi | 4926b90 | 2014-09-16 18:10:56 +0900 | [diff] [blame] | 243 | const TrieMap::Result result = mTrieMap.get(oldestPrevWordId, lastBitmapEntryIndex); |
| 244 | if (!result.mIsValid) { |
| 245 | if (!mTrieMap.put(oldestPrevWordId, |
| 246 | ProbabilityEntry().encode(mHasHistoricalInfo), lastBitmapEntryIndex)) { |
| 247 | return TrieMap::INVALID_INDEX; |
| 248 | } |
| 249 | } |
Keisuke Kuroyanagi | 09c1549 | 2014-09-17 21:16:31 +0900 | [diff] [blame] | 250 | return mTrieMap.getNextLevelBitmapEntryIndex(prevWordIds.lastOrDefault(NOT_A_WORD_ID), |
Keisuke Kuroyanagi | 9a23f0f | 2014-08-12 20:32:42 +0900 | [diff] [blame] | 251 | lastBitmapEntryIndex); |
| 252 | } |
| 253 | |
Keisuke Kuroyanagi | 03dc44f | 2014-08-05 14:51:11 +0900 | [diff] [blame] | 254 | int LanguageModelDictContent::getBitmapEntryIndex(const WordIdArrayView prevWordIds) const { |
| 255 | int bitmapEntryIndex = mTrieMap.getRootBitmapEntryIndex(); |
| 256 | for (const int wordId : prevWordIds) { |
| 257 | const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndex); |
| 258 | if (!result.mIsValid) { |
| 259 | return TrieMap::INVALID_INDEX; |
| 260 | } |
| 261 | bitmapEntryIndex = result.mNextLevelBitmapEntryIndex; |
| 262 | } |
| 263 | return bitmapEntryIndex; |
| 264 | } |
| 265 | |
Keisuke Kuroyanagi | 5400701 | 2014-10-15 12:29:31 +0900 | [diff] [blame] | 266 | bool LanguageModelDictContent::updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex, |
Keisuke Kuroyanagi | 3601c21 | 2014-10-15 20:43:27 +0900 | [diff] [blame] | 267 | const int prevWordCount, const HeaderPolicy *const headerPolicy, |
Keisuke Kuroyanagi | 47fc656 | 2014-10-21 16:36:03 +0900 | [diff] [blame^] | 268 | MutableEntryCounters *const outEntryCounters) { |
Keisuke Kuroyanagi | 9aa6699 | 2014-08-22 20:07:54 +0900 | [diff] [blame] | 269 | for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) { |
Keisuke Kuroyanagi | 3601c21 | 2014-10-15 20:43:27 +0900 | [diff] [blame] | 270 | if (prevWordCount > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { |
| 271 | AKLOGE("Invalid prevWordCount. prevWordCount: %d, MAX_PREV_WORD_COUNT_FOR_N_GRAM: %d.", |
| 272 | prevWordCount, MAX_PREV_WORD_COUNT_FOR_N_GRAM); |
Keisuke Kuroyanagi | 9aa6699 | 2014-08-22 20:07:54 +0900 | [diff] [blame] | 273 | return false; |
| 274 | } |
| 275 | const ProbabilityEntry probabilityEntry = |
| 276 | ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo); |
Keisuke Kuroyanagi | 3601c21 | 2014-10-15 20:43:27 +0900 | [diff] [blame] | 277 | if (prevWordCount > 0 && probabilityEntry.isValid() |
| 278 | && !mTrieMap.getRoot(entry.key()).mIsValid) { |
| 279 | // The entry is related to a word that has been removed. Remove the entry. |
| 280 | if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) { |
| 281 | return false; |
| 282 | } |
| 283 | continue; |
| 284 | } |
| 285 | if (mHasHistoricalInfo && !probabilityEntry.representsBeginningOfSentence() |
| 286 | && probabilityEntry.isValid()) { |
Keisuke Kuroyanagi | 9aa6699 | 2014-08-22 20:07:54 +0900 | [diff] [blame] | 287 | const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave( |
| 288 | probabilityEntry.getHistoricalInfo(), headerPolicy); |
| 289 | if (ForgettingCurveUtils::needsToKeep(&historicalInfo, headerPolicy)) { |
| 290 | // Update the entry. |
| 291 | const ProbabilityEntry updatedEntry(probabilityEntry.getFlags(), &historicalInfo); |
| 292 | if (!mTrieMap.put(entry.key(), updatedEntry.encode(mHasHistoricalInfo), |
| 293 | bitmapEntryIndex)) { |
| 294 | return false; |
| 295 | } |
| 296 | } else { |
| 297 | // Remove the entry. |
| 298 | if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) { |
| 299 | return false; |
| 300 | } |
| 301 | continue; |
| 302 | } |
| 303 | } |
| 304 | if (!probabilityEntry.representsBeginningOfSentence()) { |
Keisuke Kuroyanagi | 47fc656 | 2014-10-21 16:36:03 +0900 | [diff] [blame^] | 305 | outEntryCounters->incrementNgramCount(prevWordCount + 1); |
Keisuke Kuroyanagi | 9aa6699 | 2014-08-22 20:07:54 +0900 | [diff] [blame] | 306 | } |
| 307 | if (!entry.hasNextLevelMap()) { |
| 308 | continue; |
| 309 | } |
Keisuke Kuroyanagi | 3601c21 | 2014-10-15 20:43:27 +0900 | [diff] [blame] | 310 | if (!updateAllProbabilityEntriesForGCInner(entry.getNextLevelBitmapEntryIndex(), |
Keisuke Kuroyanagi | 47fc656 | 2014-10-21 16:36:03 +0900 | [diff] [blame^] | 311 | prevWordCount + 1, headerPolicy, outEntryCounters)) { |
Keisuke Kuroyanagi | 9aa6699 | 2014-08-22 20:07:54 +0900 | [diff] [blame] | 312 | return false; |
| 313 | } |
| 314 | } |
| 315 | return true; |
| 316 | } |
| 317 | |
Keisuke Kuroyanagi | 063f86d | 2014-08-21 12:48:24 +0900 | [diff] [blame] | 318 | bool LanguageModelDictContent::turncateEntriesInSpecifiedLevel( |
Keisuke Kuroyanagi | 758d093 | 2014-08-27 20:04:39 +0900 | [diff] [blame] | 319 | const HeaderPolicy *const headerPolicy, const int maxEntryCount, const int targetLevel, |
| 320 | int *const outEntryCount) { |
Keisuke Kuroyanagi | 063f86d | 2014-08-21 12:48:24 +0900 | [diff] [blame] | 321 | std::vector<int> prevWordIds; |
| 322 | std::vector<EntryInfoToTurncate> entryInfoVector; |
| 323 | if (!getEntryInfo(headerPolicy, targetLevel, mTrieMap.getRootBitmapEntryIndex(), |
| 324 | &prevWordIds, &entryInfoVector)) { |
| 325 | return false; |
| 326 | } |
| 327 | if (static_cast<int>(entryInfoVector.size()) <= maxEntryCount) { |
Keisuke Kuroyanagi | 758d093 | 2014-08-27 20:04:39 +0900 | [diff] [blame] | 328 | *outEntryCount = static_cast<int>(entryInfoVector.size()); |
Keisuke Kuroyanagi | 063f86d | 2014-08-21 12:48:24 +0900 | [diff] [blame] | 329 | return true; |
| 330 | } |
Keisuke Kuroyanagi | 758d093 | 2014-08-27 20:04:39 +0900 | [diff] [blame] | 331 | *outEntryCount = maxEntryCount; |
Keisuke Kuroyanagi | 063f86d | 2014-08-21 12:48:24 +0900 | [diff] [blame] | 332 | const int entryCountToRemove = static_cast<int>(entryInfoVector.size()) - maxEntryCount; |
| 333 | std::partial_sort(entryInfoVector.begin(), entryInfoVector.begin() + entryCountToRemove, |
| 334 | entryInfoVector.end(), |
| 335 | EntryInfoToTurncate::Comparator()); |
| 336 | for (int i = 0; i < entryCountToRemove; ++i) { |
| 337 | const EntryInfoToTurncate &entryInfo = entryInfoVector[i]; |
| 338 | if (!removeNgramProbabilityEntry( |
Keisuke Kuroyanagi | 3601c21 | 2014-10-15 20:43:27 +0900 | [diff] [blame] | 339 | WordIdArrayView(entryInfo.mPrevWordIds, entryInfo.mPrevWordCount), entryInfo.mKey)) { |
Keisuke Kuroyanagi | 063f86d | 2014-08-21 12:48:24 +0900 | [diff] [blame] | 340 | return false; |
| 341 | } |
| 342 | } |
| 343 | return true; |
| 344 | } |
| 345 | |
| 346 | bool LanguageModelDictContent::getEntryInfo(const HeaderPolicy *const headerPolicy, |
| 347 | const int targetLevel, const int bitmapEntryIndex, std::vector<int> *const prevWordIds, |
| 348 | std::vector<EntryInfoToTurncate> *const outEntryInfo) const { |
Keisuke Kuroyanagi | 3601c21 | 2014-10-15 20:43:27 +0900 | [diff] [blame] | 349 | const int prevWordCount = prevWordIds->size(); |
Keisuke Kuroyanagi | 063f86d | 2014-08-21 12:48:24 +0900 | [diff] [blame] | 350 | for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) { |
Keisuke Kuroyanagi | 3601c21 | 2014-10-15 20:43:27 +0900 | [diff] [blame] | 351 | if (prevWordCount < targetLevel) { |
Keisuke Kuroyanagi | 063f86d | 2014-08-21 12:48:24 +0900 | [diff] [blame] | 352 | if (!entry.hasNextLevelMap()) { |
| 353 | continue; |
| 354 | } |
| 355 | prevWordIds->push_back(entry.key()); |
| 356 | if (!getEntryInfo(headerPolicy, targetLevel, entry.getNextLevelBitmapEntryIndex(), |
| 357 | prevWordIds, outEntryInfo)) { |
| 358 | return false; |
| 359 | } |
| 360 | prevWordIds->pop_back(); |
| 361 | continue; |
| 362 | } |
| 363 | const ProbabilityEntry probabilityEntry = |
| 364 | ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo); |
| 365 | const int probability = (mHasHistoricalInfo) ? |
| 366 | ForgettingCurveUtils::decodeProbability(probabilityEntry.getHistoricalInfo(), |
| 367 | headerPolicy) : probabilityEntry.getProbability(); |
| 368 | outEntryInfo->emplace_back(probability, |
Keisuke Kuroyanagi | 287e155 | 2014-10-01 11:39:33 +0900 | [diff] [blame] | 369 | probabilityEntry.getHistoricalInfo()->getTimestamp(), |
Keisuke Kuroyanagi | 063f86d | 2014-08-21 12:48:24 +0900 | [diff] [blame] | 370 | entry.key(), targetLevel, prevWordIds->data()); |
| 371 | } |
| 372 | return true; |
| 373 | } |
| 374 | |
| 375 | bool LanguageModelDictContent::EntryInfoToTurncate::Comparator::operator()( |
| 376 | const EntryInfoToTurncate &left, const EntryInfoToTurncate &right) const { |
| 377 | if (left.mProbability != right.mProbability) { |
| 378 | return left.mProbability < right.mProbability; |
| 379 | } |
| 380 | if (left.mTimestamp != right.mTimestamp) { |
| 381 | return left.mTimestamp > right.mTimestamp; |
| 382 | } |
| 383 | if (left.mKey != right.mKey) { |
| 384 | return left.mKey < right.mKey; |
| 385 | } |
Keisuke Kuroyanagi | 3601c21 | 2014-10-15 20:43:27 +0900 | [diff] [blame] | 386 | if (left.mPrevWordCount != right.mPrevWordCount) { |
| 387 | return left.mPrevWordCount > right.mPrevWordCount; |
Keisuke Kuroyanagi | 063f86d | 2014-08-21 12:48:24 +0900 | [diff] [blame] | 388 | } |
Keisuke Kuroyanagi | 3601c21 | 2014-10-15 20:43:27 +0900 | [diff] [blame] | 389 | for (int i = 0; i < left.mPrevWordCount; ++i) { |
Keisuke Kuroyanagi | 063f86d | 2014-08-21 12:48:24 +0900 | [diff] [blame] | 390 | if (left.mPrevWordIds[i] != right.mPrevWordIds[i]) { |
| 391 | return left.mPrevWordIds[i] < right.mPrevWordIds[i]; |
| 392 | } |
| 393 | } |
| 394 | // left and rigth represent the same entry. |
| 395 | return false; |
| 396 | } |
| 397 | |
| 398 | LanguageModelDictContent::EntryInfoToTurncate::EntryInfoToTurncate(const int probability, |
Keisuke Kuroyanagi | 3601c21 | 2014-10-15 20:43:27 +0900 | [diff] [blame] | 399 | const int timestamp, const int key, const int prevWordCount, const int *const prevWordIds) |
| 400 | : mProbability(probability), mTimestamp(timestamp), mKey(key), |
| 401 | mPrevWordCount(prevWordCount) { |
| 402 | memmove(mPrevWordIds, prevWordIds, mPrevWordCount * sizeof(mPrevWordIds[0])); |
Keisuke Kuroyanagi | 063f86d | 2014-08-21 12:48:24 +0900 | [diff] [blame] | 403 | } |
| 404 | |
Keisuke Kuroyanagi | c4696b2 | 2014-08-01 20:19:16 +0900 | [diff] [blame] | 405 | } // namespace latinime |