Keisuke Kuroyanagi | 6bf2681 | 2014-05-14 19:09:01 +0900 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2013, The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | /* |
| 18 | * !!!!! DO NOT EDIT THIS FILE !!!!! |
| 19 | * |
| 20 | * This file was generated from |
| 21 | * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp |
| 22 | */ |
| 23 | |
Keisuke Kuroyanagi | 07e1412 | 2014-06-10 19:22:40 +0900 | [diff] [blame] | 24 | #include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h" |
Keisuke Kuroyanagi | 6bf2681 | 2014-05-14 19:09:01 +0900 | [diff] [blame] | 25 | |
| 26 | #include <cstring> |
| 27 | #include <queue> |
| 28 | |
| 29 | #include "suggest/policyimpl/dictionary/header/header_policy.h" |
Keisuke Kuroyanagi | 07e1412 | 2014-06-10 19:22:40 +0900 | [diff] [blame] | 30 | #include "suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" |
| 31 | #include "suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h" |
| 32 | #include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h" |
| 33 | #include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" |
| 34 | #include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" |
| 35 | #include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h" |
| 36 | #include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h" |
Keisuke Kuroyanagi | 6bf2681 | 2014-05-14 19:09:01 +0900 | [diff] [blame] | 37 | #include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" |
| 38 | #include "suggest/policyimpl/dictionary/utils/file_utils.h" |
| 39 | #include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" |
| 40 | |
| 41 | namespace latinime { |
| 42 | namespace backward { |
Keisuke Kuroyanagi | 07e1412 | 2014-06-10 19:22:40 +0900 | [diff] [blame] | 43 | namespace v402 { |
Keisuke Kuroyanagi | 6bf2681 | 2014-05-14 19:09:01 +0900 | [diff] [blame] | 44 | |
| 45 | bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath, |
Keisuke Kuroyanagi | e8750d9 | 2014-10-21 15:46:14 +0900 | [diff] [blame] | 46 | const EntryCounts &entryCounts) const { |
Keisuke Kuroyanagi | 6bf2681 | 2014-05-14 19:09:01 +0900 | [diff] [blame] | 47 | const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); |
| 48 | BufferWithExtendableBuffer headerBuffer( |
| 49 | BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); |
| 50 | const int extendedRegionSize = headerPolicy->getExtendedRegionSize() |
| 51 | + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize(); |
| 52 | if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */, |
Keisuke Kuroyanagi | e8750d9 | 2014-10-21 15:46:14 +0900 | [diff] [blame] | 53 | entryCounts, extendedRegionSize, &headerBuffer)) { |
Keisuke Kuroyanagi | 6bf2681 | 2014-05-14 19:09:01 +0900 | [diff] [blame] | 54 | AKLOGE("Cannot write header structure to buffer. " |
| 55 | "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, " |
Keisuke Kuroyanagi | 0b8bb0c | 2014-10-21 20:20:11 +0900 | [diff] [blame^] | 56 | "extendedRegionSize: %d", false, entryCounts.getUnigramCount(), |
| 57 | entryCounts.getBigramCount(), extendedRegionSize); |
Keisuke Kuroyanagi | 6bf2681 | 2014-05-14 19:09:01 +0900 | [diff] [blame] | 58 | return false; |
| 59 | } |
| 60 | return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); |
| 61 | } |
| 62 | |
| 63 | bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos, |
| 64 | const char *const dictDirPath) { |
| 65 | const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); |
| 66 | Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers( |
| 67 | Ver4DictBuffers::createVer4DictBuffers(headerPolicy, |
| 68 | Ver4DictConstants::MAX_DICTIONARY_SIZE)); |
| 69 | int unigramCount = 0; |
| 70 | int bigramCount = 0; |
| 71 | if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount)) { |
| 72 | return false; |
| 73 | } |
| 74 | BufferWithExtendableBuffer headerBuffer( |
| 75 | BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); |
| 76 | if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, |
Keisuke Kuroyanagi | e8750d9 | 2014-10-21 15:46:14 +0900 | [diff] [blame] | 77 | EntryCounts(unigramCount, bigramCount, 0 /* trigramCount */), |
| 78 | 0 /* extendedRegionSize */, &headerBuffer)) { |
Keisuke Kuroyanagi | 6bf2681 | 2014-05-14 19:09:01 +0900 | [diff] [blame] | 79 | return false; |
| 80 | } |
| 81 | return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); |
| 82 | } |
| 83 | |
| 84 | bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, |
| 85 | const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite, |
| 86 | int *const outUnigramCount, int *const outBigramCount) { |
| 87 | Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(), |
| 88 | mBuffers->getProbabilityDictContent(), headerPolicy); |
| 89 | Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer()); |
| 90 | Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(), |
| 91 | mBuffers->getTerminalPositionLookupTable(), headerPolicy); |
| 92 | Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(), |
| 93 | mBuffers->getTerminalPositionLookupTable()); |
| 94 | Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(), |
| 95 | mBuffers, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy, |
| 96 | &shortcutPolicy); |
| 97 | |
| 98 | DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader); |
| 99 | readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); |
| 100 | DynamicPtGcEventListeners |
| 101 | ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted |
| 102 | traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( |
| 103 | &ptNodeWriter); |
| 104 | if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( |
| 105 | &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) { |
| 106 | return false; |
| 107 | } |
| 108 | const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted |
| 109 | .getValidUnigramCount(); |
| 110 | const int maxUnigramCount = headerPolicy->getMaxUnigramCount(); |
| 111 | if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) { |
| 112 | if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) { |
| 113 | AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount, |
| 114 | maxUnigramCount); |
| 115 | return false; |
| 116 | } |
| 117 | } |
| 118 | |
| 119 | readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); |
| 120 | DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability |
| 121 | traversePolicyToUpdateBigramProbability(&ptNodeWriter); |
| 122 | if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( |
| 123 | &traversePolicyToUpdateBigramProbability)) { |
| 124 | return false; |
| 125 | } |
| 126 | const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount(); |
| 127 | const int maxBigramCount = headerPolicy->getMaxBigramCount(); |
| 128 | if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) { |
| 129 | if (!truncateBigrams(maxBigramCount)) { |
| 130 | AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount); |
| 131 | return false; |
| 132 | } |
| 133 | } |
| 134 | |
| 135 | // Mapping from positions in mBuffer to positions in bufferToWrite. |
| 136 | PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap; |
| 137 | readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); |
| 138 | Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(), |
| 139 | buffersToWrite, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy, |
| 140 | &shortcutPolicy); |
| 141 | DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer |
| 142 | traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers, |
| 143 | buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap); |
| 144 | if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( |
| 145 | &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) { |
| 146 | return false; |
| 147 | } |
| 148 | |
| 149 | // Create policy instances for the GCed dictionary. |
| 150 | Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(), |
| 151 | buffersToWrite->getProbabilityDictContent(), headerPolicy); |
| 152 | Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer()); |
| 153 | Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(), |
| 154 | buffersToWrite->getTerminalPositionLookupTable(), headerPolicy); |
| 155 | Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(), |
| 156 | buffersToWrite->getTerminalPositionLookupTable()); |
| 157 | Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(), |
| 158 | buffersToWrite, headerPolicy, &newPtNodeReader, &newPtNodeArrayreader, &newBigramPolicy, |
| 159 | &newShortcutPolicy); |
| 160 | // Re-assign terminal IDs for valid terminal PtNodes. |
| 161 | TerminalPositionLookupTable::TerminalIdMap terminalIdMap; |
| 162 | if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds( |
| 163 | &terminalIdMap)) { |
| 164 | return false; |
| 165 | } |
| 166 | // Run GC for probability dict content. |
| 167 | if (!buffersToWrite->getMutableProbabilityDictContent()->runGC(&terminalIdMap, |
| 168 | mBuffers->getProbabilityDictContent())) { |
| 169 | return false; |
| 170 | } |
| 171 | // Run GC for bigram dict content. |
| 172 | if(!buffersToWrite->getMutableBigramDictContent()->runGC(&terminalIdMap, |
| 173 | mBuffers->getBigramDictContent(), outBigramCount)) { |
| 174 | return false; |
| 175 | } |
| 176 | // Run GC for shortcut dict content. |
| 177 | if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap, |
| 178 | mBuffers->getShortcutDictContent())) { |
| 179 | return false; |
| 180 | } |
| 181 | DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader); |
| 182 | newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); |
| 183 | DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields |
| 184 | traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap); |
| 185 | if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( |
| 186 | &traversePolicyToUpdateAllPositionFields)) { |
| 187 | return false; |
| 188 | } |
| 189 | newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); |
| 190 | TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds |
| 191 | traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap); |
| 192 | if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner( |
| 193 | &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) { |
| 194 | return false; |
| 195 | } |
| 196 | *outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount(); |
| 197 | return true; |
| 198 | } |
| 199 | |
| 200 | bool Ver4PatriciaTrieWritingHelper::truncateUnigrams( |
| 201 | const Ver4PatriciaTrieNodeReader *const ptNodeReader, |
| 202 | Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount) { |
| 203 | const TerminalPositionLookupTable *const terminalPosLookupTable = |
| 204 | mBuffers->getTerminalPositionLookupTable(); |
| 205 | const int nextTerminalId = terminalPosLookupTable->getNextTerminalId(); |
| 206 | std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator> |
| 207 | priorityQueue; |
| 208 | for (int i = 0; i < nextTerminalId; ++i) { |
| 209 | const int terminalPos = terminalPosLookupTable->getTerminalPtNodePosition(i); |
| 210 | if (terminalPos == NOT_A_DICT_POS) { |
| 211 | continue; |
| 212 | } |
| 213 | const ProbabilityEntry probabilityEntry = |
| 214 | mBuffers->getProbabilityDictContent()->getProbabilityEntry(i); |
| 215 | const int probability = probabilityEntry.hasHistoricalInfo() ? |
| 216 | ForgettingCurveUtils::decodeProbability( |
| 217 | probabilityEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) : |
| 218 | probabilityEntry.getProbability(); |
| 219 | priorityQueue.push(DictProbability(terminalPos, probability, |
Keisuke Kuroyanagi | 287e155 | 2014-10-01 11:39:33 +0900 | [diff] [blame] | 220 | probabilityEntry.getHistoricalInfo()->getTimestamp())); |
Keisuke Kuroyanagi | 6bf2681 | 2014-05-14 19:09:01 +0900 | [diff] [blame] | 221 | } |
| 222 | |
| 223 | // Delete unigrams. |
| 224 | while (static_cast<int>(priorityQueue.size()) > maxUnigramCount) { |
| 225 | const int ptNodePos = priorityQueue.top().getDictPos(); |
Keisuke Kuroyanagi | 07e1412 | 2014-06-10 19:22:40 +0900 | [diff] [blame] | 226 | priorityQueue.pop(); |
Keisuke Kuroyanagi | 6bf2681 | 2014-05-14 19:09:01 +0900 | [diff] [blame] | 227 | const PtNodeParams ptNodeParams = |
Keisuke Kuroyanagi | 0fbca1a | 2014-06-20 14:46:13 +0900 | [diff] [blame] | 228 | ptNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); |
Keisuke Kuroyanagi | 07e1412 | 2014-06-10 19:22:40 +0900 | [diff] [blame] | 229 | if (ptNodeParams.representsNonWordInfo()) { |
| 230 | continue; |
| 231 | } |
Keisuke Kuroyanagi | 6bf2681 | 2014-05-14 19:09:01 +0900 | [diff] [blame] | 232 | if (!ptNodeWriter->markPtNodeAsWillBecomeNonTerminal(&ptNodeParams)) { |
| 233 | AKLOGE("Cannot mark PtNode as willBecomeNonterminal. PtNode pos: %d", ptNodePos); |
| 234 | return false; |
| 235 | } |
Keisuke Kuroyanagi | 6bf2681 | 2014-05-14 19:09:01 +0900 | [diff] [blame] | 236 | } |
| 237 | return true; |
| 238 | } |
| 239 | |
| 240 | bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) { |
| 241 | const TerminalPositionLookupTable *const terminalPosLookupTable = |
| 242 | mBuffers->getTerminalPositionLookupTable(); |
| 243 | const int nextTerminalId = terminalPosLookupTable->getNextTerminalId(); |
| 244 | std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator> |
| 245 | priorityQueue; |
| 246 | BigramDictContent *const bigramDictContent = mBuffers->getMutableBigramDictContent(); |
| 247 | for (int i = 0; i < nextTerminalId; ++i) { |
| 248 | const int bigramListPos = bigramDictContent->getBigramListHeadPos(i); |
| 249 | if (bigramListPos == NOT_A_DICT_POS) { |
| 250 | continue; |
| 251 | } |
| 252 | bool hasNext = true; |
| 253 | int readingPos = bigramListPos; |
| 254 | while (hasNext) { |
| 255 | const int entryPos = readingPos; |
| 256 | const BigramEntry bigramEntry = |
| 257 | bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); |
| 258 | hasNext = bigramEntry.hasNext(); |
| 259 | if (!bigramEntry.isValid()) { |
| 260 | continue; |
| 261 | } |
| 262 | const int probability = bigramEntry.hasHistoricalInfo() ? |
| 263 | ForgettingCurveUtils::decodeProbability( |
| 264 | bigramEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) : |
| 265 | bigramEntry.getProbability(); |
| 266 | priorityQueue.push(DictProbability(entryPos, probability, |
Keisuke Kuroyanagi | 287e155 | 2014-10-01 11:39:33 +0900 | [diff] [blame] | 267 | bigramEntry.getHistoricalInfo()->getTimestamp())); |
Keisuke Kuroyanagi | 6bf2681 | 2014-05-14 19:09:01 +0900 | [diff] [blame] | 268 | } |
| 269 | } |
| 270 | |
| 271 | // Delete bigrams. |
| 272 | while (static_cast<int>(priorityQueue.size()) > maxBigramCount) { |
| 273 | const int entryPos = priorityQueue.top().getDictPos(); |
| 274 | const BigramEntry bigramEntry = bigramDictContent->getBigramEntry(entryPos); |
| 275 | const BigramEntry invalidatedBigramEntry = bigramEntry.getInvalidatedEntry(); |
| 276 | if (!bigramDictContent->writeBigramEntry(&invalidatedBigramEntry, entryPos)) { |
| 277 | AKLOGE("Cannot write bigram entry to remove. pos: %d", entryPos); |
| 278 | return false; |
| 279 | } |
| 280 | priorityQueue.pop(); |
| 281 | } |
| 282 | return true; |
| 283 | } |
| 284 | |
| 285 | bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds |
| 286 | ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { |
| 287 | if (!ptNodeParams->isTerminal()) { |
| 288 | return true; |
| 289 | } |
| 290 | TerminalPositionLookupTable::TerminalIdMap::const_iterator it = |
| 291 | mTerminalIdMap->find(ptNodeParams->getTerminalId()); |
| 292 | if (it == mTerminalIdMap->end()) { |
| 293 | AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd", |
| 294 | ptNodeParams->getTerminalId(), mTerminalIdMap->size()); |
| 295 | return false; |
| 296 | } |
| 297 | if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) { |
| 298 | AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second); |
| 299 | } |
| 300 | return mPtNodeWriter->updatePtNodeHasBigramsAndShortcutTargetsFlags(ptNodeParams); |
| 301 | } |
| 302 | |
Keisuke Kuroyanagi | 07e1412 | 2014-06-10 19:22:40 +0900 | [diff] [blame] | 303 | } // namespace v402 |
Keisuke Kuroyanagi | 6bf2681 | 2014-05-14 19:09:01 +0900 | [diff] [blame] | 304 | } // namespace backward |
| 305 | } // namespace latinime |