blob: a033d396b96631848746ff6598d0e7f9e17d16e3 [file] [log] [blame]
Keisuke Kuroyanagi6bf26812014-05-14 19:09:01 +09001/*
2 * Copyright (C) 2013, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 * !!!!! DO NOT EDIT THIS FILE !!!!!
19 *
20 * This file was generated from
21 * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp
22 */
23
Keisuke Kuroyanagi07e14122014-06-10 19:22:40 +090024#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h"
Keisuke Kuroyanagi6bf26812014-05-14 19:09:01 +090025
26#include <cstring>
27#include <queue>
28
29#include "suggest/policyimpl/dictionary/header/header_policy.h"
Keisuke Kuroyanagi07e14122014-06-10 19:22:40 +090030#include "suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h"
31#include "suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h"
32#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h"
33#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h"
34#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
35#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h"
36#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h"
Keisuke Kuroyanagi6bf26812014-05-14 19:09:01 +090037#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
38#include "suggest/policyimpl/dictionary/utils/file_utils.h"
39#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
40
41namespace latinime {
42namespace backward {
Keisuke Kuroyanagi07e14122014-06-10 19:22:40 +090043namespace v402 {
Keisuke Kuroyanagi6bf26812014-05-14 19:09:01 +090044
45bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath,
Keisuke Kuroyanagie8750d92014-10-21 15:46:14 +090046 const EntryCounts &entryCounts) const {
Keisuke Kuroyanagi6bf26812014-05-14 19:09:01 +090047 const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
48 BufferWithExtendableBuffer headerBuffer(
49 BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
50 const int extendedRegionSize = headerPolicy->getExtendedRegionSize()
51 + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize();
52 if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */,
Keisuke Kuroyanagie8750d92014-10-21 15:46:14 +090053 entryCounts, extendedRegionSize, &headerBuffer)) {
Keisuke Kuroyanagi6bf26812014-05-14 19:09:01 +090054 AKLOGE("Cannot write header structure to buffer. "
55 "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, "
Keisuke Kuroyanagi0b8bb0c2014-10-21 20:20:11 +090056 "extendedRegionSize: %d", false, entryCounts.getUnigramCount(),
57 entryCounts.getBigramCount(), extendedRegionSize);
Keisuke Kuroyanagi6bf26812014-05-14 19:09:01 +090058 return false;
59 }
60 return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
61}
62
63bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos,
64 const char *const dictDirPath) {
65 const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
66 Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers(
67 Ver4DictBuffers::createVer4DictBuffers(headerPolicy,
68 Ver4DictConstants::MAX_DICTIONARY_SIZE));
69 int unigramCount = 0;
70 int bigramCount = 0;
71 if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount)) {
72 return false;
73 }
74 BufferWithExtendableBuffer headerBuffer(
75 BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
76 if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */,
Keisuke Kuroyanagie8750d92014-10-21 15:46:14 +090077 EntryCounts(unigramCount, bigramCount, 0 /* trigramCount */),
78 0 /* extendedRegionSize */, &headerBuffer)) {
Keisuke Kuroyanagi6bf26812014-05-14 19:09:01 +090079 return false;
80 }
81 return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
82}
83
84bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
85 const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite,
86 int *const outUnigramCount, int *const outBigramCount) {
87 Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(),
88 mBuffers->getProbabilityDictContent(), headerPolicy);
89 Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer());
90 Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(),
91 mBuffers->getTerminalPositionLookupTable(), headerPolicy);
92 Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(),
93 mBuffers->getTerminalPositionLookupTable());
94 Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(),
95 mBuffers, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy,
96 &shortcutPolicy);
97
98 DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader);
99 readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
100 DynamicPtGcEventListeners
101 ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
102 traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
103 &ptNodeWriter);
104 if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
105 &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) {
106 return false;
107 }
108 const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
109 .getValidUnigramCount();
110 const int maxUnigramCount = headerPolicy->getMaxUnigramCount();
111 if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) {
112 if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) {
113 AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount,
114 maxUnigramCount);
115 return false;
116 }
117 }
118
119 readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
120 DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability
121 traversePolicyToUpdateBigramProbability(&ptNodeWriter);
122 if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
123 &traversePolicyToUpdateBigramProbability)) {
124 return false;
125 }
126 const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount();
127 const int maxBigramCount = headerPolicy->getMaxBigramCount();
128 if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) {
129 if (!truncateBigrams(maxBigramCount)) {
130 AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount);
131 return false;
132 }
133 }
134
135 // Mapping from positions in mBuffer to positions in bufferToWrite.
136 PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap;
137 readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
138 Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(),
139 buffersToWrite, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy,
140 &shortcutPolicy);
141 DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
142 traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers,
143 buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap);
144 if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
145 &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) {
146 return false;
147 }
148
149 // Create policy instances for the GCed dictionary.
150 Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(),
151 buffersToWrite->getProbabilityDictContent(), headerPolicy);
152 Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer());
153 Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(),
154 buffersToWrite->getTerminalPositionLookupTable(), headerPolicy);
155 Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(),
156 buffersToWrite->getTerminalPositionLookupTable());
157 Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(),
158 buffersToWrite, headerPolicy, &newPtNodeReader, &newPtNodeArrayreader, &newBigramPolicy,
159 &newShortcutPolicy);
160 // Re-assign terminal IDs for valid terminal PtNodes.
161 TerminalPositionLookupTable::TerminalIdMap terminalIdMap;
162 if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds(
163 &terminalIdMap)) {
164 return false;
165 }
166 // Run GC for probability dict content.
167 if (!buffersToWrite->getMutableProbabilityDictContent()->runGC(&terminalIdMap,
168 mBuffers->getProbabilityDictContent())) {
169 return false;
170 }
171 // Run GC for bigram dict content.
172 if(!buffersToWrite->getMutableBigramDictContent()->runGC(&terminalIdMap,
173 mBuffers->getBigramDictContent(), outBigramCount)) {
174 return false;
175 }
176 // Run GC for shortcut dict content.
177 if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap,
178 mBuffers->getShortcutDictContent())) {
179 return false;
180 }
181 DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader);
182 newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
183 DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields
184 traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap);
185 if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
186 &traversePolicyToUpdateAllPositionFields)) {
187 return false;
188 }
189 newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
190 TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
191 traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap);
192 if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
193 &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) {
194 return false;
195 }
196 *outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount();
197 return true;
198}
199
200bool Ver4PatriciaTrieWritingHelper::truncateUnigrams(
201 const Ver4PatriciaTrieNodeReader *const ptNodeReader,
202 Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount) {
203 const TerminalPositionLookupTable *const terminalPosLookupTable =
204 mBuffers->getTerminalPositionLookupTable();
205 const int nextTerminalId = terminalPosLookupTable->getNextTerminalId();
206 std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator>
207 priorityQueue;
208 for (int i = 0; i < nextTerminalId; ++i) {
209 const int terminalPos = terminalPosLookupTable->getTerminalPtNodePosition(i);
210 if (terminalPos == NOT_A_DICT_POS) {
211 continue;
212 }
213 const ProbabilityEntry probabilityEntry =
214 mBuffers->getProbabilityDictContent()->getProbabilityEntry(i);
215 const int probability = probabilityEntry.hasHistoricalInfo() ?
216 ForgettingCurveUtils::decodeProbability(
217 probabilityEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
218 probabilityEntry.getProbability();
219 priorityQueue.push(DictProbability(terminalPos, probability,
Keisuke Kuroyanagi287e1552014-10-01 11:39:33 +0900220 probabilityEntry.getHistoricalInfo()->getTimestamp()));
Keisuke Kuroyanagi6bf26812014-05-14 19:09:01 +0900221 }
222
223 // Delete unigrams.
224 while (static_cast<int>(priorityQueue.size()) > maxUnigramCount) {
225 const int ptNodePos = priorityQueue.top().getDictPos();
Keisuke Kuroyanagi07e14122014-06-10 19:22:40 +0900226 priorityQueue.pop();
Keisuke Kuroyanagi6bf26812014-05-14 19:09:01 +0900227 const PtNodeParams ptNodeParams =
Keisuke Kuroyanagi0fbca1a2014-06-20 14:46:13 +0900228 ptNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
Keisuke Kuroyanagi07e14122014-06-10 19:22:40 +0900229 if (ptNodeParams.representsNonWordInfo()) {
230 continue;
231 }
Keisuke Kuroyanagi6bf26812014-05-14 19:09:01 +0900232 if (!ptNodeWriter->markPtNodeAsWillBecomeNonTerminal(&ptNodeParams)) {
233 AKLOGE("Cannot mark PtNode as willBecomeNonterminal. PtNode pos: %d", ptNodePos);
234 return false;
235 }
Keisuke Kuroyanagi6bf26812014-05-14 19:09:01 +0900236 }
237 return true;
238}
239
240bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) {
241 const TerminalPositionLookupTable *const terminalPosLookupTable =
242 mBuffers->getTerminalPositionLookupTable();
243 const int nextTerminalId = terminalPosLookupTable->getNextTerminalId();
244 std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator>
245 priorityQueue;
246 BigramDictContent *const bigramDictContent = mBuffers->getMutableBigramDictContent();
247 for (int i = 0; i < nextTerminalId; ++i) {
248 const int bigramListPos = bigramDictContent->getBigramListHeadPos(i);
249 if (bigramListPos == NOT_A_DICT_POS) {
250 continue;
251 }
252 bool hasNext = true;
253 int readingPos = bigramListPos;
254 while (hasNext) {
255 const int entryPos = readingPos;
256 const BigramEntry bigramEntry =
257 bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
258 hasNext = bigramEntry.hasNext();
259 if (!bigramEntry.isValid()) {
260 continue;
261 }
262 const int probability = bigramEntry.hasHistoricalInfo() ?
263 ForgettingCurveUtils::decodeProbability(
264 bigramEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
265 bigramEntry.getProbability();
266 priorityQueue.push(DictProbability(entryPos, probability,
Keisuke Kuroyanagi287e1552014-10-01 11:39:33 +0900267 bigramEntry.getHistoricalInfo()->getTimestamp()));
Keisuke Kuroyanagi6bf26812014-05-14 19:09:01 +0900268 }
269 }
270
271 // Delete bigrams.
272 while (static_cast<int>(priorityQueue.size()) > maxBigramCount) {
273 const int entryPos = priorityQueue.top().getDictPos();
274 const BigramEntry bigramEntry = bigramDictContent->getBigramEntry(entryPos);
275 const BigramEntry invalidatedBigramEntry = bigramEntry.getInvalidatedEntry();
276 if (!bigramDictContent->writeBigramEntry(&invalidatedBigramEntry, entryPos)) {
277 AKLOGE("Cannot write bigram entry to remove. pos: %d", entryPos);
278 return false;
279 }
280 priorityQueue.pop();
281 }
282 return true;
283}
284
285bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
286 ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
287 if (!ptNodeParams->isTerminal()) {
288 return true;
289 }
290 TerminalPositionLookupTable::TerminalIdMap::const_iterator it =
291 mTerminalIdMap->find(ptNodeParams->getTerminalId());
292 if (it == mTerminalIdMap->end()) {
293 AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd",
294 ptNodeParams->getTerminalId(), mTerminalIdMap->size());
295 return false;
296 }
297 if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) {
298 AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second);
299 }
300 return mPtNodeWriter->updatePtNodeHasBigramsAndShortcutTargetsFlags(ptNodeParams);
301}
302
Keisuke Kuroyanagi07e14122014-06-10 19:22:40 +0900303} // namespace v402
Keisuke Kuroyanagi6bf26812014-05-14 19:09:01 +0900304} // namespace backward
305} // namespace latinime