| /* |
| * Copyright (C) 2014 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package com.android.inputmethod.latin.utils; |
| |
| import android.util.Log; |
| |
| import com.android.inputmethod.latin.Dictionary; |
| import com.android.inputmethod.latin.DictionaryFacilitator; |
| import com.android.inputmethod.latin.PrevWordsInfo; |
| import com.android.inputmethod.latin.settings.SpacingAndPunctuations; |
| |
| import java.util.ArrayList; |
| import java.util.List; |
| import java.util.Locale; |
| |
| // Note: this class is used as a parameter type of a native method. You should be careful when you |
| // rename this class or field name. See BinaryDictionary#addMultipleDictionaryEntriesNative(). |
| public final class LanguageModelParam { |
| private static final String TAG = LanguageModelParam.class.getSimpleName(); |
| private static final boolean DEBUG = false; |
| private static final boolean DEBUG_TOKEN = false; |
| |
| // For now, these probability values are being referred to only when we add new entries to |
| // decaying dynamic binary dictionaries. When these are referred to, what matters is 0 or |
| // non-0. Thus, it's not meaningful to compare 10, 100, and so on. |
| // TODO: Revise the logic in ForgettingCurveUtils in native code. |
| private static final int UNIGRAM_PROBABILITY_FOR_VALID_WORD = 100; |
| private static final int UNIGRAM_PROBABILITY_FOR_OOV_WORD = Dictionary.NOT_A_PROBABILITY; |
| private static final int BIGRAM_PROBABILITY_FOR_VALID_WORD = 10; |
| private static final int BIGRAM_PROBABILITY_FOR_OOV_WORD = Dictionary.NOT_A_PROBABILITY; |
| |
| public final String mTargetWord; |
| public final int[] mWord0; |
| public final int[] mWord1; |
| // TODO: this needs to be a list of shortcuts |
| public final int[] mShortcutTarget; |
| public final int mUnigramProbability; |
| public final int mBigramProbability; |
| public final int mShortcutProbability; |
| public final boolean mIsNotAWord; |
| public final boolean mIsBlacklisted; |
| // Time stamp in seconds. |
| public final int mTimestamp; |
| |
| // Constructor for unigram. TODO: support shortcuts |
| public LanguageModelParam(final String word, final int unigramProbability, |
| final int timestamp) { |
| this(null /* word0 */, word, unigramProbability, Dictionary.NOT_A_PROBABILITY, timestamp); |
| } |
| |
| // Constructor for unigram and bigram. |
| public LanguageModelParam(final String word0, final String word1, |
| final int unigramProbability, final int bigramProbability, |
| final int timestamp) { |
| mTargetWord = word1; |
| mWord0 = (word0 == null) ? null : StringUtils.toCodePointArray(word0); |
| mWord1 = StringUtils.toCodePointArray(word1); |
| mShortcutTarget = null; |
| mUnigramProbability = unigramProbability; |
| mBigramProbability = bigramProbability; |
| mShortcutProbability = Dictionary.NOT_A_PROBABILITY; |
| mIsNotAWord = false; |
| mIsBlacklisted = false; |
| mTimestamp = timestamp; |
| } |
| |
| // Process a list of words and return a list of {@link LanguageModelParam} objects. |
| public static ArrayList<LanguageModelParam> createLanguageModelParamsFrom( |
| final List<String> tokens, final int timestamp, |
| final DictionaryFacilitator dictionaryFacilitator, |
| final SpacingAndPunctuations spacingAndPunctuations, |
| final DistracterFilter distracterFilter) { |
| final ArrayList<LanguageModelParam> languageModelParams = new ArrayList<>(); |
| final int N = tokens.size(); |
| PrevWordsInfo prevWordsInfo = PrevWordsInfo.EMPTY_PREV_WORDS_INFO; |
| for (int i = 0; i < N; ++i) { |
| final String tempWord = tokens.get(i); |
| if (StringUtils.isEmptyStringOrWhiteSpaces(tempWord)) { |
| // just skip this token |
| if (DEBUG_TOKEN) { |
| Log.d(TAG, "--- isEmptyStringOrWhiteSpaces: \"" + tempWord + "\""); |
| } |
| continue; |
| } |
| if (!DictionaryInfoUtils.looksValidForDictionaryInsertion( |
| tempWord, spacingAndPunctuations)) { |
| if (DEBUG_TOKEN) { |
| Log.d(TAG, "--- not looksValidForDictionaryInsertion: \"" |
| + tempWord + "\""); |
| } |
| // Sentence terminator found. Split. |
| prevWordsInfo = PrevWordsInfo.EMPTY_PREV_WORDS_INFO; |
| continue; |
| } |
| if (DEBUG_TOKEN) { |
| Log.d(TAG, "--- word: \"" + tempWord + "\""); |
| } |
| final LanguageModelParam languageModelParam = |
| detectWhetherVaildWordOrNotAndGetLanguageModelParam( |
| prevWordsInfo, tempWord, timestamp, dictionaryFacilitator, |
| distracterFilter); |
| if (languageModelParam == null) { |
| continue; |
| } |
| languageModelParams.add(languageModelParam); |
| prevWordsInfo = new PrevWordsInfo(languageModelParam.mTargetWord); |
| } |
| return languageModelParams; |
| } |
| |
| private static LanguageModelParam detectWhetherVaildWordOrNotAndGetLanguageModelParam( |
| final PrevWordsInfo prevWordsInfo, final String targetWord, final int timestamp, |
| final DictionaryFacilitator dictionaryFacilitator, |
| final DistracterFilter distracterFilter) { |
| final Locale locale = dictionaryFacilitator.getLocale(); |
| if (locale == null) { |
| return null; |
| } |
| // TODO: Though targetWord is an IV (in-vocabulary) word, we should still apply |
| // distracterFilter in the following code. If targetWord is a distracter, |
| // it should be filtered out. |
| if (dictionaryFacilitator.isValidWord(targetWord, false /* ignoreCase */)) { |
| return createAndGetLanguageModelParamOfWord(prevWordsInfo, targetWord, timestamp, |
| true /* isValidWord */, locale); |
| } |
| |
| final String lowerCaseTargetWord = targetWord.toLowerCase(locale); |
| if (dictionaryFacilitator.isValidWord(lowerCaseTargetWord, false /* ignoreCase */)) { |
| // Add the lower-cased word. |
| return createAndGetLanguageModelParamOfWord(prevWordsInfo, lowerCaseTargetWord, |
| timestamp, true /* isValidWord */, locale); |
| } |
| |
| // Treat the word as an OOV word. The following statement checks whether this OOV |
| // is a distracter to words in dictionaries. Being a distracter means the OOV word is |
| // too close to a common word in dictionaries (e.g., the OOV "mot" is very close to "not"). |
| // Adding such a word to dictonaries would interfere with entering in-dictionary words. For |
| // example, adding "mot" to dictionaries might interfere with entering "not". |
| // This kind of OOV should be filtered out. |
| if (distracterFilter.isDistracterToWordsInDictionaries(prevWordsInfo, targetWord, locale)) { |
| return null; |
| } |
| return createAndGetLanguageModelParamOfWord(prevWordsInfo, targetWord, timestamp, |
| false /* isValidWord */, locale); |
| } |
| |
| private static LanguageModelParam createAndGetLanguageModelParamOfWord( |
| final PrevWordsInfo prevWordsInfo, final String targetWord, final int timestamp, |
| final boolean isValidWord, final Locale locale) { |
| final String word; |
| if (StringUtils.getCapitalizationType(targetWord) == StringUtils.CAPITALIZE_FIRST |
| && prevWordsInfo.mPrevWord == null && !isValidWord) { |
| word = targetWord.toLowerCase(locale); |
| } else { |
| word = targetWord; |
| } |
| final int unigramProbability = isValidWord ? |
| UNIGRAM_PROBABILITY_FOR_VALID_WORD : UNIGRAM_PROBABILITY_FOR_OOV_WORD; |
| if (prevWordsInfo.mPrevWord == null) { |
| if (DEBUG) { |
| Log.d(TAG, "--- add unigram: current(" |
| + (isValidWord ? "Valid" : "OOV") + ") = " + word); |
| } |
| return new LanguageModelParam(word, unigramProbability, timestamp); |
| } |
| if (DEBUG) { |
| Log.d(TAG, "--- add bigram: prev = " + prevWordsInfo.mPrevWord + ", current(" |
| + (isValidWord ? "Valid" : "OOV") + ") = " + word); |
| } |
| final int bigramProbability = isValidWord ? |
| BIGRAM_PROBABILITY_FOR_VALID_WORD : BIGRAM_PROBABILITY_FOR_OOV_WORD; |
| return new LanguageModelParam(prevWordsInfo.mPrevWord, word, unigramProbability, |
| bigramProbability, timestamp); |
| } |
| } |