Keisuke Kuroyanagi | 5f5feeb | 2014-02-06 15:13:33 +0900 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2011 The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | package com.android.inputmethod.latin.makedict; |
| 18 | |
| 19 | import com.android.inputmethod.annotations.UsedForTesting; |
| 20 | import com.android.inputmethod.latin.BinaryDictionary; |
Tadashi G. Takaoka | 5f00fe0 | 2014-10-20 14:48:56 +0900 | [diff] [blame] | 21 | import com.android.inputmethod.latin.Dictionary; |
Keisuke Kuroyanagi | c6a6f6a | 2014-10-01 11:21:08 +0900 | [diff] [blame] | 22 | import com.android.inputmethod.latin.NgramContext; |
| 23 | import com.android.inputmethod.latin.NgramContext.WordInfo; |
Jean Chalard | 4beeb92 | 2014-10-28 21:31:09 +0900 | [diff] [blame] | 24 | import com.android.inputmethod.latin.common.StringUtils; |
Keisuke Kuroyanagi | b24de42 | 2014-02-06 16:09:25 +0900 | [diff] [blame] | 25 | import com.android.inputmethod.latin.utils.CombinedFormatUtils; |
Keisuke Kuroyanagi | 5f5feeb | 2014-02-06 15:13:33 +0900 | [diff] [blame] | 26 | |
| 27 | import java.util.ArrayList; |
| 28 | import java.util.Arrays; |
| 29 | |
Keisuke Kuroyanagi | d7a51c2 | 2014-10-09 15:26:10 +0900 | [diff] [blame] | 30 | import javax.annotation.Nullable; |
| 31 | |
Keisuke Kuroyanagi | 5f5feeb | 2014-02-06 15:13:33 +0900 | [diff] [blame] | 32 | /** |
| 33 | * Utility class for a word with a probability. |
| 34 | * |
| 35 | * This is chiefly used to iterate a dictionary. |
| 36 | */ |
| 37 | public final class WordProperty implements Comparable<WordProperty> { |
| 38 | public final String mWord; |
| 39 | public final ProbabilityInfo mProbabilityInfo; |
Keisuke Kuroyanagi | c6a6f6a | 2014-10-01 11:21:08 +0900 | [diff] [blame] | 40 | public final ArrayList<NgramProperty> mNgrams; |
Keisuke Kuroyanagi | 1adca93 | 2014-05-23 19:58:58 +0900 | [diff] [blame] | 41 | // TODO: Support mIsBeginningOfSentence. |
| 42 | public final boolean mIsBeginningOfSentence; |
Keisuke Kuroyanagi | 5f5feeb | 2014-02-06 15:13:33 +0900 | [diff] [blame] | 43 | public final boolean mIsNotAWord; |
Adrian Velicu | 05172bf | 2014-10-14 12:13:11 +0900 | [diff] [blame] | 44 | public final boolean mIsPossiblyOffensive; |
Keisuke Kuroyanagi | c6a6f6a | 2014-10-01 11:21:08 +0900 | [diff] [blame] | 45 | public final boolean mHasNgrams; |
Keisuke Kuroyanagi | 5f5feeb | 2014-02-06 15:13:33 +0900 | [diff] [blame] | 46 | |
| 47 | private int mHashCode = 0; |
| 48 | |
Keisuke Kuroyanagi | c6a6f6a | 2014-10-01 11:21:08 +0900 | [diff] [blame] | 49 | // TODO: Support n-gram. |
Keisuke Kuroyanagi | aa7abb2 | 2014-03-28 15:02:00 +0900 | [diff] [blame] | 50 | @UsedForTesting |
Keisuke Kuroyanagi | 8ffc631 | 2014-02-10 15:05:08 +0900 | [diff] [blame] | 51 | public WordProperty(final String word, final ProbabilityInfo probabilityInfo, |
Keisuke Kuroyanagi | d7a51c2 | 2014-10-09 15:26:10 +0900 | [diff] [blame] | 52 | @Nullable final ArrayList<WeightedString> bigrams, |
Adrian Velicu | 05172bf | 2014-10-14 12:13:11 +0900 | [diff] [blame] | 53 | final boolean isNotAWord, final boolean isPossiblyOffensive) { |
Keisuke Kuroyanagi | 5f5feeb | 2014-02-06 15:13:33 +0900 | [diff] [blame] | 54 | mWord = word; |
Keisuke Kuroyanagi | 8ffc631 | 2014-02-10 15:05:08 +0900 | [diff] [blame] | 55 | mProbabilityInfo = probabilityInfo; |
Jean Chalard | b28d1cc | 2014-10-03 17:55:26 +0900 | [diff] [blame] | 56 | if (null == bigrams) { |
| 57 | mNgrams = null; |
| 58 | } else { |
| 59 | mNgrams = new ArrayList<>(); |
| 60 | final NgramContext ngramContext = new NgramContext(new WordInfo(mWord)); |
Tadashi G. Takaoka | 5f00fe0 | 2014-10-20 14:48:56 +0900 | [diff] [blame] | 61 | for (final WeightedString bigramTarget : bigrams) { |
| 62 | mNgrams.add(new NgramProperty(bigramTarget, ngramContext)); |
Keisuke Kuroyanagi | c6a6f6a | 2014-10-01 11:21:08 +0900 | [diff] [blame] | 63 | } |
| 64 | } |
Keisuke Kuroyanagi | 1adca93 | 2014-05-23 19:58:58 +0900 | [diff] [blame] | 65 | mIsBeginningOfSentence = false; |
Keisuke Kuroyanagi | 5f5feeb | 2014-02-06 15:13:33 +0900 | [diff] [blame] | 66 | mIsNotAWord = isNotAWord; |
Adrian Velicu | 05172bf | 2014-10-14 12:13:11 +0900 | [diff] [blame] | 67 | mIsPossiblyOffensive = isPossiblyOffensive; |
Keisuke Kuroyanagi | c6a6f6a | 2014-10-01 11:21:08 +0900 | [diff] [blame] | 68 | mHasNgrams = bigrams != null && !bigrams.isEmpty(); |
Keisuke Kuroyanagi | 5f5feeb | 2014-02-06 15:13:33 +0900 | [diff] [blame] | 69 | } |
| 70 | |
| 71 | private static ProbabilityInfo createProbabilityInfoFromArray(final int[] probabilityInfo) { |
| 72 | return new ProbabilityInfo( |
| 73 | probabilityInfo[BinaryDictionary.FORMAT_WORD_PROPERTY_PROBABILITY_INDEX], |
| 74 | probabilityInfo[BinaryDictionary.FORMAT_WORD_PROPERTY_TIMESTAMP_INDEX], |
| 75 | probabilityInfo[BinaryDictionary.FORMAT_WORD_PROPERTY_LEVEL_INDEX], |
| 76 | probabilityInfo[BinaryDictionary.FORMAT_WORD_PROPERTY_COUNT_INDEX]); |
| 77 | } |
| 78 | |
| 79 | // Construct word property using information from native code. |
| 80 | // This represents invalid word when the probability is BinaryDictionary.NOT_A_PROBABILITY. |
| 81 | public WordProperty(final int[] codePoints, final boolean isNotAWord, |
Dan Zivkovic | 12d80eb | 2015-02-10 14:54:38 -0800 | [diff] [blame] | 82 | final boolean isPossiblyOffensive, final boolean hasBigram, |
Keisuke Kuroyanagi | 88fa47a | 2014-06-24 12:37:07 +0900 | [diff] [blame] | 83 | final boolean isBeginningOfSentence, final int[] probabilityInfo, |
Keisuke Kuroyanagi | d7a51c2 | 2014-10-09 15:26:10 +0900 | [diff] [blame] | 84 | final ArrayList<int[][]> ngramPrevWordsArray, |
Keisuke Kuroyanagi | b5ef884 | 2014-10-22 18:15:53 +0900 | [diff] [blame] | 85 | final ArrayList<boolean[]> ngramPrevWordIsBeginningOfSentenceArray, |
Dan Zivkovic | 12d80eb | 2015-02-10 14:54:38 -0800 | [diff] [blame] | 86 | final ArrayList<int[]> ngramTargets, final ArrayList<int[]> ngramProbabilityInfo) { |
Keisuke Kuroyanagi | 5f5feeb | 2014-02-06 15:13:33 +0900 | [diff] [blame] | 87 | mWord = StringUtils.getStringFromNullTerminatedCodePointArray(codePoints); |
| 88 | mProbabilityInfo = createProbabilityInfoFromArray(probabilityInfo); |
Jean Chalard | b28d1cc | 2014-10-03 17:55:26 +0900 | [diff] [blame] | 89 | final ArrayList<NgramProperty> ngrams = new ArrayList<>(); |
Keisuke Kuroyanagi | 88fa47a | 2014-06-24 12:37:07 +0900 | [diff] [blame] | 90 | mIsBeginningOfSentence = isBeginningOfSentence; |
Keisuke Kuroyanagi | 5f5feeb | 2014-02-06 15:13:33 +0900 | [diff] [blame] | 91 | mIsNotAWord = isNotAWord; |
Adrian Velicu | 05172bf | 2014-10-14 12:13:11 +0900 | [diff] [blame] | 92 | mIsPossiblyOffensive = isPossiblyOffensive; |
Keisuke Kuroyanagi | c6a6f6a | 2014-10-01 11:21:08 +0900 | [diff] [blame] | 93 | mHasNgrams = hasBigram; |
Keisuke Kuroyanagi | 5f5feeb | 2014-02-06 15:13:33 +0900 | [diff] [blame] | 94 | |
Keisuke Kuroyanagi | d7a51c2 | 2014-10-09 15:26:10 +0900 | [diff] [blame] | 95 | final int relatedNgramCount = ngramTargets.size(); |
Keisuke Kuroyanagi | c6a6f6a | 2014-10-01 11:21:08 +0900 | [diff] [blame] | 96 | for (int i = 0; i < relatedNgramCount; i++) { |
| 97 | final String ngramTargetString = |
Keisuke Kuroyanagi | d7a51c2 | 2014-10-09 15:26:10 +0900 | [diff] [blame] | 98 | StringUtils.getStringFromNullTerminatedCodePointArray(ngramTargets.get(i)); |
Keisuke Kuroyanagi | c6a6f6a | 2014-10-01 11:21:08 +0900 | [diff] [blame] | 99 | final WeightedString ngramTarget = new WeightedString(ngramTargetString, |
Keisuke Kuroyanagi | d7a51c2 | 2014-10-09 15:26:10 +0900 | [diff] [blame] | 100 | createProbabilityInfoFromArray(ngramProbabilityInfo.get(i))); |
Keisuke Kuroyanagi | b5ef884 | 2014-10-22 18:15:53 +0900 | [diff] [blame] | 101 | final int[][] prevWords = ngramPrevWordsArray.get(i); |
| 102 | final boolean[] isBeginningOfSentenceArray = |
| 103 | ngramPrevWordIsBeginningOfSentenceArray.get(i); |
| 104 | final WordInfo[] wordInfoArray = new WordInfo[prevWords.length]; |
| 105 | for (int j = 0; j < prevWords.length; j++) { |
| 106 | wordInfoArray[j] = isBeginningOfSentenceArray[j] |
| 107 | ? WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO |
| 108 | : new WordInfo(StringUtils.getStringFromNullTerminatedCodePointArray( |
| 109 | prevWords[j])); |
| 110 | } |
| 111 | final NgramContext ngramContext = new NgramContext(wordInfoArray); |
Jean Chalard | b28d1cc | 2014-10-03 17:55:26 +0900 | [diff] [blame] | 112 | ngrams.add(new NgramProperty(ngramTarget, ngramContext)); |
Keisuke Kuroyanagi | 5f5feeb | 2014-02-06 15:13:33 +0900 | [diff] [blame] | 113 | } |
Jean Chalard | b28d1cc | 2014-10-03 17:55:26 +0900 | [diff] [blame] | 114 | mNgrams = ngrams.isEmpty() ? null : ngrams; |
Keisuke Kuroyanagi | 5f5feeb | 2014-02-06 15:13:33 +0900 | [diff] [blame] | 115 | } |
| 116 | |
Keisuke Kuroyanagi | c6a6f6a | 2014-10-01 11:21:08 +0900 | [diff] [blame] | 117 | // TODO: Remove |
Keisuke Kuroyanagi | b5ef884 | 2014-10-22 18:15:53 +0900 | [diff] [blame] | 118 | @UsedForTesting |
Keisuke Kuroyanagi | c6a6f6a | 2014-10-01 11:21:08 +0900 | [diff] [blame] | 119 | public ArrayList<WeightedString> getBigrams() { |
Jean Chalard | b28d1cc | 2014-10-03 17:55:26 +0900 | [diff] [blame] | 120 | if (null == mNgrams) { |
| 121 | return null; |
| 122 | } |
Keisuke Kuroyanagi | c6a6f6a | 2014-10-01 11:21:08 +0900 | [diff] [blame] | 123 | final ArrayList<WeightedString> bigrams = new ArrayList<>(); |
| 124 | for (final NgramProperty ngram : mNgrams) { |
| 125 | if (ngram.mNgramContext.getPrevWordCount() == 1) { |
| 126 | bigrams.add(ngram.mTargetWord); |
| 127 | } |
| 128 | } |
| 129 | return bigrams; |
| 130 | } |
| 131 | |
Keisuke Kuroyanagi | 5f5feeb | 2014-02-06 15:13:33 +0900 | [diff] [blame] | 132 | public int getProbability() { |
| 133 | return mProbabilityInfo.mProbability; |
| 134 | } |
| 135 | |
| 136 | private static int computeHashCode(WordProperty word) { |
| 137 | return Arrays.hashCode(new Object[] { |
| 138 | word.mWord, |
| 139 | word.mProbabilityInfo, |
Keisuke Kuroyanagi | c6a6f6a | 2014-10-01 11:21:08 +0900 | [diff] [blame] | 140 | word.mNgrams, |
Keisuke Kuroyanagi | 5f5feeb | 2014-02-06 15:13:33 +0900 | [diff] [blame] | 141 | word.mIsNotAWord, |
Adrian Velicu | 05172bf | 2014-10-14 12:13:11 +0900 | [diff] [blame] | 142 | word.mIsPossiblyOffensive |
Keisuke Kuroyanagi | 5f5feeb | 2014-02-06 15:13:33 +0900 | [diff] [blame] | 143 | }); |
| 144 | } |
| 145 | |
| 146 | /** |
| 147 | * Three-way comparison. |
| 148 | * |
| 149 | * A Word x is greater than a word y if x has a higher frequency. If they have the same |
| 150 | * frequency, they are sorted in lexicographic order. |
| 151 | */ |
| 152 | @Override |
| 153 | public int compareTo(final WordProperty w) { |
| 154 | if (getProbability() < w.getProbability()) return 1; |
| 155 | if (getProbability() > w.getProbability()) return -1; |
| 156 | return mWord.compareTo(w.mWord); |
| 157 | } |
| 158 | |
| 159 | /** |
| 160 | * Equality test. |
| 161 | * |
| 162 | * Words are equal if they have the same frequency, the same spellings, and the same |
| 163 | * attributes. |
| 164 | */ |
| 165 | @Override |
| 166 | public boolean equals(Object o) { |
| 167 | if (o == this) return true; |
| 168 | if (!(o instanceof WordProperty)) return false; |
| 169 | WordProperty w = (WordProperty)o; |
Dan Zivkovic | 12d80eb | 2015-02-10 14:54:38 -0800 | [diff] [blame] | 170 | return mProbabilityInfo.equals(w.mProbabilityInfo) |
| 171 | && mWord.equals(w.mWord) && equals(mNgrams, w.mNgrams) |
Adrian Velicu | 05172bf | 2014-10-14 12:13:11 +0900 | [diff] [blame] | 172 | && mIsNotAWord == w.mIsNotAWord && mIsPossiblyOffensive == w.mIsPossiblyOffensive |
Dan Zivkovic | 12d80eb | 2015-02-10 14:54:38 -0800 | [diff] [blame] | 173 | && mHasNgrams == w.mHasNgrams; |
Keisuke Kuroyanagi | 5f5feeb | 2014-02-06 15:13:33 +0900 | [diff] [blame] | 174 | } |
| 175 | |
Keisuke Kuroyanagi | d7a51c2 | 2014-10-09 15:26:10 +0900 | [diff] [blame] | 176 | // TDOO: Have a utility method like java.util.Objects.equals. |
| 177 | private static <T> boolean equals(final ArrayList<T> a, final ArrayList<T> b) { |
Jean Chalard | b28d1cc | 2014-10-03 17:55:26 +0900 | [diff] [blame] | 178 | if (null == a) { |
| 179 | return null == b; |
| 180 | } |
| 181 | return a.equals(b); |
| 182 | } |
| 183 | |
Keisuke Kuroyanagi | 5f5feeb | 2014-02-06 15:13:33 +0900 | [diff] [blame] | 184 | @Override |
| 185 | public int hashCode() { |
| 186 | if (mHashCode == 0) { |
| 187 | mHashCode = computeHashCode(this); |
| 188 | } |
| 189 | return mHashCode; |
| 190 | } |
| 191 | |
| 192 | @UsedForTesting |
| 193 | public boolean isValid() { |
Tadashi G. Takaoka | 5f00fe0 | 2014-10-20 14:48:56 +0900 | [diff] [blame] | 194 | return getProbability() != Dictionary.NOT_A_PROBABILITY; |
Keisuke Kuroyanagi | 5f5feeb | 2014-02-06 15:13:33 +0900 | [diff] [blame] | 195 | } |
| 196 | |
| 197 | @Override |
| 198 | public String toString() { |
Keisuke Kuroyanagi | b24de42 | 2014-02-06 16:09:25 +0900 | [diff] [blame] | 199 | return CombinedFormatUtils.formatWordProperty(this); |
Keisuke Kuroyanagi | 5f5feeb | 2014-02-06 15:13:33 +0900 | [diff] [blame] | 200 | } |
| 201 | } |