| /* |
| * Copyright (C) 2013 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package com.android.inputmethod.latin; |
| |
| import static org.junit.Assert.assertEquals; |
| import static org.junit.Assert.assertFalse; |
| import static org.junit.Assert.assertTrue; |
| import static org.junit.Assert.fail; |
| |
| import android.text.TextUtils; |
| import android.util.Pair; |
| |
| import androidx.test.InstrumentationRegistry; |
| import androidx.test.filters.LargeTest; |
| import androidx.test.runner.AndroidJUnit4; |
| |
| import com.android.inputmethod.latin.NgramContext.WordInfo; |
| import com.android.inputmethod.latin.common.CodePointUtils; |
| import com.android.inputmethod.latin.common.FileUtils; |
| import com.android.inputmethod.latin.makedict.DictionaryHeader; |
| import com.android.inputmethod.latin.makedict.FormatSpec; |
| import com.android.inputmethod.latin.makedict.WeightedString; |
| import com.android.inputmethod.latin.makedict.WordProperty; |
| import com.android.inputmethod.latin.utils.BinaryDictionaryUtils; |
| |
| import org.junit.After; |
| import org.junit.Before; |
| import org.junit.Test; |
| import org.junit.runner.RunWith; |
| |
| import java.io.File; |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Locale; |
| import java.util.Random; |
| |
| @LargeTest |
| @RunWith(AndroidJUnit4.class) |
| public class BinaryDictionaryTests { |
| private static final String TEST_DICT_FILE_EXTENSION = ".testDict"; |
| private static final String TEST_LOCALE = "test"; |
| private static final String DICTIONARY_ID = "TestBinaryDictionary"; |
| |
| private HashSet<File> mDictFilesToBeDeleted = new HashSet<>(); |
| |
| @Before |
| public void setUp() throws Exception { |
| mDictFilesToBeDeleted.clear(); |
| } |
| |
| @After |
| public void tearDown() throws Exception { |
| for (final File dictFile : mDictFilesToBeDeleted) { |
| dictFile.delete(); |
| } |
| mDictFilesToBeDeleted.clear(); |
| } |
| |
| private File createEmptyDictionaryAndGetFile(final int formatVersion) { |
| return createEmptyDictionaryWithAttributesAndGetFile(formatVersion, |
| new HashMap<String, String>()); |
| } |
| |
| private File createEmptyDictionaryWithAttributesAndGetFile(final int formatVersion, |
| final HashMap<String, String> attributeMap) { |
| try { |
| final File dictFile = createEmptyVer4DictionaryAndGetFile(formatVersion, |
| attributeMap); |
| mDictFilesToBeDeleted.add(dictFile); |
| return dictFile; |
| } catch (final IOException e) { |
| fail(e.toString()); |
| } |
| return null; |
| } |
| |
| private File createEmptyVer4DictionaryAndGetFile(final int formatVersion, |
| final HashMap<String, String> attributeMap) throws IOException { |
| final File file = File.createTempFile(DICTIONARY_ID, TEST_DICT_FILE_EXTENSION, |
| InstrumentationRegistry.getTargetContext().getCacheDir()); |
| file.delete(); |
| file.mkdir(); |
| if (BinaryDictionaryUtils.createEmptyDictFile(file.getAbsolutePath(), formatVersion, |
| Locale.ENGLISH, attributeMap)) { |
| return file; |
| } |
| throw new IOException("Empty dictionary " + file.getAbsolutePath() |
| + " cannot be created. Format version: " + formatVersion); |
| } |
| |
| private static BinaryDictionary getBinaryDictionary(final File dictFile) { |
| return new BinaryDictionary(dictFile.getAbsolutePath(), |
| 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, |
| Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); |
| } |
| |
| private BinaryDictionary getEmptyBinaryDictionary(final int formatVersion) { |
| final File dictFile = createEmptyDictionaryAndGetFile(formatVersion); |
| return new BinaryDictionary(dictFile.getAbsolutePath(), |
| 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, |
| Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); |
| } |
| |
| @Test |
| public void testIsValidDictionary() { |
| final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); |
| BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); |
| assertTrue("binaryDictionary must be valid for existing valid dictionary file.", |
| binaryDictionary.isValidDictionary()); |
| binaryDictionary.close(); |
| assertFalse("binaryDictionary must be invalid after closing.", |
| binaryDictionary.isValidDictionary()); |
| FileUtils.deleteRecursively(dictFile); |
| binaryDictionary = getBinaryDictionary(dictFile); |
| assertFalse("binaryDictionary must be invalid for not existing dictionary file.", |
| binaryDictionary.isValidDictionary()); |
| binaryDictionary.close(); |
| } |
| |
| @Test |
| public void testConstructingDictionaryOnMemory() { |
| final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); |
| FileUtils.deleteRecursively(dictFile); |
| assertFalse(dictFile.exists()); |
| final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), |
| true /* useFullEditDistance */, Locale.getDefault(), TEST_LOCALE, |
| FormatSpec.VERSION403, new HashMap<String, String>()); |
| assertTrue(binaryDictionary.isValidDictionary()); |
| assertEquals(FormatSpec.VERSION403, binaryDictionary.getFormatVersion()); |
| final int probability = 100; |
| addUnigramWord(binaryDictionary, "word", probability); |
| assertEquals(probability, binaryDictionary.getFrequency("word")); |
| assertFalse(dictFile.exists()); |
| binaryDictionary.flush(); |
| assertTrue(dictFile.exists()); |
| assertTrue(binaryDictionary.isValidDictionary()); |
| assertEquals(FormatSpec.VERSION403, binaryDictionary.getFormatVersion()); |
| assertEquals(probability, binaryDictionary.getFrequency("word")); |
| binaryDictionary.close(); |
| } |
| |
| @Test |
| public void testAddTooLongWord() { |
| final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); |
| final StringBuffer stringBuilder = new StringBuffer(); |
| for (int i = 0; i < BinaryDictionary.DICTIONARY_MAX_WORD_LENGTH; i++) { |
| stringBuilder.append('a'); |
| } |
| final String validLongWord = stringBuilder.toString(); |
| stringBuilder.append('a'); |
| final String invalidLongWord = stringBuilder.toString(); |
| final int probability = 100; |
| addUnigramWord(binaryDictionary, "aaa", probability); |
| addUnigramWord(binaryDictionary, validLongWord, probability); |
| addUnigramWord(binaryDictionary, invalidLongWord, probability); |
| // Too long short cut. |
| binaryDictionary.addUnigramEntry("a", probability, false /* isBeginningOfSentence */, |
| false /* isNotAWord */, false /* isPossiblyOffensive */, |
| BinaryDictionary.NOT_A_VALID_TIMESTAMP); |
| addUnigramWord(binaryDictionary, "abc", probability); |
| final int updatedProbability = 200; |
| // Update. |
| addUnigramWord(binaryDictionary, validLongWord, updatedProbability); |
| addUnigramWord(binaryDictionary, invalidLongWord, updatedProbability); |
| addUnigramWord(binaryDictionary, "abc", updatedProbability); |
| |
| assertEquals(probability, binaryDictionary.getFrequency("aaa")); |
| assertEquals(updatedProbability, binaryDictionary.getFrequency(validLongWord)); |
| assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency(invalidLongWord)); |
| assertEquals(updatedProbability, binaryDictionary.getFrequency("abc")); |
| } |
| |
| private static void addUnigramWord(final BinaryDictionary binaryDictionary, final String word, |
| final int probability) { |
| binaryDictionary.addUnigramEntry(word, probability, |
| false /* isBeginningOfSentence */, false /* isNotAWord */, |
| false /* isPossiblyOffensive */, |
| BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); |
| } |
| |
| private static void addBigramWords(final BinaryDictionary binaryDictionary, final String word0, |
| final String word1, final int probability) { |
| binaryDictionary.addNgramEntry(new NgramContext(new WordInfo(word0)), word1, probability, |
| BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); |
| } |
| |
| private static void addTrigramEntry(final BinaryDictionary binaryDictionary, final String word0, |
| final String word1, final String word2, final int probability) { |
| binaryDictionary.addNgramEntry( |
| new NgramContext(new WordInfo(word1), new WordInfo(word0)), word2, |
| probability, BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); |
| } |
| |
| private static boolean isValidBigram(final BinaryDictionary binaryDictionary, |
| final String word0, final String word1) { |
| return binaryDictionary.isValidNgram(new NgramContext(new WordInfo(word0)), word1); |
| } |
| |
| private static int getBigramProbability(final BinaryDictionary binaryDictionary, |
| final String word0, final String word1) { |
| return binaryDictionary.getNgramProbability(new NgramContext(new WordInfo(word0)), word1); |
| } |
| |
| private static int getTrigramProbability(final BinaryDictionary binaryDictionary, |
| final String word0, final String word1, final String word2) { |
| return binaryDictionary.getNgramProbability( |
| new NgramContext(new WordInfo(word1), new WordInfo(word0)), word2); |
| } |
| |
| @Test |
| public void testAddUnigramWord() { |
| final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); |
| final int probability = 100; |
| addUnigramWord(binaryDictionary, "aaa", probability); |
| // Reallocate and create. |
| addUnigramWord(binaryDictionary, "aab", probability); |
| // Insert into children. |
| addUnigramWord(binaryDictionary, "aac", probability); |
| // Make terminal. |
| addUnigramWord(binaryDictionary, "aa", probability); |
| // Create children. |
| addUnigramWord(binaryDictionary, "aaaa", probability); |
| // Reallocate and make termianl. |
| addUnigramWord(binaryDictionary, "a", probability); |
| |
| final int updatedProbability = 200; |
| // Update. |
| addUnigramWord(binaryDictionary, "aaa", updatedProbability); |
| |
| assertEquals(probability, binaryDictionary.getFrequency("aab")); |
| assertEquals(probability, binaryDictionary.getFrequency("aac")); |
| assertEquals(probability, binaryDictionary.getFrequency("aa")); |
| assertEquals(probability, binaryDictionary.getFrequency("aaaa")); |
| assertEquals(probability, binaryDictionary.getFrequency("a")); |
| assertEquals(updatedProbability, binaryDictionary.getFrequency("aaa")); |
| } |
| |
| @Test |
| public void testRandomlyAddUnigramWord() { |
| final int wordCount = 1000; |
| final int codePointSetSize = 50; |
| final long seed = System.currentTimeMillis(); |
| final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); |
| |
| final HashMap<String, Integer> probabilityMap = new HashMap<>(); |
| // Test a word that isn't contained within the dictionary. |
| final Random random = new Random(seed); |
| final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); |
| for (int i = 0; i < wordCount; ++i) { |
| final String word = CodePointUtils.generateWord(random, codePointSet); |
| probabilityMap.put(word, random.nextInt(0xFF)); |
| } |
| for (String word : probabilityMap.keySet()) { |
| addUnigramWord(binaryDictionary, word, probabilityMap.get(word)); |
| } |
| for (String word : probabilityMap.keySet()) { |
| assertEquals(word, (int)probabilityMap.get(word), binaryDictionary.getFrequency(word)); |
| } |
| } |
| |
| @Test |
| public void testAddBigramWords() { |
| final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); |
| |
| final int unigramProbability = 100; |
| final int bigramProbability = 150; |
| final int updatedBigramProbability = 200; |
| addUnigramWord(binaryDictionary, "aaa", unigramProbability); |
| addUnigramWord(binaryDictionary, "abb", unigramProbability); |
| addUnigramWord(binaryDictionary, "bcc", unigramProbability); |
| addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability); |
| addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability); |
| addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability); |
| addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability); |
| |
| assertTrue(isValidBigram(binaryDictionary, "aaa", "abb")); |
| assertTrue(isValidBigram(binaryDictionary, "aaa", "bcc")); |
| assertTrue(isValidBigram(binaryDictionary, "abb", "aaa")); |
| assertTrue(isValidBigram(binaryDictionary, "abb", "bcc")); |
| assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "abb")); |
| assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bcc")); |
| assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "aaa")); |
| assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "bcc")); |
| |
| addBigramWords(binaryDictionary, "aaa", "abb", updatedBigramProbability); |
| assertEquals(updatedBigramProbability, |
| getBigramProbability(binaryDictionary, "aaa", "abb")); |
| |
| assertFalse(isValidBigram(binaryDictionary, "bcc", "aaa")); |
| assertFalse(isValidBigram(binaryDictionary, "bcc", "bbc")); |
| assertFalse(isValidBigram(binaryDictionary, "aaa", "aaa")); |
| assertEquals(Dictionary.NOT_A_PROBABILITY, |
| getBigramProbability(binaryDictionary, "bcc", "aaa")); |
| assertEquals(Dictionary.NOT_A_PROBABILITY, |
| getBigramProbability(binaryDictionary, "bcc", "bbc")); |
| assertEquals(Dictionary.NOT_A_PROBABILITY, |
| getBigramProbability(binaryDictionary, "aaa", "aaa")); |
| |
| // Testing bigram link. |
| addUnigramWord(binaryDictionary, "abcde", unigramProbability); |
| addUnigramWord(binaryDictionary, "fghij", unigramProbability); |
| addBigramWords(binaryDictionary, "abcde", "fghij", bigramProbability); |
| addUnigramWord(binaryDictionary, "fgh", unigramProbability); |
| addUnigramWord(binaryDictionary, "abc", unigramProbability); |
| addUnigramWord(binaryDictionary, "f", unigramProbability); |
| |
| assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abcde", "fghij")); |
| assertEquals(Dictionary.NOT_A_PROBABILITY, |
| getBigramProbability(binaryDictionary, "abcde", "fgh")); |
| addBigramWords(binaryDictionary, "abcde", "fghij", updatedBigramProbability); |
| assertEquals(updatedBigramProbability, |
| getBigramProbability(binaryDictionary, "abcde", "fghij")); |
| } |
| |
| @Test |
| public void testRandomlyAddBigramWords() { |
| final int wordCount = 100; |
| final int bigramCount = 1000; |
| final int codePointSetSize = 50; |
| final long seed = System.currentTimeMillis(); |
| final Random random = new Random(seed); |
| final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); |
| |
| final ArrayList<String> words = new ArrayList<>(); |
| final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>(); |
| final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); |
| final HashMap<String, Integer> unigramProbabilities = new HashMap<>(); |
| final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>(); |
| |
| for (int i = 0; i < wordCount; ++i) { |
| final String word = CodePointUtils.generateWord(random, codePointSet); |
| words.add(word); |
| final int unigramProbability = random.nextInt(0xFF); |
| unigramProbabilities.put(word, unigramProbability); |
| addUnigramWord(binaryDictionary, word, unigramProbability); |
| } |
| |
| for (int i = 0; i < bigramCount; i++) { |
| final String word0 = words.get(random.nextInt(wordCount)); |
| final String word1 = words.get(random.nextInt(wordCount)); |
| if (TextUtils.equals(word0, word1)) { |
| continue; |
| } |
| final Pair<String, String> bigram = new Pair<>(word0, word1); |
| bigramWords.add(bigram); |
| final int unigramProbability = unigramProbabilities.get(word1); |
| final int bigramProbability = |
| unigramProbability + random.nextInt(0xFF - unigramProbability); |
| bigramProbabilities.put(bigram, bigramProbability); |
| addBigramWords(binaryDictionary, word0, word1, bigramProbability); |
| } |
| |
| for (final Pair<String, String> bigram : bigramWords) { |
| final int bigramProbability = bigramProbabilities.get(bigram); |
| assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY, |
| isValidBigram(binaryDictionary, bigram.first, bigram.second)); |
| assertEquals(bigramProbability, |
| getBigramProbability(binaryDictionary, bigram.first, bigram.second)); |
| } |
| } |
| |
| @Test |
| public void testAddTrigramWords() { |
| final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); |
| final int unigramProbability = 100; |
| final int trigramProbability = 150; |
| final int updatedTrigramProbability = 200; |
| addUnigramWord(binaryDictionary, "aaa", unigramProbability); |
| addUnigramWord(binaryDictionary, "abb", unigramProbability); |
| addUnigramWord(binaryDictionary, "bcc", unigramProbability); |
| |
| addBigramWords(binaryDictionary, "abb", "bcc", 10); |
| addBigramWords(binaryDictionary, "abb", "aaa", 10); |
| |
| addTrigramEntry(binaryDictionary, "aaa", "abb", "bcc", trigramProbability); |
| addTrigramEntry(binaryDictionary, "bcc", "abb", "aaa", trigramProbability); |
| |
| assertEquals(trigramProbability, |
| getTrigramProbability(binaryDictionary, "aaa", "abb", "bcc")); |
| assertEquals(trigramProbability, |
| getTrigramProbability(binaryDictionary, "bcc", "abb", "aaa")); |
| assertFalse(isValidBigram(binaryDictionary, "aaa", "abb")); |
| |
| addTrigramEntry(binaryDictionary, "bcc", "abb", "aaa", updatedTrigramProbability); |
| assertEquals(updatedTrigramProbability, |
| getTrigramProbability(binaryDictionary, "bcc", "abb", "aaa")); |
| } |
| |
| @Test |
| public void testFlushDictionary() { |
| final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); |
| BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); |
| |
| final int probability = 100; |
| addUnigramWord(binaryDictionary, "aaa", probability); |
| addUnigramWord(binaryDictionary, "abcd", probability); |
| // Close without flushing. |
| binaryDictionary.close(); |
| |
| binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), |
| 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, |
| Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); |
| |
| assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("aaa")); |
| assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("abcd")); |
| |
| addUnigramWord(binaryDictionary, "aaa", probability); |
| addUnigramWord(binaryDictionary, "abcd", probability); |
| binaryDictionary.flush(); |
| binaryDictionary.close(); |
| |
| binaryDictionary = getBinaryDictionary(dictFile); |
| assertEquals(probability, binaryDictionary.getFrequency("aaa")); |
| assertEquals(probability, binaryDictionary.getFrequency("abcd")); |
| addUnigramWord(binaryDictionary, "bcde", probability); |
| binaryDictionary.flush(); |
| binaryDictionary.close(); |
| |
| binaryDictionary = getBinaryDictionary(dictFile); |
| assertEquals(probability, binaryDictionary.getFrequency("bcde")); |
| binaryDictionary.close(); |
| } |
| |
| @Test |
| public void testFlushWithGCDictionary() { |
| final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); |
| BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); |
| final int unigramProbability = 100; |
| final int bigramProbability = 150; |
| addUnigramWord(binaryDictionary, "aaa", unigramProbability); |
| addUnigramWord(binaryDictionary, "abb", unigramProbability); |
| addUnigramWord(binaryDictionary, "bcc", unigramProbability); |
| addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability); |
| addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability); |
| addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability); |
| addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability); |
| binaryDictionary.flushWithGC(); |
| binaryDictionary.close(); |
| |
| binaryDictionary = getBinaryDictionary(dictFile); |
| assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa")); |
| assertEquals(unigramProbability, binaryDictionary.getFrequency("abb")); |
| assertEquals(unigramProbability, binaryDictionary.getFrequency("bcc")); |
| assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "abb")); |
| assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bcc")); |
| assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "aaa")); |
| assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "bcc")); |
| assertFalse(isValidBigram(binaryDictionary, "bcc", "aaa")); |
| assertFalse(isValidBigram(binaryDictionary, "bcc", "bbc")); |
| assertFalse(isValidBigram(binaryDictionary, "aaa", "aaa")); |
| binaryDictionary.flushWithGC(); |
| binaryDictionary.close(); |
| } |
| |
| @Test |
| public void testAddBigramWordsAndFlashWithGC() { |
| final int wordCount = 100; |
| final int bigramCount = 1000; |
| final int codePointSetSize = 30; |
| final long seed = System.currentTimeMillis(); |
| final Random random = new Random(seed); |
| |
| final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); |
| BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); |
| |
| final ArrayList<String> words = new ArrayList<>(); |
| final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>(); |
| final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); |
| final HashMap<String, Integer> unigramProbabilities = new HashMap<>(); |
| final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>(); |
| |
| for (int i = 0; i < wordCount; ++i) { |
| final String word = CodePointUtils.generateWord(random, codePointSet); |
| words.add(word); |
| final int unigramProbability = random.nextInt(0xFF); |
| unigramProbabilities.put(word, unigramProbability); |
| addUnigramWord(binaryDictionary, word, unigramProbability); |
| } |
| |
| for (int i = 0; i < bigramCount; i++) { |
| final String word0 = words.get(random.nextInt(wordCount)); |
| final String word1 = words.get(random.nextInt(wordCount)); |
| if (TextUtils.equals(word0, word1)) { |
| continue; |
| } |
| final Pair<String, String> bigram = new Pair<>(word0, word1); |
| bigramWords.add(bigram); |
| final int unigramProbability = unigramProbabilities.get(word1); |
| final int bigramProbability = |
| unigramProbability + random.nextInt(0xFF - unigramProbability); |
| bigramProbabilities.put(bigram, bigramProbability); |
| addBigramWords(binaryDictionary, word0, word1, bigramProbability); |
| } |
| |
| binaryDictionary.flushWithGC(); |
| binaryDictionary.close(); |
| binaryDictionary = getBinaryDictionary(dictFile); |
| |
| for (final Pair<String, String> bigram : bigramWords) { |
| final int bigramProbability = bigramProbabilities.get(bigram); |
| assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY, |
| isValidBigram(binaryDictionary, bigram.first, bigram.second)); |
| assertEquals(bigramProbability, |
| getBigramProbability(binaryDictionary, bigram.first, bigram.second)); |
| } |
| } |
| |
| @Test |
| public void testRandomOperationsAndFlashWithGC() { |
| final int maxUnigramCount = 5000; |
| final int maxBigramCount = 10000; |
| final HashMap<String, String> attributeMap = new HashMap<>(); |
| attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY, String.valueOf(maxUnigramCount)); |
| attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY, String.valueOf(maxBigramCount)); |
| |
| final int flashWithGCIterationCount = 50; |
| final int operationCountInEachIteration = 200; |
| final int initialUnigramCount = 100; |
| final float addUnigramProb = 0.5f; |
| final float addBigramProb = 0.8f; |
| final int codePointSetSize = 30; |
| |
| final long seed = System.currentTimeMillis(); |
| final Random random = new Random(seed); |
| final File dictFile = createEmptyDictionaryWithAttributesAndGetFile(FormatSpec.VERSION403, |
| attributeMap); |
| BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); |
| |
| final ArrayList<String> words = new ArrayList<>(); |
| final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>(); |
| final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); |
| final HashMap<String, Integer> unigramProbabilities = new HashMap<>(); |
| final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>(); |
| for (int i = 0; i < initialUnigramCount; ++i) { |
| final String word = CodePointUtils.generateWord(random, codePointSet); |
| words.add(word); |
| final int unigramProbability = random.nextInt(0xFF); |
| unigramProbabilities.put(word, unigramProbability); |
| addUnigramWord(binaryDictionary, word, unigramProbability); |
| } |
| binaryDictionary.flushWithGC(); |
| binaryDictionary.close(); |
| |
| for (int gcCount = 0; gcCount < flashWithGCIterationCount; gcCount++) { |
| binaryDictionary = getBinaryDictionary(dictFile); |
| for (int opCount = 0; opCount < operationCountInEachIteration; opCount++) { |
| // Add unigram. |
| if (random.nextFloat() < addUnigramProb) { |
| final String word = CodePointUtils.generateWord(random, codePointSet); |
| words.add(word); |
| final int unigramProbability = random.nextInt(0xFF); |
| unigramProbabilities.put(word, unigramProbability); |
| addUnigramWord(binaryDictionary, word, unigramProbability); |
| } |
| // Add bigram. |
| if (random.nextFloat() < addBigramProb && words.size() > 2) { |
| final int word0Index = random.nextInt(words.size()); |
| int word1Index = random.nextInt(words.size() - 1); |
| if (word0Index <= word1Index) { |
| word1Index++; |
| } |
| final String word0 = words.get(word0Index); |
| final String word1 = words.get(word1Index); |
| if (TextUtils.equals(word0, word1)) { |
| continue; |
| } |
| final int unigramProbability = unigramProbabilities.get(word1); |
| final int bigramProbability = |
| unigramProbability + random.nextInt(0xFF - unigramProbability); |
| final Pair<String, String> bigram = new Pair<>(word0, word1); |
| bigramWords.add(bigram); |
| bigramProbabilities.put(bigram, bigramProbability); |
| addBigramWords(binaryDictionary, word0, word1, bigramProbability); |
| } |
| } |
| |
| // Test whether the all unigram operations are collectlly handled. |
| for (int i = 0; i < words.size(); i++) { |
| final String word = words.get(i); |
| final int unigramProbability = unigramProbabilities.get(word); |
| assertEquals(word, unigramProbability, binaryDictionary.getFrequency(word)); |
| } |
| // Test whether the all bigram operations are collectlly handled. |
| for (int i = 0; i < bigramWords.size(); i++) { |
| final Pair<String, String> bigram = bigramWords.get(i); |
| final int probability; |
| if (bigramProbabilities.containsKey(bigram)) { |
| probability = bigramProbabilities.get(bigram); |
| } else { |
| probability = Dictionary.NOT_A_PROBABILITY; |
| } |
| |
| assertEquals(probability, |
| getBigramProbability(binaryDictionary, bigram.first, bigram.second)); |
| assertEquals(probability != Dictionary.NOT_A_PROBABILITY, |
| isValidBigram(binaryDictionary, bigram.first, bigram.second)); |
| } |
| binaryDictionary.flushWithGC(); |
| binaryDictionary.close(); |
| } |
| } |
| |
| @Test |
| public void testAddManyUnigramsAndFlushWithGC() { |
| final int flashWithGCIterationCount = 3; |
| final int codePointSetSize = 50; |
| |
| final long seed = System.currentTimeMillis(); |
| final Random random = new Random(seed); |
| |
| final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); |
| |
| final ArrayList<String> words = new ArrayList<>(); |
| final HashMap<String, Integer> unigramProbabilities = new HashMap<>(); |
| final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); |
| |
| BinaryDictionary binaryDictionary; |
| for (int i = 0; i < flashWithGCIterationCount; i++) { |
| binaryDictionary = getBinaryDictionary(dictFile); |
| while(!binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) { |
| final String word = CodePointUtils.generateWord(random, codePointSet); |
| words.add(word); |
| final int unigramProbability = random.nextInt(0xFF); |
| unigramProbabilities.put(word, unigramProbability); |
| addUnigramWord(binaryDictionary, word, unigramProbability); |
| } |
| |
| for (int j = 0; j < words.size(); j++) { |
| final String word = words.get(j); |
| final int unigramProbability = unigramProbabilities.get(word); |
| assertEquals(word, unigramProbability, binaryDictionary.getFrequency(word)); |
| } |
| |
| binaryDictionary.flushWithGC(); |
| binaryDictionary.close(); |
| } |
| } |
| |
| @Test |
| public void testUnigramAndBigramCount() { |
| final int maxUnigramCount = 5000; |
| final int maxBigramCount = 10000; |
| final HashMap<String, String> attributeMap = new HashMap<>(); |
| attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY, String.valueOf(maxUnigramCount)); |
| attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY, String.valueOf(maxBigramCount)); |
| |
| final int flashWithGCIterationCount = 10; |
| final int codePointSetSize = 50; |
| final int unigramCountPerIteration = 1000; |
| final int bigramCountPerIteration = 2000; |
| final long seed = System.currentTimeMillis(); |
| final Random random = new Random(seed); |
| final File dictFile = createEmptyDictionaryWithAttributesAndGetFile(FormatSpec.VERSION403, |
| attributeMap); |
| |
| final ArrayList<String> words = new ArrayList<>(); |
| final HashSet<Pair<String, String>> bigrams = new HashSet<>(); |
| final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); |
| |
| BinaryDictionary binaryDictionary; |
| for (int i = 0; i < flashWithGCIterationCount; i++) { |
| binaryDictionary = getBinaryDictionary(dictFile); |
| for (int j = 0; j < unigramCountPerIteration; j++) { |
| final String word = CodePointUtils.generateWord(random, codePointSet); |
| words.add(word); |
| final int unigramProbability = random.nextInt(0xFF); |
| addUnigramWord(binaryDictionary, word, unigramProbability); |
| } |
| for (int j = 0; j < bigramCountPerIteration; j++) { |
| final String word0 = words.get(random.nextInt(words.size())); |
| final String word1 = words.get(random.nextInt(words.size())); |
| if (TextUtils.equals(word0, word1)) { |
| continue; |
| } |
| bigrams.add(new Pair<>(word0, word1)); |
| final int bigramProbability = random.nextInt(0xF); |
| addBigramWords(binaryDictionary, word0, word1, bigramProbability); |
| } |
| assertEquals(new HashSet<>(words).size(), Integer.parseInt( |
| binaryDictionary.getPropertyForGettingStats( |
| BinaryDictionary.UNIGRAM_COUNT_QUERY))); |
| assertEquals(new HashSet<>(bigrams).size(), Integer.parseInt( |
| binaryDictionary.getPropertyForGettingStats( |
| BinaryDictionary.BIGRAM_COUNT_QUERY))); |
| binaryDictionary.flushWithGC(); |
| assertEquals(new HashSet<>(words).size(), Integer.parseInt( |
| binaryDictionary.getPropertyForGettingStats( |
| BinaryDictionary.UNIGRAM_COUNT_QUERY))); |
| assertEquals(new HashSet<>(bigrams).size(), Integer.parseInt( |
| binaryDictionary.getPropertyForGettingStats( |
| BinaryDictionary.BIGRAM_COUNT_QUERY))); |
| binaryDictionary.close(); |
| } |
| } |
| |
| @Test |
| public void testGetWordProperties() { |
| final long seed = System.currentTimeMillis(); |
| final Random random = new Random(seed); |
| final int UNIGRAM_COUNT = 1000; |
| final int BIGRAM_COUNT = 1000; |
| final int codePointSetSize = 20; |
| final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); |
| final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); |
| final BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); |
| |
| final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord", |
| false /* isBeginningOfSentence */); |
| assertFalse(invalidWordProperty.isValid()); |
| |
| final ArrayList<String> words = new ArrayList<>(); |
| final HashMap<String, Integer> wordProbabilities = new HashMap<>(); |
| final HashMap<String, HashSet<String>> bigrams = new HashMap<>(); |
| final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>(); |
| |
| for (int i = 0; i < UNIGRAM_COUNT; i++) { |
| final String word = CodePointUtils.generateWord(random, codePointSet); |
| final int unigramProbability = random.nextInt(0xFF); |
| final boolean isNotAWord = random.nextBoolean(); |
| final boolean isPossiblyOffensive = random.nextBoolean(); |
| // TODO: Add tests for historical info. |
| binaryDictionary.addUnigramEntry(word, unigramProbability, |
| false /* isBeginningOfSentence */, isNotAWord, isPossiblyOffensive, |
| BinaryDictionary.NOT_A_VALID_TIMESTAMP); |
| if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { |
| binaryDictionary.flushWithGC(); |
| } |
| words.add(word); |
| wordProbabilities.put(word, unigramProbability); |
| final WordProperty wordProperty = binaryDictionary.getWordProperty(word, |
| false /* isBeginningOfSentence */); |
| assertEquals(word, wordProperty.mWord); |
| assertTrue(wordProperty.isValid()); |
| assertEquals(isNotAWord, wordProperty.mIsNotAWord); |
| assertEquals(isPossiblyOffensive, wordProperty.mIsPossiblyOffensive); |
| assertEquals(false, wordProperty.mHasNgrams); |
| assertEquals(unigramProbability, wordProperty.mProbabilityInfo.mProbability); |
| } |
| |
| for (int i = 0; i < BIGRAM_COUNT; i++) { |
| final int word0Index = random.nextInt(wordProbabilities.size()); |
| final int word1Index = random.nextInt(wordProbabilities.size()); |
| if (word0Index == word1Index) { |
| continue; |
| } |
| final String word0 = words.get(word0Index); |
| final String word1 = words.get(word1Index); |
| final int unigramProbability = wordProbabilities.get(word1); |
| final int bigramProbability = |
| unigramProbability + random.nextInt(0xFF - unigramProbability); |
| addBigramWords(binaryDictionary, word0, word1, bigramProbability); |
| if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { |
| binaryDictionary.flushWithGC(); |
| } |
| if (!bigrams.containsKey(word0)) { |
| final HashSet<String> bigramWord1s = new HashSet<>(); |
| bigrams.put(word0, bigramWord1s); |
| } |
| bigrams.get(word0).add(word1); |
| bigramProbabilities.put(new Pair<>(word0, word1), bigramProbability); |
| } |
| |
| for (int i = 0; i < words.size(); i++) { |
| final String word0 = words.get(i); |
| if (!bigrams.containsKey(word0)) { |
| continue; |
| } |
| final HashSet<String> bigramWord1s = bigrams.get(word0); |
| final WordProperty wordProperty = binaryDictionary.getWordProperty(word0, |
| false /* isBeginningOfSentence */); |
| assertEquals(bigramWord1s.size(), wordProperty.mNgrams.size()); |
| // TODO: Support ngram. |
| for (final WeightedString bigramTarget : wordProperty.getBigrams()) { |
| final String word1 = bigramTarget.mWord; |
| assertTrue(bigramWord1s.contains(word1)); |
| final int bigramProbability = bigramProbabilities.get(new Pair<>(word0, word1)); |
| assertEquals(bigramProbability, bigramTarget.getProbability()); |
| } |
| } |
| } |
| |
| @Test |
| public void testIterateAllWords() { |
| final long seed = System.currentTimeMillis(); |
| final Random random = new Random(seed); |
| final int UNIGRAM_COUNT = 1000; |
| final int BIGRAM_COUNT = 1000; |
| final int codePointSetSize = 20; |
| final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); |
| final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); |
| |
| final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord", |
| false /* isBeginningOfSentence */); |
| assertFalse(invalidWordProperty.isValid()); |
| |
| final ArrayList<String> words = new ArrayList<>(); |
| final HashMap<String, Integer> wordProbabilitiesToCheckLater = new HashMap<>(); |
| final HashMap<String, HashSet<String>> bigrams = new HashMap<>(); |
| final HashMap<Pair<String, String>, Integer> bigramProbabilitiesToCheckLater = |
| new HashMap<>(); |
| |
| for (int i = 0; i < UNIGRAM_COUNT; i++) { |
| final String word = CodePointUtils.generateWord(random, codePointSet); |
| final int unigramProbability = random.nextInt(0xFF); |
| addUnigramWord(binaryDictionary, word, unigramProbability); |
| if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { |
| binaryDictionary.flushWithGC(); |
| } |
| words.add(word); |
| wordProbabilitiesToCheckLater.put(word, unigramProbability); |
| } |
| |
| for (int i = 0; i < BIGRAM_COUNT; i++) { |
| final int word0Index = random.nextInt(wordProbabilitiesToCheckLater.size()); |
| final int word1Index = random.nextInt(wordProbabilitiesToCheckLater.size()); |
| if (word0Index == word1Index) { |
| continue; |
| } |
| final String word0 = words.get(word0Index); |
| final String word1 = words.get(word1Index); |
| final int unigramProbability = wordProbabilitiesToCheckLater.get(word1); |
| final int bigramProbability = |
| unigramProbability + random.nextInt(0xFF - unigramProbability); |
| addBigramWords(binaryDictionary, word0, word1, bigramProbability); |
| if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { |
| binaryDictionary.flushWithGC(); |
| } |
| if (!bigrams.containsKey(word0)) { |
| final HashSet<String> bigramWord1s = new HashSet<>(); |
| bigrams.put(word0, bigramWord1s); |
| } |
| bigrams.get(word0).add(word1); |
| bigramProbabilitiesToCheckLater.put(new Pair<>(word0, word1), bigramProbability); |
| } |
| |
| final HashSet<String> wordSet = new HashSet<>(words); |
| final HashSet<Pair<String, String>> bigramSet = |
| new HashSet<>(bigramProbabilitiesToCheckLater.keySet()); |
| int token = 0; |
| do { |
| final BinaryDictionary.GetNextWordPropertyResult result = |
| binaryDictionary.getNextWordProperty(token); |
| final WordProperty wordProperty = result.mWordProperty; |
| final String word0 = wordProperty.mWord; |
| assertEquals((int)wordProbabilitiesToCheckLater.get(word0), |
| wordProperty.mProbabilityInfo.mProbability); |
| wordSet.remove(word0); |
| final HashSet<String> bigramWord1s = bigrams.get(word0); |
| // TODO: Support ngram. |
| if (wordProperty.mHasNgrams) { |
| for (final WeightedString bigramTarget : wordProperty.getBigrams()) { |
| final String word1 = bigramTarget.mWord; |
| assertTrue(bigramWord1s.contains(word1)); |
| final Pair<String, String> bigram = new Pair<>(word0, word1); |
| final int bigramProbability = bigramProbabilitiesToCheckLater.get(bigram); |
| assertEquals(bigramProbability, bigramTarget.getProbability()); |
| bigramSet.remove(bigram); |
| } |
| } |
| token = result.mNextToken; |
| } while (token != 0); |
| assertTrue(wordSet.isEmpty()); |
| assertTrue(bigramSet.isEmpty()); |
| } |
| |
| @Test |
| public void testPossiblyOffensiveAttributeMaintained() { |
| final BinaryDictionary binaryDictionary = |
| getEmptyBinaryDictionary(FormatSpec.VERSION403); |
| binaryDictionary.addUnigramEntry("ddd", 100, false, true, true, 0); |
| WordProperty wordProperty = binaryDictionary.getWordProperty("ddd", false); |
| assertEquals(true, wordProperty.mIsPossiblyOffensive); |
| } |
| |
| @Test |
| public void testBeginningOfSentence() { |
| final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); |
| final int dummyProbability = 0; |
| final NgramContext beginningOfSentenceContext = NgramContext.BEGINNING_OF_SENTENCE; |
| final int bigramProbability = 200; |
| addUnigramWord(binaryDictionary, "aaa", dummyProbability); |
| binaryDictionary.addNgramEntry(beginningOfSentenceContext, "aaa", bigramProbability, |
| BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); |
| assertEquals(bigramProbability, |
| binaryDictionary.getNgramProbability(beginningOfSentenceContext, "aaa")); |
| binaryDictionary.addNgramEntry(beginningOfSentenceContext, "aaa", bigramProbability, |
| BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); |
| addUnigramWord(binaryDictionary, "bbb", dummyProbability); |
| binaryDictionary.addNgramEntry(beginningOfSentenceContext, "bbb", bigramProbability, |
| BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); |
| binaryDictionary.flushWithGC(); |
| assertEquals(bigramProbability, |
| binaryDictionary.getNgramProbability(beginningOfSentenceContext, "aaa")); |
| assertEquals(bigramProbability, |
| binaryDictionary.getNgramProbability(beginningOfSentenceContext, "bbb")); |
| } |
| } |