AI 143689: am: CL 143659 am: CL 143472 Reduce dictionary size.
Changed the tree structure to have variable length nodes to save an average of 21% on the dictionary size.
Created a shortened English dictionary for Dream - 50K words.
Added a shortened Spanish dictionary for Dream - 32K words.
Original author: yamasani
Merged from: //branches/cupcake/...
Original author: android-build
Merged from: //branches/donutburger/...
Automated import of CL 143689
diff --git a/tools/makedict/src/com/android/tools/dict/MakeBinaryDictionary.java b/tools/makedict/src/com/android/tools/dict/MakeBinaryDictionary.java
index cbe7028..8a8a677 100755
--- a/tools/makedict/src/com/android/tools/dict/MakeBinaryDictionary.java
+++ b/tools/makedict/src/com/android/tools/dict/MakeBinaryDictionary.java
@@ -45,6 +45,10 @@
public static final String TAG_WORD = "w";
public static final String ATTR_FREQ = "f";
+ private static final int FLAG_ADDRESS_MASK = 0x400000;
+ private static final int FLAG_TERMINAL_MASK = 0x800000;
+ private static final int ADDRESS_MASK = 0x3FFFFF;
+
public static final CharNode EMPTY_NODE = new CharNode();
List<CharNode> roots;
@@ -179,7 +183,7 @@
parent.children.add(child);
}
child.data = data;
- child.freq += occur;
+ if (child.freq == 0) child.freq = occur;
if (word.length() > charAt + 1) {
addWordRec(child, word, charAt + 1, occur);
} else {
@@ -195,56 +199,76 @@
static final int ADDR_WIDTH = 23; // Offset to children
static final int FREQ_WIDTH_BYTES = 1;
static final int COUNT_WIDTH_BYTES = 1;
- static final int NODE_SIZE_BYTES =
- (CHAR_WIDTH + FLAGS_WIDTH + ADDR_WIDTH) / 8 + FREQ_WIDTH_BYTES;
private void addCount(int count) {
dict[dictSize++] = (byte) (0xFF & count);
}
- /* TODO: Allow characters to be beyond the 0-255 range. This is required for some latin
- language not currently supported */
private void addNode(CharNode node) {
int charData = 0xFFFF & node.data;
if (charData > 254) {
- System.out.println("WARNING: Non-ASCII character encountered : " + node.data +
- ", value = " + charData);
- dict[dictSize++] = '@';
+ dict[dictSize++] = (byte) 255;
+ dict[dictSize++] = (byte) ((node.data >> 8) & 0xFF);
+ dict[dictSize++] = (byte) (node.data & 0xFF);
} else {
dict[dictSize++] = (byte) (0xFF & node.data);
}
- dictSize += 3; // Space for children address
- if ((0xFFFFFF & node.freq) > 255) {
- node.freq = (byte) 255;
+ if (node.children != null) {
+ dictSize += 3; // Space for children address
+ } else {
+ dictSize += 1; // Space for just the terminal/address flags
}
- dict[dictSize++] = (byte) (0xFF & node.freq);
+ if ((0xFFFFFF & node.freq) > 255) {
+ node.freq = 255;
+ }
+ if (node.terminal) {
+ byte freq = (byte) (0xFF & node.freq);
+ dict[dictSize++] = freq;
+ }
}
+ int nullChildrenCount = 0;
+ int notTerminalCount = 0;
+
private void updateNodeAddress(int nodeAddress, CharNode node,
int childrenAddress) {
- childrenAddress = 0x7FFFFF & childrenAddress;
+ if ((dict[nodeAddress] & 0xFF) == 0xFF) { // 3 byte character
+ nodeAddress += 2;
+ }
+ childrenAddress = ADDRESS_MASK & childrenAddress;
+ if (childrenAddress == 0) {
+ nullChildrenCount++;
+ } else {
+ childrenAddress |= FLAG_ADDRESS_MASK;
+ }
if (node.terminal) {
- childrenAddress |= 0x800000;
+ childrenAddress |= FLAG_TERMINAL_MASK;
+ } else {
+ notTerminalCount++;
}
dict[nodeAddress + 1] = (byte) (childrenAddress >> 16);
- dict[nodeAddress + 2] = (byte) ((childrenAddress & 0xFF00) >> 8);
- dict[nodeAddress + 3] = (byte) ((childrenAddress & 0xFF));
+ if ((childrenAddress & FLAG_ADDRESS_MASK) != 0) {
+ dict[nodeAddress + 2] = (byte) ((childrenAddress & 0xFF00) >> 8);
+ dict[nodeAddress + 3] = (byte) ((childrenAddress & 0xFF));
+ }
}
void writeWordsRec(List<CharNode> children) {
if (children == null || children.size() == 0) {
return;
}
- addCount(children.size());
- int childrenStart = dictSize;
- for (int j = 0; j < children.size(); j++) {
+ final int childCount = children.size();
+ addCount(childCount);
+ //int childrenStart = dictSize;
+ int[] childrenAddresses = new int[childCount];
+ for (int j = 0; j < childCount; j++) {
CharNode node = children.get(j);
+ childrenAddresses[j] = dictSize;
addNode(node);
}
- for (int j = 0; j < children.size(); j++) {
+ for (int j = 0; j < childCount; j++) {
CharNode node = children.get(j);
- // TODO: Fix this when child length becomes variable
- int nodeAddress = childrenStart + NODE_SIZE_BYTES * j;
+ int nodeAddress = childrenAddresses[j];
int cacheDictSize = dictSize;
writeWordsRec(node.children);
updateNodeAddress(nodeAddress, node, node.children != null
@@ -253,8 +277,8 @@
}
void writeToDict(String dictFilename) {
- // 2MB max
- dict = new byte[2 * 1024 * 1024]; // 2MB upper limit. Actual is probably
+ // 4MB max, 22-bit offsets
+ dict = new byte[4 * 1024 * 1024]; // 4MB upper limit. Actual is probably
// < 1MB in most cases, as there is a limit in the
// resource size in apks.
dictSize = 0;
@@ -272,19 +296,29 @@
void traverseDict(int pos, char[] word, int depth) {
int count = dict[pos++] & 0xFF;
for (int i = 0; i < count; i++) {
- char c = (char) (dict[pos] & 0xFF);
- word[depth] = c;
- if ((dict[pos + 1] & 0x80) > 0) {
- showWord(word, depth + 1, dict[pos + 4] & 0xFF);
+ char c = (char) (dict[pos++] & 0xFF);
+ if (c == 0xFF) {
+ c = (char) (((dict[pos] & 0xFF) << 8) | (dict[pos+1] & 0xFF));
+ pos += 2;
}
- int address =
- ((dict[pos + 1] & 0x7F) << 16)
- | ((dict[pos + 2] & 0xFF) << 8)
- | ((dict[pos + 3] & 0xFF));
+ word[depth] = c;
+ boolean terminal = (dict[pos] & 0x80) > 0;
+ int address = 0;
+ if ((dict[pos] & (FLAG_ADDRESS_MASK >> 16)) > 0) {
+ address =
+ ((dict[pos + 0] & (FLAG_ADDRESS_MASK >> 16)) << 16)
+ | ((dict[pos + 1] & 0xFF) << 8)
+ | ((dict[pos + 2] & 0xFF));
+ pos += 2;
+ }
+ pos++;
+ if (terminal) {
+ showWord(word, depth + 1, dict[pos] & 0xFF);
+ pos++;
+ }
if (address != 0) {
traverseDict(address, word, depth + 1);
}
- pos += NODE_SIZE_BYTES;
}
}