blob: 4a598155496028e00107ecf3edec8af578874635 [file] [log] [blame]
J. Duke319a3b92007-12-01 00:00:00 +00001/*
2 * Copyright 1999-2003 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26/*
27 *
28 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
29 * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
30 *
31 * The original version of this source code and documentation
32 * is copyrighted and owned by Taligent, Inc., a wholly-owned
33 * subsidiary of IBM. These materials are provided under terms
34 * of a License Agreement between Taligent and Sun. This technology
35 * is protected by multiple US and International patents.
36 *
37 * This notice and attribution to Taligent may not be removed.
38 * Taligent is a registered trademark of Taligent, Inc.
39 */
40
41package java.text;
42
43import java.util.Vector;
44import java.util.Stack;
45import java.util.Hashtable;
46import java.text.CharacterIterator;
47import java.io.InputStream;
48import java.io.IOException;
49
50/**
51 * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
52 * to further subdivide ranges of text beyond what is possible using just the
53 * state-table-based algorithm. This is necessary, for example, to handle
54 * word and line breaking in Thai, which doesn't use spaces between words. The
55 * state-table-based algorithm used by RuleBasedBreakIterator is used to divide
56 * up text as far as possible, and then contiguous ranges of letters are
57 * repeatedly compared against a list of known words (i.e., the dictionary)
58 * to divide them up into words.
59 *
60 * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator,
61 * but adds one more special substitution name: <dictionary>. This substitution
62 * name is used to identify characters in words in the dictionary. The idea is that
63 * if the iterator passes over a chunk of text that includes two or more characters
64 * in a row that are included in <dictionary>, it goes back through that range and
65 * derives additional break positions (if possible) using the dictionary.
66 *
67 * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
68 * file. It follows a prescribed search path to locate the dictionary (right now,
69 * it looks for it in /com/ibm/text/resources in each directory in the classpath,
70 * and won't find it in JAR files, but this location is likely to change). The
71 * dictionary file is in a serialized binary format. We have a very primitive (and
72 * slow) BuildDictionaryFile utility for creating dictionary files, but aren't
73 * currently making it public. Contact us for help.
74 */
75class DictionaryBasedBreakIterator extends RuleBasedBreakIterator {
76
77 /**
78 * a list of known words that is used to divide up contiguous ranges of letters,
79 * stored in a compressed, indexed, format that offers fast access
80 */
81 private BreakDictionary dictionary;
82
83 /**
84 * a list of flags indicating which character categories are contained in
85 * the dictionary file (this is used to determine which ranges of characters
86 * to apply the dictionary to)
87 */
88 private boolean[] categoryFlags;
89
90 /**
91 * a temporary hiding place for the number of dictionary characters in the
92 * last range passed over by next()
93 */
94 private int dictionaryCharCount;
95
96 /**
97 * when a range of characters is divided up using the dictionary, the break
98 * positions that are discovered are stored here, preventing us from having
99 * to use either the dictionary or the state table again until the iterator
100 * leaves this range of text
101 */
102 private int[] cachedBreakPositions;
103
104 /**
105 * if cachedBreakPositions is not null, this indicates which item in the
106 * cache the current iteration position refers to
107 */
108 private int positionInCache;
109
110 /**
111 * Constructs a DictionaryBasedBreakIterator.
112 * @param description Same as the description parameter on RuleBasedBreakIterator,
113 * except for the special meaning of "<dictionary>". This parameter is just
114 * passed through to RuleBasedBreakIterator's constructor.
115 * @param dictionaryFilename The filename of the dictionary file to use
116 */
117 public DictionaryBasedBreakIterator(String dataFile, String dictionaryFile)
118 throws IOException {
119 super(dataFile);
120 byte[] tmp = super.getAdditionalData();
121 if (tmp != null) {
122 prepareCategoryFlags(tmp);
123 super.setAdditionalData(null);
124 }
125 dictionary = new BreakDictionary(dictionaryFile);
126 }
127
128 private void prepareCategoryFlags(byte[] data) {
129 categoryFlags = new boolean[data.length];
130 for (int i = 0; i < data.length; i++) {
131 categoryFlags[i] = (data[i] == (byte)1) ? true : false;
132 }
133 }
134
135 public void setText(CharacterIterator newText) {
136 super.setText(newText);
137 cachedBreakPositions = null;
138 dictionaryCharCount = 0;
139 positionInCache = 0;
140 }
141
142 /**
143 * Sets the current iteration position to the beginning of the text.
144 * (i.e., the CharacterIterator's starting offset).
145 * @return The offset of the beginning of the text.
146 */
147 public int first() {
148 cachedBreakPositions = null;
149 dictionaryCharCount = 0;
150 positionInCache = 0;
151 return super.first();
152 }
153
154 /**
155 * Sets the current iteration position to the end of the text.
156 * (i.e., the CharacterIterator's ending offset).
157 * @return The text's past-the-end offset.
158 */
159 public int last() {
160 cachedBreakPositions = null;
161 dictionaryCharCount = 0;
162 positionInCache = 0;
163 return super.last();
164 }
165
166 /**
167 * Advances the iterator one step backwards.
168 * @return The position of the last boundary position before the
169 * current iteration position
170 */
171 public int previous() {
172 CharacterIterator text = getText();
173
174 // if we have cached break positions and we're still in the range
175 // covered by them, just move one step backward in the cache
176 if (cachedBreakPositions != null && positionInCache > 0) {
177 --positionInCache;
178 text.setIndex(cachedBreakPositions[positionInCache]);
179 return cachedBreakPositions[positionInCache];
180 }
181
182 // otherwise, dump the cache and use the inherited previous() method to move
183 // backward. This may fill up the cache with new break positions, in which
184 // case we have to mark our position in the cache
185 else {
186 cachedBreakPositions = null;
187 int result = super.previous();
188 if (cachedBreakPositions != null) {
189 positionInCache = cachedBreakPositions.length - 2;
190 }
191 return result;
192 }
193 }
194
195 /**
196 * Sets the current iteration position to the last boundary position
197 * before the specified position.
198 * @param offset The position to begin searching from
199 * @return The position of the last boundary before "offset"
200 */
201 public int preceding(int offset) {
202 CharacterIterator text = getText();
203 checkOffset(offset, text);
204
205 // if we have no cached break positions, or "offset" is outside the
206 // range covered by the cache, we can just call the inherited routine
207 // (which will eventually call other routines in this class that may
208 // refresh the cache)
209 if (cachedBreakPositions == null || offset <= cachedBreakPositions[0] ||
210 offset > cachedBreakPositions[cachedBreakPositions.length - 1]) {
211 cachedBreakPositions = null;
212 return super.preceding(offset);
213 }
214
215 // on the other hand, if "offset" is within the range covered by the cache,
216 // then all we have to do is search the cache for the last break position
217 // before "offset"
218 else {
219 positionInCache = 0;
220 while (positionInCache < cachedBreakPositions.length
221 && offset > cachedBreakPositions[positionInCache]) {
222 ++positionInCache;
223 }
224 --positionInCache;
225 text.setIndex(cachedBreakPositions[positionInCache]);
226 return text.getIndex();
227 }
228 }
229
230 /**
231 * Sets the current iteration position to the first boundary position after
232 * the specified position.
233 * @param offset The position to begin searching forward from
234 * @return The position of the first boundary after "offset"
235 */
236 public int following(int offset) {
237 CharacterIterator text = getText();
238 checkOffset(offset, text);
239
240 // if we have no cached break positions, or if "offset" is outside the
241 // range covered by the cache, then dump the cache and call our
242 // inherited following() method. This will call other methods in this
243 // class that may refresh the cache.
244 if (cachedBreakPositions == null || offset < cachedBreakPositions[0] ||
245 offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) {
246 cachedBreakPositions = null;
247 return super.following(offset);
248 }
249
250 // on the other hand, if "offset" is within the range covered by the
251 // cache, then just search the cache for the first break position
252 // after "offset"
253 else {
254 positionInCache = 0;
255 while (positionInCache < cachedBreakPositions.length
256 && offset >= cachedBreakPositions[positionInCache]) {
257 ++positionInCache;
258 }
259 text.setIndex(cachedBreakPositions[positionInCache]);
260 return text.getIndex();
261 }
262 }
263
264 /**
265 * This is the implementation function for next().
266 */
267 protected int handleNext() {
268 CharacterIterator text = getText();
269
270 // if there are no cached break positions, or if we've just moved
271 // off the end of the range covered by the cache, we have to dump
272 // and possibly regenerate the cache
273 if (cachedBreakPositions == null ||
274 positionInCache == cachedBreakPositions.length - 1) {
275
276 // start by using the inherited handleNext() to find a tentative return
277 // value. dictionaryCharCount tells us how many dictionary characters
278 // we passed over on our way to the tentative return value
279 int startPos = text.getIndex();
280 dictionaryCharCount = 0;
281 int result = super.handleNext();
282
283 // if we passed over more than one dictionary character, then we use
284 // divideUpDictionaryRange() to regenerate the cached break positions
285 // for the new range
286 if (dictionaryCharCount > 1 && result - startPos > 1) {
287 divideUpDictionaryRange(startPos, result);
288 }
289
290 // otherwise, the value we got back from the inherited fuction
291 // is our return value, and we can dump the cache
292 else {
293 cachedBreakPositions = null;
294 return result;
295 }
296 }
297
298 // if the cache of break positions has been regenerated (or existed all
299 // along), then just advance to the next break position in the cache
300 // and return it
301 if (cachedBreakPositions != null) {
302 ++positionInCache;
303 text.setIndex(cachedBreakPositions[positionInCache]);
304 return cachedBreakPositions[positionInCache];
305 }
306 return -9999; // SHOULD NEVER GET HERE!
307 }
308
309 /**
310 * Looks up a character category for a character.
311 */
312 protected int lookupCategory(int c) {
313 // this override of lookupCategory() exists only to keep track of whether we've
314 // passed over any dictionary characters. It calls the inherited lookupCategory()
315 // to do the real work, and then checks whether its return value is one of the
316 // categories represented in the dictionary. If it is, bump the dictionary-
317 // character count.
318 int result = super.lookupCategory(c);
319 if (result != RuleBasedBreakIterator.IGNORE && categoryFlags[result]) {
320 ++dictionaryCharCount;
321 }
322 return result;
323 }
324
325 /**
326 * This is the function that actually implements the dictionary-based
327 * algorithm. Given the endpoints of a range of text, it uses the
328 * dictionary to determine the positions of any boundaries in this
329 * range. It stores all the boundary positions it discovers in
330 * cachedBreakPositions so that we only have to do this work once
331 * for each time we enter the range.
332 */
333 private void divideUpDictionaryRange(int startPos, int endPos) {
334 CharacterIterator text = getText();
335
336 // the range we're dividing may begin or end with non-dictionary characters
337 // (i.e., for line breaking, we may have leading or trailing punctuation
338 // that needs to be kept with the word). Seek from the beginning of the
339 // range to the first dictionary character
340 text.setIndex(startPos);
341 int c = getCurrent();
342 int category = lookupCategory(c);
343 while (category == IGNORE || !categoryFlags[category]) {
344 c = getNext();
345 category = lookupCategory(c);
346 }
347
348 // initialize. We maintain two stacks: currentBreakPositions contains
349 // the list of break positions that will be returned if we successfully
350 // finish traversing the whole range now. possibleBreakPositions lists
351 // all other possible word ends we've passed along the way. (Whenever
352 // we reach an error [a sequence of characters that can't begin any word
353 // in the dictionary], we back up, possibly delete some breaks from
354 // currentBreakPositions, move a break from possibleBreakPositions
355 // to currentBreakPositions, and start over from there. This process
356 // continues in this way until we either successfully make it all the way
357 // across the range, or exhaust all of our combinations of break
358 // positions.)
359 Stack currentBreakPositions = new Stack();
360 Stack possibleBreakPositions = new Stack();
361 Vector wrongBreakPositions = new Vector();
362
363 // the dictionary is implemented as a trie, which is treated as a state
364 // machine. -1 represents the end of a legal word. Every word in the
365 // dictionary is represented by a path from the root node to -1. A path
366 // that ends in state 0 is an illegal combination of characters.
367 int state = 0;
368
369 // these two variables are used for error handling. We keep track of the
370 // farthest we've gotten through the range being divided, and the combination
371 // of breaks that got us that far. If we use up all possible break
372 // combinations, the text contains an error or a word that's not in the
373 // dictionary. In this case, we "bless" the break positions that got us the
374 // farthest as real break positions, and then start over from scratch with
375 // the character where the error occurred.
376 int farthestEndPoint = text.getIndex();
377 Stack bestBreakPositions = null;
378
379 // initialize (we always exit the loop with a break statement)
380 c = getCurrent();
381 while (true) {
382
383 // if we can transition to state "-1" from our current state, we're
384 // on the last character of a legal word. Push that position onto
385 // the possible-break-positions stack
386 if (dictionary.getNextState(state, 0) == -1) {
387 possibleBreakPositions.push(new Integer(text.getIndex()));
388 }
389
390 // look up the new state to transition to in the dictionary
391 state = dictionary.getNextStateFromCharacter(state, c);
392
393 // if the character we're sitting on causes us to transition to
394 // the "end of word" state, then it was a non-dictionary character
395 // and we've successfully traversed the whole range. Drop out
396 // of the loop.
397 if (state == -1) {
398 currentBreakPositions.push(new Integer(text.getIndex()));
399 break;
400 }
401
402 // if the character we're sitting on causes us to transition to
403 // the error state, or if we've gone off the end of the range
404 // without transitioning to the "end of word" state, we've hit
405 // an error...
406 else if (state == 0 || text.getIndex() >= endPos) {
407
408 // if this is the farthest we've gotten, take note of it in
409 // case there's an error in the text
410 if (text.getIndex() > farthestEndPoint) {
411 farthestEndPoint = text.getIndex();
412 bestBreakPositions = (Stack)(currentBreakPositions.clone());
413 }
414
415 // wrongBreakPositions is a list of all break positions
416 // we've tried starting that didn't allow us to traverse
417 // all the way through the text. Every time we pop a
418 //break position off of currentBreakPositions, we put it
419 // into wrongBreakPositions to avoid trying it again later.
420 // If we make it to this spot, we're either going to back
421 // up to a break in possibleBreakPositions and try starting
422 // over from there, or we've exhausted all possible break
423 // positions and are going to do the fallback procedure.
424 // This loop prevents us from messing with anything in
425 // possibleBreakPositions that didn't work as a starting
426 // point the last time we tried it (this is to prevent a bunch of
427 // repetitive checks from slowing down some extreme cases)
428 Integer newStartingSpot = null;
429 while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains(
430 possibleBreakPositions.peek())) {
431 possibleBreakPositions.pop();
432 }
433
434 // if we've used up all possible break-position combinations, there's
435 // an error or an unknown word in the text. In this case, we start
436 // over, treating the farthest character we've reached as the beginning
437 // of the range, and "blessing" the break positions that got us that
438 // far as real break positions
439 if (possibleBreakPositions.isEmpty()) {
440 if (bestBreakPositions != null) {
441 currentBreakPositions = bestBreakPositions;
442 if (farthestEndPoint < endPos) {
443 text.setIndex(farthestEndPoint + 1);
444 }
445 else {
446 break;
447 }
448 }
449 else {
450 if ((currentBreakPositions.size() == 0 ||
451 ((Integer)(currentBreakPositions.peek())).intValue() != text.getIndex())
452 && text.getIndex() != startPos) {
453 currentBreakPositions.push(new Integer(text.getIndex()));
454 }
455 getNext();
456 currentBreakPositions.push(new Integer(text.getIndex()));
457 }
458 }
459
460 // if we still have more break positions we can try, then promote the
461 // last break in possibleBreakPositions into currentBreakPositions,
462 // and get rid of all entries in currentBreakPositions that come after
463 // it. Then back up to that position and start over from there (i.e.,
464 // treat that position as the beginning of a new word)
465 else {
466 Integer temp = (Integer)possibleBreakPositions.pop();
467 Object temp2 = null;
468 while (!currentBreakPositions.isEmpty() && temp.intValue() <
469 ((Integer)currentBreakPositions.peek()).intValue()) {
470 temp2 = currentBreakPositions.pop();
471 wrongBreakPositions.addElement(temp2);
472 }
473 currentBreakPositions.push(temp);
474 text.setIndex(((Integer)currentBreakPositions.peek()).intValue());
475 }
476
477 // re-sync "c" for the next go-round, and drop out of the loop if
478 // we've made it off the end of the range
479 c = getCurrent();
480 if (text.getIndex() >= endPos) {
481 break;
482 }
483 }
484
485 // if we didn't hit any exceptional conditions on this last iteration,
486 // just advance to the next character and loop
487 else {
488 c = getNext();
489 }
490 }
491
492 // dump the last break position in the list, and replace it with the actual
493 // end of the range (which may be the same character, or may be further on
494 // because the range actually ended with non-dictionary characters we want to
495 // keep with the word)
496 if (!currentBreakPositions.isEmpty()) {
497 currentBreakPositions.pop();
498 }
499 currentBreakPositions.push(new Integer(endPos));
500
501 // create a regular array to hold the break positions and copy
502 // the break positions from the stack to the array (in addition,
503 // our starting position goes into this array as a break position).
504 // This array becomes the cache of break positions used by next()
505 // and previous(), so this is where we actually refresh the cache.
506 cachedBreakPositions = new int[currentBreakPositions.size() + 1];
507 cachedBreakPositions[0] = startPos;
508
509 for (int i = 0; i < currentBreakPositions.size(); i++) {
510 cachedBreakPositions[i + 1] = ((Integer)currentBreakPositions.elementAt(i)).intValue();
511 }
512 positionInCache = 0;
513 }
514}