blob: 9810c1002cba83f2c4efa9e28c03af2c3a3f1ce1 [file] [log] [blame]
J. Duke319a3b92007-12-01 00:00:00 +00001/*
2 * Portions Copyright 2005-2006 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26/*
27 *******************************************************************************
28 * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
29 * *
30 * The original version of this source code and documentation is copyrighted *
31 * and owned by IBM, These materials are provided under terms of a License *
32 * Agreement between IBM and Sun. This technology is protected by multiple *
33 * US and International patents. This notice and attribution to IBM may not *
34 * to removed. *
35 *******************************************************************************
36 */
37
38package sun.text.normalizer;
39
40/**
41 * <p>Class enabling iteration of the values in a Trie.</p>
42 * <p>Result of each iteration contains the interval of codepoints that have
43 * the same value type and the value type itself.</p>
44 * <p>The comparison of each codepoint value is done via extract(), which the
45 * default implementation is to return the value as it is.</p>
46 * <p>Method extract() can be overwritten to perform manipulations on
47 * codepoint values in order to perform specialized comparison.</p>
48 * <p>TrieIterator is designed to be a generic iterator for the CharTrie
49 * and the IntTrie, hence to accommodate both types of data, the return
50 * result will be in terms of int (32 bit) values.</p>
51 * <p>See com.ibm.icu.text.UCharacterTypeIterator for examples of use.</p>
52 * <p>Notes for porting utrie_enum from icu4c to icu4j:<br>
53 * Internally, icu4c's utrie_enum performs all iterations in its body. In Java
54 * sense, the caller will have to pass a object with a callback function
55 * UTrieEnumRange(const void *context, UChar32 start, UChar32 limit,
56 * uint32_t value) into utrie_enum. utrie_enum will then find ranges of
57 * codepoints with the same value as determined by
58 * UTrieEnumValue(const void *context, uint32_t value). for each range,
59 * utrie_enum calls the callback function to perform a task. In this way,
60 * icu4c performs the iteration within utrie_enum.
61 * To follow the JDK model, icu4j is slightly different from icu4c.
62 * Instead of requesting the caller to implement an object for a callback.
63 * The caller will have to implement a subclass of TrieIterator, fleshing out
64 * the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j,
65 * the caller will have to code his own iteration and flesh out the task
66 * (equivalent to UTrieEnumRange) to be performed in the iteration loop.
67 * </p>
68 * <p>There are basically 3 usage scenarios for porting:</p>
69 * <p>1) UTrieEnumValue is the only implemented callback then just implement a
70 * subclass of TrieIterator and override the extract(int) method. The
71 * extract(int) method is analogus to UTrieEnumValue callback.
72 * </p>
73 * <p>2) UTrieEnumValue and UTrieEnumRange both are implemented then implement
74 * a subclass of TrieIterator, override the extract method and iterate, e.g
75 * </p>
76 * <p>utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange,
77 * set);<br>
78 * In Java :<br>
79 * <pre>
80 * class TrieIteratorImpl extends TrieIterator{
81 * public TrieIteratorImpl(Trie data){
82 * super(data);
83 * }
84 * public int extract(int value){
85 * // port the implementation of _enumPropertyStartsValue here
86 * }
87 * }
88 * ....
89 * TrieIterator fcdIter = new TrieIteratorImpl(fcdTrieImpl.fcdTrie);
90 * while(fcdIter.next(result)) {
91 * // port the implementation of _enumPropertyStartsRange
92 * }
93 * </pre>
94 * </p>
95 * <p>3) UTrieEnumRange is the only implemented callback then just implement
96 * the while loop, when utrie_enum is called
97 * <pre>
98 * // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
99 * TrieIterator fcdIter = new TrieIterator(fcdTrieImpl.fcdTrie);
100 * while(fcdIter.next(result)){
101 * set.add(result.start);
102 * }
103 * </pre>
104 * </p>
105 * @author synwee
106 * @see com.ibm.icu.impl.Trie
107 * @see com.ibm.icu.lang.UCharacterTypeIterator
108 * @since release 2.1, Jan 17 2002
109 */
110public class TrieIterator implements RangeValueIterator
111
112{
113 // public constructor ---------------------------------------------
114
115 /**
116 * TrieEnumeration constructor
117 * @param trie to be used
118 * @exception IllegalArgumentException throw when argument is null.
119 * @draft 2.1
120 */
121 public TrieIterator(Trie trie)
122 {
123 if (trie == null) {
124 throw new IllegalArgumentException(
125 "Argument trie cannot be null");
126 }
127 m_trie_ = trie;
128 // synwee: check that extract belongs to the child class
129 m_initialValue_ = extract(m_trie_.getInitialValue());
130 reset();
131 }
132
133 // public methods -------------------------------------------------
134
135 /**
136 * <p>Returns true if we are not at the end of the iteration, false
137 * otherwise.</p>
138 * <p>The next set of codepoints with the same value type will be
139 * calculated during this call and returned in the arguement element.</p>
140 * @param element return result
141 * @return true if we are not at the end of the iteration, false otherwise.
142 * @exception NoSuchElementException - if no more elements exist.
143 * @see com.ibm.icu.util.RangeValueIterator.Element
144 * @draft 2.1
145 */
146 public final boolean next(Element element)
147 {
148 if (m_nextCodepoint_ > UCharacter.MAX_VALUE) {
149 return false;
150 }
151 if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE &&
152 calculateNextBMPElement(element)) {
153 return true;
154 }
155 calculateNextSupplementaryElement(element);
156 return true;
157 }
158
159 /**
160 * Resets the iterator to the beginning of the iteration
161 * @draft 2.1
162 */
163 public final void reset()
164 {
165 m_currentCodepoint_ = 0;
166 m_nextCodepoint_ = 0;
167 m_nextIndex_ = 0;
168 m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_;
169 if (m_nextBlock_ == 0) {
170 m_nextValue_ = m_initialValue_;
171 }
172 else {
173 m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_));
174 }
175 m_nextBlockIndex_ = 0;
176 m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_;
177 }
178
179 // protected methods ----------------------------------------------
180
181 /**
182 * Called by next() to extracts a 32 bit value from a trie value
183 * used for comparison.
184 * This method is to be overwritten if special manipulation is to be done
185 * to retrieve a relevant comparison.
186 * The default function is to return the value as it is.
187 * @param value a value from the trie
188 * @return extracted value
189 * @draft 2.1
190 */
191 protected int extract(int value)
192 {
193 return value;
194 }
195
196 // private methods ------------------------------------------------
197
198 /**
199 * Set the result values
200 * @param element return result object
201 * @param start codepoint of range
202 * @param limit (end + 1) codepoint of range
203 * @param value common value of range
204 */
205 private final void setResult(Element element, int start, int limit,
206 int value)
207 {
208 element.start = start;
209 element.limit = limit;
210 element.value = value;
211 }
212
213 /**
214 * Finding the next element.
215 * This method is called just before returning the result of
216 * next().
217 * We always store the next element before it is requested.
218 * In the case that we have to continue calculations into the
219 * supplementary planes, a false will be returned.
220 * @param element return result object
221 * @return true if the next range is found, false if we have to proceed to
222 * the supplementary range.
223 */
224 private final boolean calculateNextBMPElement(Element element)
225 {
226 int currentBlock = m_nextBlock_;
227 int currentValue = m_nextValue_;
228 m_currentCodepoint_ = m_nextCodepoint_;
229 m_nextCodepoint_ ++;
230 m_nextBlockIndex_ ++;
231 if (!checkBlockDetail(currentValue)) {
232 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
233 currentValue);
234 return true;
235 }
236 // synwee check that next block index == 0 here
237 // enumerate BMP - the main loop enumerates data blocks
238 while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) {
239 m_nextIndex_ ++;
240 // because of the way the character is split to form the index
241 // the lead surrogate and trail surrogate can not be in the
242 // mid of a block
243 if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) {
244 // skip lead surrogate code units,
245 // go to lead surrogate codepoints
246 m_nextIndex_ = BMP_INDEX_LENGTH_;
247 }
248 else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) {
249 // go back to regular BMP code points
250 m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_;
251 }
252
253 m_nextBlockIndex_ = 0;
254 if (!checkBlock(currentBlock, currentValue)) {
255 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
256 currentValue);
257 return true;
258 }
259 }
260 m_nextCodepoint_ --; // step one back since this value has not been
261 m_nextBlockIndex_ --; // retrieved yet.
262 return false;
263 }
264
265 /**
266 * Finds the next supplementary element.
267 * For each entry in the trie, the value to be delivered is passed through
268 * extract().
269 * We always store the next element before it is requested.
270 * Called after calculateNextBMP() completes its round of BMP characters.
271 * There is a slight difference in the usage of m_currentCodepoint_
272 * here as compared to calculateNextBMP(). Though both represents the
273 * lower bound of the next element, in calculateNextBMP() it gets set
274 * at the start of any loop, where-else, in calculateNextSupplementary()
275 * since m_currentCodepoint_ already contains the lower bound of the
276 * next element (passed down from calculateNextBMP()), we keep it till
277 * the end before resetting it to the new value.
278 * Note, if there are no more iterations, it will never get to here.
279 * Blocked out by next().
280 * @param element return result object
281 * @draft 2.1
282 */
283 private final void calculateNextSupplementaryElement(Element element)
284 {
285 int currentValue = m_nextValue_;
286 int currentBlock = m_nextBlock_;
287 m_nextCodepoint_ ++;
288 m_nextBlockIndex_ ++;
289
290 if (UTF16.getTrailSurrogate(m_nextCodepoint_)
291 != UTF16.TRAIL_SURROGATE_MIN_VALUE) {
292 // this piece is only called when we are in the middle of a lead
293 // surrogate block
294 if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) {
295 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
296 currentValue);
297 m_currentCodepoint_ = m_nextCodepoint_;
298 return;
299 }
300 // we have cleared one block
301 m_nextIndex_ ++;
302 m_nextTrailIndexOffset_ ++;
303 if (!checkTrailBlock(currentBlock, currentValue)) {
304 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
305 currentValue);
306 m_currentCodepoint_ = m_nextCodepoint_;
307 return;
308 }
309 }
310 int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_);
311 // enumerate supplementary code points
312 while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) {
313 // lead surrogate access
314 int leadBlock =
315 m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
316 Trie.INDEX_STAGE_2_SHIFT_;
317 if (leadBlock == m_trie_.m_dataOffset_) {
318 // no entries for a whole block of lead surrogates
319 if (currentValue != m_initialValue_) {
320 m_nextValue_ = m_initialValue_;
321 m_nextBlock_ = 0;
322 m_nextBlockIndex_ = 0;
323 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
324 currentValue);
325 m_currentCodepoint_ = m_nextCodepoint_;
326 return;
327 }
328
329 nextLead += DATA_BLOCK_LENGTH_;
330 // number of total affected supplementary codepoints in one
331 // block
332 // this is not a simple addition of
333 // DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider
334 // that we might have moved some of the codepoints
335 m_nextCodepoint_ = UCharacterProperty.getRawSupplementary(
336 (char)nextLead,
337 (char)UTF16.TRAIL_SURROGATE_MIN_VALUE);
338 continue;
339 }
340 if (m_trie_.m_dataManipulate_ == null) {
341 throw new NullPointerException(
342 "The field DataManipulate in this Trie is null");
343 }
344 // enumerate trail surrogates for this lead surrogate
345 m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
346 m_trie_.getValue(leadBlock +
347 (nextLead & Trie.INDEX_STAGE_3_MASK_)));
348 if (m_nextIndex_ <= 0) {
349 // no data for this lead surrogate
350 if (currentValue != m_initialValue_) {
351 m_nextValue_ = m_initialValue_;
352 m_nextBlock_ = 0;
353 m_nextBlockIndex_ = 0;
354 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
355 currentValue);
356 m_currentCodepoint_ = m_nextCodepoint_;
357 return;
358 }
359 m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_;
360 } else {
361 m_nextTrailIndexOffset_ = 0;
362 if (!checkTrailBlock(currentBlock, currentValue)) {
363 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
364 currentValue);
365 m_currentCodepoint_ = m_nextCodepoint_;
366 return;
367 }
368 }
369 nextLead ++;
370 }
371
372 // deliver last range
373 setResult(element, m_currentCodepoint_, UCharacter.MAX_VALUE + 1,
374 currentValue);
375 }
376
377 /**
378 * Internal block value calculations
379 * Performs calculations on a data block to find codepoints in m_nextBlock_
380 * after the index m_nextBlockIndex_ that has the same value.
381 * Note m_*_ variables at this point is the next codepoint whose value
382 * has not been calculated.
383 * But when returned with false, it will be the last codepoint whose
384 * value has been calculated.
385 * @param currentValue the value which other codepoints are tested against
386 * @return true if the whole block has the same value as currentValue or if
387 * the whole block has been calculated, false otherwise.
388 */
389 private final boolean checkBlockDetail(int currentValue)
390 {
391 while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) {
392 m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_ +
393 m_nextBlockIndex_));
394 if (m_nextValue_ != currentValue) {
395 return false;
396 }
397 ++ m_nextBlockIndex_;
398 ++ m_nextCodepoint_;
399 }
400 return true;
401 }
402
403 /**
404 * Internal block value calculations
405 * Performs calculations on a data block to find codepoints in m_nextBlock_
406 * that has the same value.
407 * Will call checkBlockDetail() if highlevel check fails.
408 * Note m_*_ variables at this point is the next codepoint whose value
409 * has not been calculated.
410 * @param currentBlock the initial block containing all currentValue
411 * @param currentValue the value which other codepoints are tested against
412 * @return true if the whole block has the same value as currentValue or if
413 * the whole block has been calculated, false otherwise.
414 */
415 private final boolean checkBlock(int currentBlock, int currentValue)
416 {
417 m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] <<
418 Trie.INDEX_STAGE_2_SHIFT_;
419 if (m_nextBlock_ == currentBlock &&
420 (m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) {
421 // the block is the same as the previous one, filled with
422 // currentValue
423 m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
424 }
425 else if (m_nextBlock_ == 0) {
426 // this is the all-initial-value block
427 if (currentValue != m_initialValue_) {
428 m_nextValue_ = m_initialValue_;
429 m_nextBlockIndex_ = 0;
430 return false;
431 }
432 m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
433 }
434 else {
435 if (!checkBlockDetail(currentValue)) {
436 return false;
437 }
438 }
439 return true;
440 }
441
442 /**
443 * Internal block value calculations
444 * Performs calculations on multiple data blocks for a set of trail
445 * surrogates to find codepoints in m_nextBlock_ that has the same value.
446 * Will call checkBlock() for internal block checks.
447 * Note m_*_ variables at this point is the next codepoint whose value
448 * has not been calculated.
449 * @param currentBlock the initial block containing all currentValue
450 * @param currentValue the value which other codepoints are tested against
451 * @return true if the whole block has the same value as currentValue or if
452 * the whole block has been calculated, false otherwise.
453 */
454 private final boolean checkTrailBlock(int currentBlock,
455 int currentValue)
456 {
457 // enumerate code points for this lead surrogate
458 while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_)
459 {
460 // if we ever reach here, we are at the start of a new block
461 m_nextBlockIndex_ = 0;
462 // copy of most of the body of the BMP loop
463 if (!checkBlock(currentBlock, currentValue)) {
464 return false;
465 }
466 m_nextTrailIndexOffset_ ++;
467 m_nextIndex_ ++;
468 }
469 return true;
470 }
471
472 /**
473 * Checks if we are beginning at the start of a initial block.
474 * If we are then the rest of the codepoints in this initial block
475 * has the same values.
476 * We increment m_nextCodepoint_ and relevant data members if so.
477 * This is used only in for the supplementary codepoints because
478 * the offset to the trail indexes could be 0.
479 * @return true if we are at the start of a initial block.
480 */
481 private final boolean checkNullNextTrailIndex()
482 {
483 if (m_nextIndex_ <= 0) {
484 m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1;
485 int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_);
486 int leadBlock =
487 m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
488 Trie.INDEX_STAGE_2_SHIFT_;
489 if (m_trie_.m_dataManipulate_ == null) {
490 throw new NullPointerException(
491 "The field DataManipulate in this Trie is null");
492 }
493 m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
494 m_trie_.getValue(leadBlock +
495 (nextLead & Trie.INDEX_STAGE_3_MASK_)));
496 m_nextIndex_ --;
497 m_nextBlockIndex_ = DATA_BLOCK_LENGTH_;
498 return true;
499 }
500 return false;
501 }
502
503 // private data members --------------------------------------------
504
505 /**
506 * Size of the stage 1 BMP indexes
507 */
508 private static final int BMP_INDEX_LENGTH_ =
509 0x10000 >> Trie.INDEX_STAGE_1_SHIFT_;
510 /**
511 * Lead surrogate minimum value
512 */
513 private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800;
514 /**
515 * Trail surrogate minimum value
516 */
517 private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00;
518 /**
519 * Trail surrogate maximum value
520 */
521 private static final int TRAIL_SURROGATE_MAX_VALUE_ = 0xDFFF;
522 /**
523 * Number of trail surrogate
524 */
525 private static final int TRAIL_SURROGATE_COUNT_ = 0x400;
526 /**
527 * Number of stage 1 indexes for supplementary calculations that maps to
528 * each lead surrogate character.
529 * See second pass into getRawOffset for the trail surrogate character.
530 * 10 for significant number of bits for trail surrogates, 5 for what we
531 * discard during shifting.
532 */
533 private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ =
534 1 << (10 - Trie.INDEX_STAGE_1_SHIFT_);
535 /**
536 * Number of data values in a stage 2 (data array) block.
537 */
538 private static final int DATA_BLOCK_LENGTH_ =
539 1 << Trie.INDEX_STAGE_1_SHIFT_;
540 /**
541 * Number of codepoints in a stage 2 block
542 */
543 private static final int DATA_BLOCK_SUPPLEMENTARY_LENGTH_ =
544 DATA_BLOCK_LENGTH_ << 10;
545 /**
546 * Trie instance
547 */
548 private Trie m_trie_;
549 /**
550 * Initial value for trie values
551 */
552 private int m_initialValue_;
553 /**
554 * Next element results and data.
555 */
556 private int m_currentCodepoint_;
557 private int m_nextCodepoint_;
558 private int m_nextValue_;
559 private int m_nextIndex_;
560 private int m_nextBlock_;
561 private int m_nextBlockIndex_;
562 private int m_nextTrailIndexOffset_;
563 /**
564 * This is the return result element
565 */
566 private int m_start_;
567 private int m_limit_;
568 private int m_value_;
569}