blob: c2fc1ab291dc40a523045e85e7139d7b7b9cd4c4 [file] [log] [blame]
J. Duke319a3b92007-12-01 00:00:00 +00001/*
2 * Portions Copyright 2001-2006 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26/*
27 *******************************************************************************
28 * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
29 * *
30 * The original version of this source code and documentation is copyrighted *
31 * and owned by IBM, These materials are provided under terms of a License *
32 * Agreement between IBM and Sun. This technology is protected by multiple *
33 * US and International patents. This notice and attribution to IBM may not *
34 * to removed. *
35 *******************************************************************************
36 */
37
38package sun.text.normalizer;
39
40import java.text.CharacterIterator;
41import java.text.Normalizer;
42
43/**
44 * Unicode Normalization
45 *
46 * <h2>Unicode normalization API</h2>
47 *
48 * <code>normalize</code> transforms Unicode text into an equivalent composed or
49 * decomposed form, allowing for easier sorting and searching of text.
50 * <code>normalize</code> supports the standard normalization forms described in
51 * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
52 * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
53 *
54 * Characters with accents or other adornments can be encoded in
55 * several different ways in Unicode. For example, take the character A-acute.
56 * In Unicode, this can be encoded as a single character (the
57 * "composed" form):
58 *
59 * <p>
60 * 00C1 LATIN CAPITAL LETTER A WITH ACUTE
61 * </p>
62 *
63 * or as two separate characters (the "decomposed" form):
64 *
65 * <p>
66 * 0041 LATIN CAPITAL LETTER A
67 * 0301 COMBINING ACUTE ACCENT
68 * </p>
69 *
70 * To a user of your program, however, both of these sequences should be
71 * treated as the same "user-level" character "A with acute accent". When you
72 * are searching or comparing text, you must ensure that these two sequences are
73 * treated equivalently. In addition, you must handle characters with more than
74 * one accent. Sometimes the order of a character's combining accents is
75 * significant, while in other cases accent sequences in different orders are
76 * really equivalent.
77 *
78 * Similarly, the string "ffi" can be encoded as three separate letters:
79 *
80 * <p>
81 * 0066 LATIN SMALL LETTER F
82 * 0066 LATIN SMALL LETTER F
83 * 0069 LATIN SMALL LETTER I
84 * </p>
85 *
86 * or as the single character
87 *
88 * <p>
89 * FB03 LATIN SMALL LIGATURE FFI
90 * </p>
91 *
92 * The ffi ligature is not a distinct semantic character, and strictly speaking
93 * it shouldn't be in Unicode at all, but it was included for compatibility
94 * with existing character sets that already provided it. The Unicode standard
95 * identifies such characters by giving them "compatibility" decompositions
96 * into the corresponding semantic characters. When sorting and searching, you
97 * will often want to use these mappings.
98 *
99 * <code>normalize</code> helps solve these problems by transforming text into
100 * the canonical composed and decomposed forms as shown in the first example
101 * above. In addition, you can have it perform compatibility decompositions so
102 * that you can treat compatibility characters the same as their equivalents.
103 * Finally, <code>normalize</code> rearranges accents into the proper canonical
104 * order, so that you do not have to worry about accent rearrangement on your
105 * own.
106 *
107 * Form FCD, "Fast C or D", is also designed for collation.
108 * It allows to work on strings that are not necessarily normalized
109 * with an algorithm (like in collation) that works under "canonical closure",
110 * i.e., it treats precomposed characters and their decomposed equivalents the
111 * same.
112 *
113 * It is not a normalization form because it does not provide for uniqueness of
114 * representation. Multiple strings may be canonically equivalent (their NFDs
115 * are identical) and may all conform to FCD without being identical themselves.
116 *
117 * The form is defined such that the "raw decomposition", the recursive
118 * canonical decomposition of each character, results in a string that is
119 * canonically ordered. This means that precomposed characters are allowed for
120 * as long as their decompositions do not need canonical reordering.
121 *
122 * Its advantage for a process like collation is that all NFD and most NFC texts
123 * - and many unnormalized texts - already conform to FCD and do not need to be
124 * normalized (NFD) for such a process. The FCD quick check will return YES for
125 * most strings in practice.
126 *
127 * normalize(FCD) may be implemented with NFD.
128 *
129 * For more details on FCD see the collation design document:
130 * http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/collation/ICU_collation_design.htm
131 *
132 * ICU collation performs either NFD or FCD normalization automatically if
133 * normalization is turned on for the collator object. Beyond collation and
134 * string search, normalized strings may be useful for string equivalence
135 * comparisons, transliteration/transcription, unique representations, etc.
136 *
137 * The W3C generally recommends to exchange texts in NFC.
138 * Note also that most legacy character encodings use only precomposed forms and
139 * often do not encode any combining marks by themselves. For conversion to such
140 * character encodings the Unicode text needs to be normalized to NFC.
141 * For more usage examples, see the Unicode Standard Annex.
142 * @stable ICU 2.8
143 */
144
145public final class NormalizerBase implements Cloneable {
146
147 //-------------------------------------------------------------------------
148 // Private data
149 //-------------------------------------------------------------------------
150 private char[] buffer = new char[100];
151 private int bufferStart = 0;
152 private int bufferPos = 0;
153 private int bufferLimit = 0;
154
155 // The input text and our position in it
156 private UCharacterIterator text;
157 private Mode mode = NFC;
158 private int options = 0;
159 private int currentIndex;
160 private int nextIndex;
161
162 /**
163 * Options bit set value to select Unicode 3.2 normalization
164 * (except NormalizationCorrections).
165 * At most one Unicode version can be selected at a time.
166 * @stable ICU 2.6
167 */
168 public static final int UNICODE_3_2=0x20;
169
170 /**
171 * Constant indicating that the end of the iteration has been reached.
172 * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.
173 * @stable ICU 2.8
174 */
175 public static final int DONE = UCharacterIterator.DONE;
176
177 /**
178 * Constants for normalization modes.
179 * @stable ICU 2.8
180 */
181 public static class Mode {
182 private int modeValue;
183 private Mode(int value) {
184 modeValue = value;
185 }
186
187 /**
188 * This method is used for method dispatch
189 * @stable ICU 2.6
190 */
191 protected int normalize(char[] src, int srcStart, int srcLimit,
192 char[] dest,int destStart,int destLimit,
193 UnicodeSet nx) {
194 int srcLen = (srcLimit - srcStart);
195 int destLen = (destLimit - destStart);
196 if( srcLen > destLen ) {
197 return srcLen;
198 }
199 System.arraycopy(src,srcStart,dest,destStart,srcLen);
200 return srcLen;
201 }
202
203 /**
204 * This method is used for method dispatch
205 * @stable ICU 2.6
206 */
207 protected int normalize(char[] src, int srcStart, int srcLimit,
208 char[] dest,int destStart,int destLimit,
209 int options) {
210 return normalize( src, srcStart, srcLimit,
211 dest,destStart,destLimit,
212 NormalizerImpl.getNX(options)
213 );
214 }
215
216 /**
217 * This method is used for method dispatch
218 * @stable ICU 2.6
219 */
220 protected String normalize(String src, int options) {
221 return src;
222 }
223
224 /**
225 * This method is used for method dispatch
226 * @stable ICU 2.8
227 */
228 protected int getMinC() {
229 return -1;
230 }
231
232 /**
233 * This method is used for method dispatch
234 * @stable ICU 2.8
235 */
236 protected int getMask() {
237 return -1;
238 }
239
240 /**
241 * This method is used for method dispatch
242 * @stable ICU 2.8
243 */
244 protected IsPrevBoundary getPrevBoundary() {
245 return null;
246 }
247
248 /**
249 * This method is used for method dispatch
250 * @stable ICU 2.8
251 */
252 protected IsNextBoundary getNextBoundary() {
253 return null;
254 }
255
256 /**
257 * This method is used for method dispatch
258 * @stable ICU 2.6
259 */
260 protected QuickCheckResult quickCheck(char[] src,int start, int limit,
261 boolean allowMaybe,UnicodeSet nx) {
262 if(allowMaybe) {
263 return MAYBE;
264 }
265 return NO;
266 }
267
268 /**
269 * This method is used for method dispatch
270 * @stable ICU 2.8
271 */
272 protected boolean isNFSkippable(int c) {
273 return true;
274 }
275 }
276
277 /**
278 * No decomposition/composition.
279 * @stable ICU 2.8
280 */
281 public static final Mode NONE = new Mode(1);
282
283 /**
284 * Canonical decomposition.
285 * @stable ICU 2.8
286 */
287 public static final Mode NFD = new NFDMode(2);
288
289 private static final class NFDMode extends Mode {
290 private NFDMode(int value) {
291 super(value);
292 }
293
294 protected int normalize(char[] src, int srcStart, int srcLimit,
295 char[] dest,int destStart,int destLimit,
296 UnicodeSet nx) {
297 int[] trailCC = new int[1];
298 return NormalizerImpl.decompose(src, srcStart,srcLimit,
299 dest, destStart,destLimit,
300 false, trailCC,nx);
301 }
302
303 protected String normalize( String src, int options) {
304 return decompose(src,false,options);
305 }
306
307 protected int getMinC() {
308 return NormalizerImpl.MIN_WITH_LEAD_CC;
309 }
310
311 protected IsPrevBoundary getPrevBoundary() {
312 return new IsPrevNFDSafe();
313 }
314
315 protected IsNextBoundary getNextBoundary() {
316 return new IsNextNFDSafe();
317 }
318
319 protected int getMask() {
320 return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFD);
321 }
322
323 protected QuickCheckResult quickCheck(char[] src,int start,
324 int limit,boolean allowMaybe,
325 UnicodeSet nx) {
326 return NormalizerImpl.quickCheck(
327 src, start,limit,
328 NormalizerImpl.getFromIndexesArr(
329 NormalizerImpl.INDEX_MIN_NFD_NO_MAYBE
330 ),
331 NormalizerImpl.QC_NFD,
332 0,
333 allowMaybe,
334 nx
335 );
336 }
337
338 protected boolean isNFSkippable(int c) {
339 return NormalizerImpl.isNFSkippable(c,this,
340 (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFD)
341 );
342 }
343 }
344
345 /**
346 * Compatibility decomposition.
347 * @stable ICU 2.8
348 */
349 public static final Mode NFKD = new NFKDMode(3);
350
351 private static final class NFKDMode extends Mode {
352 private NFKDMode(int value) {
353 super(value);
354 }
355
356 protected int normalize(char[] src, int srcStart, int srcLimit,
357 char[] dest,int destStart,int destLimit,
358 UnicodeSet nx) {
359 int[] trailCC = new int[1];
360 return NormalizerImpl.decompose(src, srcStart,srcLimit,
361 dest, destStart,destLimit,
362 true, trailCC, nx);
363 }
364
365 protected String normalize( String src, int options) {
366 return decompose(src,true,options);
367 }
368
369 protected int getMinC() {
370 return NormalizerImpl.MIN_WITH_LEAD_CC;
371 }
372
373 protected IsPrevBoundary getPrevBoundary() {
374 return new IsPrevNFDSafe();
375 }
376
377 protected IsNextBoundary getNextBoundary() {
378 return new IsNextNFDSafe();
379 }
380
381 protected int getMask() {
382 return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFKD);
383 }
384
385 protected QuickCheckResult quickCheck(char[] src,int start,
386 int limit,boolean allowMaybe,
387 UnicodeSet nx) {
388 return NormalizerImpl.quickCheck(
389 src,start,limit,
390 NormalizerImpl.getFromIndexesArr(
391 NormalizerImpl.INDEX_MIN_NFKD_NO_MAYBE
392 ),
393 NormalizerImpl.QC_NFKD,
394 NormalizerImpl.OPTIONS_COMPAT,
395 allowMaybe,
396 nx
397 );
398 }
399
400 protected boolean isNFSkippable(int c) {
401 return NormalizerImpl.isNFSkippable(c, this,
402 (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFKD)
403 );
404 }
405 }
406
407 /**
408 * Canonical decomposition followed by canonical composition.
409 * @stable ICU 2.8
410 */
411 public static final Mode NFC = new NFCMode(4);
412
413 private static final class NFCMode extends Mode{
414 private NFCMode(int value) {
415 super(value);
416 }
417 protected int normalize(char[] src, int srcStart, int srcLimit,
418 char[] dest,int destStart,int destLimit,
419 UnicodeSet nx) {
420 return NormalizerImpl.compose( src, srcStart, srcLimit,
421 dest,destStart,destLimit,
422 0, nx);
423 }
424
425 protected String normalize( String src, int options) {
426 return compose(src, false, options);
427 }
428
429 protected int getMinC() {
430 return NormalizerImpl.getFromIndexesArr(
431 NormalizerImpl.INDEX_MIN_NFC_NO_MAYBE
432 );
433 }
434 protected IsPrevBoundary getPrevBoundary() {
435 return new IsPrevTrueStarter();
436 }
437 protected IsNextBoundary getNextBoundary() {
438 return new IsNextTrueStarter();
439 }
440 protected int getMask() {
441 return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFC);
442 }
443 protected QuickCheckResult quickCheck(char[] src,int start,
444 int limit,boolean allowMaybe,
445 UnicodeSet nx) {
446 return NormalizerImpl.quickCheck(
447 src,start,limit,
448 NormalizerImpl.getFromIndexesArr(
449 NormalizerImpl.INDEX_MIN_NFC_NO_MAYBE
450 ),
451 NormalizerImpl.QC_NFC,
452 0,
453 allowMaybe,
454 nx
455 );
456 }
457 protected boolean isNFSkippable(int c) {
458 return NormalizerImpl.isNFSkippable(c,this,
459 ( NormalizerImpl.CC_MASK|NormalizerImpl.COMBINES_ANY|
460 (NormalizerImpl.QC_NFC & NormalizerImpl.QC_ANY_NO)
461 )
462 );
463 }
464 };
465
466 /**
467 * Compatibility decomposition followed by canonical composition.
468 * @stable ICU 2.8
469 */
470 public static final Mode NFKC =new NFKCMode(5);
471
472 private static final class NFKCMode extends Mode{
473 private NFKCMode(int value) {
474 super(value);
475 }
476 protected int normalize(char[] src, int srcStart, int srcLimit,
477 char[] dest,int destStart,int destLimit,
478 UnicodeSet nx) {
479 return NormalizerImpl.compose(src, srcStart,srcLimit,
480 dest, destStart,destLimit,
481 NormalizerImpl.OPTIONS_COMPAT, nx);
482 }
483
484 protected String normalize( String src, int options) {
485 return compose(src, true, options);
486 }
487 protected int getMinC() {
488 return NormalizerImpl.getFromIndexesArr(
489 NormalizerImpl.INDEX_MIN_NFKC_NO_MAYBE
490 );
491 }
492 protected IsPrevBoundary getPrevBoundary() {
493 return new IsPrevTrueStarter();
494 }
495 protected IsNextBoundary getNextBoundary() {
496 return new IsNextTrueStarter();
497 }
498 protected int getMask() {
499 return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFKC);
500 }
501 protected QuickCheckResult quickCheck(char[] src,int start,
502 int limit,boolean allowMaybe,
503 UnicodeSet nx) {
504 return NormalizerImpl.quickCheck(
505 src,start,limit,
506 NormalizerImpl.getFromIndexesArr(
507 NormalizerImpl.INDEX_MIN_NFKC_NO_MAYBE
508 ),
509 NormalizerImpl.QC_NFKC,
510 NormalizerImpl.OPTIONS_COMPAT,
511 allowMaybe,
512 nx
513 );
514 }
515 protected boolean isNFSkippable(int c) {
516 return NormalizerImpl.isNFSkippable(c, this,
517 ( NormalizerImpl.CC_MASK|NormalizerImpl.COMBINES_ANY|
518 (NormalizerImpl.QC_NFKC & NormalizerImpl.QC_ANY_NO)
519 )
520 );
521 }
522 };
523
524 /**
525 * Result values for quickCheck().
526 * For details see Unicode Technical Report 15.
527 * @stable ICU 2.8
528 */
529 public static final class QuickCheckResult{
530 private int resultValue;
531 private QuickCheckResult(int value) {
532 resultValue=value;
533 }
534 }
535 /**
536 * Indicates that string is not in the normalized format
537 * @stable ICU 2.8
538 */
539 public static final QuickCheckResult NO = new QuickCheckResult(0);
540
541 /**
542 * Indicates that string is in the normalized format
543 * @stable ICU 2.8
544 */
545 public static final QuickCheckResult YES = new QuickCheckResult(1);
546
547 /**
548 * Indicates it cannot be determined if string is in the normalized
549 * format without further thorough checks.
550 * @stable ICU 2.8
551 */
552 public static final QuickCheckResult MAYBE = new QuickCheckResult(2);
553
554 //-------------------------------------------------------------------------
555 // Constructors
556 //-------------------------------------------------------------------------
557
558 /**
559 * Creates a new <tt>Normalizer</tt> object for iterating over the
560 * normalized form of a given string.
561 * <p>
562 * The <tt>options</tt> parameter specifies which optional
563 * <tt>Normalizer</tt> features are to be enabled for this object.
564 * <p>
565 * @param str The string to be normalized. The normalization
566 * will start at the beginning of the string.
567 *
568 * @param mode The normalization mode.
569 *
570 * @param opt Any optional features to be enabled.
571 * Currently the only available option is {@link #UNICODE_3_2}.
572 * If you want the default behavior corresponding to one of the
573 * standard Unicode Normalization Forms, use 0 for this argument.
574 * @stable ICU 2.6
575 */
576 public NormalizerBase(String str, Mode mode, int opt) {
577 this.text = UCharacterIterator.getInstance(str);
578 this.mode = mode;
579 this.options=opt;
580 }
581
582 /**
583 * Creates a new <tt>Normalizer</tt> object for iterating over the
584 * normalized form of the given text.
585 * <p>
586 * @param iter The input text to be normalized. The normalization
587 * will start at the beginning of the string.
588 *
589 * @param mode The normalization mode.
590 */
591 public NormalizerBase(CharacterIterator iter, Mode mode) {
592 this(iter, mode, UNICODE_LATEST);
593 }
594
595 /**
596 * Creates a new <tt>Normalizer</tt> object for iterating over the
597 * normalized form of the given text.
598 * <p>
599 * @param iter The input text to be normalized. The normalization
600 * will start at the beginning of the string.
601 *
602 * @param mode The normalization mode.
603 *
604 * @param opt Any optional features to be enabled.
605 * Currently the only available option is {@link #UNICODE_3_2}.
606 * If you want the default behavior corresponding to one of the
607 * standard Unicode Normalization Forms, use 0 for this argument.
608 * @stable ICU 2.6
609 */
610 public NormalizerBase(CharacterIterator iter, Mode mode, int opt) {
611 this.text = UCharacterIterator.getInstance(
612 (CharacterIterator)iter.clone()
613 );
614 this.mode = mode;
615 this.options = opt;
616 }
617
618 /**
619 * Clones this <tt>Normalizer</tt> object. All properties of this
620 * object are duplicated in the new object, including the cloning of any
621 * {@link CharacterIterator} that was passed in to the constructor
622 * or to {@link #setText(CharacterIterator) setText}.
623 * However, the text storage underlying
624 * the <tt>CharacterIterator</tt> is not duplicated unless the
625 * iterator's <tt>clone</tt> method does so.
626 * @stable ICU 2.8
627 */
628 public Object clone() {
629 try {
630 NormalizerBase copy = (NormalizerBase) super.clone();
631 copy.text = (UCharacterIterator) text.clone();
632 //clone the internal buffer
633 if (buffer != null) {
634 copy.buffer = new char[buffer.length];
635 System.arraycopy(buffer,0,copy.buffer,0,buffer.length);
636 }
637 return copy;
638 }
639 catch (CloneNotSupportedException e) {
640 throw new InternalError(e.toString());
641 }
642 }
643
644 //--------------------------------------------------------------------------
645 // Static Utility methods
646 //--------------------------------------------------------------------------
647
648 /**
649 * Compose a string.
650 * The string will be composed to according the the specified mode.
651 * @param str The string to compose.
652 * @param compat If true the string will be composed accoding to
653 * NFKC rules and if false will be composed according to
654 * NFC rules.
655 * @param options The only recognized option is UNICODE_3_2
656 * @return String The composed string
657 * @stable ICU 2.6
658 */
659 public static String compose(String str, boolean compat, int options) {
660
661 char[] dest, src;
662 if (options == UNICODE_3_2_0_ORIGINAL) {
663 String mappedStr = NormalizerImpl.convert(str);
664 dest = new char[mappedStr.length()*MAX_BUF_SIZE_COMPOSE];
665 src = mappedStr.toCharArray();
666 } else {
667 dest = new char[str.length()*MAX_BUF_SIZE_COMPOSE];
668 src = str.toCharArray();
669 }
670 int destSize=0;
671
672 UnicodeSet nx = NormalizerImpl.getNX(options);
673
674 /* reset options bits that should only be set here or inside compose() */
675 options&=~(NormalizerImpl.OPTIONS_SETS_MASK|NormalizerImpl.OPTIONS_COMPAT|NormalizerImpl.OPTIONS_COMPOSE_CONTIGUOUS);
676
677 if(compat) {
678 options|=NormalizerImpl.OPTIONS_COMPAT;
679 }
680
681 for(;;) {
682 destSize=NormalizerImpl.compose(src,0,src.length,
683 dest,0,dest.length,options,
684 nx);
685 if(destSize<=dest.length) {
686 return new String(dest,0,destSize);
687 } else {
688 dest = new char[destSize];
689 }
690 }
691 }
692
693 private static final int MAX_BUF_SIZE_COMPOSE = 2;
694 private static final int MAX_BUF_SIZE_DECOMPOSE = 3;
695
696 /**
697 * Decompose a string.
698 * The string will be decomposed to according the the specified mode.
699 * @param str The string to decompose.
700 * @param compat If true the string will be decomposed accoding to NFKD
701 * rules and if false will be decomposed according to NFD
702 * rules.
703 * @return String The decomposed string
704 * @stable ICU 2.8
705 */
706 public static String decompose(String str, boolean compat) {
707 return decompose(str,compat,UNICODE_LATEST);
708 }
709
710 /**
711 * Decompose a string.
712 * The string will be decomposed to according the the specified mode.
713 * @param str The string to decompose.
714 * @param compat If true the string will be decomposed accoding to NFKD
715 * rules and if false will be decomposed according to NFD
716 * rules.
717 * @param options The normalization options, ORed together (0 for no options).
718 * @return String The decomposed string
719 * @stable ICU 2.6
720 */
721 public static String decompose(String str, boolean compat, int options) {
722
723 int[] trailCC = new int[1];
724 int destSize=0;
725 UnicodeSet nx = NormalizerImpl.getNX(options);
726 char[] dest;
727
728 if (options == UNICODE_3_2_0_ORIGINAL) {
729 String mappedStr = NormalizerImpl.convert(str);
730 dest = new char[mappedStr.length()*MAX_BUF_SIZE_DECOMPOSE];
731
732 for(;;) {
733 destSize=NormalizerImpl.decompose(mappedStr.toCharArray(),0,mappedStr.length(),
734 dest,0,dest.length,
735 compat,trailCC, nx);
736 if(destSize<=dest.length) {
737 return new String(dest,0,destSize);
738 } else {
739 dest = new char[destSize];
740 }
741 }
742 } else {
743 dest = new char[str.length()*MAX_BUF_SIZE_DECOMPOSE];
744
745 for(;;) {
746 destSize=NormalizerImpl.decompose(str.toCharArray(),0,str.length(),
747 dest,0,dest.length,
748 compat,trailCC, nx);
749 if(destSize<=dest.length) {
750 return new String(dest,0,destSize);
751 } else {
752 dest = new char[destSize];
753 }
754 }
755 }
756 }
757
758 /**
759 * Normalize a string.
760 * The string will be normalized according the the specified normalization
761 * mode and options.
762 * @param src The char array to compose.
763 * @param srcStart Start index of the source
764 * @param srcLimit Limit index of the source
765 * @param dest The char buffer to fill in
766 * @param destStart Start index of the destination buffer
767 * @param destLimit End index of the destination buffer
768 * @param mode The normalization mode; one of Normalizer.NONE,
769 * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
770 * Normalizer.NFKD, Normalizer.DEFAULT
771 * @param options The normalization options, ORed together (0 for no options).
772 * @return int The total buffer size needed;if greater than length of
773 * result, the output was truncated.
774 * @exception IndexOutOfBoundsException if the target capacity is
775 * less than the required length
776 * @stable ICU 2.6
777 */
778 public static int normalize(char[] src,int srcStart, int srcLimit,
779 char[] dest,int destStart, int destLimit,
780 Mode mode, int options) {
781 int length = mode.normalize(src,srcStart,srcLimit,dest,destStart,destLimit, options);
782
783 if(length<=(destLimit-destStart)) {
784 return length;
785 } else {
786 throw new IndexOutOfBoundsException(Integer.toString(length));
787 }
788 }
789
790 //-------------------------------------------------------------------------
791 // Iteration API
792 //-------------------------------------------------------------------------
793
794 /**
795 * Return the current character in the normalized text->
796 * @return The codepoint as an int
797 * @stable ICU 2.8
798 */
799 public int current() {
800 if(bufferPos<bufferLimit || nextNormalize()) {
801 return getCodePointAt(bufferPos);
802 } else {
803 return DONE;
804 }
805 }
806
807 /**
808 * Return the next character in the normalized text and advance
809 * the iteration position by one. If the end
810 * of the text has already been reached, {@link #DONE} is returned.
811 * @return The codepoint as an int
812 * @stable ICU 2.8
813 */
814 public int next() {
815 if(bufferPos<bufferLimit || nextNormalize()) {
816 int c=getCodePointAt(bufferPos);
817 bufferPos+=(c>0xFFFF) ? 2 : 1;
818 return c;
819 } else {
820 return DONE;
821 }
822 }
823
824
825 /**
826 * Return the previous character in the normalized text and decrement
827 * the iteration position by one. If the beginning
828 * of the text has already been reached, {@link #DONE} is returned.
829 * @return The codepoint as an int
830 * @stable ICU 2.8
831 */
832 public int previous() {
833 if(bufferPos>0 || previousNormalize()) {
834 int c=getCodePointAt(bufferPos-1);
835 bufferPos-=(c>0xFFFF) ? 2 : 1;
836 return c;
837 } else {
838 return DONE;
839 }
840 }
841
842 /**
843 * Reset the index to the beginning of the text.
844 * This is equivalent to setIndexOnly(startIndex)).
845 * @stable ICU 2.8
846 */
847 public void reset() {
848 text.setIndex(0);
849 currentIndex=nextIndex=0;
850 clearBuffer();
851 }
852
853 /**
854 * Set the iteration position in the input text that is being normalized,
855 * without any immediate normalization.
856 * After setIndexOnly(), getIndex() will return the same index that is
857 * specified here.
858 *
859 * @param index the desired index in the input text.
860 * @stable ICU 2.8
861 */
862 public void setIndexOnly(int index) {
863 text.setIndex(index);
864 currentIndex=nextIndex=index; // validates index
865 clearBuffer();
866 }
867
868 /**
869 * Set the iteration position in the input text that is being normalized
870 * and return the first normalized character at that position.
871 * <p>
872 * <b>Note:</b> This method sets the position in the <em>input</em> text,
873 * while {@link #next} and {@link #previous} iterate through characters
874 * in the normalized <em>output</em>. This means that there is not
875 * necessarily a one-to-one correspondence between characters returned
876 * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
877 * returned from <tt>setIndex</tt> and {@link #getIndex}.
878 * <p>
879 * @param index the desired index in the input text->
880 *
881 * @return the first normalized character that is the result of iterating
882 * forward starting at the given index.
883 *
884 * @throws IllegalArgumentException if the given index is less than
885 * {@link #getBeginIndex} or greater than {@link #getEndIndex}.
886 * @return The codepoint as an int
887 * @deprecated ICU 3.2
888 * @obsolete ICU 3.2
889 */
890 public int setIndex(int index) {
891 setIndexOnly(index);
892 return current();
893 }
894
895 /**
896 * Retrieve the index of the start of the input text. This is the begin
897 * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
898 * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
899 * @deprecated ICU 2.2. Use startIndex() instead.
900 * @return The codepoint as an int
901 * @see #startIndex
902 */
903 public int getBeginIndex() {
904 return 0;
905 }
906
907 /**
908 * Retrieve the index of the end of the input text. This is the end index
909 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
910 * over which this <tt>Normalizer</tt> is iterating
911 * @deprecated ICU 2.2. Use endIndex() instead.
912 * @return The codepoint as an int
913 * @see #endIndex
914 */
915 public int getEndIndex() {
916 return endIndex();
917 }
918
919 /**
920 * Retrieve the current iteration position in the input text that is
921 * being normalized. This method is useful in applications such as
922 * searching, where you need to be able to determine the position in
923 * the input text that corresponds to a given normalized output character.
924 * <p>
925 * <b>Note:</b> This method sets the position in the <em>input</em>, while
926 * {@link #next} and {@link #previous} iterate through characters in the
927 * <em>output</em>. This means that there is not necessarily a one-to-one
928 * correspondence between characters returned by <tt>next</tt> and
929 * <tt>previous</tt> and the indices passed to and returned from
930 * <tt>setIndex</tt> and {@link #getIndex}.
931 * @return The current iteration position
932 * @stable ICU 2.8
933 */
934 public int getIndex() {
935 if(bufferPos<bufferLimit) {
936 return currentIndex;
937 } else {
938 return nextIndex;
939 }
940 }
941
942 /**
943 * Retrieve the index of the end of the input text-> This is the end index
944 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
945 * over which this <tt>Normalizer</tt> is iterating
946 * @return The current iteration position
947 * @stable ICU 2.8
948 */
949 public int endIndex() {
950 return text.getLength();
951 }
952
953 //-------------------------------------------------------------------------
954 // Property access methods
955 //-------------------------------------------------------------------------
956 /**
957 * Set the normalization mode for this object.
958 * <p>
959 * <b>Note:</b>If the normalization mode is changed while iterating
960 * over a string, calls to {@link #next} and {@link #previous} may
961 * return previously buffers characters in the old normalization mode
962 * until the iteration is able to re-sync at the next base character.
963 * It is safest to call {@link #setText setText()}, {@link #first},
964 * {@link #last}, etc. after calling <tt>setMode</tt>.
965 * <p>
966 * @param newMode the new mode for this <tt>Normalizer</tt>.
967 * The supported modes are:
968 * <ul>
969 * <li>{@link #COMPOSE} - Unicode canonical decompositiion
970 * followed by canonical composition.
971 * <li>{@link #COMPOSE_COMPAT} - Unicode compatibility decompositiion
972 * follwed by canonical composition.
973 * <li>{@link #DECOMP} - Unicode canonical decomposition
974 * <li>{@link #DECOMP_COMPAT} - Unicode compatibility decomposition.
975 * <li>{@link #NO_OP} - Do nothing but return characters
976 * from the underlying input text.
977 * </ul>
978 *
979 * @see #getMode
980 * @stable ICU 2.8
981 */
982 public void setMode(Mode newMode) {
983 mode = newMode;
984 }
985 /**
986 * Return the basic operation performed by this <tt>Normalizer</tt>
987 *
988 * @see #setMode
989 * @stable ICU 2.8
990 */
991 public Mode getMode() {
992 return mode;
993 }
994
995 /**
996 * Set the input text over which this <tt>Normalizer</tt> will iterate.
997 * The iteration position is set to the beginning of the input text->
998 * @param newText The new string to be normalized.
999 * @stable ICU 2.8
1000 */
1001 public void setText(String newText) {
1002
1003 UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1004 if (newIter == null) {
1005 throw new InternalError("Could not create a new UCharacterIterator");
1006 }
1007 text = newIter;
1008 reset();
1009 }
1010
1011 /**
1012 * Set the input text over which this <tt>Normalizer</tt> will iterate.
1013 * The iteration position is set to the beginning of the input text->
1014 * @param newText The new string to be normalized.
1015 * @stable ICU 2.8
1016 */
1017 public void setText(CharacterIterator newText) {
1018
1019 UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1020 if (newIter == null) {
1021 throw new InternalError("Could not create a new UCharacterIterator");
1022 }
1023 text = newIter;
1024 currentIndex=nextIndex=0;
1025 clearBuffer();
1026 }
1027
1028 //-------------------------------------------------------------------------
1029 // Private utility methods
1030 //-------------------------------------------------------------------------
1031
1032
1033 /* backward iteration --------------------------------------------------- */
1034
1035 /*
1036 * read backwards and get norm32
1037 * return 0 if the character is <minC
1038 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
1039 * surrogate but read second!)
1040 */
1041
1042 private static long getPrevNorm32(UCharacterIterator src,
1043 int/*unsigned*/ minC,
1044 int/*unsigned*/ mask,
1045 char[] chars) {
1046 long norm32;
1047 int ch=0;
1048 /* need src.hasPrevious() */
1049 if((ch=src.previous()) == UCharacterIterator.DONE) {
1050 return 0;
1051 }
1052 chars[0]=(char)ch;
1053 chars[1]=0;
1054
1055 /* check for a surrogate before getting norm32 to see if we need to
1056 * predecrement further */
1057 if(chars[0]<minC) {
1058 return 0;
1059 } else if(!UTF16.isSurrogate(chars[0])) {
1060 return NormalizerImpl.getNorm32(chars[0]);
1061 } else if(UTF16.isLeadSurrogate(chars[0]) || (src.getIndex()==0)) {
1062 /* unpaired surrogate */
1063 chars[1]=(char)src.current();
1064 return 0;
1065 } else if(UTF16.isLeadSurrogate(chars[1]=(char)src.previous())) {
1066 norm32=NormalizerImpl.getNorm32(chars[1]);
1067 if((norm32&mask)==0) {
1068 /* all surrogate pairs with this lead surrogate have irrelevant
1069 * data */
1070 return 0;
1071 } else {
1072 /* norm32 must be a surrogate special */
1073 return NormalizerImpl.getNorm32FromSurrogatePair(norm32,chars[0]);
1074 }
1075 } else {
1076 /* unpaired second surrogate, undo the c2=src.previous() movement */
1077 src.moveIndex( 1);
1078 return 0;
1079 }
1080 }
1081
1082 private interface IsPrevBoundary{
1083 public boolean isPrevBoundary(UCharacterIterator src,
1084 int/*unsigned*/ minC,
1085 int/*unsigned*/ mask,
1086 char[] chars);
1087 }
1088 private static final class IsPrevNFDSafe implements IsPrevBoundary{
1089 /*
1090 * for NF*D:
1091 * read backwards and check if the lead combining class is 0
1092 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
1093 * surrogate but read second!)
1094 */
1095 public boolean isPrevBoundary(UCharacterIterator src,
1096 int/*unsigned*/ minC,
1097 int/*unsigned*/ ccOrQCMask,
1098 char[] chars) {
1099
1100 return NormalizerImpl.isNFDSafe(getPrevNorm32(src, minC,
1101 ccOrQCMask, chars),
1102 ccOrQCMask,
1103 ccOrQCMask& NormalizerImpl.QC_MASK);
1104 }
1105 }
1106
1107 private static final class IsPrevTrueStarter implements IsPrevBoundary{
1108 /*
1109 * read backwards and check if the character is (or its decomposition
1110 * begins with) a "true starter" (cc==0 and NF*C_YES)
1111 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
1112 * surrogate but read second!)
1113 */
1114 public boolean isPrevBoundary(UCharacterIterator src,
1115 int/*unsigned*/ minC,
1116 int/*unsigned*/ ccOrQCMask,
1117 char[] chars) {
1118 long norm32;
1119 int/*unsigned*/ decompQCMask;
1120
1121 decompQCMask=(ccOrQCMask<<2)&0xf; /*decomposition quick check mask*/
1122 norm32=getPrevNorm32(src, minC, ccOrQCMask|decompQCMask, chars);
1123 return NormalizerImpl.isTrueStarter(norm32,ccOrQCMask,decompQCMask);
1124 }
1125 }
1126
1127 private static int findPreviousIterationBoundary(UCharacterIterator src,
1128 IsPrevBoundary obj,
1129 int/*unsigned*/ minC,
1130 int/*mask*/ mask,
1131 char[] buffer,
1132 int[] startIndex) {
1133 char[] chars=new char[2];
1134 boolean isBoundary;
1135
1136 /* fill the buffer from the end backwards */
1137 startIndex[0] = buffer.length;
1138 chars[0]=0;
1139 while(src.getIndex()>0 && chars[0]!=UCharacterIterator.DONE) {
1140 isBoundary=obj.isPrevBoundary(src, minC, mask, chars);
1141
1142 /* always write this character to the front of the buffer */
1143 /* make sure there is enough space in the buffer */
1144 if(startIndex[0] < (chars[1]==0 ? 1 : 2)) {
1145
1146 // grow the buffer
1147 char[] newBuf = new char[buffer.length*2];
1148 /* move the current buffer contents up */
1149 System.arraycopy(buffer,startIndex[0],newBuf,
1150 newBuf.length-(buffer.length-startIndex[0]),
1151 buffer.length-startIndex[0]);
1152 //adjust the startIndex
1153 startIndex[0]+=newBuf.length-buffer.length;
1154
1155 buffer=newBuf;
1156 newBuf=null;
1157
1158 }
1159
1160 buffer[--startIndex[0]]=chars[0];
1161 if(chars[1]!=0) {
1162 buffer[--startIndex[0]]=chars[1];
1163 }
1164
1165 /* stop if this just-copied character is a boundary */
1166 if(isBoundary) {
1167 break;
1168 }
1169 }
1170
1171 /* return the length of the buffer contents */
1172 return buffer.length-startIndex[0];
1173 }
1174
1175 private static int previous(UCharacterIterator src,
1176 char[] dest, int destStart, int destLimit,
1177 Mode mode,
1178 boolean doNormalize,
1179 boolean[] pNeededToNormalize,
1180 int options) {
1181
1182 IsPrevBoundary isPreviousBoundary;
1183 int destLength, bufferLength;
1184 int/*unsigned*/ mask;
1185 int c,c2;
1186
1187 char minC;
1188 int destCapacity = destLimit-destStart;
1189 destLength=0;
1190
1191 if(pNeededToNormalize!=null) {
1192 pNeededToNormalize[0]=false;
1193 }
1194 minC = (char)mode.getMinC();
1195 mask = mode.getMask();
1196 isPreviousBoundary = mode.getPrevBoundary();
1197
1198 if(isPreviousBoundary==null) {
1199 destLength=0;
1200 if((c=src.previous())>=0) {
1201 destLength=1;
1202 if(UTF16.isTrailSurrogate((char)c)) {
1203 c2= src.previous();
1204 if(c2!= UCharacterIterator.DONE) {
1205 if(UTF16.isLeadSurrogate((char)c2)) {
1206 if(destCapacity>=2) {
1207 dest[1]=(char)c; // trail surrogate
1208 destLength=2;
1209 }
1210 // lead surrogate to be written below
1211 c=c2;
1212 } else {
1213 src.moveIndex(1);
1214 }
1215 }
1216 }
1217
1218 if(destCapacity>0) {
1219 dest[0]=(char)c;
1220 }
1221 }
1222 return destLength;
1223 }
1224
1225 char[] buffer = new char[100];
1226 int[] startIndex= new int[1];
1227 bufferLength=findPreviousIterationBoundary(src,
1228 isPreviousBoundary,
1229 minC, mask,buffer,
1230 startIndex);
1231 if(bufferLength>0) {
1232 if(doNormalize) {
1233 destLength=NormalizerBase.normalize(buffer,startIndex[0],
1234 startIndex[0]+bufferLength,
1235 dest, destStart,destLimit,
1236 mode, options);
1237
1238 if(pNeededToNormalize!=null) {
1239 pNeededToNormalize[0]=(boolean)(destLength!=bufferLength ||
1240 Utility.arrayRegionMatches(
1241 buffer,0,dest,
1242 destStart,destLimit
1243 ));
1244 }
1245 } else {
1246 /* just copy the source characters */
1247 if(destCapacity>0) {
1248 System.arraycopy(buffer,startIndex[0],dest,0,
1249 (bufferLength<destCapacity) ?
1250 bufferLength : destCapacity
1251 );
1252 }
1253 }
1254 }
1255
1256
1257 return destLength;
1258 }
1259
1260
1261
1262 /* forward iteration ---------------------------------------------------- */
1263 /*
1264 * read forward and check if the character is a next-iteration boundary
1265 * if c2!=0 then (c, c2) is a surrogate pair
1266 */
1267 private interface IsNextBoundary{
1268 boolean isNextBoundary(UCharacterIterator src,
1269 int/*unsigned*/ minC,
1270 int/*unsigned*/ mask,
1271 int[] chars);
1272 }
1273 /*
1274 * read forward and get norm32
1275 * return 0 if the character is <minC
1276 * if c2!=0 then (c2, c) is a surrogate pair
1277 * always reads complete characters
1278 */
1279 private static long /*unsigned*/ getNextNorm32(UCharacterIterator src,
1280 int/*unsigned*/ minC,
1281 int/*unsigned*/ mask,
1282 int[] chars) {
1283 long norm32;
1284
1285 /* need src.hasNext() to be true */
1286 chars[0]=src.next();
1287 chars[1]=0;
1288
1289 if(chars[0]<minC) {
1290 return 0;
1291 }
1292
1293 norm32=NormalizerImpl.getNorm32((char)chars[0]);
1294 if(UTF16.isLeadSurrogate((char)chars[0])) {
1295 if(src.current()!=UCharacterIterator.DONE &&
1296 UTF16.isTrailSurrogate((char)(chars[1]=src.current()))) {
1297 src.moveIndex(1); /* skip the c2 surrogate */
1298 if((norm32&mask)==0) {
1299 /* irrelevant data */
1300 return 0;
1301 } else {
1302 /* norm32 must be a surrogate special */
1303 return NormalizerImpl.getNorm32FromSurrogatePair(norm32,(char)chars[1]);
1304 }
1305 } else {
1306 /* unmatched surrogate */
1307 return 0;
1308 }
1309 }
1310 return norm32;
1311 }
1312
1313
1314 /*
1315 * for NF*D:
1316 * read forward and check if the lead combining class is 0
1317 * if c2!=0 then (c, c2) is a surrogate pair
1318 */
1319 private static final class IsNextNFDSafe implements IsNextBoundary{
1320 public boolean isNextBoundary(UCharacterIterator src,
1321 int/*unsigned*/ minC,
1322 int/*unsigned*/ ccOrQCMask,
1323 int[] chars) {
1324 return NormalizerImpl.isNFDSafe(getNextNorm32(src,minC,ccOrQCMask,chars),
1325 ccOrQCMask, ccOrQCMask&NormalizerImpl.QC_MASK);
1326 }
1327 }
1328
1329 /*
1330 * for NF*C:
1331 * read forward and check if the character is (or its decomposition begins
1332 * with) a "true starter" (cc==0 and NF*C_YES)
1333 * if c2!=0 then (c, c2) is a surrogate pair
1334 */
1335 private static final class IsNextTrueStarter implements IsNextBoundary{
1336 public boolean isNextBoundary(UCharacterIterator src,
1337 int/*unsigned*/ minC,
1338 int/*unsigned*/ ccOrQCMask,
1339 int[] chars) {
1340 long norm32;
1341 int/*unsigned*/ decompQCMask;
1342
1343 decompQCMask=(ccOrQCMask<<2)&0xf; /*decomposition quick check mask*/
1344 norm32=getNextNorm32(src, minC, ccOrQCMask|decompQCMask, chars);
1345 return NormalizerImpl.isTrueStarter(norm32, ccOrQCMask, decompQCMask);
1346 }
1347 }
1348
1349 private static int findNextIterationBoundary(UCharacterIterator src,
1350 IsNextBoundary obj,
1351 int/*unsigned*/ minC,
1352 int/*unsigned*/ mask,
1353 char[] buffer) {
1354 if(src.current()==UCharacterIterator.DONE) {
1355 return 0;
1356 }
1357
1358 /* get one character and ignore its properties */
1359 int[] chars = new int[2];
1360 chars[0]=src.next();
1361 buffer[0]=(char)chars[0];
1362 int bufferIndex = 1;
1363
1364 if(UTF16.isLeadSurrogate((char)chars[0])&&
1365 src.current()!=UCharacterIterator.DONE) {
1366 if(UTF16.isTrailSurrogate((char)(chars[1]=src.next()))) {
1367 buffer[bufferIndex++]=(char)chars[1];
1368 } else {
1369 src.moveIndex(-1); /* back out the non-trail-surrogate */
1370 }
1371 }
1372
1373 /* get all following characters until we see a boundary */
1374 /* checking hasNext() instead of c!=DONE on the off-chance that U+ffff
1375 * is part of the string */
1376 while( src.current()!=UCharacterIterator.DONE) {
1377 if(obj.isNextBoundary(src, minC, mask, chars)) {
1378 /* back out the latest movement to stop at the boundary */
1379 src.moveIndex(chars[1]==0 ? -1 : -2);
1380 break;
1381 } else {
1382 if(bufferIndex+(chars[1]==0 ? 1 : 2)<=buffer.length) {
1383 buffer[bufferIndex++]=(char)chars[0];
1384 if(chars[1]!=0) {
1385 buffer[bufferIndex++]=(char)chars[1];
1386 }
1387 } else {
1388 char[] newBuf = new char[buffer.length*2];
1389 System.arraycopy(buffer,0,newBuf,0,bufferIndex);
1390 buffer = newBuf;
1391 buffer[bufferIndex++]=(char)chars[0];
1392 if(chars[1]!=0) {
1393 buffer[bufferIndex++]=(char)chars[1];
1394 }
1395 }
1396 }
1397 }
1398
1399 /* return the length of the buffer contents */
1400 return bufferIndex;
1401 }
1402
1403 private static int next(UCharacterIterator src,
1404 char[] dest, int destStart, int destLimit,
1405 NormalizerBase.Mode mode,
1406 boolean doNormalize,
1407 boolean[] pNeededToNormalize,
1408 int options) {
1409
1410 IsNextBoundary isNextBoundary;
1411 int /*unsigned*/ mask;
1412 int /*unsigned*/ bufferLength;
1413 int c,c2;
1414 char minC;
1415 int destCapacity = destLimit - destStart;
1416 int destLength = 0;
1417 if(pNeededToNormalize!=null) {
1418 pNeededToNormalize[0]=false;
1419 }
1420
1421 minC = (char)mode.getMinC();
1422 mask = mode.getMask();
1423 isNextBoundary = mode.getNextBoundary();
1424
1425 if(isNextBoundary==null) {
1426 destLength=0;
1427 c=src.next();
1428 if(c!=UCharacterIterator.DONE) {
1429 destLength=1;
1430 if(UTF16.isLeadSurrogate((char)c)) {
1431 c2= src.next();
1432 if(c2!= UCharacterIterator.DONE) {
1433 if(UTF16.isTrailSurrogate((char)c2)) {
1434 if(destCapacity>=2) {
1435 dest[1]=(char)c2; // trail surrogate
1436 destLength=2;
1437 }
1438 // lead surrogate to be written below
1439 } else {
1440 src.moveIndex(-1);
1441 }
1442 }
1443 }
1444
1445 if(destCapacity>0) {
1446 dest[0]=(char)c;
1447 }
1448 }
1449 return destLength;
1450 }
1451
1452 char[] buffer=new char[100];
1453 int[] startIndex = new int[1];
1454 bufferLength=findNextIterationBoundary(src,isNextBoundary, minC, mask,
1455 buffer);
1456 if(bufferLength>0) {
1457 if(doNormalize) {
1458 destLength=mode.normalize(buffer,startIndex[0],bufferLength,
1459 dest,destStart,destLimit, options);
1460
1461 if(pNeededToNormalize!=null) {
1462 pNeededToNormalize[0]=(boolean)(destLength!=bufferLength ||
1463 Utility.arrayRegionMatches(buffer,startIndex[0],
1464 dest,destStart,
1465 destLength));
1466 }
1467 } else {
1468 /* just copy the source characters */
1469 if(destCapacity>0) {
1470 System.arraycopy(buffer,0,dest,destStart,
1471 Math.min(bufferLength,destCapacity)
1472 );
1473 }
1474
1475
1476 }
1477 }
1478 return destLength;
1479 }
1480
1481 private void clearBuffer() {
1482 bufferLimit=bufferStart=bufferPos=0;
1483 }
1484
1485 private boolean nextNormalize() {
1486
1487 clearBuffer();
1488 currentIndex=nextIndex;
1489 text.setIndex(nextIndex);
1490
1491 bufferLimit=next(text,buffer,bufferStart,buffer.length,mode,true,null,options);
1492
1493 nextIndex=text.getIndex();
1494 return (bufferLimit>0);
1495 }
1496
1497 private boolean previousNormalize() {
1498
1499 clearBuffer();
1500 nextIndex=currentIndex;
1501 text.setIndex(currentIndex);
1502 bufferLimit=previous(text,buffer,bufferStart,buffer.length,mode,true,null,options);
1503
1504 currentIndex=text.getIndex();
1505 bufferPos = bufferLimit;
1506 return bufferLimit>0;
1507 }
1508
1509 private int getCodePointAt(int index) {
1510 if( UTF16.isSurrogate(buffer[index])) {
1511 if(UTF16.isLeadSurrogate(buffer[index])) {
1512 if((index+1)<bufferLimit &&
1513 UTF16.isTrailSurrogate(buffer[index+1])) {
1514 return UCharacterProperty.getRawSupplementary(
1515 buffer[index],
1516 buffer[index+1]
1517 );
1518 }
1519 }else if(UTF16.isTrailSurrogate(buffer[index])) {
1520 if(index>0 && UTF16.isLeadSurrogate(buffer[index-1])) {
1521 return UCharacterProperty.getRawSupplementary(
1522 buffer[index-1],
1523 buffer[index]
1524 );
1525 }
1526 }
1527 }
1528 return buffer[index];
1529
1530 }
1531
1532 /**
1533 * Internal API
1534 * @internal
1535 */
1536 public static boolean isNFSkippable(int c, Mode mode) {
1537 return mode.isNFSkippable(c);
1538 }
1539
1540 //
1541 // Options
1542 //
1543
1544 /*
1545 * Default option for Unicode 3.2.0 normalization.
1546 * Corrigendum 4 was fixed in Unicode 3.2.0 but isn't supported in
1547 * IDNA/StringPrep.
1548 * The public review issue #29 was fixed in Unicode 4.1.0. Corrigendum 5
1549 * allowed Unicode 3.2 to 4.0.1 to apply the fix for PRI #29, but it isn't
1550 * supported by IDNA/StringPrep as well as Corrigendum 4.
1551 */
1552 public static final int UNICODE_3_2_0_ORIGINAL =
1553 UNICODE_3_2 |
1554 NormalizerImpl.WITHOUT_CORRIGENDUM4_CORRECTIONS |
1555 NormalizerImpl.BEFORE_PRI_29;
1556
1557 /*
1558 * Default option for the latest Unicode normalization. This option is
1559 * provided mainly for testing.
1560 * The value zero means that normalization is done with the fixes for
1561 * - Corrigendum 4 (Five CJK Canonical Mapping Errors)
1562 * - Corrigendum 5 (Normalization Idempotency)
1563 */
1564 public static final int UNICODE_LATEST = 0x00;
1565
1566 //
1567 // public constructor and methods for java.text.Normalizer and
1568 // sun.text.Normalizer
1569 //
1570
1571 /**
1572 * Creates a new <tt>Normalizer</tt> object for iterating over the
1573 * normalized form of a given string.
1574 *
1575 * @param str The string to be normalized. The normalization
1576 * will start at the beginning of the string.
1577 *
1578 * @param mode The normalization mode.
1579 */
1580 public NormalizerBase(String str, Mode mode) {
1581 this(str, mode, UNICODE_LATEST);
1582 }
1583
1584 /**
1585 * Normalizes a <code>String</code> using the given normalization form.
1586 *
1587 * @param str the input string to be normalized.
1588 * @param form the normalization form
1589 */
1590 public static String normalize(String str, Normalizer.Form form) {
1591 return normalize(str, form, UNICODE_LATEST);
1592 }
1593
1594 /**
1595 * Normalizes a <code>String</code> using the given normalization form.
1596 *
1597 * @param str the input string to be normalized.
1598 * @param form the normalization form
1599 * @param options the optional features to be enabled.
1600 */
1601 public static String normalize(String str, Normalizer.Form form, int options) {
1602 switch (form) {
1603 case NFC :
1604 return NFC.normalize(str, options);
1605 case NFD :
1606 return NFD.normalize(str, options);
1607 case NFKC :
1608 return NFKC.normalize(str, options);
1609 case NFKD :
1610 return NFKD.normalize(str, options);
1611 }
1612
1613 throw new IllegalArgumentException("Unexpected normalization form: " +
1614 form);
1615 }
1616
1617 /**
1618 * Test if a string is in a given normalization form.
1619 * This is semantically equivalent to source.equals(normalize(source, mode)).
1620 *
1621 * Unlike quickCheck(), this function returns a definitive result,
1622 * never a "maybe".
1623 * For NFD, NFKD, and FCD, both functions work exactly the same.
1624 * For NFC and NFKC where quickCheck may return "maybe", this function will
1625 * perform further tests to arrive at a true/false result.
1626 * @param str the input string to be checked to see if it is normalized
1627 * @param form the normalization form
1628 * @param options the optional features to be enabled.
1629 */
1630 public static boolean isNormalized(String str, Normalizer.Form form) {
1631 return isNormalized(str, form, UNICODE_LATEST);
1632 }
1633
1634 /**
1635 * Test if a string is in a given normalization form.
1636 * This is semantically equivalent to source.equals(normalize(source, mode)).
1637 *
1638 * Unlike quickCheck(), this function returns a definitive result,
1639 * never a "maybe".
1640 * For NFD, NFKD, and FCD, both functions work exactly the same.
1641 * For NFC and NFKC where quickCheck may return "maybe", this function will
1642 * perform further tests to arrive at a true/false result.
1643 * @param str the input string to be checked to see if it is normalized
1644 * @param form the normalization form
1645 * @param options the optional features to be enabled.
1646 */
1647 public static boolean isNormalized(String str, Normalizer.Form form, int options) {
1648 switch (form) {
1649 case NFC:
1650 return (NFC.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES);
1651 case NFD:
1652 return (NFD.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES);
1653 case NFKC:
1654 return (NFKC.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES);
1655 case NFKD:
1656 return (NFKD.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES);
1657 }
1658
1659 throw new IllegalArgumentException("Unexpected normalization form: " +
1660 form);
1661 }
1662}