blob: 52459be71904892f35c1cb9d142facc84a915d3b [file] [log] [blame]
claireho50294ea2010-05-03 15:44:48 -07001/*
2*******************************************************************************
3*
4* Copyright (C) 2009-2010, International Business Machines
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: normalizer2impl.cpp
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2009nov22
14* created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_NORMALIZATION
20
21#include "unicode/normalizer2.h"
22#include "unicode/udata.h"
23#include "unicode/ustring.h"
24#include "cmemory.h"
25#include "mutex.h"
26#include "normalizer2impl.h"
27#include "uassert.h"
claireho27f65472011-06-09 11:11:49 -070028#include "uhash.h"
claireho50294ea2010-05-03 15:44:48 -070029#include "uset_imp.h"
30#include "utrie2.h"
claireho27f65472011-06-09 11:11:49 -070031#include "uvector.h"
claireho50294ea2010-05-03 15:44:48 -070032
33U_NAMESPACE_BEGIN
34
35// ReorderingBuffer -------------------------------------------------------- ***
36
37UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
38 int32_t length=str.length();
39 start=str.getBuffer(destCapacity);
40 if(start==NULL) {
41 // getBuffer() already did str.setToBogus()
42 errorCode=U_MEMORY_ALLOCATION_ERROR;
43 return FALSE;
44 }
45 limit=start+length;
46 remainingCapacity=str.getCapacity()-length;
47 reorderStart=start;
48 if(start==limit) {
49 lastCC=0;
50 } else {
51 setIterator();
52 lastCC=previousCC();
53 // Set reorderStart after the last code point with cc<=1 if there is one.
54 if(lastCC>1) {
55 while(previousCC()>1) {}
56 }
57 reorderStart=codePointLimit;
58 }
59 return TRUE;
60}
61
62UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
63 int32_t length=(int32_t)(limit-start);
64 return
65 length==(int32_t)(otherLimit-otherStart) &&
66 0==u_memcmp(start, otherStart, length);
67}
68
69UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
70 if(remainingCapacity<2 && !resize(2, errorCode)) {
71 return FALSE;
72 }
73 if(lastCC<=cc || cc==0) {
74 limit[0]=U16_LEAD(c);
75 limit[1]=U16_TRAIL(c);
76 limit+=2;
77 lastCC=cc;
78 if(cc<=1) {
79 reorderStart=limit;
80 }
81 } else {
82 insert(c, cc);
83 }
84 remainingCapacity-=2;
85 return TRUE;
86}
87
88UBool ReorderingBuffer::append(const UChar *s, int32_t length,
89 uint8_t leadCC, uint8_t trailCC,
90 UErrorCode &errorCode) {
91 if(length==0) {
92 return TRUE;
93 }
94 if(remainingCapacity<length && !resize(length, errorCode)) {
95 return FALSE;
96 }
97 remainingCapacity-=length;
98 if(lastCC<=leadCC || leadCC==0) {
99 if(trailCC<=1) {
100 reorderStart=limit+length;
101 } else if(leadCC<=1) {
102 reorderStart=limit+1; // Ok if not a code point boundary.
103 }
104 const UChar *sLimit=s+length;
105 do { *limit++=*s++; } while(s!=sLimit);
106 lastCC=trailCC;
107 } else {
108 int32_t i=0;
109 UChar32 c;
110 U16_NEXT(s, i, length, c);
111 insert(c, leadCC); // insert first code point
112 while(i<length) {
113 U16_NEXT(s, i, length, c);
114 if(i<length) {
115 // s must be in NFD, otherwise we need to use getCC().
116 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
117 } else {
118 leadCC=trailCC;
119 }
120 append(c, leadCC, errorCode);
121 }
122 }
123 return TRUE;
124}
125
126UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
127 int32_t cpLength=U16_LENGTH(c);
128 if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
129 return FALSE;
130 }
131 remainingCapacity-=cpLength;
132 if(cpLength==1) {
133 *limit++=(UChar)c;
134 } else {
135 limit[0]=U16_LEAD(c);
136 limit[1]=U16_TRAIL(c);
137 limit+=2;
138 }
139 lastCC=0;
140 reorderStart=limit;
141 return TRUE;
142}
143
144UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
145 if(s==sLimit) {
146 return TRUE;
147 }
148 int32_t length=(int32_t)(sLimit-s);
149 if(remainingCapacity<length && !resize(length, errorCode)) {
150 return FALSE;
151 }
152 u_memcpy(limit, s, length);
153 limit+=length;
154 remainingCapacity-=length;
155 lastCC=0;
156 reorderStart=limit;
157 return TRUE;
158}
159
160void ReorderingBuffer::remove() {
161 reorderStart=limit=start;
162 remainingCapacity=str.getCapacity();
163 lastCC=0;
164}
165
166void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
167 if(suffixLength<(limit-start)) {
168 limit-=suffixLength;
169 remainingCapacity+=suffixLength;
170 } else {
171 limit=start;
172 remainingCapacity=str.getCapacity();
173 }
174 lastCC=0;
175 reorderStart=limit;
176}
177
178UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
179 int32_t reorderStartIndex=(int32_t)(reorderStart-start);
180 int32_t length=(int32_t)(limit-start);
181 str.releaseBuffer(length);
182 int32_t newCapacity=length+appendLength;
183 int32_t doubleCapacity=2*str.getCapacity();
184 if(newCapacity<doubleCapacity) {
185 newCapacity=doubleCapacity;
186 }
187 if(newCapacity<256) {
188 newCapacity=256;
189 }
190 start=str.getBuffer(newCapacity);
191 if(start==NULL) {
192 // getBuffer() already did str.setToBogus()
193 errorCode=U_MEMORY_ALLOCATION_ERROR;
194 return FALSE;
195 }
196 reorderStart=start+reorderStartIndex;
197 limit=start+length;
198 remainingCapacity=str.getCapacity()-length;
199 return TRUE;
200}
201
202void ReorderingBuffer::skipPrevious() {
203 codePointLimit=codePointStart;
204 UChar c=*--codePointStart;
205 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
206 --codePointStart;
207 }
208}
209
210uint8_t ReorderingBuffer::previousCC() {
211 codePointLimit=codePointStart;
212 if(reorderStart>=codePointStart) {
213 return 0;
214 }
215 UChar32 c=*--codePointStart;
216 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
217 return 0;
218 }
219
220 UChar c2;
221 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
222 --codePointStart;
223 c=U16_GET_SUPPLEMENTARY(c2, c);
224 }
225 return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
226}
227
228// Inserts c somewhere before the last character.
229// Requires 0<cc<lastCC which implies reorderStart<limit.
230void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
231 for(setIterator(), skipPrevious(); previousCC()>cc;) {}
232 // insert c at codePointLimit, after the character with prevCC<=cc
233 UChar *q=limit;
234 UChar *r=limit+=U16_LENGTH(c);
235 do {
236 *--r=*--q;
237 } while(codePointLimit!=q);
238 writeCodePoint(q, c);
239 if(cc<=1) {
240 reorderStart=r;
241 }
242}
243
244// Normalizer2Impl --------------------------------------------------------- ***
245
claireho27f65472011-06-09 11:11:49 -0700246struct CanonIterData : public UMemory {
247 CanonIterData(UErrorCode &errorCode);
248 ~CanonIterData();
249 void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
250 UTrie2 *trie;
251 UVector canonStartSets; // contains UnicodeSet *
252};
253
claireho50294ea2010-05-03 15:44:48 -0700254Normalizer2Impl::~Normalizer2Impl() {
255 udata_close(memory);
256 utrie2_close(normTrie);
257 UTrie2Singleton(fcdTrieSingleton).deleteInstance();
claireho27f65472011-06-09 11:11:49 -0700258 delete (CanonIterData *)canonIterDataSingleton.fInstance;
claireho50294ea2010-05-03 15:44:48 -0700259}
260
261UBool U_CALLCONV
262Normalizer2Impl::isAcceptable(void *context,
claireho27f65472011-06-09 11:11:49 -0700263 const char * /* type */, const char * /*name*/,
claireho50294ea2010-05-03 15:44:48 -0700264 const UDataInfo *pInfo) {
265 if(
266 pInfo->size>=20 &&
267 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
268 pInfo->charsetFamily==U_CHARSET_FAMILY &&
269 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
270 pInfo->dataFormat[1]==0x72 &&
271 pInfo->dataFormat[2]==0x6d &&
272 pInfo->dataFormat[3]==0x32 &&
273 pInfo->formatVersion[0]==1
274 ) {
275 Normalizer2Impl *me=(Normalizer2Impl *)context;
276 uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4);
277 return TRUE;
278 } else {
279 return FALSE;
280 }
281}
282
283void
284Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) {
285 if(U_FAILURE(errorCode)) {
286 return;
287 }
288 memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode);
289 if(U_FAILURE(errorCode)) {
290 return;
291 }
292 const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory);
293 const int32_t *inIndexes=(const int32_t *)inBytes;
294 int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4;
295 if(indexesLength<=IX_MIN_MAYBE_YES) {
296 errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes.
297 return;
298 }
299
300 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
301 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
302
303 minYesNo=inIndexes[IX_MIN_YES_NO];
304 minNoNo=inIndexes[IX_MIN_NO_NO];
305 limitNoNo=inIndexes[IX_LIMIT_NO_NO];
306 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
307
308 int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET];
309 int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
310 normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
311 inBytes+offset, nextOffset-offset, NULL,
312 &errorCode);
313 if(U_FAILURE(errorCode)) {
314 return;
315 }
316
317 offset=nextOffset;
318 maybeYesCompositions=(const uint16_t *)(inBytes+offset);
319 extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);
320}
321
322uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const {
323 UChar32 c;
324 if(cpStart==(cpLimit-1)) {
325 c=*cpStart;
326 } else {
327 c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]);
328 }
329 uint16_t prevNorm16=getNorm16(c);
330 if(prevNorm16<=minYesNo) {
331 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0
332 } else {
333 return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo
334 }
335}
336
337U_CDECL_BEGIN
338
339static UBool U_CALLCONV
340enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
341 /* add the start code point to the USet */
342 const USetAdder *sa=(const USetAdder *)context;
343 sa->add(sa->set, start);
344 return TRUE;
345}
346
claireho27f65472011-06-09 11:11:49 -0700347static uint32_t U_CALLCONV
348segmentStarterMapper(const void * /*context*/, uint32_t value) {
349 return value&CANON_NOT_SEGMENT_STARTER;
350}
351
claireho50294ea2010-05-03 15:44:48 -0700352U_CDECL_END
353
354void
claireho27f65472011-06-09 11:11:49 -0700355Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
claireho50294ea2010-05-03 15:44:48 -0700356 /* add the start code point of each same-value range of each trie */
357 utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa);
358
359 /* add Hangul LV syllables and LV+1 because of skippables */
360 for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
361 sa->add(sa->set, c);
362 sa->add(sa->set, c+1);
363 }
364 sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
365}
366
claireho27f65472011-06-09 11:11:49 -0700367void
368Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
369 /* add the start code point of each same-value range of the canonical iterator data trie */
370 if(ensureCanonIterData(errorCode)) {
371 // currently only used for the SEGMENT_STARTER property
372 utrie2_enum(((CanonIterData *)canonIterDataSingleton.fInstance)->trie,
373 segmentStarterMapper, enumPropertyStartsRange, sa);
374 }
375}
376
claireho50294ea2010-05-03 15:44:48 -0700377const UChar *
378Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
379 UChar32 minNeedDataCP,
380 ReorderingBuffer *buffer,
381 UErrorCode &errorCode) const {
382 // Make some effort to support NUL-terminated strings reasonably.
383 // Take the part of the fast quick check loop that does not look up
384 // data and check the first part of the string.
385 // After this prefix, determine the string length to simplify the rest
386 // of the code.
387 const UChar *prevSrc=src;
388 UChar c;
389 while((c=*src++)<minNeedDataCP && c!=0) {}
390 // Back out the last character for full processing.
391 // Copy this prefix.
392 if(--src!=prevSrc) {
393 if(buffer!=NULL) {
394 buffer->appendZeroCC(prevSrc, src, errorCode);
395 }
396 }
397 return src;
398}
399
400// Dual functionality:
401// buffer!=NULL: normalize
402// buffer==NULL: isNormalized/spanQuickCheckYes
403const UChar *
404Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
405 ReorderingBuffer *buffer,
406 UErrorCode &errorCode) const {
407 UChar32 minNoCP=minDecompNoCP;
408 if(limit==NULL) {
409 src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
410 if(U_FAILURE(errorCode)) {
411 return src;
412 }
413 limit=u_strchr(src, 0);
414 }
415
416 const UChar *prevSrc;
417 UChar32 c=0;
418 uint16_t norm16=0;
419
420 // only for quick check
421 const UChar *prevBoundary=src;
422 uint8_t prevCC=0;
423
424 for(;;) {
425 // count code units below the minimum or with irrelevant data for the quick check
426 for(prevSrc=src; src!=limit;) {
427 if( (c=*src)<minNoCP ||
428 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
429 ) {
430 ++src;
431 } else if(!U16_IS_SURROGATE(c)) {
432 break;
433 } else {
434 UChar c2;
435 if(U16_IS_SURROGATE_LEAD(c)) {
436 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
437 c=U16_GET_SUPPLEMENTARY(c, c2);
438 }
439 } else /* trail surrogate */ {
440 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
441 --src;
442 c=U16_GET_SUPPLEMENTARY(c2, c);
443 }
444 }
445 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
446 src+=U16_LENGTH(c);
447 } else {
448 break;
449 }
450 }
451 }
452 // copy these code units all at once
453 if(src!=prevSrc) {
454 if(buffer!=NULL) {
455 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
456 break;
457 }
458 } else {
459 prevCC=0;
460 prevBoundary=src;
461 }
462 }
463 if(src==limit) {
464 break;
465 }
466
467 // Check one above-minimum, relevant code point.
468 src+=U16_LENGTH(c);
469 if(buffer!=NULL) {
470 if(!decompose(c, norm16, *buffer, errorCode)) {
471 break;
472 }
473 } else {
474 if(isDecompYes(norm16)) {
475 uint8_t cc=getCCFromYesOrMaybe(norm16);
476 if(prevCC<=cc || cc==0) {
477 prevCC=cc;
478 if(cc<=1) {
479 prevBoundary=src;
480 }
481 continue;
482 }
483 }
484 return prevBoundary; // "no" or cc out of order
485 }
486 }
487 return src;
488}
489
490// Decompose a short piece of text which is likely to contain characters that
491// fail the quick check loop and/or where the quick check loop's overhead
492// is unlikely to be amortized.
493// Called by the compose() and makeFCD() implementations.
494UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
495 ReorderingBuffer &buffer,
496 UErrorCode &errorCode) const {
497 while(src<limit) {
498 UChar32 c;
499 uint16_t norm16;
500 UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
501 if(!decompose(c, norm16, buffer, errorCode)) {
502 return FALSE;
503 }
504 }
505 return TRUE;
506}
507
508UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
509 ReorderingBuffer &buffer,
510 UErrorCode &errorCode) const {
511 // Only loops for 1:1 algorithmic mappings.
512 for(;;) {
513 // get the decomposition and the lead and trail cc's
514 if(isDecompYes(norm16)) {
515 // c does not decompose
516 return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
517 } else if(isHangul(norm16)) {
518 // Hangul syllable: decompose algorithmically
519 UChar jamos[3];
520 return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
521 } else if(isDecompNoAlgorithmic(norm16)) {
522 c=mapAlgorithmic(c, norm16);
523 norm16=getNorm16(c);
524 } else {
525 // c decomposes, get everything from the variable-length extra data
526 const uint16_t *mapping=getMapping(norm16);
527 uint16_t firstUnit=*mapping++;
528 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
529 uint8_t leadCC, trailCC;
530 trailCC=(uint8_t)(firstUnit>>8);
531 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
532 leadCC=(uint8_t)(*mapping++>>8);
533 } else {
534 leadCC=0;
535 }
536 return buffer.append((const UChar *)mapping, length, leadCC, trailCC, errorCode);
537 }
538 }
539}
540
541const UChar *
542Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
543 const UChar *decomp=NULL;
544 uint16_t norm16;
545 for(;;) {
546 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
547 // c does not decompose
548 return decomp;
549 } else if(isHangul(norm16)) {
550 // Hangul syllable: decompose algorithmically
551 length=Hangul::decompose(c, buffer);
552 return buffer;
553 } else if(isDecompNoAlgorithmic(norm16)) {
554 c=mapAlgorithmic(c, norm16);
555 decomp=buffer;
556 length=0;
557 U16_APPEND_UNSAFE(buffer, length, c);
558 } else {
559 // c decomposes, get everything from the variable-length extra data
560 const uint16_t *mapping=getMapping(norm16);
561 uint16_t firstUnit=*mapping++;
562 length=firstUnit&MAPPING_LENGTH_MASK;
563 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
564 ++mapping;
565 }
566 return (const UChar *)mapping;
567 }
568 }
569}
570
571void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
572 UBool doDecompose,
573 ReorderingBuffer &buffer,
574 UErrorCode &errorCode) const {
575 if(doDecompose) {
576 decompose(src, limit, &buffer, errorCode);
577 return;
578 }
579 // Just merge the strings at the boundary.
580 ForwardUTrie2StringIterator iter(normTrie, src, limit);
581 uint8_t firstCC, prevCC, cc;
582 firstCC=prevCC=cc=getCC(iter.next16());
583 while(cc!=0) {
584 prevCC=cc;
585 cc=getCC(iter.next16());
586 };
587 buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode) &&
588 buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
589}
590
591// Note: hasDecompBoundary() could be implemented as aliases to
592// hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
593// at the cost of building the FCD trie for a decomposition normalizer.
594UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
595 for(;;) {
596 if(c<minDecompNoCP) {
597 return TRUE;
598 }
599 uint16_t norm16=getNorm16(c);
600 if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
601 return TRUE;
602 } else if(norm16>MIN_NORMAL_MAYBE_YES) {
603 return FALSE; // ccc!=0
604 } else if(isDecompNoAlgorithmic(norm16)) {
605 c=mapAlgorithmic(c, norm16);
606 } else {
607 // c decomposes, get everything from the variable-length extra data
608 const uint16_t *mapping=getMapping(norm16);
609 uint16_t firstUnit=*mapping++;
610 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
611 return FALSE;
612 }
613 if(!before) {
614 // decomp after-boundary: same as hasFCDBoundaryAfter(),
615 // fcd16<=1 || trailCC==0
616 if(firstUnit>0x1ff) {
617 return FALSE; // trailCC>1
618 }
619 if(firstUnit<=0xff) {
620 return TRUE; // trailCC==0
621 }
622 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
623 }
624 // TRUE if leadCC==0 (hasFCDBoundaryBefore())
625 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*mapping&0xff00)==0;
626 }
627 }
628}
629
630/*
631 * Finds the recomposition result for
632 * a forward-combining "lead" character,
633 * specified with a pointer to its compositions list,
634 * and a backward-combining "trail" character.
635 *
636 * If the lead and trail characters combine, then this function returns
637 * the following "compositeAndFwd" value:
638 * Bits 21..1 composite character
639 * Bit 0 set if the composite is a forward-combining starter
640 * otherwise it returns -1.
641 *
642 * The compositions list has (trail, compositeAndFwd) pair entries,
643 * encoded as either pairs or triples of 16-bit units.
644 * The last entry has the high bit of its first unit set.
645 *
646 * The list is sorted by ascending trail characters (there are no duplicates).
647 * A linear search is used.
648 *
649 * See normalizer2impl.h for a more detailed description
650 * of the compositions list format.
651 */
652int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
653 uint16_t key1, firstUnit;
654 if(trail<COMP_1_TRAIL_LIMIT) {
655 // trail character is 0..33FF
656 // result entry may have 2 or 3 units
657 key1=(uint16_t)(trail<<1);
658 while(key1>(firstUnit=*list)) {
659 list+=2+(firstUnit&COMP_1_TRIPLE);
660 }
661 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
662 if(firstUnit&COMP_1_TRIPLE) {
663 return ((int32_t)list[1]<<16)|list[2];
664 } else {
665 return list[1];
666 }
667 }
668 } else {
669 // trail character is 3400..10FFFF
670 // result entry has 3 units
671 key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
claireho27f65472011-06-09 11:11:49 -0700672 (((trail>>COMP_1_TRAIL_SHIFT))&
673 ~COMP_1_TRIPLE));
claireho50294ea2010-05-03 15:44:48 -0700674 uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
675 uint16_t secondUnit;
676 for(;;) {
677 if(key1>(firstUnit=*list)) {
678 list+=2+(firstUnit&COMP_1_TRIPLE);
679 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
680 if(key2>(secondUnit=list[1])) {
681 if(firstUnit&COMP_1_LAST_TUPLE) {
682 break;
683 } else {
684 list+=3;
685 }
686 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
687 return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
688 } else {
689 break;
690 }
691 } else {
692 break;
693 }
694 }
695 }
696 return -1;
697}
698
claireho27f65472011-06-09 11:11:49 -0700699/**
700 * @param list some character's compositions list
701 * @param set recursively receives the composites from these compositions
702 */
703void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
704 uint16_t firstUnit;
705 int32_t compositeAndFwd;
706 do {
707 firstUnit=*list;
708 if((firstUnit&COMP_1_TRIPLE)==0) {
709 compositeAndFwd=list[1];
710 list+=2;
711 } else {
712 compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];
713 list+=3;
714 }
715 UChar32 composite=compositeAndFwd>>1;
716 if((compositeAndFwd&1)!=0) {
717 addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
718 }
719 set.add(composite);
720 } while((firstUnit&COMP_1_LAST_TUPLE)==0);
721}
722
claireho50294ea2010-05-03 15:44:48 -0700723/*
724 * Recomposes the buffer text starting at recomposeStartIndex
725 * (which is in NFD - decomposed and canonically ordered),
726 * and truncates the buffer contents.
727 *
728 * Note that recomposition never lengthens the text:
729 * Any character consists of either one or two code units;
730 * a composition may contain at most one more code unit than the original starter,
731 * while the combining mark that is removed has at least one code unit.
732 */
733void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
734 UBool onlyContiguous) const {
735 UChar *p=buffer.getStart()+recomposeStartIndex;
736 UChar *limit=buffer.getLimit();
737 if(p==limit) {
738 return;
739 }
740
741 UChar *starter, *pRemove, *q, *r;
742 const uint16_t *compositionsList;
743 UChar32 c, compositeAndFwd;
744 uint16_t norm16;
745 uint8_t cc, prevCC;
746 UBool starterIsSupplementary;
747
748 // Some of the following variables are not used until we have a forward-combining starter
749 // and are only initialized now to avoid compiler warnings.
750 compositionsList=NULL; // used as indicator for whether we have a forward-combining starter
751 starter=NULL;
752 starterIsSupplementary=FALSE;
753 prevCC=0;
754
755 for(;;) {
756 UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
757 cc=getCCFromYesOrMaybe(norm16);
758 if( // this character combines backward and
759 isMaybe(norm16) &&
760 // we have seen a starter that combines forward and
761 compositionsList!=NULL &&
762 // the backward-combining character is not blocked
763 (prevCC<cc || prevCC==0)
764 ) {
765 if(isJamoVT(norm16)) {
766 // c is a Jamo V/T, see if we can compose it with the previous character.
767 if(c<Hangul::JAMO_T_BASE) {
768 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
769 UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
770 if(prev<Hangul::JAMO_L_COUNT) {
771 pRemove=p-1;
772 UChar syllable=(UChar)
773 (Hangul::HANGUL_BASE+
774 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
775 Hangul::JAMO_T_COUNT);
776 UChar t;
777 if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
778 ++p;
779 syllable+=t; // The next character was a Jamo T.
780 }
781 *starter=syllable;
782 // remove the Jamo V/T
783 q=pRemove;
784 r=p;
785 while(r<limit) {
786 *q++=*r++;
787 }
788 limit=q;
789 p=pRemove;
790 }
791 }
792 /*
793 * No "else" for Jamo T:
794 * Since the input is in NFD, there are no Hangul LV syllables that
795 * a Jamo T could combine with.
796 * All Jamo Ts are combined above when handling Jamo Vs.
797 */
798 if(p==limit) {
799 break;
800 }
801 compositionsList=NULL;
802 continue;
803 } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
804 // The starter and the combining mark (c) do combine.
805 UChar32 composite=compositeAndFwd>>1;
806
807 // Replace the starter with the composite, remove the combining mark.
808 pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark
809 if(starterIsSupplementary) {
810 if(U_IS_SUPPLEMENTARY(composite)) {
811 // both are supplementary
812 starter[0]=U16_LEAD(composite);
813 starter[1]=U16_TRAIL(composite);
814 } else {
815 *starter=(UChar)composite;
816 // The composite is shorter than the starter,
817 // move the intermediate characters forward one.
818 starterIsSupplementary=FALSE;
819 q=starter+1;
820 r=q+1;
821 while(r<pRemove) {
822 *q++=*r++;
823 }
824 --pRemove;
825 }
826 } else if(U_IS_SUPPLEMENTARY(composite)) {
827 // The composite is longer than the starter,
828 // move the intermediate characters back one.
829 starterIsSupplementary=TRUE;
830 ++starter; // temporarily increment for the loop boundary
831 q=pRemove;
832 r=++pRemove;
833 while(starter<q) {
834 *--r=*--q;
835 }
836 *starter=U16_TRAIL(composite);
837 *--starter=U16_LEAD(composite); // undo the temporary increment
838 } else {
839 // both are on the BMP
840 *starter=(UChar)composite;
841 }
842
843 /* remove the combining mark by moving the following text over it */
844 if(pRemove<p) {
845 q=pRemove;
846 r=p;
847 while(r<limit) {
848 *q++=*r++;
849 }
850 limit=q;
851 p=pRemove;
852 }
853 // Keep prevCC because we removed the combining mark.
854
855 if(p==limit) {
856 break;
857 }
858 // Is the composite a starter that combines forward?
859 if(compositeAndFwd&1) {
860 compositionsList=
861 getCompositionsListForComposite(getNorm16(composite));
862 } else {
863 compositionsList=NULL;
864 }
865
866 // We combined; continue with looking for compositions.
867 continue;
868 }
869 }
870
871 // no combination this time
872 prevCC=cc;
873 if(p==limit) {
874 break;
875 }
876
877 // If c did not combine, then check if it is a starter.
878 if(cc==0) {
879 // Found a new starter.
880 if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
881 // It may combine with something, prepare for it.
882 if(U_IS_BMP(c)) {
883 starterIsSupplementary=FALSE;
884 starter=p-1;
885 } else {
886 starterIsSupplementary=TRUE;
887 starter=p-2;
888 }
889 }
890 } else if(onlyContiguous) {
891 // FCC: no discontiguous compositions; any intervening character blocks.
892 compositionsList=NULL;
893 }
894 }
895 buffer.setReorderingLimit(limit);
896}
897
898// Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
899// doCompose: normalize
900// !doCompose: isNormalized (buffer must be empty and initialized)
901UBool
902Normalizer2Impl::compose(const UChar *src, const UChar *limit,
903 UBool onlyContiguous,
904 UBool doCompose,
905 ReorderingBuffer &buffer,
906 UErrorCode &errorCode) const {
claireho50294ea2010-05-03 15:44:48 -0700907 /*
908 * prevBoundary points to the last character before the current one
909 * that has a composition boundary before it with ccc==0 and quick check "yes".
910 * Keeping track of prevBoundary saves us looking for a composition boundary
911 * when we find a "no" or "maybe".
912 *
913 * When we back out from prevSrc back to prevBoundary,
914 * then we also remove those same characters (which had been simply copied
915 * or canonically-order-inserted) from the ReorderingBuffer.
916 * Therefore, at all times, the [prevBoundary..prevSrc[ source units
917 * must correspond 1:1 to destination units at the end of the destination buffer.
918 */
919 const UChar *prevBoundary=src;
claireho27f65472011-06-09 11:11:49 -0700920 UChar32 minNoMaybeCP=minCompNoMaybeCP;
921 if(limit==NULL) {
922 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
923 doCompose ? &buffer : NULL,
924 errorCode);
925 if(U_FAILURE(errorCode)) {
926 return FALSE;
927 }
928 if(prevBoundary<src) {
929 // Set prevBoundary to the last character in the prefix.
930 prevBoundary=src-1;
931 }
932 limit=u_strchr(src, 0);
933 }
934
claireho50294ea2010-05-03 15:44:48 -0700935 const UChar *prevSrc;
936 UChar32 c=0;
937 uint16_t norm16=0;
938
939 // only for isNormalized
940 uint8_t prevCC=0;
941
942 for(;;) {
943 // count code units below the minimum or with irrelevant data for the quick check
944 for(prevSrc=src; src!=limit;) {
945 if( (c=*src)<minNoMaybeCP ||
946 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
947 ) {
948 ++src;
949 } else if(!U16_IS_SURROGATE(c)) {
950 break;
951 } else {
952 UChar c2;
953 if(U16_IS_SURROGATE_LEAD(c)) {
954 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
955 c=U16_GET_SUPPLEMENTARY(c, c2);
956 }
957 } else /* trail surrogate */ {
958 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
959 --src;
960 c=U16_GET_SUPPLEMENTARY(c2, c);
961 }
962 }
963 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
964 src+=U16_LENGTH(c);
965 } else {
966 break;
967 }
968 }
969 }
970 // copy these code units all at once
971 if(src!=prevSrc) {
972 if(doCompose) {
973 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) {
974 break;
975 }
976 } else {
977 prevCC=0;
978 }
979 if(src==limit) {
980 break;
981 }
982 // Set prevBoundary to the last character in the quick check loop.
983 prevBoundary=src-1;
984 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
985 U16_IS_LEAD(*(prevBoundary-1))
986 ) {
987 --prevBoundary;
988 }
989 // The start of the current character (c).
990 prevSrc=src;
991 } else if(src==limit) {
992 break;
993 }
994
995 src+=U16_LENGTH(c);
996 /*
997 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
998 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
999 * or has ccc!=0.
1000 * Check for Jamo V/T, then for regular characters.
1001 * c is not a Hangul syllable or Jamo L because those have "yes" properties.
1002 */
1003 if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
1004 UChar prev=*(prevSrc-1);
1005 UBool needToDecompose=FALSE;
1006 if(c<Hangul::JAMO_T_BASE) {
1007 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1008 prev=(UChar)(prev-Hangul::JAMO_L_BASE);
1009 if(prev<Hangul::JAMO_L_COUNT) {
1010 if(!doCompose) {
1011 return FALSE;
1012 }
1013 UChar syllable=(UChar)
1014 (Hangul::HANGUL_BASE+
1015 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
1016 Hangul::JAMO_T_COUNT);
1017 UChar t;
1018 if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
1019 ++src;
1020 syllable+=t; // The next character was a Jamo T.
1021 prevBoundary=src;
1022 buffer.setLastChar(syllable);
1023 continue;
1024 }
1025 // If we see L+V+x where x!=T then we drop to the slow path,
1026 // decompose and recompose.
1027 // This is to deal with NFKC finding normal L and V but a
1028 // compatibility variant of a T. We need to either fully compose that
1029 // combination here (which would complicate the code and may not work
1030 // with strange custom data) or use the slow path -- or else our replacing
1031 // two input characters (L+V) with one output character (LV syllable)
1032 // would violate the invariant that [prevBoundary..prevSrc[ has the same
1033 // length as what we appended to the buffer since prevBoundary.
1034 needToDecompose=TRUE;
1035 }
1036 } else if(Hangul::isHangulWithoutJamoT(prev)) {
1037 // c is a Jamo Trailing consonant,
1038 // compose with previous Hangul LV that does not contain a Jamo T.
1039 if(!doCompose) {
1040 return FALSE;
1041 }
1042 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE));
1043 prevBoundary=src;
1044 continue;
1045 }
1046 if(!needToDecompose) {
1047 // The Jamo V/T did not compose into a Hangul syllable.
1048 if(doCompose) {
1049 if(!buffer.appendBMP((UChar)c, 0, errorCode)) {
1050 break;
1051 }
1052 } else {
1053 prevCC=0;
1054 }
1055 continue;
1056 }
1057 }
1058 /*
1059 * Source buffer pointers:
1060 *
1061 * all done quick check current char not yet
1062 * "yes" but (c) processed
1063 * may combine
1064 * forward
1065 * [-------------[-------------[-------------[-------------[
1066 * | | | | |
1067 * orig. src prevBoundary prevSrc src limit
1068 *
1069 *
1070 * Destination buffer pointers inside the ReorderingBuffer:
1071 *
1072 * all done might take not filled yet
1073 * characters for
1074 * reordering
1075 * [-------------[-------------[-------------[
1076 * | | | |
1077 * start reorderStart limit |
1078 * +remainingCap.+
1079 */
1080 if(norm16>=MIN_YES_YES_WITH_CC) {
1081 uint8_t cc=(uint8_t)norm16; // cc!=0
1082 if( onlyContiguous && // FCC
1083 (doCompose ? buffer.getLastCC() : prevCC)==0 &&
1084 prevBoundary<prevSrc &&
1085 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
1086 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1087 // passed the quick check "yes && ccc==0" test.
1088 // Check whether the last character was a "yesYes" or a "yesNo".
1089 // If a "yesNo", then we get its trailing ccc from its
1090 // mapping and check for canonical order.
1091 // All other cases are ok.
1092 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
1093 ) {
1094 // Fails FCD test, need to decompose and contiguously recompose.
1095 if(!doCompose) {
1096 return FALSE;
1097 }
1098 } else if(doCompose) {
1099 if(!buffer.append(c, cc, errorCode)) {
1100 break;
1101 }
1102 continue;
1103 } else if(prevCC<=cc) {
1104 prevCC=cc;
1105 continue;
1106 } else {
1107 return FALSE;
1108 }
1109 } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
1110 return FALSE;
1111 }
1112
1113 /*
1114 * Find appropriate boundaries around this character,
1115 * decompose the source text from between the boundaries,
1116 * and recompose it.
1117 *
1118 * We may need to remove the last few characters from the ReorderingBuffer
1119 * to account for source text that was copied or appended
1120 * but needs to take part in the recomposition.
1121 */
1122
1123 /*
1124 * Find the last composition boundary in [prevBoundary..src[.
1125 * It is either the decomposition of the current character (at prevSrc),
1126 * or prevBoundary.
1127 */
1128 if(hasCompBoundaryBefore(c, norm16)) {
1129 prevBoundary=prevSrc;
1130 } else if(doCompose) {
1131 buffer.removeSuffix((int32_t)(prevSrc-prevBoundary));
1132 }
1133
1134 // Find the next composition boundary in [src..limit[ -
1135 // modifies src to point to the next starter.
1136 src=(UChar *)findNextCompBoundary(src, limit);
1137
1138 // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
1139 int32_t recomposeStartIndex=buffer.length();
1140 if(!decomposeShort(prevBoundary, src, buffer, errorCode)) {
1141 break;
1142 }
1143 recompose(buffer, recomposeStartIndex, onlyContiguous);
1144 if(!doCompose) {
1145 if(!buffer.equals(prevBoundary, src)) {
1146 return FALSE;
1147 }
1148 buffer.remove();
1149 prevCC=0;
1150 }
1151
1152 // Move to the next starter. We never need to look back before this point again.
1153 prevBoundary=src;
1154 }
1155 return TRUE;
1156}
1157
1158// Very similar to compose(): Make the same changes in both places if relevant.
1159// pQCResult==NULL: spanQuickCheckYes
1160// pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
1161const UChar *
1162Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
1163 UBool onlyContiguous,
1164 UNormalizationCheckResult *pQCResult) const {
claireho50294ea2010-05-03 15:44:48 -07001165 /*
1166 * prevBoundary points to the last character before the current one
1167 * that has a composition boundary before it with ccc==0 and quick check "yes".
1168 */
1169 const UChar *prevBoundary=src;
claireho27f65472011-06-09 11:11:49 -07001170 UChar32 minNoMaybeCP=minCompNoMaybeCP;
1171 if(limit==NULL) {
1172 UErrorCode errorCode=U_ZERO_ERROR;
1173 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
1174 if(prevBoundary<src) {
1175 // Set prevBoundary to the last character in the prefix.
1176 prevBoundary=src-1;
1177 }
1178 limit=u_strchr(src, 0);
1179 }
1180
claireho50294ea2010-05-03 15:44:48 -07001181 const UChar *prevSrc;
1182 UChar32 c=0;
1183 uint16_t norm16=0;
1184 uint8_t prevCC=0;
1185
1186 for(;;) {
1187 // count code units below the minimum or with irrelevant data for the quick check
1188 for(prevSrc=src;;) {
1189 if(src==limit) {
1190 return src;
1191 }
1192 if( (c=*src)<minNoMaybeCP ||
1193 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
1194 ) {
1195 ++src;
1196 } else if(!U16_IS_SURROGATE(c)) {
1197 break;
1198 } else {
1199 UChar c2;
1200 if(U16_IS_SURROGATE_LEAD(c)) {
1201 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1202 c=U16_GET_SUPPLEMENTARY(c, c2);
1203 }
1204 } else /* trail surrogate */ {
1205 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1206 --src;
1207 c=U16_GET_SUPPLEMENTARY(c2, c);
1208 }
1209 }
1210 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1211 src+=U16_LENGTH(c);
1212 } else {
1213 break;
1214 }
1215 }
1216 }
1217 if(src!=prevSrc) {
1218 // Set prevBoundary to the last character in the quick check loop.
1219 prevBoundary=src-1;
1220 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
1221 U16_IS_LEAD(*(prevBoundary-1))
1222 ) {
1223 --prevBoundary;
1224 }
1225 prevCC=0;
1226 // The start of the current character (c).
1227 prevSrc=src;
1228 }
1229
1230 src+=U16_LENGTH(c);
1231 /*
1232 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1233 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1234 * or has ccc!=0.
1235 */
1236 if(isMaybeOrNonZeroCC(norm16)) {
1237 uint8_t cc=getCCFromYesOrMaybe(norm16);
1238 if( onlyContiguous && // FCC
1239 cc!=0 &&
1240 prevCC==0 &&
1241 prevBoundary<prevSrc &&
1242 // prevCC==0 && prevBoundary<prevSrc tell us that
1243 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1244 // passed the quick check "yes && ccc==0" test.
1245 // Check whether the last character was a "yesYes" or a "yesNo".
1246 // If a "yesNo", then we get its trailing ccc from its
1247 // mapping and check for canonical order.
1248 // All other cases are ok.
1249 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
1250 ) {
1251 // Fails FCD test.
1252 } else if(prevCC<=cc || cc==0) {
1253 prevCC=cc;
1254 if(norm16<MIN_YES_YES_WITH_CC) {
1255 if(pQCResult!=NULL) {
1256 *pQCResult=UNORM_MAYBE;
1257 } else {
1258 return prevBoundary;
1259 }
1260 }
1261 continue;
1262 }
1263 }
1264 if(pQCResult!=NULL) {
1265 *pQCResult=UNORM_NO;
1266 }
1267 return prevBoundary;
1268 }
1269}
1270
1271void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
1272 UBool doCompose,
1273 UBool onlyContiguous,
1274 ReorderingBuffer &buffer,
1275 UErrorCode &errorCode) const {
1276 if(!buffer.isEmpty()) {
1277 const UChar *firstStarterInSrc=findNextCompBoundary(src, limit);
1278 if(src!=firstStarterInSrc) {
1279 const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
1280 buffer.getLimit());
1281 UnicodeString middle(lastStarterInDest,
1282 (int32_t)(buffer.getLimit()-lastStarterInDest));
1283 buffer.removeSuffix((int32_t)(buffer.getLimit()-lastStarterInDest));
1284 middle.append(src, (int32_t)(firstStarterInSrc-src));
1285 const UChar *middleStart=middle.getBuffer();
1286 compose(middleStart, middleStart+middle.length(), onlyContiguous,
1287 TRUE, buffer, errorCode);
1288 if(U_FAILURE(errorCode)) {
1289 return;
1290 }
1291 src=firstStarterInSrc;
1292 }
1293 }
1294 if(doCompose) {
1295 compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
1296 } else {
1297 buffer.appendZeroCC(src, limit, errorCode);
1298 }
1299}
1300
1301/**
1302 * Does c have a composition boundary before it?
1303 * True if its decomposition begins with a character that has
1304 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
1305 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
1306 * (isCompYesAndZeroCC()) so we need not decompose.
1307 */
1308UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
1309 for(;;) {
1310 if(isCompYesAndZeroCC(norm16)) {
1311 return TRUE;
1312 } else if(isMaybeOrNonZeroCC(norm16)) {
1313 return FALSE;
1314 } else if(isDecompNoAlgorithmic(norm16)) {
1315 c=mapAlgorithmic(c, norm16);
1316 norm16=getNorm16(c);
1317 } else {
1318 // c decomposes, get everything from the variable-length extra data
1319 const uint16_t *mapping=getMapping(norm16);
1320 uint16_t firstUnit=*mapping++;
1321 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1322 return FALSE;
1323 }
1324 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*mapping++&0xff00)) {
1325 return FALSE; // non-zero leadCC
1326 }
1327 int32_t i=0;
1328 UChar32 c;
1329 U16_NEXT_UNSAFE(mapping, i, c);
1330 return isCompYesAndZeroCC(getNorm16(c));
1331 }
1332 }
1333}
1334
1335UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const {
1336 for(;;) {
1337 uint16_t norm16=getNorm16(c);
1338 if(isInert(norm16)) {
1339 return TRUE;
1340 } else if(norm16<=minYesNo) {
1341 // Hangul LVT (==minYesNo) has a boundary after it.
1342 // Hangul LV and non-inert yesYes characters combine forward.
1343 return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);
1344 } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
1345 return FALSE;
1346 } else if(isDecompNoAlgorithmic(norm16)) {
1347 c=mapAlgorithmic(c, norm16);
1348 } else {
1349 // c decomposes, get everything from the variable-length extra data.
1350 // If testInert, then c must be a yesNo character which has lccc=0,
1351 // otherwise it could be a noNo.
1352 const uint16_t *mapping=getMapping(norm16);
1353 uint16_t firstUnit=*mapping;
1354 // TRUE if
1355 // c is not deleted, and
1356 // it and its decomposition do not combine forward, and it has a starter, and
1357 // if FCC then trailCC<=1
1358 return
1359 (firstUnit&MAPPING_LENGTH_MASK)!=0 &&
1360 (firstUnit&(MAPPING_PLUS_COMPOSITION_LIST|MAPPING_NO_COMP_BOUNDARY_AFTER))==0 &&
1361 (!onlyContiguous || firstUnit<=0x1ff);
1362 }
1363 }
1364}
1365
1366const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const {
1367 BackwardUTrie2StringIterator iter(normTrie, start, p);
1368 uint16_t norm16;
1369 do {
1370 norm16=iter.previous16();
1371 } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
1372 // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
1373 // but that's probably not worth the extra cost.
1374 return iter.codePointStart;
1375}
1376
1377const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const {
1378 ForwardUTrie2StringIterator iter(normTrie, p, limit);
1379 uint16_t norm16;
1380 do {
1381 norm16=iter.next16();
1382 } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
1383 return iter.codePointStart;
1384}
1385
1386class FCDTrieSingleton : public UTrie2Singleton {
1387public:
1388 FCDTrieSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) :
1389 UTrie2Singleton(s), impl(ni), errorCode(ec) {}
1390 UTrie2 *getInstance(UErrorCode &errorCode) {
1391 return UTrie2Singleton::getInstance(createInstance, this, errorCode);
1392 }
1393 static void *createInstance(const void *context, UErrorCode &errorCode);
1394 UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
1395 if(value!=0) {
1396 impl.setFCD16FromNorm16(start, end, (uint16_t)value, newFCDTrie, errorCode);
1397 }
1398 return U_SUCCESS(errorCode);
1399 }
1400
1401 Normalizer2Impl &impl;
1402 UTrie2 *newFCDTrie;
1403 UErrorCode &errorCode;
1404};
1405
1406U_CDECL_BEGIN
1407
1408// Set the FCD value for a range of same-norm16 characters.
1409static UBool U_CALLCONV
1410enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
1411 return ((FCDTrieSingleton *)context)->rangeHandler(start, end, value);
1412}
1413
1414// Collect (OR together) the FCD values for a range of supplementary characters,
1415// for their lead surrogate code unit.
1416static UBool U_CALLCONV
claireho27f65472011-06-09 11:11:49 -07001417enumRangeOrValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
claireho50294ea2010-05-03 15:44:48 -07001418 *((uint32_t *)context)|=value;
1419 return TRUE;
1420}
1421
1422U_CDECL_END
1423
1424void *FCDTrieSingleton::createInstance(const void *context, UErrorCode &errorCode) {
1425 FCDTrieSingleton *me=(FCDTrieSingleton *)context;
1426 me->newFCDTrie=utrie2_open(0, 0, &errorCode);
1427 if(U_SUCCESS(errorCode)) {
1428 utrie2_enum(me->impl.getNormTrie(), NULL, enumRangeHandler, me);
1429 for(UChar lead=0xd800; lead<0xdc00; ++lead) {
1430 uint32_t oredValue=utrie2_get32(me->newFCDTrie, lead);
1431 utrie2_enumForLeadSurrogate(me->newFCDTrie, lead, NULL, enumRangeOrValue, &oredValue);
1432 if(oredValue!=0) {
1433 // Set a "bad" value for makeFCD() to break the quick check loop
1434 // and look up the value for the supplementary code point.
1435 // If there is any lccc, then set the worst-case lccc of 1.
1436 // The ORed-together value's tccc is already the worst case.
1437 if(oredValue>0xff) {
1438 oredValue=0x100|(oredValue&0xff);
1439 }
1440 utrie2_set32ForLeadSurrogateCodeUnit(me->newFCDTrie, lead, oredValue, &errorCode);
1441 }
1442 }
1443 utrie2_freeze(me->newFCDTrie, UTRIE2_16_VALUE_BITS, &errorCode);
1444 if(U_SUCCESS(errorCode)) {
1445 return me->newFCDTrie;
1446 }
1447 }
1448 utrie2_close(me->newFCDTrie);
1449 return NULL;
1450}
1451
1452void Normalizer2Impl::setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
1453 UTrie2 *newFCDTrie, UErrorCode &errorCode) const {
1454 // Only loops for 1:1 algorithmic mappings.
1455 for(;;) {
1456 if(norm16>=MIN_NORMAL_MAYBE_YES) {
1457 norm16&=0xff;
1458 norm16|=norm16<<8;
1459 } else if(norm16<=minYesNo || minMaybeYes<=norm16) {
1460 // no decomposition or Hangul syllable, all zeros
1461 break;
1462 } else if(limitNoNo<=norm16) {
1463 int32_t delta=norm16-(minMaybeYes-MAX_DELTA-1);
1464 if(start==end) {
1465 start+=delta;
1466 norm16=getNorm16(start);
1467 } else {
1468 // the same delta leads from different original characters to different mappings
1469 do {
1470 UChar32 c=start+delta;
1471 setFCD16FromNorm16(c, c, getNorm16(c), newFCDTrie, errorCode);
1472 } while(++start<=end);
1473 break;
1474 }
1475 } else {
1476 // c decomposes, get everything from the variable-length extra data
1477 const uint16_t *mapping=getMapping(norm16);
1478 uint16_t firstUnit=*mapping;
1479 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1480 // A character that is deleted (maps to an empty string) must
1481 // get the worst-case lccc and tccc values because arbitrary
1482 // characters on both sides will become adjacent.
1483 norm16=0x1ff;
1484 } else {
1485 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
1486 norm16=mapping[1]&0xff00; // lccc
1487 } else {
1488 norm16=0;
1489 }
1490 norm16|=firstUnit>>8; // tccc
1491 }
1492 }
1493 utrie2_setRange32(newFCDTrie, start, end, norm16, TRUE, &errorCode);
1494 break;
1495 }
1496}
1497
1498const UTrie2 *Normalizer2Impl::getFCDTrie(UErrorCode &errorCode) const {
1499 // Logically const: Synchronized instantiation.
1500 Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
1501 return FCDTrieSingleton(me->fcdTrieSingleton, *me, errorCode).getInstance(errorCode);
1502}
1503
1504// Dual functionality:
1505// buffer!=NULL: normalize
1506// buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
1507const UChar *
1508Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
1509 ReorderingBuffer *buffer,
1510 UErrorCode &errorCode) const {
claireho27f65472011-06-09 11:11:49 -07001511 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1512 // Similar to the prevBoundary in the compose() implementation.
1513 const UChar *prevBoundary=src;
1514 int32_t prevFCD16=0;
claireho50294ea2010-05-03 15:44:48 -07001515 if(limit==NULL) {
1516 src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode);
1517 if(U_FAILURE(errorCode)) {
1518 return src;
1519 }
claireho27f65472011-06-09 11:11:49 -07001520 if(prevBoundary<src) {
1521 prevBoundary=src;
1522 // We know that the previous character's lccc==0.
1523 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1524 prevFCD16=getFCD16FromSingleLead(*(src-1));
1525 if(prevFCD16>1) {
1526 --prevBoundary;
1527 }
1528 }
claireho50294ea2010-05-03 15:44:48 -07001529 limit=u_strchr(src, 0);
1530 }
1531
1532 // Note: In this function we use buffer->appendZeroCC() because we track
1533 // the lead and trail combining classes here, rather than leaving it to
1534 // the ReorderingBuffer.
1535 // The exception is the call to decomposeShort() which uses the buffer
1536 // in the normal way.
1537
1538 const UTrie2 *trie=fcdTrie();
1539
claireho50294ea2010-05-03 15:44:48 -07001540 const UChar *prevSrc;
1541 UChar32 c=0;
claireho50294ea2010-05-03 15:44:48 -07001542 uint16_t fcd16=0;
1543
1544 for(;;) {
1545 // count code units with lccc==0
1546 for(prevSrc=src; src!=limit;) {
1547 if((c=*src)<MIN_CCC_LCCC_CP) {
1548 prevFCD16=~c;
1549 ++src;
1550 } else if((fcd16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, c))<=0xff) {
1551 prevFCD16=fcd16;
1552 ++src;
1553 } else if(!U16_IS_SURROGATE(c)) {
1554 break;
1555 } else {
1556 UChar c2;
1557 if(U16_IS_SURROGATE_LEAD(c)) {
1558 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1559 c=U16_GET_SUPPLEMENTARY(c, c2);
1560 }
1561 } else /* trail surrogate */ {
1562 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1563 --src;
1564 c=U16_GET_SUPPLEMENTARY(c2, c);
1565 }
1566 }
1567 if((fcd16=getFCD16(c))<=0xff) {
1568 prevFCD16=fcd16;
1569 src+=U16_LENGTH(c);
1570 } else {
1571 break;
1572 }
1573 }
1574 }
1575 // copy these code units all at once
1576 if(src!=prevSrc) {
1577 if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
1578 break;
1579 }
1580 if(src==limit) {
1581 break;
1582 }
1583 prevBoundary=src;
1584 // We know that the previous character's lccc==0.
1585 if(prevFCD16<0) {
1586 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1587 prevFCD16=getFCD16FromSingleLead((UChar)~prevFCD16);
1588 if(prevFCD16>1) {
1589 --prevBoundary;
1590 }
1591 } else {
1592 const UChar *p=src-1;
1593 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
1594 --p;
1595 // Need to fetch the previous character's FCD value because
1596 // prevFCD16 was just for the trail surrogate code point.
1597 prevFCD16=getFCD16FromSurrogatePair(p[0], p[1]);
1598 // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1599 }
1600 if(prevFCD16>1) {
1601 prevBoundary=p;
1602 }
1603 }
1604 // The start of the current character (c).
1605 prevSrc=src;
1606 } else if(src==limit) {
1607 break;
1608 }
1609
1610 src+=U16_LENGTH(c);
1611 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
1612 // Check for proper order, and decompose locally if necessary.
1613 if((prevFCD16&0xff)<=(fcd16>>8)) {
1614 // proper order: prev tccc <= current lccc
1615 if((fcd16&0xff)<=1) {
1616 prevBoundary=src;
1617 }
1618 if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
1619 break;
1620 }
1621 prevFCD16=fcd16;
1622 continue;
1623 } else if(buffer==NULL) {
1624 return prevBoundary; // quick check "no"
1625 } else {
1626 /*
1627 * Back out the part of the source that we copied or appended
1628 * already but is now going to be decomposed.
1629 * prevSrc is set to after what was copied/appended.
1630 */
1631 buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
1632 /*
1633 * Find the part of the source that needs to be decomposed,
1634 * up to the next safe boundary.
1635 */
1636 src=findNextFCDBoundary(src, limit);
1637 /*
1638 * The source text does not fulfill the conditions for FCD.
1639 * Decompose and reorder a limited piece of the text.
1640 */
1641 if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) {
1642 break;
1643 }
1644 prevBoundary=src;
1645 prevFCD16=0;
1646 }
1647 }
1648 return src;
1649}
1650
1651void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
1652 UBool doMakeFCD,
1653 ReorderingBuffer &buffer,
1654 UErrorCode &errorCode) const {
1655 if(!buffer.isEmpty()) {
1656 const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
1657 if(src!=firstBoundaryInSrc) {
1658 const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
1659 buffer.getLimit());
1660 UnicodeString middle(lastBoundaryInDest,
1661 (int32_t)(buffer.getLimit()-lastBoundaryInDest));
1662 buffer.removeSuffix((int32_t)(buffer.getLimit()-lastBoundaryInDest));
1663 middle.append(src, (int32_t)(firstBoundaryInSrc-src));
1664 const UChar *middleStart=middle.getBuffer();
1665 makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
1666 if(U_FAILURE(errorCode)) {
1667 return;
1668 }
1669 src=firstBoundaryInSrc;
1670 }
1671 }
1672 if(doMakeFCD) {
1673 makeFCD(src, limit, &buffer, errorCode);
1674 } else {
1675 buffer.appendZeroCC(src, limit, errorCode);
1676 }
1677}
1678
1679const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
1680 BackwardUTrie2StringIterator iter(fcdTrie(), start, p);
1681 uint16_t fcd16;
1682 do {
1683 fcd16=iter.previous16();
1684 } while(fcd16>0xff);
1685 return iter.codePointStart;
1686}
1687
1688const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
1689 ForwardUTrie2StringIterator iter(fcdTrie(), p, limit);
1690 uint16_t fcd16;
1691 do {
1692 fcd16=iter.next16();
1693 } while(fcd16>0xff);
1694 return iter.codePointStart;
1695}
1696
claireho27f65472011-06-09 11:11:49 -07001697// CanonicalIterator data -------------------------------------------------- ***
1698
1699CanonIterData::CanonIterData(UErrorCode &errorCode) :
1700 trie(utrie2_open(0, 0, &errorCode)),
1701 canonStartSets(uhash_deleteUObject, NULL, errorCode) {}
1702
1703CanonIterData::~CanonIterData() {
1704 utrie2_close(trie);
1705}
1706
1707void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
1708 uint32_t canonValue=utrie2_get32(trie, decompLead);
1709 if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
1710 // origin is the first character whose decomposition starts with
1711 // the character for which we are setting the value.
1712 utrie2_set32(trie, decompLead, canonValue|origin, &errorCode);
1713 } else {
1714 // origin is not the first character, or it is U+0000.
1715 UnicodeSet *set;
1716 if((canonValue&CANON_HAS_SET)==0) {
1717 set=new UnicodeSet;
1718 if(set==NULL) {
1719 errorCode=U_MEMORY_ALLOCATION_ERROR;
1720 return;
1721 }
1722 UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
1723 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
1724 utrie2_set32(trie, decompLead, canonValue, &errorCode);
1725 canonStartSets.addElement(set, errorCode);
1726 if(firstOrigin!=0) {
1727 set->add(firstOrigin);
1728 }
1729 } else {
1730 set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
1731 }
1732 set->add(origin);
1733 }
1734}
1735
1736class CanonIterDataSingleton {
1737public:
1738 CanonIterDataSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) :
1739 singleton(s), impl(ni), errorCode(ec) {}
1740 CanonIterData *getInstance(UErrorCode &errorCode) {
1741 void *duplicate;
1742 CanonIterData *instance=
1743 (CanonIterData *)singleton.getInstance(createInstance, this, duplicate, errorCode);
1744 delete (CanonIterData *)duplicate;
1745 return instance;
1746 }
1747 static void *createInstance(const void *context, UErrorCode &errorCode);
1748 UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
1749 if(value!=0) {
1750 impl.makeCanonIterDataFromNorm16(start, end, (uint16_t)value, *newData, errorCode);
1751 }
1752 return U_SUCCESS(errorCode);
1753 }
1754
1755private:
1756 SimpleSingleton &singleton;
1757 Normalizer2Impl &impl;
1758 CanonIterData *newData;
1759 UErrorCode &errorCode;
1760};
1761
1762U_CDECL_BEGIN
1763
1764// Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
1765static UBool U_CALLCONV
1766enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
1767 return ((CanonIterDataSingleton *)context)->rangeHandler(start, end, value);
1768}
1769
1770U_CDECL_END
1771
1772void *CanonIterDataSingleton::createInstance(const void *context, UErrorCode &errorCode) {
1773 CanonIterDataSingleton *me=(CanonIterDataSingleton *)context;
1774 me->newData=new CanonIterData(errorCode);
1775 if(me->newData==NULL) {
1776 errorCode=U_MEMORY_ALLOCATION_ERROR;
1777 return NULL;
1778 }
1779 if(U_SUCCESS(errorCode)) {
1780 utrie2_enum(me->impl.getNormTrie(), NULL, enumCIDRangeHandler, me);
1781 utrie2_freeze(me->newData->trie, UTRIE2_32_VALUE_BITS, &errorCode);
1782 if(U_SUCCESS(errorCode)) {
1783 return me->newData;
1784 }
1785 }
1786 delete me->newData;
1787 return NULL;
1788}
1789
1790void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
1791 CanonIterData &newData,
1792 UErrorCode &errorCode) const {
1793 if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {
1794 // Inert, or 2-way mapping (including Hangul syllable).
1795 // We do not write a canonStartSet for any yesNo character.
1796 // Composites from 2-way mappings are added at runtime from the
1797 // starter's compositions list, and the other characters in
1798 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
1799 // "maybe" characters.
1800 return;
1801 }
1802 for(UChar32 c=start; c<=end; ++c) {
1803 uint32_t oldValue=utrie2_get32(newData.trie, c);
1804 uint32_t newValue=oldValue;
1805 if(norm16>=minMaybeYes) {
1806 // not a segment starter if it occurs in a decomposition or has cc!=0
1807 newValue|=CANON_NOT_SEGMENT_STARTER;
1808 if(norm16<MIN_NORMAL_MAYBE_YES) {
1809 newValue|=CANON_HAS_COMPOSITIONS;
1810 }
1811 } else if(norm16<minYesNo) {
1812 newValue|=CANON_HAS_COMPOSITIONS;
1813 } else {
1814 // c has a one-way decomposition
1815 UChar32 c2=c;
1816 uint16_t norm16_2=norm16;
1817 while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
1818 c2=mapAlgorithmic(c2, norm16_2);
1819 norm16_2=getNorm16(c2);
1820 }
1821 if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
1822 // c decomposes, get everything from the variable-length extra data
1823 const uint16_t *mapping=getMapping(norm16_2);
1824 uint16_t firstUnit=*mapping++;
1825 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
1826 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1827 if(c==c2 && (*mapping&0xff)!=0) {
1828 newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0
1829 }
1830 ++mapping;
1831 }
1832 // Skip empty mappings (no characters in the decomposition).
1833 if(length!=0) {
1834 // add c to first code point's start set
1835 int32_t i=0;
1836 U16_NEXT_UNSAFE(mapping, i, c2);
1837 newData.addToStartSet(c, c2, errorCode);
1838 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
1839 // one-way mapping. A 2-way mapping is possible here after
1840 // intermediate algorithmic mapping.
1841 if(norm16_2>=minNoNo) {
1842 while(i<length) {
1843 U16_NEXT_UNSAFE(mapping, i, c2);
1844 uint32_t c2Value=utrie2_get32(newData.trie, c2);
1845 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
1846 utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER,
1847 &errorCode);
1848 }
1849 }
1850 }
1851 }
1852 } else {
1853 // c decomposed to c2 algorithmically; c has cc==0
1854 newData.addToStartSet(c, c2, errorCode);
1855 }
1856 }
1857 if(newValue!=oldValue) {
1858 utrie2_set32(newData.trie, c, newValue, &errorCode);
1859 }
1860 }
1861}
1862
1863UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
1864 // Logically const: Synchronized instantiation.
1865 Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
1866 CanonIterDataSingleton(me->canonIterDataSingleton, *me, errorCode).getInstance(errorCode);
1867 return U_SUCCESS(errorCode);
1868}
1869
1870int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
1871 return (int32_t)utrie2_get32(((CanonIterData *)canonIterDataSingleton.fInstance)->trie, c);
1872}
1873
1874const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
1875 return *(const UnicodeSet *)(
1876 ((CanonIterData *)canonIterDataSingleton.fInstance)->canonStartSets[n]);
1877}
1878
1879UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
1880 return getCanonValue(c)>=0;
1881}
1882
1883UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
1884 int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
1885 if(canonValue==0) {
1886 return FALSE;
1887 }
1888 set.clear();
1889 int32_t value=canonValue&CANON_VALUE_MASK;
1890 if((canonValue&CANON_HAS_SET)!=0) {
1891 set.addAll(getCanonStartSet(value));
1892 } else if(value!=0) {
1893 set.add(value);
1894 }
1895 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
1896 uint16_t norm16=getNorm16(c);
1897 if(norm16==JAMO_L) {
1898 UChar32 syllable=
1899 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
1900 set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
1901 } else {
1902 addComposites(getCompositionsList(norm16), set);
1903 }
1904 }
1905 return TRUE;
1906}
1907
claireho50294ea2010-05-03 15:44:48 -07001908U_NAMESPACE_END
1909
1910// Normalizer2 data swapping ----------------------------------------------- ***
1911
1912U_NAMESPACE_USE
1913
1914U_CAPI int32_t U_EXPORT2
1915unorm2_swap(const UDataSwapper *ds,
1916 const void *inData, int32_t length, void *outData,
1917 UErrorCode *pErrorCode) {
1918 const UDataInfo *pInfo;
1919 int32_t headerSize;
1920
1921 const uint8_t *inBytes;
1922 uint8_t *outBytes;
1923
1924 const int32_t *inIndexes;
1925 int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1];
1926
1927 int32_t i, offset, nextOffset, size;
1928
1929 /* udata_swapDataHeader checks the arguments */
1930 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
1931 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1932 return 0;
1933 }
1934
1935 /* check data format and format version */
1936 pInfo=(const UDataInfo *)((const char *)inData+4);
1937 if(!(
1938 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
1939 pInfo->dataFormat[1]==0x72 &&
1940 pInfo->dataFormat[2]==0x6d &&
1941 pInfo->dataFormat[3]==0x32 &&
1942 pInfo->formatVersion[0]==1
1943 )) {
1944 udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
1945 pInfo->dataFormat[0], pInfo->dataFormat[1],
1946 pInfo->dataFormat[2], pInfo->dataFormat[3],
1947 pInfo->formatVersion[0]);
1948 *pErrorCode=U_UNSUPPORTED_ERROR;
1949 return 0;
1950 }
1951
1952 inBytes=(const uint8_t *)inData+headerSize;
1953 outBytes=(uint8_t *)outData+headerSize;
1954
1955 inIndexes=(const int32_t *)inBytes;
1956
1957 if(length>=0) {
1958 length-=headerSize;
claireho27f65472011-06-09 11:11:49 -07001959 if(length<(int32_t)sizeof(indexes)) {
claireho50294ea2010-05-03 15:44:48 -07001960 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
1961 length);
1962 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1963 return 0;
1964 }
1965 }
1966
1967 /* read the first few indexes */
1968 for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) {
1969 indexes[i]=udata_readInt32(ds, inIndexes[i]);
1970 }
1971
1972 /* get the total length of the data */
1973 size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
1974
1975 if(length>=0) {
1976 if(length<size) {
1977 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
1978 length);
1979 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1980 return 0;
1981 }
1982
1983 /* copy the data for inaccessible bytes */
1984 if(inBytes!=outBytes) {
1985 uprv_memcpy(outBytes, inBytes, size);
1986 }
1987
1988 offset=0;
1989
1990 /* swap the int32_t indexes[] */
1991 nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
1992 ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
1993 offset=nextOffset;
1994
1995 /* swap the UTrie2 */
1996 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
1997 utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
1998 offset=nextOffset;
1999
2000 /* swap the uint16_t extraData[] */
2001 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET+1];
2002 ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2003 offset=nextOffset;
2004
2005 U_ASSERT(offset==size);
2006 }
2007
2008 return headerSize+size;
2009}
2010
2011#endif // !UCONFIG_NO_NORMALIZATION