Blame - common/normalizer2impl.cpp - platform/external/icu

blob: 52459be71904892f35c1cb9d142facc84a915d3b [file] [log] [blame]

claireho	50294ea	2010-05-03 15:44:48 -0700	[diff] [blame]	1	/*
				2	*******************************************************************************
				3	*
				4	* Copyright (C) 2009-2010, International Business Machines
				5	* Corporation and others. All Rights Reserved.
				6	*
				7	*******************************************************************************
				8	* file name: normalizer2impl.cpp
				9	* encoding: US-ASCII
				10	* tab size: 8 (not used)
				11	* indentation:4
				12	*
				13	* created on: 2009nov22
				14	* created by: Markus W. Scherer
				15	*/
				16
				17	#include "unicode/utypes.h"
				18
				19	#if !UCONFIG_NO_NORMALIZATION
				20
				21	#include "unicode/normalizer2.h"
				22	#include "unicode/udata.h"
				23	#include "unicode/ustring.h"
				24	#include "cmemory.h"
				25	#include "mutex.h"
				26	#include "normalizer2impl.h"
				27	#include "uassert.h"
claireho	27f6547	2011-06-09 11:11:49 -0700	[diff] [blame^]	28	#include "uhash.h"
claireho	50294ea	2010-05-03 15:44:48 -0700	[diff] [blame]	29	#include "uset_imp.h"
				30	#include "utrie2.h"
claireho	27f6547	2011-06-09 11:11:49 -0700	[diff] [blame^]	31	#include "uvector.h"
claireho	50294ea	2010-05-03 15:44:48 -0700	[diff] [blame]	32
				33	U_NAMESPACE_BEGIN
				34
				35	// ReorderingBuffer -------------------------------------------------------- ***
				36
				37	UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
				38	int32_t length=str.length();
				39	start=str.getBuffer(destCapacity);
				40	if(start==NULL) {
				41	// getBuffer() already did str.setToBogus()
				42	errorCode=U_MEMORY_ALLOCATION_ERROR;
				43	return FALSE;
				44	}
				45	limit=start+length;
				46	remainingCapacity=str.getCapacity()-length;
				47	reorderStart=start;
				48	if(start==limit) {
				49	lastCC=0;
				50	} else {
				51	setIterator();
				52	lastCC=previousCC();
				53	// Set reorderStart after the last code point with cc<=1 if there is one.
				54	if(lastCC>1) {
				55	while(previousCC()>1) {}
				56	}
				57	reorderStart=codePointLimit;
				58	}
				59	return TRUE;
				60	}
				61
				62	UBool ReorderingBuffer::equals(const UChar otherStart, const UChar otherLimit) const {
				63	int32_t length=(int32_t)(limit-start);
				64	return
				65	length==(int32_t)(otherLimit-otherStart) &&
				66	0==u_memcmp(start, otherStart, length);
				67	}
				68
				69	UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
				70	if(remainingCapacity<2 && !resize(2, errorCode)) {
				71	return FALSE;
				72	}
				73	if(lastCC<=cc \|\| cc==0) {
				74	limit[0]=U16_LEAD(c);
				75	limit[1]=U16_TRAIL(c);
				76	limit+=2;
				77	lastCC=cc;
				78	if(cc<=1) {
				79	reorderStart=limit;
				80	}
				81	} else {
				82	insert(c, cc);
				83	}
				84	remainingCapacity-=2;
				85	return TRUE;
				86	}
				87
				88	UBool ReorderingBuffer::append(const UChar *s, int32_t length,
				89	uint8_t leadCC, uint8_t trailCC,
				90	UErrorCode &errorCode) {
				91	if(length==0) {
				92	return TRUE;
				93	}
				94	if(remainingCapacity<length && !resize(length, errorCode)) {
				95	return FALSE;
				96	}
				97	remainingCapacity-=length;
				98	if(lastCC<=leadCC \|\| leadCC==0) {
				99	if(trailCC<=1) {
				100	reorderStart=limit+length;
				101	} else if(leadCC<=1) {
				102	reorderStart=limit+1; // Ok if not a code point boundary.
				103	}
				104	const UChar *sLimit=s+length;
				105	do { limit++=s++; } while(s!=sLimit);
				106	lastCC=trailCC;
				107	} else {
				108	int32_t i=0;
				109	UChar32 c;
				110	U16_NEXT(s, i, length, c);
				111	insert(c, leadCC); // insert first code point
				112	while(i<length) {
				113	U16_NEXT(s, i, length, c);
				114	if(i<length) {
				115	// s must be in NFD, otherwise we need to use getCC().
				116	leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
				117	} else {
				118	leadCC=trailCC;
				119	}
				120	append(c, leadCC, errorCode);
				121	}
				122	}
				123	return TRUE;
				124	}
				125
				126	UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
				127	int32_t cpLength=U16_LENGTH(c);
				128	if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
				129	return FALSE;
				130	}
				131	remainingCapacity-=cpLength;
				132	if(cpLength==1) {
				133	*limit++=(UChar)c;
				134	} else {
				135	limit[0]=U16_LEAD(c);
				136	limit[1]=U16_TRAIL(c);
				137	limit+=2;
				138	}
				139	lastCC=0;
				140	reorderStart=limit;
				141	return TRUE;
				142	}
				143
				144	UBool ReorderingBuffer::appendZeroCC(const UChar s, const UChar sLimit, UErrorCode &errorCode) {
				145	if(s==sLimit) {
				146	return TRUE;
				147	}
				148	int32_t length=(int32_t)(sLimit-s);
				149	if(remainingCapacity<length && !resize(length, errorCode)) {
				150	return FALSE;
				151	}
				152	u_memcpy(limit, s, length);
				153	limit+=length;
				154	remainingCapacity-=length;
				155	lastCC=0;
				156	reorderStart=limit;
				157	return TRUE;
				158	}
				159
				160	void ReorderingBuffer::remove() {
				161	reorderStart=limit=start;
				162	remainingCapacity=str.getCapacity();
				163	lastCC=0;
				164	}
				165
				166	void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
				167	if(suffixLength<(limit-start)) {
				168	limit-=suffixLength;
				169	remainingCapacity+=suffixLength;
				170	} else {
				171	limit=start;
				172	remainingCapacity=str.getCapacity();
				173	}
				174	lastCC=0;
				175	reorderStart=limit;
				176	}
				177
				178	UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
				179	int32_t reorderStartIndex=(int32_t)(reorderStart-start);
				180	int32_t length=(int32_t)(limit-start);
				181	str.releaseBuffer(length);
				182	int32_t newCapacity=length+appendLength;
				183	int32_t doubleCapacity=2*str.getCapacity();
				184	if(newCapacity<doubleCapacity) {
				185	newCapacity=doubleCapacity;
				186	}
				187	if(newCapacity<256) {
				188	newCapacity=256;
				189	}
				190	start=str.getBuffer(newCapacity);
				191	if(start==NULL) {
				192	// getBuffer() already did str.setToBogus()
				193	errorCode=U_MEMORY_ALLOCATION_ERROR;
				194	return FALSE;
				195	}
				196	reorderStart=start+reorderStartIndex;
				197	limit=start+length;
				198	remainingCapacity=str.getCapacity()-length;
				199	return TRUE;
				200	}
				201
				202	void ReorderingBuffer::skipPrevious() {
				203	codePointLimit=codePointStart;
				204	UChar c=*--codePointStart;
				205	if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
				206	--codePointStart;
				207	}
				208	}
				209
				210	uint8_t ReorderingBuffer::previousCC() {
				211	codePointLimit=codePointStart;
				212	if(reorderStart>=codePointStart) {
				213	return 0;
				214	}
				215	UChar32 c=*--codePointStart;
				216	if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
				217	return 0;
				218	}
				219
				220	UChar c2;
				221	if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
				222	--codePointStart;
				223	c=U16_GET_SUPPLEMENTARY(c2, c);
				224	}
				225	return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
				226	}
				227
				228	// Inserts c somewhere before the last character.
				229	// Requires 0<cc<lastCC which implies reorderStart<limit.
				230	void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
				231	for(setIterator(), skipPrevious(); previousCC()>cc;) {}
				232	// insert c at codePointLimit, after the character with prevCC<=cc
				233	UChar *q=limit;
				234	UChar *r=limit+=U16_LENGTH(c);
				235	do {
				236	--r=--q;
				237	} while(codePointLimit!=q);
				238	writeCodePoint(q, c);
				239	if(cc<=1) {
				240	reorderStart=r;
				241	}
				242	}
				243
				244	// Normalizer2Impl --------------------------------------------------------- ***
				245
claireho	27f6547	2011-06-09 11:11:49 -0700	[diff] [blame^]	246	struct CanonIterData : public UMemory {
				247	CanonIterData(UErrorCode &errorCode);
				248	~CanonIterData();
				249	void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
				250	UTrie2 *trie;
				251	UVector canonStartSets; // contains UnicodeSet *
				252	};
				253
claireho	50294ea	2010-05-03 15:44:48 -0700	[diff] [blame]	254	Normalizer2Impl::~Normalizer2Impl() {
				255	udata_close(memory);
				256	utrie2_close(normTrie);
				257	UTrie2Singleton(fcdTrieSingleton).deleteInstance();
claireho	27f6547	2011-06-09 11:11:49 -0700	[diff] [blame^]	258	delete (CanonIterData *)canonIterDataSingleton.fInstance;
claireho	50294ea	2010-05-03 15:44:48 -0700	[diff] [blame]	259	}
				260
				261	UBool U_CALLCONV
				262	Normalizer2Impl::isAcceptable(void *context,
claireho	27f6547	2011-06-09 11:11:49 -0700	[diff] [blame^]	263	const char * /* type /, const char /name/,
claireho	50294ea	2010-05-03 15:44:48 -0700	[diff] [blame]	264	const UDataInfo *pInfo) {
				265	if(
				266	pInfo->size>=20 &&
				267	pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
				268	pInfo->charsetFamily==U_CHARSET_FAMILY &&
				269	pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
				270	pInfo->dataFormat[1]==0x72 &&
				271	pInfo->dataFormat[2]==0x6d &&
				272	pInfo->dataFormat[3]==0x32 &&
				273	pInfo->formatVersion[0]==1
				274	) {
				275	Normalizer2Impl me=(Normalizer2Impl )context;
				276	uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4);
				277	return TRUE;
				278	} else {
				279	return FALSE;
				280	}
				281	}
				282
				283	void
				284	Normalizer2Impl::load(const char packageName, const char name, UErrorCode &errorCode) {
				285	if(U_FAILURE(errorCode)) {
				286	return;
				287	}
				288	memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode);
				289	if(U_FAILURE(errorCode)) {
				290	return;
				291	}
				292	const uint8_t inBytes=(const uint8_t )udata_getMemory(memory);
				293	const int32_t inIndexes=(const int32_t )inBytes;
				294	int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4;
				295	if(indexesLength<=IX_MIN_MAYBE_YES) {
				296	errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes.
				297	return;
				298	}
				299
				300	minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
				301	minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
				302
				303	minYesNo=inIndexes[IX_MIN_YES_NO];
				304	minNoNo=inIndexes[IX_MIN_NO_NO];
				305	limitNoNo=inIndexes[IX_LIMIT_NO_NO];
				306	minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
				307
				308	int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET];
				309	int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
				310	normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
				311	inBytes+offset, nextOffset-offset, NULL,
				312	&errorCode);
				313	if(U_FAILURE(errorCode)) {
				314	return;
				315	}
				316
				317	offset=nextOffset;
				318	maybeYesCompositions=(const uint16_t *)(inBytes+offset);
				319	extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);
				320	}
				321
				322	uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar cpStart, const UChar cpLimit) const {
				323	UChar32 c;
				324	if(cpStart==(cpLimit-1)) {
				325	c=*cpStart;
				326	} else {
				327	c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]);
				328	}
				329	uint16_t prevNorm16=getNorm16(c);
				330	if(prevNorm16<=minYesNo) {
				331	return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0
				332	} else {
				333	return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo
				334	}
				335	}
				336
				337	U_CDECL_BEGIN
				338
				339	static UBool U_CALLCONV
				340	enumPropertyStartsRange(const void context, UChar32 start, UChar32 /end/, uint32_t /value*/) {
				341	/* add the start code point to the USet */
				342	const USetAdder sa=(const USetAdder )context;
				343	sa->add(sa->set, start);
				344	return TRUE;
				345	}
				346
claireho	27f6547	2011-06-09 11:11:49 -0700	[diff] [blame^]	347	static uint32_t U_CALLCONV
				348	segmentStarterMapper(const void * /context/, uint32_t value) {
				349	return value&CANON_NOT_SEGMENT_STARTER;
				350	}
				351
claireho	50294ea	2010-05-03 15:44:48 -0700	[diff] [blame]	352	U_CDECL_END
				353
				354	void
claireho	27f6547	2011-06-09 11:11:49 -0700	[diff] [blame^]	355	Normalizer2Impl::addPropertyStarts(const USetAdder sa, UErrorCode & /errorCode*/) const {
claireho	50294ea	2010-05-03 15:44:48 -0700	[diff] [blame]	356	/* add the start code point of each same-value range of each trie */
				357	utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa);
				358
				359	/* add Hangul LV syllables and LV+1 because of skippables */
				360	for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
				361	sa->add(sa->set, c);
				362	sa->add(sa->set, c+1);
				363	}
				364	sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
				365	}
				366
claireho	27f6547	2011-06-09 11:11:49 -0700	[diff] [blame^]	367	void
				368	Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
				369	/* add the start code point of each same-value range of the canonical iterator data trie */
				370	if(ensureCanonIterData(errorCode)) {
				371	// currently only used for the SEGMENT_STARTER property
				372	utrie2_enum(((CanonIterData *)canonIterDataSingleton.fInstance)->trie,
				373	segmentStarterMapper, enumPropertyStartsRange, sa);
				374	}
				375	}
				376
claireho	50294ea	2010-05-03 15:44:48 -0700	[diff] [blame]	377	const UChar *
				378	Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
				379	UChar32 minNeedDataCP,
				380	ReorderingBuffer *buffer,
				381	UErrorCode &errorCode) const {
				382	// Make some effort to support NUL-terminated strings reasonably.
				383	// Take the part of the fast quick check loop that does not look up
				384	// data and check the first part of the string.
				385	// After this prefix, determine the string length to simplify the rest
				386	// of the code.
				387	const UChar *prevSrc=src;
				388	UChar c;
				389	while((c=*src++)<minNeedDataCP && c!=0) {}
				390	// Back out the last character for full processing.
				391	// Copy this prefix.
				392	if(--src!=prevSrc) {
				393	if(buffer!=NULL) {
				394	buffer->appendZeroCC(prevSrc, src, errorCode);
				395	}
				396	}
				397	return src;
				398	}
				399
				400	// Dual functionality:
				401	// buffer!=NULL: normalize
				402	// buffer==NULL: isNormalized/spanQuickCheckYes
				403	const UChar *
				404	Normalizer2Impl::decompose(const UChar src, const UChar limit,
				405	ReorderingBuffer *buffer,
				406	UErrorCode &errorCode) const {
				407	UChar32 minNoCP=minDecompNoCP;
				408	if(limit==NULL) {
				409	src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
				410	if(U_FAILURE(errorCode)) {
				411	return src;
				412	}
				413	limit=u_strchr(src, 0);
				414	}
				415
				416	const UChar *prevSrc;
				417	UChar32 c=0;
				418	uint16_t norm16=0;
				419
				420	// only for quick check
				421	const UChar *prevBoundary=src;
				422	uint8_t prevCC=0;
				423
				424	for(;;) {
				425	// count code units below the minimum or with irrelevant data for the quick check
				426	for(prevSrc=src; src!=limit;) {
				427	if( (c=*src)<minNoCP \|\|
				428	isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
				429	) {
				430	++src;
				431	} else if(!U16_IS_SURROGATE(c)) {
				432	break;
				433	} else {
				434	UChar c2;
				435	if(U16_IS_SURROGATE_LEAD(c)) {
				436	if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
				437	c=U16_GET_SUPPLEMENTARY(c, c2);
				438	}
				439	} else /* trail surrogate */ {
				440	if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
				441	--src;
				442	c=U16_GET_SUPPLEMENTARY(c2, c);
				443	}
				444	}
				445	if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
				446	src+=U16_LENGTH(c);
				447	} else {
				448	break;
				449	}
				450	}
				451	}
				452	// copy these code units all at once
				453	if(src!=prevSrc) {
				454	if(buffer!=NULL) {
				455	if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
				456	break;
				457	}
				458	} else {
				459	prevCC=0;
				460	prevBoundary=src;
				461	}
				462	}
				463	if(src==limit) {
				464	break;
				465	}
				466
				467	// Check one above-minimum, relevant code point.
				468	src+=U16_LENGTH(c);
				469	if(buffer!=NULL) {
				470	if(!decompose(c, norm16, *buffer, errorCode)) {
				471	break;
				472	}
				473	} else {
				474	if(isDecompYes(norm16)) {
				475	uint8_t cc=getCCFromYesOrMaybe(norm16);
				476	if(prevCC<=cc \|\| cc==0) {
				477	prevCC=cc;
				478	if(cc<=1) {
				479	prevBoundary=src;
				480	}
				481	continue;
				482	}
				483	}
				484	return prevBoundary; // "no" or cc out of order
				485	}
				486	}
				487	return src;
				488	}
				489
				490	// Decompose a short piece of text which is likely to contain characters that
				491	// fail the quick check loop and/or where the quick check loop's overhead
				492	// is unlikely to be amortized.
				493	// Called by the compose() and makeFCD() implementations.
				494	UBool Normalizer2Impl::decomposeShort(const UChar src, const UChar limit,
				495	ReorderingBuffer &buffer,
				496	UErrorCode &errorCode) const {
				497	while(src<limit) {
				498	UChar32 c;
				499	uint16_t norm16;
				500	UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
				501	if(!decompose(c, norm16, buffer, errorCode)) {
				502	return FALSE;
				503	}
				504	}
				505	return TRUE;
				506	}
				507
				508	UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
				509	ReorderingBuffer &buffer,
				510	UErrorCode &errorCode) const {
				511	// Only loops for 1:1 algorithmic mappings.
				512	for(;;) {
				513	// get the decomposition and the lead and trail cc's
				514	if(isDecompYes(norm16)) {
				515	// c does not decompose
				516	return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
				517	} else if(isHangul(norm16)) {
				518	// Hangul syllable: decompose algorithmically
				519	UChar jamos[3];
				520	return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
				521	} else if(isDecompNoAlgorithmic(norm16)) {
				522	c=mapAlgorithmic(c, norm16);
				523	norm16=getNorm16(c);
				524	} else {
				525	// c decomposes, get everything from the variable-length extra data
				526	const uint16_t *mapping=getMapping(norm16);
				527	uint16_t firstUnit=*mapping++;
				528	int32_t length=firstUnit&MAPPING_LENGTH_MASK;
				529	uint8_t leadCC, trailCC;
				530	trailCC=(uint8_t)(firstUnit>>8);
				531	if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
				532	leadCC=(uint8_t)(*mapping++>>8);
				533	} else {
				534	leadCC=0;
				535	}
				536	return buffer.append((const UChar *)mapping, length, leadCC, trailCC, errorCode);
				537	}
				538	}
				539	}
				540
				541	const UChar *
				542	Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
				543	const UChar *decomp=NULL;
				544	uint16_t norm16;
				545	for(;;) {
				546	if(c<minDecompNoCP \|\| isDecompYes(norm16=getNorm16(c))) {
				547	// c does not decompose
				548	return decomp;
				549	} else if(isHangul(norm16)) {
				550	// Hangul syllable: decompose algorithmically
				551	length=Hangul::decompose(c, buffer);
				552	return buffer;
				553	} else if(isDecompNoAlgorithmic(norm16)) {
				554	c=mapAlgorithmic(c, norm16);
				555	decomp=buffer;
				556	length=0;
				557	U16_APPEND_UNSAFE(buffer, length, c);
				558	} else {
				559	// c decomposes, get everything from the variable-length extra data
				560	const uint16_t *mapping=getMapping(norm16);
				561	uint16_t firstUnit=*mapping++;
				562	length=firstUnit&MAPPING_LENGTH_MASK;
				563	if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
				564	++mapping;
				565	}
				566	return (const UChar *)mapping;
				567	}
				568	}
				569	}
				570
				571	void Normalizer2Impl::decomposeAndAppend(const UChar src, const UChar limit,
				572	UBool doDecompose,
				573	ReorderingBuffer &buffer,
				574	UErrorCode &errorCode) const {
				575	if(doDecompose) {
				576	decompose(src, limit, &buffer, errorCode);
				577	return;
				578	}
				579	// Just merge the strings at the boundary.
				580	ForwardUTrie2StringIterator iter(normTrie, src, limit);
				581	uint8_t firstCC, prevCC, cc;
				582	firstCC=prevCC=cc=getCC(iter.next16());
				583	while(cc!=0) {
				584	prevCC=cc;
				585	cc=getCC(iter.next16());
				586	};
				587	buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode) &&
				588	buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
				589	}
				590
				591	// Note: hasDecompBoundary() could be implemented as aliases to
				592	// hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
				593	// at the cost of building the FCD trie for a decomposition normalizer.
				594	UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
				595	for(;;) {
				596	if(c<minDecompNoCP) {
				597	return TRUE;
				598	}
				599	uint16_t norm16=getNorm16(c);
				600	if(isHangul(norm16) \|\| isDecompYesAndZeroCC(norm16)) {
				601	return TRUE;
				602	} else if(norm16>MIN_NORMAL_MAYBE_YES) {
				603	return FALSE; // ccc!=0
				604	} else if(isDecompNoAlgorithmic(norm16)) {
				605	c=mapAlgorithmic(c, norm16);
				606	} else {
				607	// c decomposes, get everything from the variable-length extra data
				608	const uint16_t *mapping=getMapping(norm16);
				609	uint16_t firstUnit=*mapping++;
				610	if((firstUnit&MAPPING_LENGTH_MASK)==0) {
				611	return FALSE;
				612	}
				613	if(!before) {
				614	// decomp after-boundary: same as hasFCDBoundaryAfter(),
				615	// fcd16<=1 \|\| trailCC==0
				616	if(firstUnit>0x1ff) {
				617	return FALSE; // trailCC>1
				618	}
				619	if(firstUnit<=0xff) {
				620	return TRUE; // trailCC==0
				621	}
				622	// if(trailCC==1) test leadCC==0, same as checking for before-boundary
				623	}
				624	// TRUE if leadCC==0 (hasFCDBoundaryBefore())
				625	return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 \|\| (*mapping&0xff00)==0;
				626	}
				627	}
				628	}
				629
				630	/*
				631	* Finds the recomposition result for
				632	* a forward-combining "lead" character,
				633	* specified with a pointer to its compositions list,
				634	* and a backward-combining "trail" character.
				635	*
				636	* If the lead and trail characters combine, then this function returns
				637	* the following "compositeAndFwd" value:
				638	* Bits 21..1 composite character
				639	* Bit 0 set if the composite is a forward-combining starter
				640	* otherwise it returns -1.
				641	*
				642	* The compositions list has (trail, compositeAndFwd) pair entries,
				643	* encoded as either pairs or triples of 16-bit units.
				644	* The last entry has the high bit of its first unit set.
				645	*
				646	* The list is sorted by ascending trail characters (there are no duplicates).
				647	* A linear search is used.
				648	*
				649	* See normalizer2impl.h for a more detailed description
				650	* of the compositions list format.
				651	*/
				652	int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
				653	uint16_t key1, firstUnit;
				654	if(trail<COMP_1_TRAIL_LIMIT) {
				655	// trail character is 0..33FF
				656	// result entry may have 2 or 3 units
				657	key1=(uint16_t)(trail<<1);
				658	while(key1>(firstUnit=*list)) {
				659	list+=2+(firstUnit&COMP_1_TRIPLE);
				660	}
				661	if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
				662	if(firstUnit&COMP_1_TRIPLE) {
				663	return ((int32_t)list[1]<<16)\|list[2];
				664	} else {
				665	return list[1];
				666	}
				667	}
				668	} else {
				669	// trail character is 3400..10FFFF
				670	// result entry has 3 units
				671	key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
claireho	27f6547	2011-06-09 11:11:49 -0700	[diff] [blame^]	672	(((trail>>COMP_1_TRAIL_SHIFT))&
				673	~COMP_1_TRIPLE));
claireho	50294ea	2010-05-03 15:44:48 -0700	[diff] [blame]	674	uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
				675	uint16_t secondUnit;
				676	for(;;) {
				677	if(key1>(firstUnit=*list)) {
				678	list+=2+(firstUnit&COMP_1_TRIPLE);
				679	} else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
				680	if(key2>(secondUnit=list[1])) {
				681	if(firstUnit&COMP_1_LAST_TUPLE) {
				682	break;
				683	} else {
				684	list+=3;
				685	}
				686	} else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
				687	return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)\|list[2];
				688	} else {
				689	break;
				690	}
				691	} else {
				692	break;
				693	}
				694	}
				695	}
				696	return -1;
				697	}
				698
claireho	27f6547	2011-06-09 11:11:49 -0700	[diff] [blame^]	699	/**
				700	* @param list some character's compositions list
				701	* @param set recursively receives the composites from these compositions
				702	*/
				703	void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
				704	uint16_t firstUnit;
				705	int32_t compositeAndFwd;
				706	do {
				707	firstUnit=*list;
				708	if((firstUnit&COMP_1_TRIPLE)==0) {
				709	compositeAndFwd=list[1];
				710	list+=2;
				711	} else {
				712	compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)\|list[2];
				713	list+=3;
				714	}
				715	UChar32 composite=compositeAndFwd>>1;
				716	if((compositeAndFwd&1)!=0) {
				717	addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
				718	}
				719	set.add(composite);
				720	} while((firstUnit&COMP_1_LAST_TUPLE)==0);
				721	}
				722
claireho	50294ea	2010-05-03 15:44:48 -0700	[diff] [blame]	723	/*
				724	* Recomposes the buffer text starting at recomposeStartIndex
				725	* (which is in NFD - decomposed and canonically ordered),
				726	* and truncates the buffer contents.
				727	*
				728	* Note that recomposition never lengthens the text:
				729	* Any character consists of either one or two code units;
				730	* a composition may contain at most one more code unit than the original starter,
				731	* while the combining mark that is removed has at least one code unit.
				732	*/
				733	void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
				734	UBool onlyContiguous) const {
				735	UChar *p=buffer.getStart()+recomposeStartIndex;
				736	UChar *limit=buffer.getLimit();
				737	if(p==limit) {
				738	return;
				739	}
				740
				741	UChar starter, pRemove, q, r;
				742	const uint16_t *compositionsList;
				743	UChar32 c, compositeAndFwd;
				744	uint16_t norm16;
				745	uint8_t cc, prevCC;
				746	UBool starterIsSupplementary;
				747
				748	// Some of the following variables are not used until we have a forward-combining starter
				749	// and are only initialized now to avoid compiler warnings.
				750	compositionsList=NULL; // used as indicator for whether we have a forward-combining starter
				751	starter=NULL;
				752	starterIsSupplementary=FALSE;
				753	prevCC=0;
				754
				755	for(;;) {
				756	UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
				757	cc=getCCFromYesOrMaybe(norm16);
				758	if( // this character combines backward and
				759	isMaybe(norm16) &&
				760	// we have seen a starter that combines forward and
				761	compositionsList!=NULL &&
				762	// the backward-combining character is not blocked
				763	(prevCC<cc \|\| prevCC==0)
				764	) {
				765	if(isJamoVT(norm16)) {
				766	// c is a Jamo V/T, see if we can compose it with the previous character.
				767	if(c<Hangul::JAMO_T_BASE) {
				768	// c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
				769	UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
				770	if(prev<Hangul::JAMO_L_COUNT) {
				771	pRemove=p-1;
				772	UChar syllable=(UChar)
				773	(Hangul::HANGUL_BASE+
				774	(prevHangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))
				775	Hangul::JAMO_T_COUNT);
				776	UChar t;
				777	if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
				778	++p;
				779	syllable+=t; // The next character was a Jamo T.
				780	}
				781	*starter=syllable;
				782	// remove the Jamo V/T
				783	q=pRemove;
				784	r=p;
				785	while(r<limit) {
				786	q++=r++;
				787	}
				788	limit=q;
				789	p=pRemove;
				790	}
				791	}
				792	/*
				793	* No "else" for Jamo T:
				794	* Since the input is in NFD, there are no Hangul LV syllables that
				795	* a Jamo T could combine with.
				796	* All Jamo Ts are combined above when handling Jamo Vs.
				797	*/
				798	if(p==limit) {
				799	break;
				800	}
				801	compositionsList=NULL;
				802	continue;
				803	} else if((compositeAndFwd=combine(compositionsList, c))>=0) {
				804	// The starter and the combining mark (c) do combine.
				805	UChar32 composite=compositeAndFwd>>1;
				806
				807	// Replace the starter with the composite, remove the combining mark.
				808	pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark
				809	if(starterIsSupplementary) {
				810	if(U_IS_SUPPLEMENTARY(composite)) {
				811	// both are supplementary
				812	starter[0]=U16_LEAD(composite);
				813	starter[1]=U16_TRAIL(composite);
				814	} else {
				815	*starter=(UChar)composite;
				816	// The composite is shorter than the starter,
				817	// move the intermediate characters forward one.
				818	starterIsSupplementary=FALSE;
				819	q=starter+1;
				820	r=q+1;
				821	while(r<pRemove) {
				822	q++=r++;
				823	}
				824	--pRemove;
				825	}
				826	} else if(U_IS_SUPPLEMENTARY(composite)) {
				827	// The composite is longer than the starter,
				828	// move the intermediate characters back one.
				829	starterIsSupplementary=TRUE;
				830	++starter; // temporarily increment for the loop boundary
				831	q=pRemove;
				832	r=++pRemove;
				833	while(starter<q) {
				834	--r=--q;
				835	}
				836	*starter=U16_TRAIL(composite);
				837	*--starter=U16_LEAD(composite); // undo the temporary increment
				838	} else {
				839	// both are on the BMP
				840	*starter=(UChar)composite;
				841	}
				842
				843	/* remove the combining mark by moving the following text over it */
				844	if(pRemove<p) {
				845	q=pRemove;
				846	r=p;
				847	while(r<limit) {
				848	q++=r++;
				849	}
				850	limit=q;
				851	p=pRemove;
				852	}
				853	// Keep prevCC because we removed the combining mark.
				854
				855	if(p==limit) {
				856	break;
				857	}
				858	// Is the composite a starter that combines forward?
				859	if(compositeAndFwd&1) {
				860	compositionsList=
				861	getCompositionsListForComposite(getNorm16(composite));
				862	} else {
				863	compositionsList=NULL;
				864	}
				865
				866	// We combined; continue with looking for compositions.
				867	continue;
				868	}
				869	}
				870
				871	// no combination this time
				872	prevCC=cc;
				873	if(p==limit) {
				874	break;
				875	}
				876
				877	// If c did not combine, then check if it is a starter.
				878	if(cc==0) {
				879	// Found a new starter.
				880	if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
				881	// It may combine with something, prepare for it.
				882	if(U_IS_BMP(c)) {
				883	starterIsSupplementary=FALSE;
				884	starter=p-1;
				885	} else {
				886	starterIsSupplementary=TRUE;
				887	starter=p-2;
				888	}
				889	}
				890	} else if(onlyContiguous) {
				891	// FCC: no discontiguous compositions; any intervening character blocks.
				892	compositionsList=NULL;
				893	}
				894	}
				895	buffer.setReorderingLimit(limit);
				896	}
				897
				898	// Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
				899	// doCompose: normalize
				900	// !doCompose: isNormalized (buffer must be empty and initialized)
				901	UBool
				902	Normalizer2Impl::compose(const UChar src, const UChar limit,
				903	UBool onlyContiguous,
				904	UBool doCompose,
				905	ReorderingBuffer &buffer,
				906	UErrorCode &errorCode) const {
claireho	50294ea	2010-05-03 15:44:48 -0700	[diff] [blame]	907	/*
				908	* prevBoundary points to the last character before the current one
				909	* that has a composition boundary before it with ccc==0 and quick check "yes".
				910	* Keeping track of prevBoundary saves us looking for a composition boundary
				911	* when we find a "no" or "maybe".
				912	*
				913	* When we back out from prevSrc back to prevBoundary,
				914	* then we also remove those same characters (which had been simply copied
				915	* or canonically-order-inserted) from the ReorderingBuffer.
				916	* Therefore, at all times, the [prevBoundary..prevSrc[ source units
				917	* must correspond 1:1 to destination units at the end of the destination buffer.
				918	*/
				919	const UChar *prevBoundary=src;
claireho	27f6547	2011-06-09 11:11:49 -0700	[diff] [blame^]	920	UChar32 minNoMaybeCP=minCompNoMaybeCP;
				921	if(limit==NULL) {
				922	src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
				923	doCompose ? &buffer : NULL,
				924	errorCode);
				925	if(U_FAILURE(errorCode)) {
				926	return FALSE;
				927	}
				928	if(prevBoundary<src) {
				929	// Set prevBoundary to the last character in the prefix.
				930	prevBoundary=src-1;
				931	}
				932	limit=u_strchr(src, 0);
				933	}
				934
claireho	50294ea	2010-05-03 15:44:48 -0700	[diff] [blame]	935	const UChar *prevSrc;
				936	UChar32 c=0;
				937	uint16_t norm16=0;
				938
				939	// only for isNormalized
				940	uint8_t prevCC=0;
				941
				942	for(;;) {
				943	// count code units below the minimum or with irrelevant data for the quick check
				944	for(prevSrc=src; src!=limit;) {
				945	if( (c=*src)<minNoMaybeCP \|\|
				946	isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
				947	) {
				948	++src;
				949	} else if(!U16_IS_SURROGATE(c)) {
				950	break;
				951	} else {
				952	UChar c2;
				953	if(U16_IS_SURROGATE_LEAD(c)) {
				954	if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
				955	c=U16_GET_SUPPLEMENTARY(c, c2);
				956	}
				957	} else /* trail surrogate */ {
				958	if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
				959	--src;
				960	c=U16_GET_SUPPLEMENTARY(c2, c);
				961	}
				962	}
				963	if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
				964	src+=U16_LENGTH(c);
				965	} else {
				966	break;
				967	}
				968	}
				969	}
				970	// copy these code units all at once
				971	if(src!=prevSrc) {
				972	if(doCompose) {
				973	if(!buffer.appendZeroCC(prevSrc, src, errorCode)) {
				974	break;
				975	}
				976	} else {
				977	prevCC=0;
				978	}
				979	if(src==limit) {
				980	break;
				981	}
				982	// Set prevBoundary to the last character in the quick check loop.
				983	prevBoundary=src-1;
				984	if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
				985	U16_IS_LEAD(*(prevBoundary-1))
				986	) {
				987	--prevBoundary;
				988	}
				989	// The start of the current character (c).
				990	prevSrc=src;
				991	} else if(src==limit) {
				992	break;
				993	}
				994
				995	src+=U16_LENGTH(c);
				996	/*
				997	* isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
				998	* c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
				999	* or has ccc!=0.
				1000	* Check for Jamo V/T, then for regular characters.
				1001	* c is not a Hangul syllable or Jamo L because those have "yes" properties.
				1002	*/
				1003	if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
				1004	UChar prev=*(prevSrc-1);
				1005	UBool needToDecompose=FALSE;
				1006	if(c<Hangul::JAMO_T_BASE) {
				1007	// c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
				1008	prev=(UChar)(prev-Hangul::JAMO_L_BASE);
				1009	if(prev<Hangul::JAMO_L_COUNT) {
				1010	if(!doCompose) {
				1011	return FALSE;
				1012	}
				1013	UChar syllable=(UChar)
				1014	(Hangul::HANGUL_BASE+
				1015	(prevHangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))
				1016	Hangul::JAMO_T_COUNT);
				1017	UChar t;
				1018	if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
				1019	++src;
				1020	syllable+=t; // The next character was a Jamo T.
				1021	prevBoundary=src;
				1022	buffer.setLastChar(syllable);
				1023	continue;
				1024	}
				1025	// If we see L+V+x where x!=T then we drop to the slow path,
				1026	// decompose and recompose.
				1027	// This is to deal with NFKC finding normal L and V but a
				1028	// compatibility variant of a T. We need to either fully compose that
				1029	// combination here (which would complicate the code and may not work
				1030	// with strange custom data) or use the slow path -- or else our replacing
				1031	// two input characters (L+V) with one output character (LV syllable)
				1032	// would violate the invariant that [prevBoundary..prevSrc[ has the same
				1033	// length as what we appended to the buffer since prevBoundary.
				1034	needToDecompose=TRUE;
				1035	}
				1036	} else if(Hangul::isHangulWithoutJamoT(prev)) {
				1037	// c is a Jamo Trailing consonant,
				1038	// compose with previous Hangul LV that does not contain a Jamo T.
				1039	if(!doCompose) {
				1040	return FALSE;
				1041	}
				1042	buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE));
				1043	prevBoundary=src;
				1044	continue;
				1045	}
				1046	if(!needToDecompose) {
				1047	// The Jamo V/T did not compose into a Hangul syllable.
				1048	if(doCompose) {
				1049	if(!buffer.appendBMP((UChar)c, 0, errorCode)) {
				1050	break;
				1051	}
				1052	} else {
				1053	prevCC=0;
				1054	}
				1055	continue;
				1056	}
				1057	}
				1058	/*
				1059	* Source buffer pointers:
				1060	*
				1061	* all done quick check current char not yet
				1062	* "yes" but (c) processed
				1063	* may combine
				1064	* forward
				1065	* [-------------[-------------[-------------[-------------[
				1066	* \| \| \| \| \|
				1067	* orig. src prevBoundary prevSrc src limit
				1068	*
				1069	*
				1070	* Destination buffer pointers inside the ReorderingBuffer:
				1071	*
				1072	* all done might take not filled yet
				1073	* characters for
				1074	* reordering
				1075	* [-------------[-------------[-------------[
				1076	* \| \| \| \|
				1077	* start reorderStart limit \|
				1078	* +remainingCap.+
				1079	*/
				1080	if(norm16>=MIN_YES_YES_WITH_CC) {
				1081	uint8_t cc=(uint8_t)norm16; // cc!=0
				1082	if( onlyContiguous && // FCC
				1083	(doCompose ? buffer.getLastCC() : prevCC)==0 &&
				1084	prevBoundary<prevSrc &&
				1085	// buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
				1086	// [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
				1087	// passed the quick check "yes && ccc==0" test.
				1088	// Check whether the last character was a "yesYes" or a "yesNo".
				1089	// If a "yesNo", then we get its trailing ccc from its
				1090	// mapping and check for canonical order.
				1091	// All other cases are ok.
				1092	getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
				1093	) {
				1094	// Fails FCD test, need to decompose and contiguously recompose.
				1095	if(!doCompose) {
				1096	return FALSE;
				1097	}
				1098	} else if(doCompose) {
				1099	if(!buffer.append(c, cc, errorCode)) {
				1100	break;
				1101	}
				1102	continue;
				1103	} else if(prevCC<=cc) {
				1104	prevCC=cc;
				1105	continue;
				1106	} else {
				1107	return FALSE;
				1108	}
				1109	} else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
				1110	return FALSE;
				1111	}
				1112
				1113	/*
				1114	* Find appropriate boundaries around this character,
				1115	* decompose the source text from between the boundaries,
				1116	* and recompose it.
				1117	*
				1118	* We may need to remove the last few characters from the ReorderingBuffer
				1119	* to account for source text that was copied or appended
				1120	* but needs to take part in the recomposition.
				1121	*/
				1122
				1123	/*
				1124	* Find the last composition boundary in [prevBoundary..src[.
				1125	* It is either the decomposition of the current character (at prevSrc),
				1126	* or prevBoundary.
				1127	*/
				1128	if(hasCompBoundaryBefore(c, norm16)) {
				1129	prevBoundary=prevSrc;
				1130	} else if(doCompose) {
				1131	buffer.removeSuffix((int32_t)(prevSrc-prevBoundary));
				1132	}
				1133
				1134	// Find the next composition boundary in [src..limit[ -
				1135	// modifies src to point to the next starter.
				1136	src=(UChar *)findNextCompBoundary(src, limit);
				1137
				1138	// Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
				1139	int32_t recomposeStartIndex=buffer.length();
				1140	if(!decomposeShort(prevBoundary, src, buffer, errorCode)) {
				1141	break;
				1142	}
				1143	recompose(buffer, recomposeStartIndex, onlyContiguous);
				1144	if(!doCompose) {
				1145	if(!buffer.equals(prevBoundary, src)) {
				1146	return FALSE;
				1147	}
				1148	buffer.remove();
				1149	prevCC=0;
				1150	}
				1151
				1152	// Move to the next starter. We never need to look back before this point again.
				1153	prevBoundary=src;
				1154	}
				1155	return TRUE;
				1156	}
				1157
				1158	// Very similar to compose(): Make the same changes in both places if relevant.
				1159	// pQCResult==NULL: spanQuickCheckYes
				1160	// pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
				1161	const UChar *
				1162	Normalizer2Impl::composeQuickCheck(const UChar src, const UChar limit,
				1163	UBool onlyContiguous,
				1164	UNormalizationCheckResult *pQCResult) const {
claireho	50294ea	2010-05-03 15:44:48 -0700	[diff] [blame]	1165	/*
				1166	* prevBoundary points to the last character before the current one
				1167	* that has a composition boundary before it with ccc==0 and quick check "yes".
				1168	*/
				1169	const UChar *prevBoundary=src;
claireho	27f6547	2011-06-09 11:11:49 -0700	[diff] [blame^]	1170	UChar32 minNoMaybeCP=minCompNoMaybeCP;
				1171	if(limit==NULL) {
				1172	UErrorCode errorCode=U_ZERO_ERROR;
				1173	src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
				1174	if(prevBoundary<src) {
				1175	// Set prevBoundary to the last character in the prefix.
				1176	prevBoundary=src-1;
				1177	}
				1178	limit=u_strchr(src, 0);
				1179	}
				1180
claireho	50294ea	2010-05-03 15:44:48 -0700	[diff] [blame]	1181	const UChar *prevSrc;
				1182	UChar32 c=0;
				1183	uint16_t norm16=0;
				1184	uint8_t prevCC=0;
				1185
				1186	for(;;) {
				1187	// count code units below the minimum or with irrelevant data for the quick check
				1188	for(prevSrc=src;;) {
				1189	if(src==limit) {
				1190	return src;
				1191	}
				1192	if( (c=*src)<minNoMaybeCP \|\|
				1193	isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
				1194	) {
				1195	++src;
				1196	} else if(!U16_IS_SURROGATE(c)) {
				1197	break;
				1198	} else {
				1199	UChar c2;
				1200	if(U16_IS_SURROGATE_LEAD(c)) {
				1201	if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
				1202	c=U16_GET_SUPPLEMENTARY(c, c2);
				1203	}
				1204	} else /* trail surrogate */ {
				1205	if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
				1206	--src;
				1207	c=U16_GET_SUPPLEMENTARY(c2, c);
				1208	}
				1209	}
				1210	if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
				1211	src+=U16_LENGTH(c);
				1212	} else {
				1213	break;
				1214	}
				1215	}
				1216	}
				1217	if(src!=prevSrc) {
				1218	// Set prevBoundary to the last character in the quick check loop.
				1219	prevBoundary=src-1;
				1220	if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
				1221	U16_IS_LEAD(*(prevBoundary-1))
				1222	) {
				1223	--prevBoundary;
				1224	}
				1225	prevCC=0;
				1226	// The start of the current character (c).
				1227	prevSrc=src;
				1228	}
				1229
				1230	src+=U16_LENGTH(c);
				1231	/*
				1232	* isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
				1233	* c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
				1234	* or has ccc!=0.
				1235	*/
				1236	if(isMaybeOrNonZeroCC(norm16)) {
				1237	uint8_t cc=getCCFromYesOrMaybe(norm16);
				1238	if( onlyContiguous && // FCC
				1239	cc!=0 &&
				1240	prevCC==0 &&
				1241	prevBoundary<prevSrc &&
				1242	// prevCC==0 && prevBoundary<prevSrc tell us that
				1243	// [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
				1244	// passed the quick check "yes && ccc==0" test.
				1245	// Check whether the last character was a "yesYes" or a "yesNo".
				1246	// If a "yesNo", then we get its trailing ccc from its
				1247	// mapping and check for canonical order.
				1248	// All other cases are ok.
				1249	getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
				1250	) {
				1251	// Fails FCD test.
				1252	} else if(prevCC<=cc \|\| cc==0) {
				1253	prevCC=cc;
				1254	if(norm16<MIN_YES_YES_WITH_CC) {
				1255	if(pQCResult!=NULL) {
				1256	*pQCResult=UNORM_MAYBE;
				1257	} else {
				1258	return prevBoundary;
				1259	}
				1260	}
				1261	continue;
				1262	}
				1263	}
				1264	if(pQCResult!=NULL) {
				1265	*pQCResult=UNORM_NO;
				1266	}
				1267	return prevBoundary;
				1268	}
				1269	}
				1270
				1271	void Normalizer2Impl::composeAndAppend(const UChar src, const UChar limit,
				1272	UBool doCompose,
				1273	UBool onlyContiguous,
				1274	ReorderingBuffer &buffer,
				1275	UErrorCode &errorCode) const {
				1276	if(!buffer.isEmpty()) {
				1277	const UChar *firstStarterInSrc=findNextCompBoundary(src, limit);
				1278	if(src!=firstStarterInSrc) {
				1279	const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
				1280	buffer.getLimit());
				1281	UnicodeString middle(lastStarterInDest,
				1282	(int32_t)(buffer.getLimit()-lastStarterInDest));
				1283	buffer.removeSuffix((int32_t)(buffer.getLimit()-lastStarterInDest));
				1284	middle.append(src, (int32_t)(firstStarterInSrc-src));
				1285	const UChar *middleStart=middle.getBuffer();
				1286	compose(middleStart, middleStart+middle.length(), onlyContiguous,
				1287	TRUE, buffer, errorCode);
				1288	if(U_FAILURE(errorCode)) {
				1289	return;
				1290	}
				1291	src=firstStarterInSrc;
				1292	}
				1293	}
				1294	if(doCompose) {
				1295	compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
				1296	} else {
				1297	buffer.appendZeroCC(src, limit, errorCode);
				1298	}
				1299	}
				1300
				1301	/**
				1302	* Does c have a composition boundary before it?
				1303	* True if its decomposition begins with a character that has
				1304	* ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
				1305	* As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
				1306	* (isCompYesAndZeroCC()) so we need not decompose.
				1307	*/
				1308	UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
				1309	for(;;) {
				1310	if(isCompYesAndZeroCC(norm16)) {
				1311	return TRUE;
				1312	} else if(isMaybeOrNonZeroCC(norm16)) {
				1313	return FALSE;
				1314	} else if(isDecompNoAlgorithmic(norm16)) {
				1315	c=mapAlgorithmic(c, norm16);
				1316	norm16=getNorm16(c);
				1317	} else {
				1318	// c decomposes, get everything from the variable-length extra data
				1319	const uint16_t *mapping=getMapping(norm16);
				1320	uint16_t firstUnit=*mapping++;
				1321	if((firstUnit&MAPPING_LENGTH_MASK)==0) {
				1322	return FALSE;
				1323	}
				1324	if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*mapping++&0xff00)) {
				1325	return FALSE; // non-zero leadCC
				1326	}
				1327	int32_t i=0;
				1328	UChar32 c;
				1329	U16_NEXT_UNSAFE(mapping, i, c);
				1330	return isCompYesAndZeroCC(getNorm16(c));
				1331	}
				1332	}
				1333	}
				1334
				1335	UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const {
				1336	for(;;) {
				1337	uint16_t norm16=getNorm16(c);
				1338	if(isInert(norm16)) {
				1339	return TRUE;
				1340	} else if(norm16<=minYesNo) {
				1341	// Hangul LVT (==minYesNo) has a boundary after it.
				1342	// Hangul LV and non-inert yesYes characters combine forward.
				1343	return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);
				1344	} else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
				1345	return FALSE;
				1346	} else if(isDecompNoAlgorithmic(norm16)) {
				1347	c=mapAlgorithmic(c, norm16);
				1348	} else {
				1349	// c decomposes, get everything from the variable-length extra data.
				1350	// If testInert, then c must be a yesNo character which has lccc=0,
				1351	// otherwise it could be a noNo.
				1352	const uint16_t *mapping=getMapping(norm16);
				1353	uint16_t firstUnit=*mapping;
				1354	// TRUE if
				1355	// c is not deleted, and
				1356	// it and its decomposition do not combine forward, and it has a starter, and
				1357	// if FCC then trailCC<=1
				1358	return
				1359	(firstUnit&MAPPING_LENGTH_MASK)!=0 &&
				1360	(firstUnit&(MAPPING_PLUS_COMPOSITION_LIST\|MAPPING_NO_COMP_BOUNDARY_AFTER))==0 &&
				1361	(!onlyContiguous \|\| firstUnit<=0x1ff);
				1362	}
				1363	}
				1364	}
				1365
				1366	const UChar Normalizer2Impl::findPreviousCompBoundary(const UChar start, const UChar *p) const {
				1367	BackwardUTrie2StringIterator iter(normTrie, start, p);
				1368	uint16_t norm16;
				1369	do {
				1370	norm16=iter.previous16();
				1371	} while(!hasCompBoundaryBefore(iter.codePoint, norm16));
				1372	// We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
				1373	// but that's probably not worth the extra cost.
				1374	return iter.codePointStart;
				1375	}
				1376
				1377	const UChar Normalizer2Impl::findNextCompBoundary(const UChar p, const UChar *limit) const {
				1378	ForwardUTrie2StringIterator iter(normTrie, p, limit);
				1379	uint16_t norm16;
				1380	do {
				1381	norm16=iter.next16();
				1382	} while(!hasCompBoundaryBefore(iter.codePoint, norm16));
				1383	return iter.codePointStart;
				1384	}
				1385
				1386	class FCDTrieSingleton : public UTrie2Singleton {
				1387	public:
				1388	FCDTrieSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) :
				1389	UTrie2Singleton(s), impl(ni), errorCode(ec) {}
				1390	UTrie2 *getInstance(UErrorCode &errorCode) {
				1391	return UTrie2Singleton::getInstance(createInstance, this, errorCode);
				1392	}
				1393	static void createInstance(const void context, UErrorCode &errorCode);
				1394	UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
				1395	if(value!=0) {
				1396	impl.setFCD16FromNorm16(start, end, (uint16_t)value, newFCDTrie, errorCode);
				1397	}
				1398	return U_SUCCESS(errorCode);
				1399	}
				1400
				1401	Normalizer2Impl &impl;
				1402	UTrie2 *newFCDTrie;
				1403	UErrorCode &errorCode;
				1404	};
				1405
				1406	U_CDECL_BEGIN
				1407
				1408	// Set the FCD value for a range of same-norm16 characters.
				1409	static UBool U_CALLCONV
				1410	enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
				1411	return ((FCDTrieSingleton *)context)->rangeHandler(start, end, value);
				1412	}
				1413
				1414	// Collect (OR together) the FCD values for a range of supplementary characters,
				1415	// for their lead surrogate code unit.
				1416	static UBool U_CALLCONV
claireho	27f6547	2011-06-09 11:11:49 -0700	[diff] [blame^]	1417	enumRangeOrValue(const void context, UChar32 /start/, UChar32 /end*/, uint32_t value) {
claireho	50294ea	2010-05-03 15:44:48 -0700	[diff] [blame]	1418	((uint32_t )context)\|=value;
				1419	return TRUE;
				1420	}
				1421
				1422	U_CDECL_END
				1423
				1424	void FCDTrieSingleton::createInstance(const void context, UErrorCode &errorCode) {
				1425	FCDTrieSingleton me=(FCDTrieSingleton )context;
				1426	me->newFCDTrie=utrie2_open(0, 0, &errorCode);
				1427	if(U_SUCCESS(errorCode)) {
				1428	utrie2_enum(me->impl.getNormTrie(), NULL, enumRangeHandler, me);
				1429	for(UChar lead=0xd800; lead<0xdc00; ++lead) {
				1430	uint32_t oredValue=utrie2_get32(me->newFCDTrie, lead);
				1431	utrie2_enumForLeadSurrogate(me->newFCDTrie, lead, NULL, enumRangeOrValue, &oredValue);
				1432	if(oredValue!=0) {
				1433	// Set a "bad" value for makeFCD() to break the quick check loop
				1434	// and look up the value for the supplementary code point.
				1435	// If there is any lccc, then set the worst-case lccc of 1.
				1436	// The ORed-together value's tccc is already the worst case.
				1437	if(oredValue>0xff) {
				1438	oredValue=0x100\|(oredValue&0xff);
				1439	}
				1440	utrie2_set32ForLeadSurrogateCodeUnit(me->newFCDTrie, lead, oredValue, &errorCode);
				1441	}
				1442	}
				1443	utrie2_freeze(me->newFCDTrie, UTRIE2_16_VALUE_BITS, &errorCode);
				1444	if(U_SUCCESS(errorCode)) {
				1445	return me->newFCDTrie;
				1446	}
				1447	}
				1448	utrie2_close(me->newFCDTrie);
				1449	return NULL;
				1450	}
				1451
				1452	void Normalizer2Impl::setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
				1453	UTrie2 *newFCDTrie, UErrorCode &errorCode) const {
				1454	// Only loops for 1:1 algorithmic mappings.
				1455	for(;;) {
				1456	if(norm16>=MIN_NORMAL_MAYBE_YES) {
				1457	norm16&=0xff;
				1458	norm16\|=norm16<<8;
				1459	} else if(norm16<=minYesNo \|\| minMaybeYes<=norm16) {
				1460	// no decomposition or Hangul syllable, all zeros
				1461	break;
				1462	} else if(limitNoNo<=norm16) {
				1463	int32_t delta=norm16-(minMaybeYes-MAX_DELTA-1);
				1464	if(start==end) {
				1465	start+=delta;
				1466	norm16=getNorm16(start);
				1467	} else {
				1468	// the same delta leads from different original characters to different mappings
				1469	do {
				1470	UChar32 c=start+delta;
				1471	setFCD16FromNorm16(c, c, getNorm16(c), newFCDTrie, errorCode);
				1472	} while(++start<=end);
				1473	break;
				1474	}
				1475	} else {
				1476	// c decomposes, get everything from the variable-length extra data
				1477	const uint16_t *mapping=getMapping(norm16);
				1478	uint16_t firstUnit=*mapping;
				1479	if((firstUnit&MAPPING_LENGTH_MASK)==0) {
				1480	// A character that is deleted (maps to an empty string) must
				1481	// get the worst-case lccc and tccc values because arbitrary
				1482	// characters on both sides will become adjacent.
				1483	norm16=0x1ff;
				1484	} else {
				1485	if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
				1486	norm16=mapping[1]&0xff00; // lccc
				1487	} else {
				1488	norm16=0;
				1489	}
				1490	norm16\|=firstUnit>>8; // tccc
				1491	}
				1492	}
				1493	utrie2_setRange32(newFCDTrie, start, end, norm16, TRUE, &errorCode);
				1494	break;
				1495	}
				1496	}
				1497
				1498	const UTrie2 *Normalizer2Impl::getFCDTrie(UErrorCode &errorCode) const {
				1499	// Logically const: Synchronized instantiation.
				1500	Normalizer2Impl me=const_cast<Normalizer2Impl >(this);
				1501	return FCDTrieSingleton(me->fcdTrieSingleton, *me, errorCode).getInstance(errorCode);
				1502	}
				1503
				1504	// Dual functionality:
				1505	// buffer!=NULL: normalize
				1506	// buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
				1507	const UChar *
				1508	Normalizer2Impl::makeFCD(const UChar src, const UChar limit,
				1509	ReorderingBuffer *buffer,
				1510	UErrorCode &errorCode) const {
claireho	27f6547	2011-06-09 11:11:49 -0700	[diff] [blame^]	1511	// Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
				1512	// Similar to the prevBoundary in the compose() implementation.
				1513	const UChar *prevBoundary=src;
				1514	int32_t prevFCD16=0;
claireho	50294ea	2010-05-03 15:44:48 -0700	[diff] [blame]	1515	if(limit==NULL) {
				1516	src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode);
				1517	if(U_FAILURE(errorCode)) {
				1518	return src;
				1519	}
claireho	27f6547	2011-06-09 11:11:49 -0700	[diff] [blame^]	1520	if(prevBoundary<src) {
				1521	prevBoundary=src;
				1522	// We know that the previous character's lccc==0.
				1523	// Fetching the fcd16 value was deferred for this below-U+0300 code point.
				1524	prevFCD16=getFCD16FromSingleLead(*(src-1));
				1525	if(prevFCD16>1) {
				1526	--prevBoundary;
				1527	}
				1528	}
claireho	50294ea	2010-05-03 15:44:48 -0700	[diff] [blame]	1529	limit=u_strchr(src, 0);
				1530	}
				1531
				1532	// Note: In this function we use buffer->appendZeroCC() because we track
				1533	// the lead and trail combining classes here, rather than leaving it to
				1534	// the ReorderingBuffer.
				1535	// The exception is the call to decomposeShort() which uses the buffer
				1536	// in the normal way.
				1537
				1538	const UTrie2 *trie=fcdTrie();
				1539
claireho	50294ea	2010-05-03 15:44:48 -0700	[diff] [blame]	1540	const UChar *prevSrc;
				1541	UChar32 c=0;
claireho	50294ea	2010-05-03 15:44:48 -0700	[diff] [blame]	1542	uint16_t fcd16=0;
				1543
				1544	for(;;) {
				1545	// count code units with lccc==0
				1546	for(prevSrc=src; src!=limit;) {
				1547	if((c=*src)<MIN_CCC_LCCC_CP) {
				1548	prevFCD16=~c;
				1549	++src;
				1550	} else if((fcd16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, c))<=0xff) {
				1551	prevFCD16=fcd16;
				1552	++src;
				1553	} else if(!U16_IS_SURROGATE(c)) {
				1554	break;
				1555	} else {
				1556	UChar c2;
				1557	if(U16_IS_SURROGATE_LEAD(c)) {
				1558	if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
				1559	c=U16_GET_SUPPLEMENTARY(c, c2);
				1560	}
				1561	} else /* trail surrogate */ {
				1562	if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
				1563	--src;
				1564	c=U16_GET_SUPPLEMENTARY(c2, c);
				1565	}
				1566	}
				1567	if((fcd16=getFCD16(c))<=0xff) {
				1568	prevFCD16=fcd16;
				1569	src+=U16_LENGTH(c);
				1570	} else {
				1571	break;
				1572	}
				1573	}
				1574	}
				1575	// copy these code units all at once
				1576	if(src!=prevSrc) {
				1577	if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
				1578	break;
				1579	}
				1580	if(src==limit) {
				1581	break;
				1582	}
				1583	prevBoundary=src;
				1584	// We know that the previous character's lccc==0.
				1585	if(prevFCD16<0) {
				1586	// Fetching the fcd16 value was deferred for this below-U+0300 code point.
				1587	prevFCD16=getFCD16FromSingleLead((UChar)~prevFCD16);
				1588	if(prevFCD16>1) {
				1589	--prevBoundary;
				1590	}
				1591	} else {
				1592	const UChar *p=src-1;
				1593	if(U16_IS_TRAIL(p) && prevSrc<p && U16_IS_LEAD((p-1))) {
				1594	--p;
				1595	// Need to fetch the previous character's FCD value because
				1596	// prevFCD16 was just for the trail surrogate code point.
				1597	prevFCD16=getFCD16FromSurrogatePair(p[0], p[1]);
				1598	// Still known to have lccc==0 because its lead surrogate unit had lccc==0.
				1599	}
				1600	if(prevFCD16>1) {
				1601	prevBoundary=p;
				1602	}
				1603	}
				1604	// The start of the current character (c).
				1605	prevSrc=src;
				1606	} else if(src==limit) {
				1607	break;
				1608	}
				1609
				1610	src+=U16_LENGTH(c);
				1611	// The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
				1612	// Check for proper order, and decompose locally if necessary.
				1613	if((prevFCD16&0xff)<=(fcd16>>8)) {
				1614	// proper order: prev tccc <= current lccc
				1615	if((fcd16&0xff)<=1) {
				1616	prevBoundary=src;
				1617	}
				1618	if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
				1619	break;
				1620	}
				1621	prevFCD16=fcd16;
				1622	continue;
				1623	} else if(buffer==NULL) {
				1624	return prevBoundary; // quick check "no"
				1625	} else {
				1626	/*
				1627	* Back out the part of the source that we copied or appended
				1628	* already but is now going to be decomposed.
				1629	* prevSrc is set to after what was copied/appended.
				1630	*/
				1631	buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
				1632	/*
				1633	* Find the part of the source that needs to be decomposed,
				1634	* up to the next safe boundary.
				1635	*/
				1636	src=findNextFCDBoundary(src, limit);
				1637	/*
				1638	* The source text does not fulfill the conditions for FCD.
				1639	* Decompose and reorder a limited piece of the text.
				1640	*/
				1641	if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) {
				1642	break;
				1643	}
				1644	prevBoundary=src;
				1645	prevFCD16=0;
				1646	}
				1647	}
				1648	return src;
				1649	}
				1650
				1651	void Normalizer2Impl::makeFCDAndAppend(const UChar src, const UChar limit,
				1652	UBool doMakeFCD,
				1653	ReorderingBuffer &buffer,
				1654	UErrorCode &errorCode) const {
				1655	if(!buffer.isEmpty()) {
				1656	const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
				1657	if(src!=firstBoundaryInSrc) {
				1658	const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
				1659	buffer.getLimit());
				1660	UnicodeString middle(lastBoundaryInDest,
				1661	(int32_t)(buffer.getLimit()-lastBoundaryInDest));
				1662	buffer.removeSuffix((int32_t)(buffer.getLimit()-lastBoundaryInDest));
				1663	middle.append(src, (int32_t)(firstBoundaryInSrc-src));
				1664	const UChar *middleStart=middle.getBuffer();
				1665	makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
				1666	if(U_FAILURE(errorCode)) {
				1667	return;
				1668	}
				1669	src=firstBoundaryInSrc;
				1670	}
				1671	}
				1672	if(doMakeFCD) {
				1673	makeFCD(src, limit, &buffer, errorCode);
				1674	} else {
				1675	buffer.appendZeroCC(src, limit, errorCode);
				1676	}
				1677	}
				1678
				1679	const UChar Normalizer2Impl::findPreviousFCDBoundary(const UChar start, const UChar *p) const {
				1680	BackwardUTrie2StringIterator iter(fcdTrie(), start, p);
				1681	uint16_t fcd16;
				1682	do {
				1683	fcd16=iter.previous16();
				1684	} while(fcd16>0xff);
				1685	return iter.codePointStart;
				1686	}
				1687
				1688	const UChar Normalizer2Impl::findNextFCDBoundary(const UChar p, const UChar *limit) const {
				1689	ForwardUTrie2StringIterator iter(fcdTrie(), p, limit);
				1690	uint16_t fcd16;
				1691	do {
				1692	fcd16=iter.next16();
				1693	} while(fcd16>0xff);
				1694	return iter.codePointStart;
				1695	}
				1696
claireho	27f6547	2011-06-09 11:11:49 -0700	[diff] [blame^]	1697	// CanonicalIterator data -------------------------------------------------- ***
				1698
				1699	CanonIterData::CanonIterData(UErrorCode &errorCode) :
				1700	trie(utrie2_open(0, 0, &errorCode)),
				1701	canonStartSets(uhash_deleteUObject, NULL, errorCode) {}
				1702
				1703	CanonIterData::~CanonIterData() {
				1704	utrie2_close(trie);
				1705	}
				1706
				1707	void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
				1708	uint32_t canonValue=utrie2_get32(trie, decompLead);
				1709	if((canonValue&(CANON_HAS_SET\|CANON_VALUE_MASK))==0 && origin!=0) {
				1710	// origin is the first character whose decomposition starts with
				1711	// the character for which we are setting the value.
				1712	utrie2_set32(trie, decompLead, canonValue\|origin, &errorCode);
				1713	} else {
				1714	// origin is not the first character, or it is U+0000.
				1715	UnicodeSet *set;
				1716	if((canonValue&CANON_HAS_SET)==0) {
				1717	set=new UnicodeSet;
				1718	if(set==NULL) {
				1719	errorCode=U_MEMORY_ALLOCATION_ERROR;
				1720	return;
				1721	}
				1722	UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
				1723	canonValue=(canonValue&~CANON_VALUE_MASK)\|CANON_HAS_SET\|(uint32_t)canonStartSets.size();
				1724	utrie2_set32(trie, decompLead, canonValue, &errorCode);
				1725	canonStartSets.addElement(set, errorCode);
				1726	if(firstOrigin!=0) {
				1727	set->add(firstOrigin);
				1728	}
				1729	} else {
				1730	set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
				1731	}
				1732	set->add(origin);
				1733	}
				1734	}
				1735
				1736	class CanonIterDataSingleton {
				1737	public:
				1738	CanonIterDataSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) :
				1739	singleton(s), impl(ni), errorCode(ec) {}
				1740	CanonIterData *getInstance(UErrorCode &errorCode) {
				1741	void *duplicate;
				1742	CanonIterData *instance=
				1743	(CanonIterData *)singleton.getInstance(createInstance, this, duplicate, errorCode);
				1744	delete (CanonIterData *)duplicate;
				1745	return instance;
				1746	}
				1747	static void createInstance(const void context, UErrorCode &errorCode);
				1748	UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
				1749	if(value!=0) {
				1750	impl.makeCanonIterDataFromNorm16(start, end, (uint16_t)value, *newData, errorCode);
				1751	}
				1752	return U_SUCCESS(errorCode);
				1753	}
				1754
				1755	private:
				1756	SimpleSingleton &singleton;
				1757	Normalizer2Impl &impl;
				1758	CanonIterData *newData;
				1759	UErrorCode &errorCode;
				1760	};
				1761
				1762	U_CDECL_BEGIN
				1763
				1764	// Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
				1765	static UBool U_CALLCONV
				1766	enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
				1767	return ((CanonIterDataSingleton *)context)->rangeHandler(start, end, value);
				1768	}
				1769
				1770	U_CDECL_END
				1771
				1772	void CanonIterDataSingleton::createInstance(const void context, UErrorCode &errorCode) {
				1773	CanonIterDataSingleton me=(CanonIterDataSingleton )context;
				1774	me->newData=new CanonIterData(errorCode);
				1775	if(me->newData==NULL) {
				1776	errorCode=U_MEMORY_ALLOCATION_ERROR;
				1777	return NULL;
				1778	}
				1779	if(U_SUCCESS(errorCode)) {
				1780	utrie2_enum(me->impl.getNormTrie(), NULL, enumCIDRangeHandler, me);
				1781	utrie2_freeze(me->newData->trie, UTRIE2_32_VALUE_BITS, &errorCode);
				1782	if(U_SUCCESS(errorCode)) {
				1783	return me->newData;
				1784	}
				1785	}
				1786	delete me->newData;
				1787	return NULL;
				1788	}
				1789
				1790	void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
				1791	CanonIterData &newData,
				1792	UErrorCode &errorCode) const {
				1793	if(norm16==0 \|\| (minYesNo<=norm16 && norm16<minNoNo)) {
				1794	// Inert, or 2-way mapping (including Hangul syllable).
				1795	// We do not write a canonStartSet for any yesNo character.
				1796	// Composites from 2-way mappings are added at runtime from the
				1797	// starter's compositions list, and the other characters in
				1798	// 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
				1799	// "maybe" characters.
				1800	return;
				1801	}
				1802	for(UChar32 c=start; c<=end; ++c) {
				1803	uint32_t oldValue=utrie2_get32(newData.trie, c);
				1804	uint32_t newValue=oldValue;
				1805	if(norm16>=minMaybeYes) {
				1806	// not a segment starter if it occurs in a decomposition or has cc!=0
				1807	newValue\|=CANON_NOT_SEGMENT_STARTER;
				1808	if(norm16<MIN_NORMAL_MAYBE_YES) {
				1809	newValue\|=CANON_HAS_COMPOSITIONS;
				1810	}
				1811	} else if(norm16<minYesNo) {
				1812	newValue\|=CANON_HAS_COMPOSITIONS;
				1813	} else {
				1814	// c has a one-way decomposition
				1815	UChar32 c2=c;
				1816	uint16_t norm16_2=norm16;
				1817	while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
				1818	c2=mapAlgorithmic(c2, norm16_2);
				1819	norm16_2=getNorm16(c2);
				1820	}
				1821	if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
				1822	// c decomposes, get everything from the variable-length extra data
				1823	const uint16_t *mapping=getMapping(norm16_2);
				1824	uint16_t firstUnit=*mapping++;
				1825	int32_t length=firstUnit&MAPPING_LENGTH_MASK;
				1826	if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
				1827	if(c==c2 && (*mapping&0xff)!=0) {
				1828	newValue\|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0
				1829	}
				1830	++mapping;
				1831	}
				1832	// Skip empty mappings (no characters in the decomposition).
				1833	if(length!=0) {
				1834	// add c to first code point's start set
				1835	int32_t i=0;
				1836	U16_NEXT_UNSAFE(mapping, i, c2);
				1837	newData.addToStartSet(c, c2, errorCode);
				1838	// Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
				1839	// one-way mapping. A 2-way mapping is possible here after
				1840	// intermediate algorithmic mapping.
				1841	if(norm16_2>=minNoNo) {
				1842	while(i<length) {
				1843	U16_NEXT_UNSAFE(mapping, i, c2);
				1844	uint32_t c2Value=utrie2_get32(newData.trie, c2);
				1845	if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
				1846	utrie2_set32(newData.trie, c2, c2Value\|CANON_NOT_SEGMENT_STARTER,
				1847	&errorCode);
				1848	}
				1849	}
				1850	}
				1851	}
				1852	} else {
				1853	// c decomposed to c2 algorithmically; c has cc==0
				1854	newData.addToStartSet(c, c2, errorCode);
				1855	}
				1856	}
				1857	if(newValue!=oldValue) {
				1858	utrie2_set32(newData.trie, c, newValue, &errorCode);
				1859	}
				1860	}
				1861	}
				1862
				1863	UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
				1864	// Logically const: Synchronized instantiation.
				1865	Normalizer2Impl me=const_cast<Normalizer2Impl >(this);
				1866	CanonIterDataSingleton(me->canonIterDataSingleton, *me, errorCode).getInstance(errorCode);
				1867	return U_SUCCESS(errorCode);
				1868	}
				1869
				1870	int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
				1871	return (int32_t)utrie2_get32(((CanonIterData *)canonIterDataSingleton.fInstance)->trie, c);
				1872	}
				1873
				1874	const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
				1875	return (const UnicodeSet )(
				1876	((CanonIterData *)canonIterDataSingleton.fInstance)->canonStartSets[n]);
				1877	}
				1878
				1879	UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
				1880	return getCanonValue(c)>=0;
				1881	}
				1882
				1883	UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
				1884	int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
				1885	if(canonValue==0) {
				1886	return FALSE;
				1887	}
				1888	set.clear();
				1889	int32_t value=canonValue&CANON_VALUE_MASK;
				1890	if((canonValue&CANON_HAS_SET)!=0) {
				1891	set.addAll(getCanonStartSet(value));
				1892	} else if(value!=0) {
				1893	set.add(value);
				1894	}
				1895	if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
				1896	uint16_t norm16=getNorm16(c);
				1897	if(norm16==JAMO_L) {
				1898	UChar32 syllable=
				1899	(UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
				1900	set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
				1901	} else {
				1902	addComposites(getCompositionsList(norm16), set);
				1903	}
				1904	}
				1905	return TRUE;
				1906	}
				1907
claireho	50294ea	2010-05-03 15:44:48 -0700	[diff] [blame]	1908	U_NAMESPACE_END
				1909
				1910	// Normalizer2 data swapping ----------------------------------------------- ***
				1911
				1912	U_NAMESPACE_USE
				1913
				1914	U_CAPI int32_t U_EXPORT2
				1915	unorm2_swap(const UDataSwapper *ds,
				1916	const void inData, int32_t length, void outData,
				1917	UErrorCode *pErrorCode) {
				1918	const UDataInfo *pInfo;
				1919	int32_t headerSize;
				1920
				1921	const uint8_t *inBytes;
				1922	uint8_t *outBytes;
				1923
				1924	const int32_t *inIndexes;
				1925	int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1];
				1926
				1927	int32_t i, offset, nextOffset, size;
				1928
				1929	/* udata_swapDataHeader checks the arguments */
				1930	headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
				1931	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
				1932	return 0;
				1933	}
				1934
				1935	/* check data format and format version */
				1936	pInfo=(const UDataInfo )((const char )inData+4);
				1937	if(!(
				1938	pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
				1939	pInfo->dataFormat[1]==0x72 &&
				1940	pInfo->dataFormat[2]==0x6d &&
				1941	pInfo->dataFormat[3]==0x32 &&
				1942	pInfo->formatVersion[0]==1
				1943	)) {
				1944	udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
				1945	pInfo->dataFormat[0], pInfo->dataFormat[1],
				1946	pInfo->dataFormat[2], pInfo->dataFormat[3],
				1947	pInfo->formatVersion[0]);
				1948	*pErrorCode=U_UNSUPPORTED_ERROR;
				1949	return 0;
				1950	}
				1951
				1952	inBytes=(const uint8_t *)inData+headerSize;
				1953	outBytes=(uint8_t *)outData+headerSize;
				1954
				1955	inIndexes=(const int32_t *)inBytes;
				1956
				1957	if(length>=0) {
				1958	length-=headerSize;
claireho	27f6547	2011-06-09 11:11:49 -0700	[diff] [blame^]	1959	if(length<(int32_t)sizeof(indexes)) {
claireho	50294ea	2010-05-03 15:44:48 -0700	[diff] [blame]	1960	udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
				1961	length);
				1962	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
				1963	return 0;
				1964	}
				1965	}
				1966
				1967	/* read the first few indexes */
				1968	for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) {
				1969	indexes[i]=udata_readInt32(ds, inIndexes[i]);
				1970	}
				1971
				1972	/* get the total length of the data */
				1973	size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
				1974
				1975	if(length>=0) {
				1976	if(length<size) {
				1977	udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
				1978	length);
				1979	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
				1980	return 0;
				1981	}
				1982
				1983	/* copy the data for inaccessible bytes */
				1984	if(inBytes!=outBytes) {
				1985	uprv_memcpy(outBytes, inBytes, size);
				1986	}
				1987
				1988	offset=0;
				1989
				1990	/* swap the int32_t indexes[] */
				1991	nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
				1992	ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
				1993	offset=nextOffset;
				1994
				1995	/* swap the UTrie2 */
				1996	nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
				1997	utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
				1998	offset=nextOffset;
				1999
				2000	/* swap the uint16_t extraData[] */
				2001	nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET+1];
				2002	ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
				2003	offset=nextOffset;
				2004
				2005	U_ASSERT(offset==size);
				2006	}
				2007
				2008	return headerSize+size;
				2009	}
				2010
				2011	#endif // !UCONFIG_NO_NORMALIZATION