| // Copyright (C) 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ************************************************************************* |
| * COPYRIGHT: |
| * Copyright (c) 1996-2012, International Business Machines Corporation and |
| * others. All Rights Reserved. |
| ************************************************************************* |
| */ |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_NORMALIZATION |
| |
| #include "unicode/uniset.h" |
| #include "unicode/unistr.h" |
| #include "unicode/chariter.h" |
| #include "unicode/schriter.h" |
| #include "unicode/uchriter.h" |
| #include "unicode/normlzr.h" |
| #include "unicode/utf16.h" |
| #include "cmemory.h" |
| #include "normalizer2impl.h" |
| #include "uprops.h" // for uniset_getUnicode32Instance() |
| |
| U_NAMESPACE_BEGIN |
| |
| UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer) |
| |
| //------------------------------------------------------------------------- |
| // Constructors and other boilerplate |
| //------------------------------------------------------------------------- |
| |
| Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) : |
| UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), |
| text(new StringCharacterIterator(str)), |
| currentIndex(0), nextIndex(0), |
| buffer(), bufferPos(0) |
| { |
| init(); |
| } |
| |
| Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) : |
| UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), |
| text(new UCharCharacterIterator(str, length)), |
| currentIndex(0), nextIndex(0), |
| buffer(), bufferPos(0) |
| { |
| init(); |
| } |
| |
| Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) : |
| UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), |
| text(iter.clone()), |
| currentIndex(0), nextIndex(0), |
| buffer(), bufferPos(0) |
| { |
| init(); |
| } |
| |
| Normalizer::Normalizer(const Normalizer ©) : |
| UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions), |
| text(copy.text->clone()), |
| currentIndex(copy.currentIndex), nextIndex(copy.nextIndex), |
| buffer(copy.buffer), bufferPos(copy.bufferPos) |
| { |
| init(); |
| } |
| |
| void |
| Normalizer::init() { |
| UErrorCode errorCode=U_ZERO_ERROR; |
| fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode); |
| if(fOptions&UNORM_UNICODE_3_2) { |
| delete fFilteredNorm2; |
| fNorm2=fFilteredNorm2= |
| new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode)); |
| } |
| if(U_FAILURE(errorCode)) { |
| errorCode=U_ZERO_ERROR; |
| fNorm2=Normalizer2Factory::getNoopInstance(errorCode); |
| } |
| } |
| |
| Normalizer::~Normalizer() |
| { |
| delete fFilteredNorm2; |
| delete text; |
| } |
| |
| Normalizer* |
| Normalizer::clone() const |
| { |
| return new Normalizer(*this); |
| } |
| |
| /** |
| * Generates a hash code for this iterator. |
| */ |
| int32_t Normalizer::hashCode() const |
| { |
| return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex; |
| } |
| |
| UBool Normalizer::operator==(const Normalizer& that) const |
| { |
| return |
| this==&that || |
| (fUMode==that.fUMode && |
| fOptions==that.fOptions && |
| *text==*that.text && |
| buffer==that.buffer && |
| bufferPos==that.bufferPos && |
| nextIndex==that.nextIndex); |
| } |
| |
| //------------------------------------------------------------------------- |
| // Static utility methods |
| //------------------------------------------------------------------------- |
| |
| void U_EXPORT2 |
| Normalizer::normalize(const UnicodeString& source, |
| UNormalizationMode mode, int32_t options, |
| UnicodeString& result, |
| UErrorCode &status) { |
| if(source.isBogus() || U_FAILURE(status)) { |
| result.setToBogus(); |
| if(U_SUCCESS(status)) { |
| status=U_ILLEGAL_ARGUMENT_ERROR; |
| } |
| } else { |
| UnicodeString localDest; |
| UnicodeString *dest; |
| |
| if(&source!=&result) { |
| dest=&result; |
| } else { |
| // the source and result strings are the same object, use a temporary one |
| dest=&localDest; |
| } |
| const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); |
| if(U_SUCCESS(status)) { |
| if(options&UNORM_UNICODE_3_2) { |
| FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). |
| normalize(source, *dest, status); |
| } else { |
| n2->normalize(source, *dest, status); |
| } |
| } |
| if(dest==&localDest && U_SUCCESS(status)) { |
| result=*dest; |
| } |
| } |
| } |
| |
| void U_EXPORT2 |
| Normalizer::compose(const UnicodeString& source, |
| UBool compat, int32_t options, |
| UnicodeString& result, |
| UErrorCode &status) { |
| normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status); |
| } |
| |
| void U_EXPORT2 |
| Normalizer::decompose(const UnicodeString& source, |
| UBool compat, int32_t options, |
| UnicodeString& result, |
| UErrorCode &status) { |
| normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status); |
| } |
| |
| UNormalizationCheckResult |
| Normalizer::quickCheck(const UnicodeString& source, |
| UNormalizationMode mode, int32_t options, |
| UErrorCode &status) { |
| const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); |
| if(U_SUCCESS(status)) { |
| if(options&UNORM_UNICODE_3_2) { |
| return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). |
| quickCheck(source, status); |
| } else { |
| return n2->quickCheck(source, status); |
| } |
| } else { |
| return UNORM_MAYBE; |
| } |
| } |
| |
| UBool |
| Normalizer::isNormalized(const UnicodeString& source, |
| UNormalizationMode mode, int32_t options, |
| UErrorCode &status) { |
| const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); |
| if(U_SUCCESS(status)) { |
| if(options&UNORM_UNICODE_3_2) { |
| return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). |
| isNormalized(source, status); |
| } else { |
| return n2->isNormalized(source, status); |
| } |
| } else { |
| return FALSE; |
| } |
| } |
| |
| UnicodeString & U_EXPORT2 |
| Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right, |
| UnicodeString &result, |
| UNormalizationMode mode, int32_t options, |
| UErrorCode &errorCode) { |
| if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) { |
| result.setToBogus(); |
| if(U_SUCCESS(errorCode)) { |
| errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| } |
| } else { |
| UnicodeString localDest; |
| UnicodeString *dest; |
| |
| if(&right!=&result) { |
| dest=&result; |
| } else { |
| // the right and result strings are the same object, use a temporary one |
| dest=&localDest; |
| } |
| *dest=left; |
| const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode); |
| if(U_SUCCESS(errorCode)) { |
| if(options&UNORM_UNICODE_3_2) { |
| FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)). |
| append(*dest, right, errorCode); |
| } else { |
| n2->append(*dest, right, errorCode); |
| } |
| } |
| if(dest==&localDest && U_SUCCESS(errorCode)) { |
| result=*dest; |
| } |
| } |
| return result; |
| } |
| |
| //------------------------------------------------------------------------- |
| // Iteration API |
| //------------------------------------------------------------------------- |
| |
| /** |
| * Return the current character in the normalized text. |
| */ |
| UChar32 Normalizer::current() { |
| if(bufferPos<buffer.length() || nextNormalize()) { |
| return buffer.char32At(bufferPos); |
| } else { |
| return DONE; |
| } |
| } |
| |
| /** |
| * Return the next character in the normalized text and advance |
| * the iteration position by one. If the end |
| * of the text has already been reached, {@link #DONE} is returned. |
| */ |
| UChar32 Normalizer::next() { |
| if(bufferPos<buffer.length() || nextNormalize()) { |
| UChar32 c=buffer.char32At(bufferPos); |
| bufferPos+=U16_LENGTH(c); |
| return c; |
| } else { |
| return DONE; |
| } |
| } |
| |
| /** |
| * Return the previous character in the normalized text and decrement |
| * the iteration position by one. If the beginning |
| * of the text has already been reached, {@link #DONE} is returned. |
| */ |
| UChar32 Normalizer::previous() { |
| if(bufferPos>0 || previousNormalize()) { |
| UChar32 c=buffer.char32At(bufferPos-1); |
| bufferPos-=U16_LENGTH(c); |
| return c; |
| } else { |
| return DONE; |
| } |
| } |
| |
| void Normalizer::reset() { |
| currentIndex=nextIndex=text->setToStart(); |
| clearBuffer(); |
| } |
| |
| void |
| Normalizer::setIndexOnly(int32_t index) { |
| text->setIndex(index); // pins index |
| currentIndex=nextIndex=text->getIndex(); |
| clearBuffer(); |
| } |
| |
| /** |
| * Return the first character in the normalized text. This resets |
| * the <tt>Normalizer's</tt> position to the beginning of the text. |
| */ |
| UChar32 Normalizer::first() { |
| reset(); |
| return next(); |
| } |
| |
| /** |
| * Return the last character in the normalized text. This resets |
| * the <tt>Normalizer's</tt> position to be just before the |
| * the input text corresponding to that normalized character. |
| */ |
| UChar32 Normalizer::last() { |
| currentIndex=nextIndex=text->setToEnd(); |
| clearBuffer(); |
| return previous(); |
| } |
| |
| /** |
| * Retrieve the current iteration position in the input text that is |
| * being normalized. This method is useful in applications such as |
| * searching, where you need to be able to determine the position in |
| * the input text that corresponds to a given normalized output character. |
| * <p> |
| * <b>Note:</b> This method sets the position in the <em>input</em>, while |
| * {@link #next} and {@link #previous} iterate through characters in the |
| * <em>output</em>. This means that there is not necessarily a one-to-one |
| * correspondence between characters returned by <tt>next</tt> and |
| * <tt>previous</tt> and the indices passed to and returned from |
| * <tt>setIndex</tt> and {@link #getIndex}. |
| * |
| */ |
| int32_t Normalizer::getIndex() const { |
| if(bufferPos<buffer.length()) { |
| return currentIndex; |
| } else { |
| return nextIndex; |
| } |
| } |
| |
| /** |
| * Retrieve the index of the start of the input text. This is the begin index |
| * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt> |
| * over which this <tt>Normalizer</tt> is iterating |
| */ |
| int32_t Normalizer::startIndex() const { |
| return text->startIndex(); |
| } |
| |
| /** |
| * Retrieve the index of the end of the input text. This is the end index |
| * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> |
| * over which this <tt>Normalizer</tt> is iterating |
| */ |
| int32_t Normalizer::endIndex() const { |
| return text->endIndex(); |
| } |
| |
| //------------------------------------------------------------------------- |
| // Property access methods |
| //------------------------------------------------------------------------- |
| |
| void |
| Normalizer::setMode(UNormalizationMode newMode) |
| { |
| fUMode = newMode; |
| init(); |
| } |
| |
| UNormalizationMode |
| Normalizer::getUMode() const |
| { |
| return fUMode; |
| } |
| |
| void |
| Normalizer::setOption(int32_t option, |
| UBool value) |
| { |
| if (value) { |
| fOptions |= option; |
| } else { |
| fOptions &= (~option); |
| } |
| init(); |
| } |
| |
| UBool |
| Normalizer::getOption(int32_t option) const |
| { |
| return (fOptions & option) != 0; |
| } |
| |
| /** |
| * Set the input text over which this <tt>Normalizer</tt> will iterate. |
| * The iteration position is set to the beginning of the input text. |
| */ |
| void |
| Normalizer::setText(const UnicodeString& newText, |
| UErrorCode &status) |
| { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| CharacterIterator *newIter = new StringCharacterIterator(newText); |
| if (newIter == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| delete text; |
| text = newIter; |
| reset(); |
| } |
| |
| /** |
| * Set the input text over which this <tt>Normalizer</tt> will iterate. |
| * The iteration position is set to the beginning of the string. |
| */ |
| void |
| Normalizer::setText(const CharacterIterator& newText, |
| UErrorCode &status) |
| { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| CharacterIterator *newIter = newText.clone(); |
| if (newIter == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| delete text; |
| text = newIter; |
| reset(); |
| } |
| |
| void |
| Normalizer::setText(const UChar* newText, |
| int32_t length, |
| UErrorCode &status) |
| { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| CharacterIterator *newIter = new UCharCharacterIterator(newText, length); |
| if (newIter == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| delete text; |
| text = newIter; |
| reset(); |
| } |
| |
| /** |
| * Copies the text under iteration into the UnicodeString referred to by "result". |
| * @param result Receives a copy of the text under iteration. |
| */ |
| void |
| Normalizer::getText(UnicodeString& result) |
| { |
| text->getText(result); |
| } |
| |
| //------------------------------------------------------------------------- |
| // Private utility methods |
| //------------------------------------------------------------------------- |
| |
| void Normalizer::clearBuffer() { |
| buffer.remove(); |
| bufferPos=0; |
| } |
| |
| UBool |
| Normalizer::nextNormalize() { |
| clearBuffer(); |
| currentIndex=nextIndex; |
| text->setIndex(nextIndex); |
| if(!text->hasNext()) { |
| return FALSE; |
| } |
| // Skip at least one character so we make progress. |
| UnicodeString segment(text->next32PostInc()); |
| while(text->hasNext()) { |
| UChar32 c; |
| if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) { |
| text->move32(-1, CharacterIterator::kCurrent); |
| break; |
| } |
| segment.append(c); |
| } |
| nextIndex=text->getIndex(); |
| UErrorCode errorCode=U_ZERO_ERROR; |
| fNorm2->normalize(segment, buffer, errorCode); |
| return U_SUCCESS(errorCode) && !buffer.isEmpty(); |
| } |
| |
| UBool |
| Normalizer::previousNormalize() { |
| clearBuffer(); |
| nextIndex=currentIndex; |
| text->setIndex(currentIndex); |
| if(!text->hasPrevious()) { |
| return FALSE; |
| } |
| UnicodeString segment; |
| while(text->hasPrevious()) { |
| UChar32 c=text->previous32(); |
| segment.insert(0, c); |
| if(fNorm2->hasBoundaryBefore(c)) { |
| break; |
| } |
| } |
| currentIndex=text->getIndex(); |
| UErrorCode errorCode=U_ZERO_ERROR; |
| fNorm2->normalize(segment, buffer, errorCode); |
| bufferPos=buffer.length(); |
| return U_SUCCESS(errorCode) && !buffer.isEmpty(); |
| } |
| |
| U_NAMESPACE_END |
| |
| #endif /* #if !UCONFIG_NO_NORMALIZATION */ |