| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ******************************************************************************* |
| * Copyright (C) 2009-2014, International Business Machines Corporation and |
| * others. All Rights Reserved. |
| ******************************************************************************* |
| */ |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_COLLATION |
| |
| #include "unicode/alphaindex.h" |
| #include "unicode/coll.h" |
| #include "unicode/localpointer.h" |
| #include "unicode/normalizer2.h" |
| #include "unicode/tblcoll.h" |
| #include "unicode/uchar.h" |
| #include "unicode/ulocdata.h" |
| #include "unicode/uniset.h" |
| #include "unicode/uobject.h" |
| #include "unicode/usetiter.h" |
| #include "unicode/utf16.h" |
| |
| #include "cmemory.h" |
| #include "cstring.h" |
| #include "uassert.h" |
| #include "uvector.h" |
| #include "uvectr64.h" |
| |
| //#include <string> |
| //#include <iostream> |
| |
| U_NAMESPACE_BEGIN |
| |
| namespace { |
| |
| /** |
| * Prefix string for Chinese index buckets. |
| * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes |
| */ |
| const UChar BASE[1] = { 0xFDD0 }; |
| const int32_t BASE_LENGTH = 1; |
| |
| UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, |
| const UnicodeString &one, const UnicodeString &other); |
| |
| } // namespace |
| |
| static int32_t U_CALLCONV |
| collatorComparator(const void *context, const void *left, const void *right); |
| |
| static int32_t U_CALLCONV |
| recordCompareFn(const void *context, const void *left, const void *right); |
| |
| // UVector<Record *> support function, delete a Record. |
| static void U_CALLCONV |
| alphaIndex_deleteRecord(void *obj) { |
| delete static_cast<AlphabeticIndex::Record *>(obj); |
| } |
| |
| namespace { |
| |
| UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned, |
| UErrorCode &errorCode) { |
| if (U_FAILURE(errorCode)) { return NULL; } |
| if (owned.isValid()) { |
| return owned.orphan(); |
| } |
| UnicodeString *p = new UnicodeString(s); |
| if (p == NULL) { |
| errorCode = U_MEMORY_ALLOCATION_ERROR; |
| } |
| return p; |
| } |
| |
| inline UnicodeString *getString(const UVector &list, int32_t i) { |
| return static_cast<UnicodeString *>(list[i]); |
| } |
| |
| inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) { |
| return static_cast<AlphabeticIndex::Bucket *>(list[i]); |
| } |
| |
| inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) { |
| return static_cast<AlphabeticIndex::Record *>(list[i]); |
| } |
| |
| /** |
| * Like Java Collections.binarySearch(List, String, Comparator). |
| * |
| * @return the index>=0 where the item was found, |
| * or the index<0 for inserting the string at ~index in sorted order |
| */ |
| int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) { |
| if (list.size() == 0) { return ~0; } |
| int32_t start = 0; |
| int32_t limit = list.size(); |
| for (;;) { |
| int32_t i = (start + limit) / 2; |
| const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i)); |
| UErrorCode errorCode = U_ZERO_ERROR; |
| UCollationResult cmp = coll.compare(s, *si, errorCode); |
| if (cmp == UCOL_EQUAL) { |
| return i; |
| } else if (cmp < 0) { |
| if (i == start) { |
| return ~start; // insert s before *si |
| } |
| limit = i; |
| } else { |
| if (i == start) { |
| return ~(start + 1); // insert s after *si |
| } |
| start = i; |
| } |
| } |
| } |
| |
| } // namespace |
| |
| // The BucketList is not in the anonymous namespace because only Clang |
| // seems to support its use in other classes from there. |
| // However, we also don't need U_I18N_API because it is not used from outside the i18n library. |
| class BucketList : public UObject { |
| public: |
| BucketList(UVector *bucketList, UVector *publicBucketList) |
| : bucketList_(bucketList), immutableVisibleList_(publicBucketList) { |
| int32_t displayIndex = 0; |
| for (int32_t i = 0; i < publicBucketList->size(); ++i) { |
| getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++; |
| } |
| } |
| |
| // The virtual destructor must not be inline. |
| // See ticket #8454 for details. |
| virtual ~BucketList(); |
| |
| int32_t getBucketCount() const { |
| return immutableVisibleList_->size(); |
| } |
| |
| int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly, |
| UErrorCode &errorCode) { |
| // binary search |
| int32_t start = 0; |
| int32_t limit = bucketList_->size(); |
| while ((start + 1) < limit) { |
| int32_t i = (start + limit) / 2; |
| const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i); |
| UCollationResult nameVsBucket = |
| collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode); |
| if (nameVsBucket < 0) { |
| limit = i; |
| } else { |
| start = i; |
| } |
| } |
| const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start); |
| if (bucket->displayBucket_ != NULL) { |
| bucket = bucket->displayBucket_; |
| } |
| return bucket->displayIndex_; |
| } |
| |
| /** All of the buckets, visible and invisible. */ |
| UVector *bucketList_; |
| /** Just the visible buckets. */ |
| UVector *immutableVisibleList_; |
| }; |
| |
| BucketList::~BucketList() { |
| delete bucketList_; |
| if (immutableVisibleList_ != bucketList_) { |
| delete immutableVisibleList_; |
| } |
| } |
| |
| AlphabeticIndex::ImmutableIndex::~ImmutableIndex() { |
| delete buckets_; |
| delete collatorPrimaryOnly_; |
| } |
| |
| int32_t |
| AlphabeticIndex::ImmutableIndex::getBucketCount() const { |
| return buckets_->getBucketCount(); |
| } |
| |
| int32_t |
| AlphabeticIndex::ImmutableIndex::getBucketIndex( |
| const UnicodeString &name, UErrorCode &errorCode) const { |
| return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode); |
| } |
| |
| const AlphabeticIndex::Bucket * |
| AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const { |
| if (0 <= index && index < buckets_->getBucketCount()) { |
| return icu::getBucket(*buckets_->immutableVisibleList_, index); |
| } else { |
| return NULL; |
| } |
| } |
| |
| AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) |
| : inputList_(NULL), |
| labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL), |
| maxLabelCount_(99), |
| initialLabels_(NULL), firstCharsInScripts_(NULL), |
| collator_(NULL), collatorPrimaryOnly_(NULL), |
| buckets_(NULL) { |
| init(&locale, status); |
| } |
| |
| |
| AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status) |
| : inputList_(NULL), |
| labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL), |
| maxLabelCount_(99), |
| initialLabels_(NULL), firstCharsInScripts_(NULL), |
| collator_(collator), collatorPrimaryOnly_(NULL), |
| buckets_(NULL) { |
| init(NULL, status); |
| } |
| |
| |
| |
| AlphabeticIndex::~AlphabeticIndex() { |
| delete collator_; |
| delete collatorPrimaryOnly_; |
| delete firstCharsInScripts_; |
| delete buckets_; |
| delete inputList_; |
| delete initialLabels_; |
| } |
| |
| |
| AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return *this; |
| } |
| initialLabels_->addAll(additions); |
| clearBuckets(); |
| return *this; |
| } |
| |
| |
| AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) { |
| addIndexExemplars(locale, status); |
| clearBuckets(); |
| return *this; |
| } |
| |
| |
| AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) { |
| if (U_FAILURE(errorCode)) { return NULL; } |
| // In C++, the ImmutableIndex must own its copy of the BucketList, |
| // even if it contains no records, for proper memory management. |
| // We could clone the buckets_ if they are not NULL, |
| // but that would be worth it only if this method is called multiple times, |
| // or called after using the old-style bucket iterator API. |
| LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode)); |
| LocalPointer<RuleBasedCollator> coll(collatorPrimaryOnly_->clone()); |
| if (immutableBucketList.isNull() || coll.isNull()) { |
| errorCode = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias()); |
| if (immIndex == NULL) { |
| errorCode = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| // The ImmutableIndex adopted its parameter objects. |
| immutableBucketList.orphan(); |
| coll.orphan(); |
| return immIndex; |
| } |
| |
| int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) { |
| initBuckets(status); |
| if (U_FAILURE(status)) { |
| return 0; |
| } |
| return buckets_->getBucketCount(); |
| } |
| |
| |
| int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) { |
| if (U_FAILURE(status) || inputList_ == NULL) { |
| return 0; |
| } |
| return inputList_->size(); |
| } |
| |
| void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const { |
| const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode); |
| if (U_FAILURE(errorCode)) { return; } |
| |
| const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0); |
| const UnicodeString &overflowBoundary = |
| *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1); |
| |
| // We make a sorted array of elements. |
| // Some of the input may be redundant. |
| // That is, we might have c, ch, d, where "ch" sorts just like "c", "h". |
| // We filter out those cases. |
| UnicodeSetIterator iter(*initialLabels_); |
| while (iter.next()) { |
| const UnicodeString *item = &iter.getString(); |
| LocalPointer<UnicodeString> ownedItem; |
| UBool checkDistinct; |
| int32_t itemLength = item->length(); |
| if (!item->hasMoreChar32Than(0, itemLength, 1)) { |
| checkDistinct = FALSE; |
| } else if(item->charAt(itemLength - 1) == 0x2a && // '*' |
| item->charAt(itemLength - 2) != 0x2a) { |
| // Use a label if it is marked with one trailing star, |
| // even if the label string sorts the same when all contractions are suppressed. |
| ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1)); |
| item = ownedItem.getAlias(); |
| if (item == NULL) { |
| errorCode = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| checkDistinct = FALSE; |
| } else { |
| checkDistinct = TRUE; |
| } |
| if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) { |
| // Ignore a primary-ignorable or non-alphabetic index character. |
| } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) { |
| // Ignore an index character that will land in the overflow bucket. |
| } else if (checkDistinct && |
| collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) { |
| // Ignore a multi-code point index character that does not sort distinctly |
| // from the sequence of its separate characters. |
| } else { |
| int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_); |
| if (insertionPoint < 0) { |
| indexCharacters.insertElementAt( |
| ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode); |
| } else { |
| const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint); |
| if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) { |
| indexCharacters.setElementAt( |
| ownedString(*item, ownedItem, errorCode), insertionPoint); |
| } |
| } |
| } |
| } |
| if (U_FAILURE(errorCode)) { return; } |
| |
| // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element |
| |
| int32_t size = indexCharacters.size() - 1; |
| if (size > maxLabelCount_) { |
| int32_t count = 0; |
| int32_t old = -1; |
| for (int32_t i = 0; i < indexCharacters.size();) { |
| ++count; |
| int32_t bump = count * maxLabelCount_ / size; |
| if (bump == old) { |
| indexCharacters.removeElementAt(i); |
| } else { |
| old = bump; |
| ++i; |
| } |
| } |
| } |
| } |
| |
| namespace { |
| |
| const UnicodeString &fixLabel(const UnicodeString ¤t, UnicodeString &temp) { |
| if (!current.startsWith(BASE, BASE_LENGTH)) { |
| return current; |
| } |
| UChar rest = current.charAt(BASE_LENGTH); |
| if (0x2800 < rest && rest <= 0x28FF) { // stroke count |
| int32_t count = rest-0x2800; |
| temp.setTo((UChar)(0x30 + count % 10)); |
| if (count >= 10) { |
| count /= 10; |
| temp.insert(0, (UChar)(0x30 + count % 10)); |
| if (count >= 10) { |
| count /= 10; |
| temp.insert(0, (UChar)(0x30 + count)); |
| } |
| } |
| return temp.append((UChar)0x5283); |
| } |
| return temp.setTo(current, BASE_LENGTH); |
| } |
| |
| UBool hasMultiplePrimaryWeights( |
| const RuleBasedCollator &coll, uint32_t variableTop, |
| const UnicodeString &s, UVector64 &ces, UErrorCode &errorCode) { |
| ces.removeAllElements(); |
| coll.internalGetCEs(s, ces, errorCode); |
| if (U_FAILURE(errorCode)) { return FALSE; } |
| UBool seenPrimary = FALSE; |
| for (int32_t i = 0; i < ces.size(); ++i) { |
| int64_t ce = ces.elementAti(i); |
| uint32_t p = (uint32_t)(ce >> 32); |
| if (p > variableTop) { |
| // not primary ignorable |
| if (seenPrimary) { |
| return TRUE; |
| } |
| seenPrimary = TRUE; |
| } |
| } |
| return FALSE; |
| } |
| |
| } // namespace |
| |
| BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const { |
| // Initialize indexCharacters. |
| UVector indexCharacters(errorCode); |
| indexCharacters.setDeleter(uprv_deleteUObject); |
| initLabels(indexCharacters, errorCode); |
| if (U_FAILURE(errorCode)) { return NULL; } |
| |
| // Variables for hasMultiplePrimaryWeights(). |
| UVector64 ces(errorCode); |
| uint32_t variableTop; |
| if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) { |
| variableTop = collatorPrimaryOnly_->getVariableTop(errorCode); |
| } else { |
| variableTop = 0; |
| } |
| UBool hasInvisibleBuckets = FALSE; |
| |
| // Helper arrays for Chinese Pinyin collation. |
| Bucket *asciiBuckets[26] = { |
| NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL |
| }; |
| Bucket *pinyinBuckets[26] = { |
| NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL |
| }; |
| UBool hasPinyin = FALSE; |
| |
| LocalPointer<UVector> bucketList(new UVector(errorCode), errorCode); |
| if (U_FAILURE(errorCode)) { |
| return NULL; |
| } |
| bucketList->setDeleter(uprv_deleteUObject); |
| |
| // underflow bucket |
| Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW); |
| if (bucket == NULL) { |
| errorCode = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| bucketList->addElement(bucket, errorCode); |
| if (U_FAILURE(errorCode)) { return NULL; } |
| |
| UnicodeString temp; |
| |
| // fix up the list, adding underflow, additions, overflow |
| // Insert inflow labels as needed. |
| int32_t scriptIndex = -1; |
| const UnicodeString *scriptUpperBoundary = &emptyString_; |
| for (int32_t i = 0; i < indexCharacters.size(); ++i) { |
| UnicodeString ¤t = *getString(indexCharacters, i); |
| if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) { |
| // We crossed the script boundary into a new script. |
| const UnicodeString &inflowBoundary = *scriptUpperBoundary; |
| UBool skippedScript = FALSE; |
| for (;;) { |
| scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex); |
| if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) { |
| break; |
| } |
| skippedScript = TRUE; |
| } |
| if (skippedScript && bucketList->size() > 1) { |
| // We are skipping one or more scripts, |
| // and we are not just getting out of the underflow label. |
| bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW); |
| if (bucket == NULL) { |
| errorCode = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| bucketList->addElement(bucket, errorCode); |
| } |
| } |
| // Add a bucket with the current label. |
| bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL); |
| if (bucket == NULL) { |
| errorCode = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| bucketList->addElement(bucket, errorCode); |
| // Remember ASCII and Pinyin buckets for Pinyin redirects. |
| UChar c; |
| if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z |
| asciiBuckets[c - 0x41] = bucket; |
| } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) && |
| 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) { |
| pinyinBuckets[c - 0x41] = bucket; |
| hasPinyin = TRUE; |
| } |
| // Check for multiple primary weights. |
| if (!current.startsWith(BASE, BASE_LENGTH) && |
| hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, current, |
| ces, errorCode) && |
| current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) { |
| // "AE-ligature" or "Sch" etc. |
| for (int32_t j = bucketList->size() - 2;; --j) { |
| Bucket *singleBucket = getBucket(*bucketList, j); |
| if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) { |
| // There is no single-character bucket since the last |
| // underflow or inflow label. |
| break; |
| } |
| if (singleBucket->displayBucket_ == NULL && |
| !hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, |
| singleBucket->lowerBoundary_, |
| ces, errorCode)) { |
| // Add an invisible bucket that redirects strings greater than the expansion |
| // to the previous single-character bucket. |
| // For example, after ... Q R S Sch we add Sch\uFFFF->S |
| // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S. |
| bucket = new Bucket(emptyString_, |
| UnicodeString(current).append((UChar)0xFFFF), |
| U_ALPHAINDEX_NORMAL); |
| if (bucket == NULL) { |
| errorCode = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| bucket->displayBucket_ = singleBucket; |
| bucketList->addElement(bucket, errorCode); |
| hasInvisibleBuckets = TRUE; |
| break; |
| } |
| } |
| } |
| } |
| if (U_FAILURE(errorCode)) { return NULL; } |
| if (bucketList->size() == 1) { |
| // No real labels, show only the underflow label. |
| BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); |
| if (bl == NULL) { |
| errorCode = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| bucketList.orphan(); |
| return bl; |
| } |
| // overflow bucket |
| bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW); |
| if (bucket == NULL) { |
| errorCode = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| bucketList->addElement(bucket, errorCode); // final |
| |
| if (hasPinyin) { |
| // Redirect Pinyin buckets. |
| Bucket *asciiBucket = NULL; |
| for (int32_t i = 0; i < 26; ++i) { |
| if (asciiBuckets[i] != NULL) { |
| asciiBucket = asciiBuckets[i]; |
| } |
| if (pinyinBuckets[i] != NULL && asciiBucket != NULL) { |
| pinyinBuckets[i]->displayBucket_ = asciiBucket; |
| hasInvisibleBuckets = TRUE; |
| } |
| } |
| } |
| |
| if (U_FAILURE(errorCode)) { return NULL; } |
| if (!hasInvisibleBuckets) { |
| BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); |
| if (bl == NULL) { |
| errorCode = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| bucketList.orphan(); |
| return bl; |
| } |
| // Merge inflow buckets that are visually adjacent. |
| // Iterate backwards: Merge inflow into overflow rather than the other way around. |
| int32_t i = bucketList->size() - 1; |
| Bucket *nextBucket = getBucket(*bucketList, i); |
| while (--i > 0) { |
| bucket = getBucket(*bucketList, i); |
| if (bucket->displayBucket_ != NULL) { |
| continue; // skip invisible buckets |
| } |
| if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) { |
| if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) { |
| bucket->displayBucket_ = nextBucket; |
| continue; |
| } |
| } |
| nextBucket = bucket; |
| } |
| |
| LocalPointer<UVector> publicBucketList(new UVector(errorCode), errorCode); |
| if (U_FAILURE(errorCode)) { |
| return NULL; |
| } |
| // Do not call publicBucketList->setDeleter(): |
| // This vector shares its objects with the bucketList. |
| for (int32_t j = 0; j < bucketList->size(); ++j) { |
| bucket = getBucket(*bucketList, j); |
| if (bucket->displayBucket_ == NULL) { |
| publicBucketList->addElement(bucket, errorCode); |
| } |
| } |
| if (U_FAILURE(errorCode)) { return NULL; } |
| BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias()); |
| if (bl == NULL) { |
| errorCode = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| bucketList.orphan(); |
| publicBucketList.orphan(); |
| return bl; |
| } |
| |
| /** |
| * Creates an index, and buckets and sorts the list of records into the index. |
| */ |
| void AlphabeticIndex::initBuckets(UErrorCode &errorCode) { |
| if (U_FAILURE(errorCode) || buckets_ != NULL) { |
| return; |
| } |
| buckets_ = createBucketList(errorCode); |
| if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) { |
| return; |
| } |
| |
| // Sort the records by name. |
| // Stable sort preserves input order of collation duplicates. |
| inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode); |
| |
| // Now, we traverse all of the input, which is now sorted. |
| // If the item doesn't go in the current bucket, we find the next bucket that contains it. |
| // This makes the process order n*log(n), since we just sort the list and then do a linear process. |
| // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so |
| // we need to improve it for that case. |
| |
| Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0); |
| int32_t bucketIndex = 1; |
| Bucket *nextBucket; |
| const UnicodeString *upperBoundary; |
| if (bucketIndex < buckets_->bucketList_->size()) { |
| nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); |
| upperBoundary = &nextBucket->lowerBoundary_; |
| } else { |
| nextBucket = NULL; |
| upperBoundary = NULL; |
| } |
| for (int32_t i = 0; i < inputList_->size(); ++i) { |
| Record *r = getRecord(*inputList_, i); |
| // if the current bucket isn't the right one, find the one that is |
| // We have a special flag for the last bucket so that we don't look any further |
| while (upperBoundary != NULL && |
| collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) { |
| currentBucket = nextBucket; |
| // now reset the boundary that we compare against |
| if (bucketIndex < buckets_->bucketList_->size()) { |
| nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); |
| upperBoundary = &nextBucket->lowerBoundary_; |
| } else { |
| upperBoundary = NULL; |
| } |
| } |
| // now put the record into the bucket. |
| Bucket *bucket = currentBucket; |
| if (bucket->displayBucket_ != NULL) { |
| bucket = bucket->displayBucket_; |
| } |
| if (bucket->records_ == NULL) { |
| bucket->records_ = new UVector(errorCode); |
| if (bucket->records_ == NULL) { |
| errorCode = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| } |
| bucket->records_->addElement(r, errorCode); |
| } |
| } |
| |
| void AlphabeticIndex::clearBuckets() { |
| if (buckets_ != NULL) { |
| delete buckets_; |
| buckets_ = NULL; |
| internalResetBucketIterator(); |
| } |
| } |
| |
| void AlphabeticIndex::internalResetBucketIterator() { |
| labelsIterIndex_ = -1; |
| currentBucket_ = NULL; |
| } |
| |
| |
| void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) { |
| LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status)); |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| UnicodeSet exemplars; |
| ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status); |
| if (U_SUCCESS(status)) { |
| initialLabels_->addAll(exemplars); |
| return; |
| } |
| status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR |
| |
| // The locale data did not include explicit Index characters. |
| // Synthesize a set of them from the locale's standard exemplar characters. |
| ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status); |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| // question: should we add auxiliary exemplars? |
| if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.isEmpty()) { |
| exemplars.add(0x61, 0x7A); |
| } |
| if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables |
| // cut down to small list |
| exemplars.remove(0xAC00, 0xD7A3). |
| add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C). |
| add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544). |
| add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0). |
| add(0xD30C).add(0xD558); |
| } |
| if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block |
| // cut down to small list |
| // make use of the fact that Ethiopic is allocated in 8's, where |
| // the base is 0 mod 8. |
| UnicodeSet ethiopic(UnicodeString(u"[ሀለሐመሠረሰሸቀቈቐቘበቨተቸኀኈነኘአከኰኸዀወዐዘዠየደዸጀገጐጘጠጨጰጸፀፈፐፘ]"), status); |
| ethiopic.retainAll(exemplars); |
| exemplars.remove(u'ሀ', 0x137F).addAll(ethiopic); |
| } |
| |
| // Upper-case any that aren't already so. |
| // (We only do this for synthesized index characters.) |
| UnicodeSetIterator it(exemplars); |
| UnicodeString upperC; |
| while (it.next()) { |
| const UnicodeString &exemplarC = it.getString(); |
| upperC = exemplarC; |
| upperC.toUpper(locale); |
| initialLabels_->add(upperC); |
| } |
| } |
| |
| UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) { |
| UnicodeSet contractions; |
| collatorPrimaryOnly_->internalAddContractions(BASE[0], contractions, errorCode); |
| if (U_FAILURE(errorCode) || contractions.isEmpty()) { return FALSE; } |
| initialLabels_->addAll(contractions); |
| UnicodeSetIterator iter(contractions); |
| while (iter.next()) { |
| const UnicodeString &s = iter.getString(); |
| U_ASSERT (s.startsWith(BASE, BASE_LENGTH)); |
| UChar c = s.charAt(s.length() - 1); |
| if (0x41 <= c && c <= 0x5A) { // A-Z |
| // There are Pinyin labels, add ASCII A-Z labels as well. |
| initialLabels_->add(0x41, 0x5A); // A-Z |
| break; |
| } |
| } |
| return TRUE; |
| } |
| |
| |
| /* |
| * Return the string with interspersed CGJs. Input must have more than 2 codepoints. |
| */ |
| static const UChar CGJ = 0x034F; |
| UnicodeString AlphabeticIndex::separated(const UnicodeString &item) { |
| UnicodeString result; |
| if (item.length() == 0) { |
| return result; |
| } |
| int32_t i = 0; |
| for (;;) { |
| UChar32 cp = item.char32At(i); |
| result.append(cp); |
| i = item.moveIndex32(i, 1); |
| if (i >= item.length()) { |
| break; |
| } |
| result.append(CGJ); |
| } |
| return result; |
| } |
| |
| |
| UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const { |
| return FALSE; |
| } |
| |
| |
| UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const { |
| return FALSE; |
| } |
| |
| |
| const RuleBasedCollator &AlphabeticIndex::getCollator() const { |
| return *collator_; |
| } |
| |
| |
| const UnicodeString &AlphabeticIndex::getInflowLabel() const { |
| return inflowLabel_; |
| } |
| |
| const UnicodeString &AlphabeticIndex::getOverflowLabel() const { |
| return overflowLabel_; |
| } |
| |
| |
| const UnicodeString &AlphabeticIndex::getUnderflowLabel() const { |
| return underflowLabel_; |
| } |
| |
| |
| AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { |
| inflowLabel_ = label; |
| clearBuckets(); |
| return *this; |
| } |
| |
| |
| AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { |
| overflowLabel_ = label; |
| clearBuckets(); |
| return *this; |
| } |
| |
| |
| AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { |
| underflowLabel_ = label; |
| clearBuckets(); |
| return *this; |
| } |
| |
| |
| int32_t AlphabeticIndex::getMaxLabelCount() const { |
| return maxLabelCount_; |
| } |
| |
| |
| AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return *this; |
| } |
| if (maxLabelCount <= 0) { |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return *this; |
| } |
| maxLabelCount_ = maxLabelCount; |
| clearBuckets(); |
| return *this; |
| } |
| |
| |
| // |
| // init() - Common code for constructors. |
| // |
| |
| void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) { |
| if (U_FAILURE(status)) { return; } |
| if (locale == NULL && collator_ == NULL) { |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| |
| initialLabels_ = new UnicodeSet(); |
| if (initialLabels_ == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| |
| inflowLabel_.setTo((UChar)0x2026); // Ellipsis |
| overflowLabel_ = inflowLabel_; |
| underflowLabel_ = inflowLabel_; |
| |
| if (collator_ == NULL) { |
| Collator *coll = Collator::createInstance(*locale, status); |
| if (U_FAILURE(status)) { |
| delete coll; |
| return; |
| } |
| if (coll == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| collator_ = dynamic_cast<RuleBasedCollator *>(coll); |
| if (collator_ == NULL) { |
| delete coll; |
| status = U_UNSUPPORTED_ERROR; |
| return; |
| } |
| } |
| collatorPrimaryOnly_ = collator_->clone(); |
| if (collatorPrimaryOnly_ == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status); |
| firstCharsInScripts_ = firstStringsInScript(status); |
| if (U_FAILURE(status)) { return; } |
| firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status); |
| // Guard against a degenerate collator where |
| // some script boundary strings are primary ignorable. |
| for (;;) { |
| if (U_FAILURE(status)) { return; } |
| if (firstCharsInScripts_->isEmpty()) { |
| // AlphabeticIndex requires some non-ignorable script boundary strings. |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| if (collatorPrimaryOnly_->compare( |
| *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)), |
| emptyString_, status) == UCOL_EQUAL) { |
| firstCharsInScripts_->removeElementAt(0); |
| } else { |
| break; |
| } |
| } |
| |
| // Chinese index characters, which are specific to each of the several Chinese tailorings, |
| // take precedence over the single locale data exemplar set per language. |
| if (!addChineseIndexCharacters(status) && locale != NULL) { |
| addIndexExemplars(*locale, status); |
| } |
| } |
| |
| |
| // |
| // Comparison function for UVector<UnicodeString *> sorting with a collator. |
| // |
| static int32_t U_CALLCONV |
| collatorComparator(const void *context, const void *left, const void *right) { |
| const UElement *leftElement = static_cast<const UElement *>(left); |
| const UElement *rightElement = static_cast<const UElement *>(right); |
| const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer); |
| const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer); |
| |
| if (leftString == rightString) { |
| // Catches case where both are NULL |
| return 0; |
| } |
| if (leftString == NULL) { |
| return 1; |
| } |
| if (rightString == NULL) { |
| return -1; |
| } |
| const Collator *col = static_cast<const Collator *>(context); |
| UErrorCode errorCode = U_ZERO_ERROR; |
| return col->compare(*leftString, *rightString, errorCode); |
| } |
| |
| // |
| // Comparison function for UVector<Record *> sorting with a collator. |
| // |
| static int32_t U_CALLCONV |
| recordCompareFn(const void *context, const void *left, const void *right) { |
| const UElement *leftElement = static_cast<const UElement *>(left); |
| const UElement *rightElement = static_cast<const UElement *>(right); |
| const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer); |
| const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer); |
| const Collator *col = static_cast<const Collator *>(context); |
| UErrorCode errorCode = U_ZERO_ERROR; |
| return col->compare(leftRec->name_, rightRec->name_, errorCode); |
| } |
| |
| UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return NULL; |
| } |
| LocalPointer<UVector> dest(new UVector(status), status); |
| if (U_FAILURE(status)) { |
| return NULL; |
| } |
| dest->setDeleter(uprv_deleteUObject); |
| // Fetch the script-first-primary contractions which are defined in the root collator. |
| // They all start with U+FDD1. |
| UnicodeSet set; |
| collatorPrimaryOnly_->internalAddContractions(0xFDD1, set, status); |
| if (U_FAILURE(status)) { |
| return NULL; |
| } |
| if (set.isEmpty()) { |
| status = U_UNSUPPORTED_ERROR; |
| return NULL; |
| } |
| UnicodeSetIterator iter(set); |
| while (iter.next()) { |
| const UnicodeString &boundary = iter.getString(); |
| uint32_t gcMask = U_GET_GC_MASK(boundary.char32At(1)); |
| if ((gcMask & (U_GC_L_MASK | U_GC_CN_MASK)) == 0) { |
| // Ignore boundaries for the special reordering groups. |
| // Take only those for "real scripts" (where the sample character is a Letter, |
| // and the one for unassigned implicit weights (Cn). |
| continue; |
| } |
| UnicodeString *s = new UnicodeString(boundary); |
| if (s == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| dest->addElement(s, status); |
| } |
| return dest.orphan(); |
| } |
| |
| |
| namespace { |
| |
| /** |
| * Returns true if one index character string is "better" than the other. |
| * Shorter NFKD is better, and otherwise NFKD-binary-less-than is |
| * better, and otherwise binary-less-than is better. |
| */ |
| UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, |
| const UnicodeString &one, const UnicodeString &other) { |
| // This is called with primary-equal strings, but never with one.equals(other). |
| UErrorCode status = U_ZERO_ERROR; |
| UnicodeString n1 = nfkdNormalizer.normalize(one, status); |
| UnicodeString n2 = nfkdNormalizer.normalize(other, status); |
| if (U_FAILURE(status)) { return FALSE; } |
| int32_t result = n1.countChar32() - n2.countChar32(); |
| if (result != 0) { |
| return result < 0; |
| } |
| result = n1.compareCodePointOrder(n2); |
| if (result != 0) { |
| return result < 0; |
| } |
| return one.compareCodePointOrder(other) < 0; |
| } |
| |
| } // namespace |
| |
| // |
| // Constructor & Destructor for AlphabeticIndex::Record |
| // |
| // Records are internal only, instances are not directly surfaced in the public API. |
| // This class is mostly struct-like, with all public fields. |
| |
| AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data) |
| : name_(name), data_(data) {} |
| |
| AlphabeticIndex::Record::~Record() { |
| } |
| |
| |
| AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return *this; |
| } |
| if (inputList_ == NULL) { |
| inputList_ = new UVector(status); |
| if (inputList_ == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return *this; |
| } |
| inputList_->setDeleter(alphaIndex_deleteRecord); |
| } |
| Record *r = new Record(name, data); |
| if (r == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return *this; |
| } |
| inputList_->addElement(r, status); |
| clearBuckets(); |
| //std::string ss; |
| //std::string ss2; |
| //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" << |
| // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl; |
| return *this; |
| } |
| |
| |
| AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) { |
| if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) { |
| inputList_->removeAllElements(); |
| clearBuckets(); |
| } |
| return *this; |
| } |
| |
| int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) { |
| initBuckets(status); |
| if (U_FAILURE(status)) { |
| return 0; |
| } |
| return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status); |
| } |
| |
| |
| int32_t AlphabeticIndex::getBucketIndex() const { |
| return labelsIterIndex_; |
| } |
| |
| |
| UBool AlphabeticIndex::nextBucket(UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (buckets_ == NULL && currentBucket_ != NULL) { |
| status = U_ENUM_OUT_OF_SYNC_ERROR; |
| return FALSE; |
| } |
| initBuckets(status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| ++labelsIterIndex_; |
| if (labelsIterIndex_ >= buckets_->getBucketCount()) { |
| labelsIterIndex_ = buckets_->getBucketCount(); |
| return FALSE; |
| } |
| currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_); |
| resetRecordIterator(); |
| return TRUE; |
| } |
| |
| const UnicodeString &AlphabeticIndex::getBucketLabel() const { |
| if (currentBucket_ != NULL) { |
| return currentBucket_->label_; |
| } else { |
| return emptyString_; |
| } |
| } |
| |
| |
| UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const { |
| if (currentBucket_ != NULL) { |
| return currentBucket_->labelType_; |
| } else { |
| return U_ALPHAINDEX_NORMAL; |
| } |
| } |
| |
| |
| int32_t AlphabeticIndex::getBucketRecordCount() const { |
| if (currentBucket_ != NULL && currentBucket_->records_ != NULL) { |
| return currentBucket_->records_->size(); |
| } else { |
| return 0; |
| } |
| } |
| |
| |
| AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return *this; |
| } |
| internalResetBucketIterator(); |
| return *this; |
| } |
| |
| |
| UBool AlphabeticIndex::nextRecord(UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (currentBucket_ == NULL) { |
| // We are trying to iterate over the items in a bucket, but there is no |
| // current bucket from the enumeration of buckets. |
| status = U_INVALID_STATE_ERROR; |
| return FALSE; |
| } |
| if (buckets_ == NULL) { |
| status = U_ENUM_OUT_OF_SYNC_ERROR; |
| return FALSE; |
| } |
| if (currentBucket_->records_ == NULL) { |
| return FALSE; |
| } |
| ++itemsIterIndex_; |
| if (itemsIterIndex_ >= currentBucket_->records_->size()) { |
| itemsIterIndex_ = currentBucket_->records_->size(); |
| return FALSE; |
| } |
| return TRUE; |
| } |
| |
| |
| const UnicodeString &AlphabeticIndex::getRecordName() const { |
| const UnicodeString *retStr = &emptyString_; |
| if (currentBucket_ != NULL && currentBucket_->records_ != NULL && |
| itemsIterIndex_ >= 0 && |
| itemsIterIndex_ < currentBucket_->records_->size()) { |
| Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); |
| retStr = &item->name_; |
| } |
| return *retStr; |
| } |
| |
| const void *AlphabeticIndex::getRecordData() const { |
| const void *retPtr = NULL; |
| if (currentBucket_ != NULL && currentBucket_->records_ != NULL && |
| itemsIterIndex_ >= 0 && |
| itemsIterIndex_ < currentBucket_->records_->size()) { |
| Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); |
| retPtr = item->data_; |
| } |
| return retPtr; |
| } |
| |
| |
| AlphabeticIndex & AlphabeticIndex::resetRecordIterator() { |
| itemsIterIndex_ = -1; |
| return *this; |
| } |
| |
| |
| |
| AlphabeticIndex::Bucket::Bucket(const UnicodeString &label, |
| const UnicodeString &lowerBoundary, |
| UAlphabeticIndexLabelType type) |
| : label_(label), lowerBoundary_(lowerBoundary), labelType_(type), |
| displayBucket_(NULL), displayIndex_(-1), |
| records_(NULL) { |
| } |
| |
| |
| AlphabeticIndex::Bucket::~Bucket() { |
| delete records_; |
| } |
| |
| U_NAMESPACE_END |
| |
| #endif // !UCONFIG_NO_COLLATION |