| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ********************************************************************** |
| * Copyright (C) 2001-2015 IBM and others. All rights reserved. |
| ********************************************************************** |
| * Date Name Description |
| * 08/13/2001 synwee Creation. |
| ********************************************************************** |
| */ |
| #ifndef USRCHIMP_H |
| #define USRCHIMP_H |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_COLLATION |
| |
| #include "unicode/normalizer2.h" |
| #include "unicode/ucol.h" |
| #include "unicode/ucoleitr.h" |
| #include "unicode/ubrk.h" |
| |
| /* mask off anything but primary order */ |
| #define UCOL_PRIMARYORDERMASK 0xffff0000 |
| /* mask off anything but secondary order */ |
| #define UCOL_SECONDARYORDERMASK 0x0000ff00 |
| /* mask off anything but tertiary order */ |
| #define UCOL_TERTIARYORDERMASK 0x000000ff |
| /* primary order shift */ |
| #define UCOL_PRIMARYORDERSHIFT 16 |
| /* secondary order shift */ |
| #define UCOL_SECONDARYORDERSHIFT 8 |
| |
| #define UCOL_IGNORABLE 0 |
| |
| /* get weights from a CE */ |
| #define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff) |
| #define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT) |
| #define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK) |
| |
| #define UCOL_CONTINUATION_MARKER 0xC0 |
| |
| #define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER) |
| |
| /** |
| * This indicates an error has occured during processing or there are no more CEs |
| * to be returned. |
| */ |
| #define UCOL_PROCESSED_NULLORDER ((int64_t)U_INT64_MAX) |
| |
| U_NAMESPACE_BEGIN |
| |
| class CollationElementIterator; |
| class Collator; |
| |
| struct PCEI |
| { |
| uint64_t ce; |
| int32_t low; |
| int32_t high; |
| }; |
| |
| struct PCEBuffer |
| { |
| PCEI defaultBuffer[16]; |
| PCEI *buffer; |
| int32_t bufferIndex; |
| int32_t bufferSize; |
| |
| PCEBuffer(); |
| ~PCEBuffer(); |
| |
| void reset(); |
| UBool isEmpty() const; |
| void put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode); |
| const PCEI *get(); |
| }; |
| |
| class UCollationPCE : public UMemory { |
| private: |
| PCEBuffer pceBuffer; |
| CollationElementIterator *cei; |
| UCollationStrength strength; |
| UBool toShift; |
| UBool isShifted; |
| uint32_t variableTop; |
| |
| public: |
| UCollationPCE(UCollationElements *elems); |
| UCollationPCE(CollationElementIterator *iter); |
| ~UCollationPCE(); |
| |
| void init(UCollationElements *elems); |
| void init(CollationElementIterator *iter); |
| |
| /** |
| * Get the processed ordering priority of the next collation element in the text. |
| * A single character may contain more than one collation element. |
| * |
| * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE. |
| * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE. |
| * @param status A pointer to an UErrorCode to receive any errors. |
| * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER |
| * if an error has occured or if the end of string has been reached |
| */ |
| int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status); |
| /** |
| * Get the processed ordering priority of the previous collation element in the text. |
| * A single character may contain more than one collation element. |
| * |
| * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE |
| * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE |
| * @param status A pointer to an UErrorCode to receive any errors. Noteably |
| * a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack |
| * buffer has been exhausted. |
| * @return The previous collation elements ordering, otherwise returns |
| * UCOL_PROCESSED_NULLORDER if an error has occured or if the start of |
| * string has been reached. |
| */ |
| int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status); |
| |
| private: |
| void init(const Collator &coll); |
| uint64_t processCE(uint32_t ce); |
| }; |
| |
| U_NAMESPACE_END |
| |
| #define INITIAL_ARRAY_SIZE_ 256 |
| #define MAX_TABLE_SIZE_ 257 |
| |
| struct USearch { |
| // required since collation element iterator does not have a getText API |
| const UChar *text; |
| int32_t textLength; // exact length |
| UBool isOverlap; |
| UBool isCanonicalMatch; |
| int16_t elementComparisonType; |
| UBreakIterator *internalBreakIter; //internal character breakiterator |
| UBreakIterator *breakIter; |
| // value USEARCH_DONE is the default value |
| // if we are not at the start of the text or the end of the text, |
| // depending on the iteration direction and matchedIndex is USEARCH_DONE |
| // it means that we can't find any more matches in that particular direction |
| int32_t matchedIndex; |
| int32_t matchedLength; |
| UBool isForwardSearching; |
| UBool reset; |
| }; |
| |
| struct UPattern { |
| const UChar *text; |
| int32_t textLength; // exact length |
| // length required for backwards ce comparison |
| int32_t cesLength; |
| int32_t *ces; |
| int32_t cesBuffer[INITIAL_ARRAY_SIZE_]; |
| int32_t pcesLength; |
| int64_t *pces; |
| int64_t pcesBuffer[INITIAL_ARRAY_SIZE_]; |
| UBool hasPrefixAccents; |
| UBool hasSuffixAccents; |
| int16_t defaultShiftSize; |
| int16_t shift[MAX_TABLE_SIZE_]; |
| int16_t backShift[MAX_TABLE_SIZE_]; |
| }; |
| |
| struct UStringSearch { |
| struct USearch *search; |
| struct UPattern pattern; |
| const UCollator *collator; |
| const icu::Normalizer2 *nfd; |
| // positions within the collation element iterator is used to determine |
| // if we are at the start of the text. |
| UCollationElements *textIter; |
| icu::UCollationPCE *textProcessedIter; |
| // utility collation element, used throughout program for temporary |
| // iteration. |
| UCollationElements *utilIter; |
| UBool ownCollator; |
| UCollationStrength strength; |
| uint32_t ceMask; |
| uint32_t variableTop; |
| UBool toShift; |
| UChar canonicalPrefixAccents[INITIAL_ARRAY_SIZE_]; |
| UChar canonicalSuffixAccents[INITIAL_ARRAY_SIZE_]; |
| }; |
| |
| /** |
| * Exact matches without checking for the ends for extra accents. |
| * The match after the position within the collation element iterator is to be |
| * found. |
| * After a match is found the offset in the collation element iterator will be |
| * shifted to the start of the match. |
| * Implementation note: |
| * For tertiary we can't use the collator->tertiaryMask, that is a |
| * preprocessed mask that takes into account case options. since we are only |
| * concerned with exact matches, we don't need that. |
| * Alternate handling - since only the 16 most significant digits is only used, |
| * we can safely do a compare without masking if the ce is a variable, we mask |
| * and get only the primary values no shifting to quartenary is required since |
| * all primary values less than variabletop will need to be masked off anyway. |
| * If the end character is composite and the pattern ce does not match the text |
| * ce, we skip it until we find a match in the end composite character or when |
| * it has passed the character. This is so that we can match pattern "a" with |
| * the text "\u00e6" |
| * @param strsrch string search data |
| * @param status error status if any |
| * @return true if an exact match is found, false otherwise |
| */ |
| U_CFUNC |
| UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status); |
| |
| /** |
| * Canonical matches. |
| * According to the definition, matches found here will include the whole span |
| * of beginning and ending accents if it overlaps that region. |
| * @param strsrch string search data |
| * @param status error status if any |
| * @return true if a canonical match is found, false otherwise |
| */ |
| U_CFUNC |
| UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status); |
| |
| /** |
| * Gets the previous match. |
| * Comments follows from handleNextExact |
| * @param strsrch string search data |
| * @param status error status if any |
| * @return True if a exact math is found, false otherwise. |
| */ |
| U_CFUNC |
| UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status); |
| |
| /** |
| * Canonical matches. |
| * According to the definition, matches found here will include the whole span |
| * of beginning and ending accents if it overlaps that region. |
| * @param strsrch string search data |
| * @param status error status if any |
| * @return true if a canonical match is found, false otherwise |
| */ |
| U_CFUNC |
| UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, |
| UErrorCode *status); |
| |
| #endif /* #if !UCONFIG_NO_COLLATION */ |
| |
| #endif |