Victor Chang | 7322950 | 2020-09-17 13:39:19 +0100 | [diff] [blame] | 1 | // © 2016 and later: Unicode, Inc. and others. |
| 2 | // License & terms of use: http://www.unicode.org/copyright.html |
| 3 | /* |
| 4 | ********************************************************************** |
| 5 | * Copyright (C) 2001-2015 IBM and others. All rights reserved. |
| 6 | ********************************************************************** |
| 7 | * Date Name Description |
| 8 | * 08/13/2001 synwee Creation. |
| 9 | ********************************************************************** |
| 10 | */ |
| 11 | #ifndef USRCHIMP_H |
| 12 | #define USRCHIMP_H |
| 13 | |
| 14 | #include "unicode/utypes.h" |
| 15 | |
| 16 | #if !UCONFIG_NO_COLLATION |
| 17 | |
| 18 | #include "unicode/normalizer2.h" |
| 19 | #include "unicode/ucol.h" |
| 20 | #include "unicode/ucoleitr.h" |
| 21 | #include "unicode/ubrk.h" |
| 22 | |
| 23 | /* mask off anything but primary order */ |
| 24 | #define UCOL_PRIMARYORDERMASK 0xffff0000 |
| 25 | /* mask off anything but secondary order */ |
| 26 | #define UCOL_SECONDARYORDERMASK 0x0000ff00 |
| 27 | /* mask off anything but tertiary order */ |
| 28 | #define UCOL_TERTIARYORDERMASK 0x000000ff |
| 29 | /* primary order shift */ |
| 30 | #define UCOL_PRIMARYORDERSHIFT 16 |
| 31 | /* secondary order shift */ |
| 32 | #define UCOL_SECONDARYORDERSHIFT 8 |
| 33 | |
| 34 | #define UCOL_IGNORABLE 0 |
| 35 | |
| 36 | /* get weights from a CE */ |
| 37 | #define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff) |
| 38 | #define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT) |
| 39 | #define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK) |
| 40 | |
| 41 | #define UCOL_CONTINUATION_MARKER 0xC0 |
| 42 | |
| 43 | #define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER) |
| 44 | |
| 45 | /** |
| 46 | * This indicates an error has occured during processing or there are no more CEs |
| 47 | * to be returned. |
| 48 | */ |
| 49 | #define UCOL_PROCESSED_NULLORDER ((int64_t)U_INT64_MAX) |
| 50 | |
| 51 | U_NAMESPACE_BEGIN |
| 52 | |
| 53 | class CollationElementIterator; |
| 54 | class Collator; |
| 55 | |
| 56 | struct PCEI |
| 57 | { |
| 58 | uint64_t ce; |
| 59 | int32_t low; |
| 60 | int32_t high; |
| 61 | }; |
| 62 | |
| 63 | struct PCEBuffer |
| 64 | { |
| 65 | PCEI defaultBuffer[16]; |
| 66 | PCEI *buffer; |
| 67 | int32_t bufferIndex; |
| 68 | int32_t bufferSize; |
| 69 | |
| 70 | PCEBuffer(); |
| 71 | ~PCEBuffer(); |
| 72 | |
| 73 | void reset(); |
| 74 | UBool isEmpty() const; |
| 75 | void put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode); |
| 76 | const PCEI *get(); |
| 77 | }; |
| 78 | |
| 79 | class UCollationPCE : public UMemory { |
| 80 | private: |
| 81 | PCEBuffer pceBuffer; |
| 82 | CollationElementIterator *cei; |
| 83 | UCollationStrength strength; |
| 84 | UBool toShift; |
| 85 | UBool isShifted; |
| 86 | uint32_t variableTop; |
| 87 | |
| 88 | public: |
| 89 | UCollationPCE(UCollationElements *elems); |
| 90 | UCollationPCE(CollationElementIterator *iter); |
| 91 | ~UCollationPCE(); |
| 92 | |
| 93 | void init(UCollationElements *elems); |
| 94 | void init(CollationElementIterator *iter); |
| 95 | |
| 96 | /** |
| 97 | * Get the processed ordering priority of the next collation element in the text. |
| 98 | * A single character may contain more than one collation element. |
| 99 | * |
| 100 | * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE. |
| 101 | * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE. |
| 102 | * @param status A pointer to an UErrorCode to receive any errors. |
| 103 | * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER |
| 104 | * if an error has occured or if the end of string has been reached |
| 105 | */ |
| 106 | int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status); |
| 107 | /** |
| 108 | * Get the processed ordering priority of the previous collation element in the text. |
| 109 | * A single character may contain more than one collation element. |
| 110 | * |
| 111 | * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE |
| 112 | * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE |
| 113 | * @param status A pointer to an UErrorCode to receive any errors. Noteably |
| 114 | * a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack |
| 115 | * buffer has been exhausted. |
| 116 | * @return The previous collation elements ordering, otherwise returns |
| 117 | * UCOL_PROCESSED_NULLORDER if an error has occured or if the start of |
| 118 | * string has been reached. |
| 119 | */ |
| 120 | int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status); |
| 121 | |
| 122 | private: |
| 123 | void init(const Collator &coll); |
| 124 | uint64_t processCE(uint32_t ce); |
| 125 | }; |
| 126 | |
| 127 | U_NAMESPACE_END |
| 128 | |
| 129 | #define INITIAL_ARRAY_SIZE_ 256 |
| 130 | #define MAX_TABLE_SIZE_ 257 |
| 131 | |
| 132 | struct USearch { |
| 133 | // required since collation element iterator does not have a getText API |
| 134 | const UChar *text; |
| 135 | int32_t textLength; // exact length |
| 136 | UBool isOverlap; |
| 137 | UBool isCanonicalMatch; |
| 138 | int16_t elementComparisonType; |
| 139 | UBreakIterator *internalBreakIter; //internal character breakiterator |
| 140 | UBreakIterator *breakIter; |
| 141 | // value USEARCH_DONE is the default value |
| 142 | // if we are not at the start of the text or the end of the text, |
| 143 | // depending on the iteration direction and matchedIndex is USEARCH_DONE |
| 144 | // it means that we can't find any more matches in that particular direction |
| 145 | int32_t matchedIndex; |
| 146 | int32_t matchedLength; |
| 147 | UBool isForwardSearching; |
| 148 | UBool reset; |
| 149 | }; |
| 150 | |
| 151 | struct UPattern { |
| 152 | const UChar *text; |
| 153 | int32_t textLength; // exact length |
| 154 | // length required for backwards ce comparison |
| 155 | int32_t cesLength; |
| 156 | int32_t *ces; |
| 157 | int32_t cesBuffer[INITIAL_ARRAY_SIZE_]; |
| 158 | int32_t pcesLength; |
| 159 | int64_t *pces; |
| 160 | int64_t pcesBuffer[INITIAL_ARRAY_SIZE_]; |
| 161 | UBool hasPrefixAccents; |
| 162 | UBool hasSuffixAccents; |
| 163 | int16_t defaultShiftSize; |
| 164 | int16_t shift[MAX_TABLE_SIZE_]; |
| 165 | int16_t backShift[MAX_TABLE_SIZE_]; |
| 166 | }; |
| 167 | |
| 168 | struct UStringSearch { |
| 169 | struct USearch *search; |
| 170 | struct UPattern pattern; |
| 171 | const UCollator *collator; |
| 172 | const icu::Normalizer2 *nfd; |
| 173 | // positions within the collation element iterator is used to determine |
| 174 | // if we are at the start of the text. |
| 175 | UCollationElements *textIter; |
| 176 | icu::UCollationPCE *textProcessedIter; |
| 177 | // utility collation element, used throughout program for temporary |
| 178 | // iteration. |
| 179 | UCollationElements *utilIter; |
| 180 | UBool ownCollator; |
| 181 | UCollationStrength strength; |
| 182 | uint32_t ceMask; |
| 183 | uint32_t variableTop; |
| 184 | UBool toShift; |
| 185 | UChar canonicalPrefixAccents[INITIAL_ARRAY_SIZE_]; |
| 186 | UChar canonicalSuffixAccents[INITIAL_ARRAY_SIZE_]; |
| 187 | }; |
| 188 | |
| 189 | /** |
| 190 | * Exact matches without checking for the ends for extra accents. |
| 191 | * The match after the position within the collation element iterator is to be |
| 192 | * found. |
| 193 | * After a match is found the offset in the collation element iterator will be |
| 194 | * shifted to the start of the match. |
| 195 | * Implementation note: |
| 196 | * For tertiary we can't use the collator->tertiaryMask, that is a |
| 197 | * preprocessed mask that takes into account case options. since we are only |
| 198 | * concerned with exact matches, we don't need that. |
| 199 | * Alternate handling - since only the 16 most significant digits is only used, |
| 200 | * we can safely do a compare without masking if the ce is a variable, we mask |
| 201 | * and get only the primary values no shifting to quartenary is required since |
| 202 | * all primary values less than variabletop will need to be masked off anyway. |
| 203 | * If the end character is composite and the pattern ce does not match the text |
| 204 | * ce, we skip it until we find a match in the end composite character or when |
| 205 | * it has passed the character. This is so that we can match pattern "a" with |
| 206 | * the text "\u00e6" |
| 207 | * @param strsrch string search data |
| 208 | * @param status error status if any |
Victor Chang | ce4bf3c | 2021-01-19 16:34:24 +0000 | [diff] [blame] | 209 | * @return true if an exact match is found, false otherwise |
Victor Chang | 7322950 | 2020-09-17 13:39:19 +0100 | [diff] [blame] | 210 | */ |
| 211 | U_CFUNC |
| 212 | UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status); |
| 213 | |
| 214 | /** |
| 215 | * Canonical matches. |
| 216 | * According to the definition, matches found here will include the whole span |
| 217 | * of beginning and ending accents if it overlaps that region. |
| 218 | * @param strsrch string search data |
| 219 | * @param status error status if any |
Victor Chang | ce4bf3c | 2021-01-19 16:34:24 +0000 | [diff] [blame] | 220 | * @return true if a canonical match is found, false otherwise |
Victor Chang | 7322950 | 2020-09-17 13:39:19 +0100 | [diff] [blame] | 221 | */ |
| 222 | U_CFUNC |
| 223 | UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status); |
| 224 | |
| 225 | /** |
| 226 | * Gets the previous match. |
| 227 | * Comments follows from handleNextExact |
| 228 | * @param strsrch string search data |
| 229 | * @param status error status if any |
Victor Chang | ce4bf3c | 2021-01-19 16:34:24 +0000 | [diff] [blame] | 230 | * @return True if a exact math is found, false otherwise. |
Victor Chang | 7322950 | 2020-09-17 13:39:19 +0100 | [diff] [blame] | 231 | */ |
| 232 | U_CFUNC |
| 233 | UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status); |
| 234 | |
| 235 | /** |
| 236 | * Canonical matches. |
| 237 | * According to the definition, matches found here will include the whole span |
| 238 | * of beginning and ending accents if it overlaps that region. |
| 239 | * @param strsrch string search data |
| 240 | * @param status error status if any |
Victor Chang | ce4bf3c | 2021-01-19 16:34:24 +0000 | [diff] [blame] | 241 | * @return true if a canonical match is found, false otherwise |
Victor Chang | 7322950 | 2020-09-17 13:39:19 +0100 | [diff] [blame] | 242 | */ |
| 243 | U_CFUNC |
| 244 | UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, |
| 245 | UErrorCode *status); |
| 246 | |
| 247 | #endif /* #if !UCONFIG_NO_COLLATION */ |
| 248 | |
| 249 | #endif |