blob: f11816785f6db6dae02fd69da4b7500985f5537a [file] [log] [blame]
Victor Chang73229502020-09-17 13:39:19 +01001// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**********************************************************************
5* Copyright (C) 2001-2015 IBM and others. All rights reserved.
6**********************************************************************
7* Date Name Description
8* 08/13/2001 synwee Creation.
9**********************************************************************
10*/
11#ifndef USRCHIMP_H
12#define USRCHIMP_H
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_COLLATION
17
18#include "unicode/normalizer2.h"
19#include "unicode/ucol.h"
20#include "unicode/ucoleitr.h"
21#include "unicode/ubrk.h"
22
23/* mask off anything but primary order */
24#define UCOL_PRIMARYORDERMASK 0xffff0000
25/* mask off anything but secondary order */
26#define UCOL_SECONDARYORDERMASK 0x0000ff00
27/* mask off anything but tertiary order */
28#define UCOL_TERTIARYORDERMASK 0x000000ff
29/* primary order shift */
30#define UCOL_PRIMARYORDERSHIFT 16
31/* secondary order shift */
32#define UCOL_SECONDARYORDERSHIFT 8
33
34#define UCOL_IGNORABLE 0
35
36/* get weights from a CE */
37#define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
38#define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
39#define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)
40
41#define UCOL_CONTINUATION_MARKER 0xC0
42
43#define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)
44
45/**
46 * This indicates an error has occured during processing or there are no more CEs
47 * to be returned.
48 */
49#define UCOL_PROCESSED_NULLORDER ((int64_t)U_INT64_MAX)
50
51U_NAMESPACE_BEGIN
52
53class CollationElementIterator;
54class Collator;
55
56struct PCEI
57{
58 uint64_t ce;
59 int32_t low;
60 int32_t high;
61};
62
63struct PCEBuffer
64{
65 PCEI defaultBuffer[16];
66 PCEI *buffer;
67 int32_t bufferIndex;
68 int32_t bufferSize;
69
70 PCEBuffer();
71 ~PCEBuffer();
72
73 void reset();
74 UBool isEmpty() const;
75 void put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
76 const PCEI *get();
77};
78
79class UCollationPCE : public UMemory {
80private:
81 PCEBuffer pceBuffer;
82 CollationElementIterator *cei;
83 UCollationStrength strength;
84 UBool toShift;
85 UBool isShifted;
86 uint32_t variableTop;
87
88public:
89 UCollationPCE(UCollationElements *elems);
90 UCollationPCE(CollationElementIterator *iter);
91 ~UCollationPCE();
92
93 void init(UCollationElements *elems);
94 void init(CollationElementIterator *iter);
95
96 /**
97 * Get the processed ordering priority of the next collation element in the text.
98 * A single character may contain more than one collation element.
99 *
100 * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
101 * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
102 * @param status A pointer to an UErrorCode to receive any errors.
103 * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER
104 * if an error has occured or if the end of string has been reached
105 */
106 int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
107 /**
108 * Get the processed ordering priority of the previous collation element in the text.
109 * A single character may contain more than one collation element.
110 *
111 * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
112 * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
113 * @param status A pointer to an UErrorCode to receive any errors. Noteably
114 * a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
115 * buffer has been exhausted.
116 * @return The previous collation elements ordering, otherwise returns
117 * UCOL_PROCESSED_NULLORDER if an error has occured or if the start of
118 * string has been reached.
119 */
120 int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
121
122private:
123 void init(const Collator &coll);
124 uint64_t processCE(uint32_t ce);
125};
126
127U_NAMESPACE_END
128
129#define INITIAL_ARRAY_SIZE_ 256
130#define MAX_TABLE_SIZE_ 257
131
132struct USearch {
133 // required since collation element iterator does not have a getText API
134 const UChar *text;
135 int32_t textLength; // exact length
136 UBool isOverlap;
137 UBool isCanonicalMatch;
138 int16_t elementComparisonType;
139 UBreakIterator *internalBreakIter; //internal character breakiterator
140 UBreakIterator *breakIter;
141 // value USEARCH_DONE is the default value
142 // if we are not at the start of the text or the end of the text,
143 // depending on the iteration direction and matchedIndex is USEARCH_DONE
144 // it means that we can't find any more matches in that particular direction
145 int32_t matchedIndex;
146 int32_t matchedLength;
147 UBool isForwardSearching;
148 UBool reset;
149};
150
151struct UPattern {
152 const UChar *text;
153 int32_t textLength; // exact length
154 // length required for backwards ce comparison
155 int32_t cesLength;
156 int32_t *ces;
157 int32_t cesBuffer[INITIAL_ARRAY_SIZE_];
158 int32_t pcesLength;
159 int64_t *pces;
160 int64_t pcesBuffer[INITIAL_ARRAY_SIZE_];
161 UBool hasPrefixAccents;
162 UBool hasSuffixAccents;
163 int16_t defaultShiftSize;
164 int16_t shift[MAX_TABLE_SIZE_];
165 int16_t backShift[MAX_TABLE_SIZE_];
166};
167
168struct UStringSearch {
169 struct USearch *search;
170 struct UPattern pattern;
171 const UCollator *collator;
172 const icu::Normalizer2 *nfd;
173 // positions within the collation element iterator is used to determine
174 // if we are at the start of the text.
175 UCollationElements *textIter;
176 icu::UCollationPCE *textProcessedIter;
177 // utility collation element, used throughout program for temporary
178 // iteration.
179 UCollationElements *utilIter;
180 UBool ownCollator;
181 UCollationStrength strength;
182 uint32_t ceMask;
183 uint32_t variableTop;
184 UBool toShift;
185 UChar canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
186 UChar canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
187};
188
189/**
190* Exact matches without checking for the ends for extra accents.
191* The match after the position within the collation element iterator is to be
192* found.
193* After a match is found the offset in the collation element iterator will be
194* shifted to the start of the match.
195* Implementation note:
196* For tertiary we can't use the collator->tertiaryMask, that is a
197* preprocessed mask that takes into account case options. since we are only
198* concerned with exact matches, we don't need that.
199* Alternate handling - since only the 16 most significant digits is only used,
200* we can safely do a compare without masking if the ce is a variable, we mask
201* and get only the primary values no shifting to quartenary is required since
202* all primary values less than variabletop will need to be masked off anyway.
203* If the end character is composite and the pattern ce does not match the text
204* ce, we skip it until we find a match in the end composite character or when
205* it has passed the character. This is so that we can match pattern "a" with
206* the text "\u00e6"
207* @param strsrch string search data
208* @param status error status if any
Victor Changce4bf3c2021-01-19 16:34:24 +0000209* @return true if an exact match is found, false otherwise
Victor Chang73229502020-09-17 13:39:19 +0100210*/
211U_CFUNC
212UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);
213
214/**
215* Canonical matches.
216* According to the definition, matches found here will include the whole span
217* of beginning and ending accents if it overlaps that region.
218* @param strsrch string search data
219* @param status error status if any
Victor Changce4bf3c2021-01-19 16:34:24 +0000220* @return true if a canonical match is found, false otherwise
Victor Chang73229502020-09-17 13:39:19 +0100221*/
222U_CFUNC
223UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);
224
225/**
226* Gets the previous match.
227* Comments follows from handleNextExact
228* @param strsrch string search data
229* @param status error status if any
Victor Changce4bf3c2021-01-19 16:34:24 +0000230* @return True if a exact math is found, false otherwise.
Victor Chang73229502020-09-17 13:39:19 +0100231*/
232U_CFUNC
233UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);
234
235/**
236* Canonical matches.
237* According to the definition, matches found here will include the whole span
238* of beginning and ending accents if it overlaps that region.
239* @param strsrch string search data
240* @param status error status if any
Victor Changce4bf3c2021-01-19 16:34:24 +0000241* @return true if a canonical match is found, false otherwise
Victor Chang73229502020-09-17 13:39:19 +0100242*/
243U_CFUNC
244UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
245 UErrorCode *status);
246
247#endif /* #if !UCONFIG_NO_COLLATION */
248
249#endif