blob: 4ee5cbd08874b8fada0d0acea74eed1457266ed5 [file] [log] [blame]
Victor Chang73229502020-09-17 13:39:19 +01001// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4 * Copyright (C) 2001-2011, International Business Machines Corporation
5 * and others. All Rights Reserved.
6 **********************************************************************
7 * Date Name Description
8 * 07/23/01 aliu Creation.
9 **********************************************************************
10 */
11#ifndef STRMATCH_H
12#define STRMATCH_H
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_TRANSLITERATION
17
18#include "unicode/unistr.h"
19#include "unicode/unifunct.h"
20#include "unicode/unimatch.h"
21#include "unicode/unirepl.h"
22
23U_NAMESPACE_BEGIN
24
25class TransliterationRuleData;
26
27/**
28 * An object that matches a fixed input string, implementing the
29 * UnicodeMatcher API. This object also implements the
30 * UnicodeReplacer API, allowing it to emit the matched text as
31 * output. Since the match text may contain flexible match elements,
32 * such as UnicodeSets, the emitted text is not the match pattern, but
33 * instead a substring of the actual matched text. Following
34 * convention, the output text is the leftmost match seen up to this
35 * point.
36 *
37 * A StringMatcher may represent a segment, in which case it has a
38 * positive segment number. This affects how the matcher converts
39 * itself to a pattern but does not otherwise affect its function.
40 *
41 * A StringMatcher that is not a segment should not be used as a
42 * UnicodeReplacer.
43 */
44class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
45
46 public:
47
48 /**
49 * Construct a matcher that matches the given pattern string.
50 * @param string the pattern to be matched, possibly containing
51 * stand-ins that represent nested UnicodeMatcher objects.
52 * @param start inclusive start index of text to be replaced
53 * @param limit exclusive end index of text to be replaced;
54 * must be greater than or equal to start
55 * @param segmentNum the segment number from 1..n, or 0 if this is
56 * not a segment.
57 * @param data context object mapping stand-ins to
58 * UnicodeMatcher objects.
59 */
60 StringMatcher(const UnicodeString& string,
61 int32_t start,
62 int32_t limit,
63 int32_t segmentNum,
64 const TransliterationRuleData& data);
65
66 /**
67 * Copy constructor
68 * @param o the object to be copied.
69 */
70 StringMatcher(const StringMatcher& o);
71
72 /**
73 * Destructor
74 */
75 virtual ~StringMatcher();
76
77 /**
78 * Implement UnicodeFunctor
79 * @return a copy of the object.
80 */
81 virtual StringMatcher* clone() const;
82
83 /**
84 * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
85 * and return the pointer.
86 * @return the UnicodeMatcher point.
87 */
88 virtual UnicodeMatcher* toMatcher() const;
89
90 /**
91 * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer
92 * and return the pointer.
93 * @return the UnicodeReplacer pointer.
94 */
95 virtual UnicodeReplacer* toReplacer() const;
96
97 /**
98 * Implement UnicodeMatcher
99 * @param text the text to be matched
100 * @param offset on input, the index into text at which to begin
101 * matching. On output, the limit of the matched text. The
102 * number of matched characters is the output value of offset
103 * minus the input value. Offset should always point to the
104 * HIGH SURROGATE (leading code unit) of a pair of surrogates,
105 * both on entry and upon return.
106 * @param limit the limit index of text to be matched. Greater
107 * than offset for a forward direction match, less than offset for
108 * a backward direction match. The last character to be
109 * considered for matching will be text.charAt(limit-1) in the
110 * forward direction or text.charAt(limit+1) in the backward
111 * direction.
Victor Changce4bf3c2021-01-19 16:34:24 +0000112 * @param incremental if true, then assume further characters may
Victor Chang73229502020-09-17 13:39:19 +0100113 * be inserted at limit and check for partial matching. Otherwise
114 * assume the text as given is complete.
115 * @return a match degree value indicating a full match, a partial
Victor Changce4bf3c2021-01-19 16:34:24 +0000116 * match, or a mismatch. If incremental is false then
Victor Chang73229502020-09-17 13:39:19 +0100117 * U_PARTIAL_MATCH should never be returned.
118 */
119 virtual UMatchDegree matches(const Replaceable& text,
120 int32_t& offset,
121 int32_t limit,
122 UBool incremental);
123
124 /**
125 * Implement UnicodeMatcher
126 * @param result Output param to receive the pattern.
127 * @param escapeUnprintable if True then escape the unprintable characters.
128 * @return A reference to 'result'.
129 */
130 virtual UnicodeString& toPattern(UnicodeString& result,
Victor Changce4bf3c2021-01-19 16:34:24 +0000131 UBool escapeUnprintable = false) const;
Victor Chang73229502020-09-17 13:39:19 +0100132
133 /**
134 * Implement UnicodeMatcher
Victor Changce4bf3c2021-01-19 16:34:24 +0000135 * Returns true if this matcher will match a character c, where c
Victor Chang73229502020-09-17 13:39:19 +0100136 * & 0xFF == v, at offset, in the forward direction (with limit >
137 * offset). This is used by <tt>RuleBasedTransliterator</tt> for
138 * indexing.
139 * @param v the given value
Victor Changce4bf3c2021-01-19 16:34:24 +0000140 * @return true if this matcher will match a character c,
Victor Chang73229502020-09-17 13:39:19 +0100141 * where c & 0xFF == v
142 */
143 virtual UBool matchesIndexValue(uint8_t v) const;
144
145 /**
146 * Implement UnicodeMatcher
147 */
148 virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
149
150 /**
151 * Implement UnicodeFunctor
152 */
153 virtual void setData(const TransliterationRuleData*);
154
155 /**
156 * Replace characters in 'text' from 'start' to 'limit' with the
157 * output text of this object. Update the 'cursor' parameter to
158 * give the cursor position and return the length of the
159 * replacement text.
160 *
161 * @param text the text to be matched
162 * @param start inclusive start index of text to be replaced
163 * @param limit exclusive end index of text to be replaced;
164 * must be greater than or equal to start
165 * @param cursor output parameter for the cursor position.
166 * Not all replacer objects will update this, but in a complete
167 * tree of replacer objects, representing the entire output side
168 * of a transliteration rule, at least one must update it.
169 * @return the number of 16-bit code units in the text replacing
170 * the characters at offsets start..(limit-1) in text
171 */
172 virtual int32_t replace(Replaceable& text,
173 int32_t start,
174 int32_t limit,
175 int32_t& cursor);
176
177 /**
178 * Returns a string representation of this replacer. If the
179 * result of calling this function is passed to the appropriate
180 * parser, typically TransliteratorParser, it will produce another
181 * replacer that is equal to this one.
182 * @param result the string to receive the pattern. Previous
183 * contents will be deleted.
Victor Changce4bf3c2021-01-19 16:34:24 +0000184 * @param escapeUnprintable if true then convert unprintable
Victor Chang73229502020-09-17 13:39:19 +0100185 * character to their hex escape representations, \\uxxxx or
186 * \\Uxxxxxxxx. Unprintable characters are defined by
187 * Utility.isUnprintable().
188 * @return a reference to 'result'.
189 */
190 virtual UnicodeString& toReplacerPattern(UnicodeString& result,
191 UBool escapeUnprintable) const;
192
193 /**
194 * Remove any match data. This must be called before performing a
195 * set of matches with this segment.
196 */
197 void resetMatch();
198
199 /**
200 * ICU "poor man's RTTI", returns a UClassID for the actual class.
201 */
202 virtual UClassID getDynamicClassID() const;
203
204 /**
205 * ICU "poor man's RTTI", returns a UClassID for this class.
206 */
207 static UClassID U_EXPORT2 getStaticClassID();
208
209 /**
210 * Union the set of all characters that may output by this object
211 * into the given set.
212 * @param toUnionTo the set into which to union the output characters
213 */
214 virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;
215
216 private:
217
218 /**
219 * The text to be matched.
220 */
221 UnicodeString pattern;
222
223 /**
224 * Context object that maps stand-ins to matcher and replacer
225 * objects.
226 */
227 const TransliterationRuleData* data;
228
229 /**
230 * The segment number, 1-based, or 0 if not a segment.
231 */
232 int32_t segmentNumber;
233
234 /**
235 * Start offset, in the match text, of the <em>rightmost</em>
236 * match.
237 */
238 int32_t matchStart;
239
240 /**
241 * Limit offset, in the match text, of the <em>rightmost</em>
242 * match.
243 */
244 int32_t matchLimit;
245
246};
247
248U_NAMESPACE_END
249
250#endif /* #if !UCONFIG_NO_TRANSLITERATION */
251
252#endif