blob: 2a972e1eaa377468dafdcc903d9e31879312e578 [file] [log] [blame]
Victor Chang73229502020-09-17 13:39:19 +01001// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**********************************************************************
5* Copyright (C) 1999-2011, International Business Machines Corporation
6* and others. All Rights Reserved.
7**********************************************************************
8* Date Name Description
9* 11/17/99 aliu Creation.
10**********************************************************************
11*/
12#ifndef RBT_PARS_H
13#define RBT_PARS_H
14
15#include "unicode/utypes.h"
16
17#if !UCONFIG_NO_TRANSLITERATION
18#ifdef __cplusplus
19
20#include "unicode/uobject.h"
21#include "unicode/parseerr.h"
22#include "unicode/unorm.h"
23#include "rbt.h"
24#include "hash.h"
25#include "uvector.h"
26
27U_NAMESPACE_BEGIN
28
29class TransliterationRuleData;
30class UnicodeFunctor;
31class ParseData;
32class RuleHalf;
33class ParsePosition;
34class StringMatcher;
35
36class TransliteratorParser : public UMemory {
37
38 public:
39
40 /**
41 * A Vector of TransliterationRuleData objects, one for each discrete group
42 * of rules in the rule set
43 */
44 UVector dataVector;
45
46 /**
47 * PUBLIC data member.
48 * A Vector of UnicodeStrings containing all of the ID blocks in the rule set
49 */
50 UVector idBlockVector;
51
52 /**
53 * PUBLIC data member containing the parsed compound filter, if any.
54 */
55 UnicodeSet* compoundFilter;
56
57 private:
58
59 /**
60 * The current data object for which we are parsing rules
61 */
62 TransliterationRuleData* curData;
63
64 UTransDirection direction;
65
66 /**
67 * Parse error information.
68 */
69 UParseError parseError;
70
71 /**
72 * Temporary symbol table used during parsing.
73 */
74 ParseData* parseData;
75
76 /**
77 * Temporary vector of matcher variables. When parsing is complete, this
78 * is copied into the array data.variables. As with data.variables,
79 * element 0 corresponds to character data.variablesBase.
80 */
81 UVector variablesVector;
82
83 /**
84 * Temporary table of variable names. When parsing is complete, this is
85 * copied into data.variableNames.
86 */
87 Hashtable variableNames;
88
89 /**
90 * String of standins for segments. Used during the parsing of a single
91 * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds
92 * to StringMatcher object segmentObjects.elementAt(0), etc.
93 */
94 UnicodeString segmentStandins;
95
96 /**
97 * Vector of StringMatcher objects for segments. Used during the
98 * parsing of a single rule.
99 * segmentStandins.charAt(0) is the standin for "$1" and corresponds
100 * to StringMatcher object segmentObjects.elementAt(0), etc.
101 */
102 UVector segmentObjects;
103
104 /**
105 * The next available stand-in for variables. This starts at some point in
106 * the private use area (discovered dynamically) and increments up toward
107 * <code>variableLimit</code>. At any point during parsing, available
108 * variables are <code>variableNext..variableLimit-1</code>.
109 */
110 UChar variableNext;
111
112 /**
113 * The last available stand-in for variables. This is discovered
114 * dynamically. At any point during parsing, available variables are
115 * <code>variableNext..variableLimit-1</code>.
116 */
117 UChar variableLimit;
118
119 /**
120 * When we encounter an undefined variable, we do not immediately signal
121 * an error, in case we are defining this variable, e.g., "$a = [a-z];".
122 * Instead, we save the name of the undefined variable, and substitute
123 * in the placeholder char variableLimit - 1, and decrement
124 * variableLimit.
125 */
126 UnicodeString undefinedVariableName;
127
128 /**
129 * The stand-in character for the 'dot' set, represented by '.' in
130 * patterns. This is allocated the first time it is needed, and
131 * reused thereafter.
132 */
133 UChar dotStandIn;
134
135public:
136
137 /**
138 * Constructor.
139 */
140 TransliteratorParser(UErrorCode &statusReturn);
141
142 /**
143 * Destructor.
144 */
145 ~TransliteratorParser();
146
147 /**
148 * Parse the given string as a sequence of rules, separated by newline
149 * characters ('\n'), and cause this object to implement those rules. Any
150 * previous rules are discarded. Typically this method is called exactly
151 * once after construction.
152 *
153 * Parse the given rules, in the given direction. After this call
154 * returns, query the public data members for results. The caller
155 * owns the 'data' and 'compoundFilter' data members after this
156 * call returns.
157 * @param rules rules, separated by ';'
158 * @param direction either FORWARD or REVERSE.
159 * @param pe Struct to recieve information on position
160 * of error if an error is encountered
161 * @param ec Output param set to success/failure code.
162 */
163 void parse(const UnicodeString& rules,
164 UTransDirection direction,
165 UParseError& pe,
166 UErrorCode& ec);
167
168 /**
169 * Return the compound filter parsed by parse(). Caller owns result.
170 * @return the compound filter parsed by parse().
171 */
172 UnicodeSet* orphanCompoundFilter();
173
174private:
175
176 /**
177 * Return a representation of this transliterator as source rules.
178 * @param rules Output param to receive the rules.
179 * @param direction either FORWARD or REVERSE.
180 */
181 void parseRules(const UnicodeString& rules,
182 UTransDirection direction,
183 UErrorCode& status);
184
185 /**
186 * MAIN PARSER. Parse the next rule in the given rule string, starting
187 * at pos. Return the index after the last character parsed. Do not
188 * parse characters at or after limit.
189 *
190 * Important: The character at pos must be a non-whitespace character
191 * that is not the comment character.
192 *
193 * This method handles quoting, escaping, and whitespace removal. It
194 * parses the end-of-rule character. It recognizes context and cursor
195 * indicators. Once it does a lexical breakdown of the rule at pos, it
196 * creates a rule object and adds it to our rule list.
197 * @param rules Output param to receive the rules.
198 * @param pos the starting position.
199 * @param limit pointer past the last character of the rule.
200 * @return the index after the last character parsed.
201 */
202 int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
203
204 /**
205 * Set the variable range to [start, end] (inclusive).
206 * @param start the start value of the range.
207 * @param end the end value of the range.
208 */
209 void setVariableRange(int32_t start, int32_t end, UErrorCode& status);
210
211 /**
212 * Assert that the given character is NOT within the variable range.
Victor Changce4bf3c2021-01-19 16:34:24 +0000213 * If it is, return false. This is neccesary to ensure that the
Victor Chang73229502020-09-17 13:39:19 +0100214 * variable range does not overlap characters used in a rule.
215 * @param ch the given character.
216 * @return True, if the given character is NOT within the variable range.
217 */
218 UBool checkVariableRange(UChar32 ch) const;
219
220 /**
221 * Set the maximum backup to 'backup', in response to a pragma
222 * statement.
223 * @param backup the new value to be set.
224 */
225 void pragmaMaximumBackup(int32_t backup);
226
227 /**
228 * Begin normalizing all rules using the given mode, in response
229 * to a pragma statement.
230 * @param mode the given mode.
231 */
232 void pragmaNormalizeRules(UNormalizationMode mode);
233
234 /**
235 * Return true if the given rule looks like a pragma.
236 * @param pos offset to the first non-whitespace character
237 * of the rule.
238 * @param limit pointer past the last character of the rule.
239 * @return true if the given rule looks like a pragma.
240 */
241 static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);
242
243 /**
244 * Parse a pragma. This method assumes resemblesPragma() has
245 * already returned true.
246 * @param pos offset to the first non-whitespace character
247 * of the rule.
248 * @param limit pointer past the last character of the rule.
249 * @return the position index after the final ';' of the pragma,
250 * or -1 on failure.
251 */
252 int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
253
254 /**
255 * Called by main parser upon syntax error. Search the rule string
256 * for the probable end of the rule. Of course, if the error is that
257 * the end of rule marker is missing, then the rule end will not be found.
258 * In any case the rule start will be correctly reported.
259 * @param parseErrorCode error code.
260 * @param msg error description.
261 * @param start position of first character of current rule.
262 * @return start position of first character of current rule.
263 */
264 int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
265 UErrorCode& status);
266
267 /**
268 * Parse a UnicodeSet out, store it, and return the stand-in character
269 * used to represent it.
270 *
271 * @param rule the rule for UnicodeSet.
272 * @param pos the position in pattern at which to start parsing.
273 * @return the stand-in character used to represent it.
274 */
275 UChar parseSet(const UnicodeString& rule,
276 ParsePosition& pos,
277 UErrorCode& status);
278
279 /**
280 * Generate and return a stand-in for a new UnicodeFunctor. Store
281 * the matcher (adopt it).
282 * @param adopted the UnicodeFunctor to be adopted.
283 * @return a stand-in for a new UnicodeFunctor.
284 */
285 UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);
286
287 /**
288 * Return the standin for segment seg (1-based).
289 * @param seg the given segment.
290 * @return the standIn character for the given segment.
291 */
292 UChar getSegmentStandin(int32_t seg, UErrorCode& status);
293
294 /**
295 * Set the object for segment seg (1-based).
296 * @param seg the given segment.
297 * @param adopted the StringMatcher to be adopted.
298 */
299 void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);
300
301 /**
302 * Return the stand-in for the dot set. It is allocated the first
303 * time and reused thereafter.
304 * @return the stand-in for the dot set.
305 */
306 UChar getDotStandIn(UErrorCode& status);
307
308 /**
309 * Append the value of the given variable name to the given
310 * UnicodeString.
311 * @param name the variable name to be appended.
312 * @param buf the given UnicodeString to append to.
313 */
314 void appendVariableDef(const UnicodeString& name,
315 UnicodeString& buf,
316 UErrorCode& status);
317
318 /**
319 * Glue method to get around access restrictions in C++.
320 */
321 /*static Transliterator* createBasicInstance(const UnicodeString& id,
322 const UnicodeString* canonID);*/
323
324 friend class RuleHalf;
325
326 // Disallowed methods; no impl.
327 /**
328 * Copy constructor
329 */
330 TransliteratorParser(const TransliteratorParser&);
331
332 /**
333 * Assignment operator
334 */
335 TransliteratorParser& operator=(const TransliteratorParser&);
336};
337
338U_NAMESPACE_END
339
340#endif /* #ifdef __cplusplus */
341
342/**
343 * Strip/convert the following from the transliterator rules:
344 * comments
345 * newlines
346 * white space at the beginning and end of a line
347 * unescape \u notation
348 *
349 * The target must be equal in size as the source.
350 * @internal
351 */
352U_CAPI int32_t
353utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);
354
355#endif /* #if !UCONFIG_NO_TRANSLITERATION */
356
357#endif