Blame - libicu/cts_headers/rbt_rule.h - platform/external/icu

blob: b927f5d6c05c110d281719bdf73fbc942dabd7c9 [file] [log] [blame]

Victor Chang	7322950	2020-09-17 13:39:19 +0100	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
				2	// License & terms of use: http://www.unicode.org/copyright.html
				3	/*
				4	* Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved.
				5	**********************************************************************
				6	* Date Name Description
				7	* 11/17/99 aliu Creation.
				8	**********************************************************************
				9	*/
				10	#ifndef RBT_RULE_H
				11	#define RBT_RULE_H
				12
				13	#include "unicode/utypes.h"
				14
				15	#if !UCONFIG_NO_TRANSLITERATION
				16
				17	#include "unicode/uobject.h"
				18	#include "unicode/unistr.h"
				19	#include "unicode/utrans.h"
				20	#include "unicode/unimatch.h"
				21
				22	U_NAMESPACE_BEGIN
				23
				24	class Replaceable;
				25	class TransliterationRuleData;
				26	class StringMatcher;
				27	class UnicodeFunctor;
				28
				29	/**
				30	* A transliteration rule used by
				31	* <code>RuleBasedTransliterator</code>.
				32	* <code>TransliterationRule</code> is an immutable object.
				33	*
				34	* <p>A rule consists of an input pattern and an output string. When
				35	* the input pattern is matched, the output string is emitted. The
				36	* input pattern consists of zero or more characters which are matched
				37	* exactly (the key) and optional context. Context must match if it
				38	* is specified. Context may be specified before the key, after the
				39	* key, or both. The key, preceding context, and following context
				40	* may contain variables. Variables represent a set of Unicode
				41	* characters, such as the letters <i>a</i> through <i>z</i>.
				42	* Variables are detected by looking up each character in a supplied
				43	* variable list to see if it has been so defined.
				44	*
				45	* <p>A rule may contain segments in its input string and segment
				46	* references in its output string. A segment is a substring of the
				47	* input pattern, indicated by an offset and limit. The segment may
				48	* be in the preceding or following context. It may not span a
				49	* context boundary. A segment reference is a special character in
				50	* the output string that causes a segment of the input string (not
				51	* the input pattern) to be copied to the output string. The range of
				52	* special characters that represent segment references is defined by
				53	* RuleBasedTransliterator.Data.
				54	*
				55	* @author Alan Liu
				56	*/
				57	class TransliterationRule : public UMemory {
				58
				59	private:
				60
				61	// TODO Eliminate the pattern and keyLength data members. They
				62	// are used only by masks() and getIndexValue() which are called
				63	// only during build time, not during run-time. Perhaps these
				64	// methods and pattern/keyLength can be isolated into a separate
				65	// object.
				66
				67	/**
				68	* The match that must occur before the key, or null if there is no
				69	* preceding context.
				70	*/
				71	StringMatcher *anteContext;
				72
				73	/**
				74	* The matcher object for the key. If null, then the key is empty.
				75	*/
				76	StringMatcher *key;
				77
				78	/**
				79	* The match that must occur after the key, or null if there is no
				80	* following context.
				81	*/
				82	StringMatcher *postContext;
				83
				84	/**
				85	* The object that performs the replacement if the key,
				86	* anteContext, and postContext are matched. Never null.
				87	*/
				88	UnicodeFunctor* output;
				89
				90	/**
				91	* The string that must be matched, consisting of the anteContext, key,
				92	* and postContext, concatenated together, in that order. Some components
				93	* may be empty (zero length).
				94	* @see anteContextLength
				95	* @see keyLength
				96	*/
				97	UnicodeString pattern;
				98
				99	/**
				100	* An array of matcher objects corresponding to the input pattern
				101	* segments. If there are no segments this is null. N.B. This is
				102	* a UnicodeMatcher for generality, but in practice it is always a
				103	* StringMatcher. In the future we may generalize this, but for
				104	* now we sometimes cast down to StringMatcher.
				105	*
				106	* The array is owned, but the pointers within it are not.
				107	*/
				108	UnicodeFunctor** segments;
				109
				110	/**
				111	* The number of elements in segments[] or zero if segments is NULL.
				112	*/
				113	int32_t segmentsCount;
				114
				115	/**
				116	* The length of the string that must match before the key. If
				117	* zero, then there is no matching requirement before the key.
				118	* Substring [0,anteContextLength) of pattern is the anteContext.
				119	*/
				120	int32_t anteContextLength;
				121
				122	/**
				123	* The length of the key. Substring [anteContextLength,
				124	* anteContextLength + keyLength) is the key.
				125
				126	*/
				127	int32_t keyLength;
				128
				129	/**
				130	* Miscellaneous attributes.
				131	*/
				132	int8_t flags;
				133
				134	/**
				135	* Flag attributes.
				136	*/
				137	enum {
				138	ANCHOR_START = 1,
				139	ANCHOR_END = 2
				140	};
				141
				142	/**
				143	* An alias pointer to the data for this rule. The data provides
				144	* lookup services for matchers and segments.
				145	*/
				146	const TransliterationRuleData* data;
				147
				148	public:
				149
				150	/**
				151	* Construct a new rule with the given input, output text, and other
				152	* attributes. A cursor position may be specified for the output text.
				153	* @param input input string, including key and optional ante and
				154	* post context.
				155	* @param anteContextPos offset into input to end of ante context, or -1 if
				156	* none. Must be <= input.length() if not -1.
				157	* @param postContextPos offset into input to start of post context, or -1
				158	* if none. Must be <= input.length() if not -1, and must be >=
				159	* anteContextPos.
				160	* @param outputStr output string.
				161	* @param cursorPosition offset into output at which cursor is located, or -1 if
				162	* none. If less than zero, then the cursor is placed after the
				163	* <code>output</code>; that is, -1 is equivalent to
				164	* <code>output.length()</code>. If greater than
				165	* <code>output.length()</code> then an exception is thrown.
				166	* @param cursorOffset an offset to be added to cursorPos to position the
				167	* cursor either in the ante context, if < 0, or in the post context, if >
				168	* 0. For example, the rule "abc{def} > \| @@@ xyz;" changes "def" to
				169	* "xyz" and moves the cursor to before "a". It would have a cursorOffset
				170	* of -3.
				171	* @param segs array of UnicodeMatcher corresponding to input pattern
				172	* segments, or null if there are none. The array itself is adopted,
				173	* but the pointers within it are not.
				174	* @param segsCount number of elements in segs[].
Victor Chang	ce4bf3c	2021-01-19 16:34:24 +0000	[diff] [blame]	175	* @param anchorStart true if the the rule is anchored on the left to
Victor Chang	7322950	2020-09-17 13:39:19 +0100	[diff] [blame]	176	* the context start.
Victor Chang	ce4bf3c	2021-01-19 16:34:24 +0000	[diff] [blame]	177	* @param anchorEnd true if the rule is anchored on the right to the
Victor Chang	7322950	2020-09-17 13:39:19 +0100	[diff] [blame]	178	* context limit.
				179	* @param data the rule data.
				180	* @param status Output parameter filled in with success or failure status.
				181	*/
				182	TransliterationRule(const UnicodeString& input,
				183	int32_t anteContextPos, int32_t postContextPos,
				184	const UnicodeString& outputStr,
				185	int32_t cursorPosition, int32_t cursorOffset,
				186	UnicodeFunctor** segs,
				187	int32_t segsCount,
				188	UBool anchorStart, UBool anchorEnd,
				189	const TransliterationRuleData* data,
				190	UErrorCode& status);
				191
				192	/**
				193	* Copy constructor.
				194	* @param other the object to be copied.
				195	*/
				196	TransliterationRule(TransliterationRule& other);
				197
				198	/**
				199	* Destructor.
				200	*/
				201	virtual ~TransliterationRule();
				202
				203	/**
				204	* Change the data object that this rule belongs to. Used
				205	* internally by the TransliterationRuleData copy constructor.
				206	* @param data the new data value to be set.
				207	*/
				208	void setData(const TransliterationRuleData* data);
				209
				210	/**
				211	* Return the preceding context length. This method is needed to
				212	* support the <code>Transliterator</code> method
				213	* <code>getMaximumContextLength()</code>. Internally, this is
				214	* implemented as the anteContextLength, optionally plus one if
				215	* there is a start anchor. The one character anchor gap is
				216	* needed to make repeated incremental transliteration with
				217	* anchors work.
				218	* @return the preceding context length.
				219	*/
				220	virtual int32_t getContextLength(void) const;
				221
				222	/**
				223	* Internal method. Returns 8-bit index value for this rule.
				224	* This is the low byte of the first character of the key,
				225	* unless the first character of the key is a set. If it's a
				226	* set, or otherwise can match multiple keys, the index value is -1.
				227	* @return 8-bit index value for this rule.
				228	*/
				229	int16_t getIndexValue() const;
				230
				231	/**
				232	* Internal method. Returns true if this rule matches the given
				233	* index value. The index value is an 8-bit integer, 0..255,
				234	* representing the low byte of the first character of the key.
				235	* It matches this rule if it matches the first character of the
				236	* key, or if the first character of the key is a set, and the set
				237	* contains any character with a low byte equal to the index
				238	* value. If the rule contains only ante context, as in foo)>bar,
				239	* then it will match any key.
				240	* @param v the given index value.
				241	* @return true if this rule matches the given index value.
				242	*/
				243	UBool matchesIndexValue(uint8_t v) const;
				244
				245	/**
				246	* Return true if this rule masks another rule. If r1 masks r2 then
				247	* r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks
				248	* r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y".
				249	* "[c]a>x" masks "[dc]a>y".
				250	* @param r2 the given rule to be compared with.
				251	* @return true if this rule masks 'r2'
				252	*/
				253	virtual UBool masks(const TransliterationRule& r2) const;
				254
				255	/**
				256	* Attempt a match and replacement at the given position. Return
				257	* the degree of match between this rule and the given text. The
				258	* degree of match may be mismatch, a partial match, or a full
				259	* match. A mismatch means at least one character of the text
				260	* does not match the context or key. A partial match means some
				261	* context and key characters match, but the text is not long
				262	* enough to match all of them. A full match means all context
				263	* and key characters match.
				264	*
				265	* If a full match is obtained, perform a replacement, update pos,
				266	* and return U_MATCH. Otherwise both text and pos are unchanged.
				267	*
				268	* @param text the text
				269	* @param pos the position indices
Victor Chang	ce4bf3c	2021-01-19 16:34:24 +0000	[diff] [blame]	270	* @param incremental if true, test for partial matches that may
Victor Chang	7322950	2020-09-17 13:39:19 +0100	[diff] [blame]	271	* be completed by additional text inserted at pos.limit.
				272	* @return one of <code>U_MISMATCH</code>,
				273	* <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If
Victor Chang	ce4bf3c	2021-01-19 16:34:24 +0000	[diff] [blame]	274	* incremental is false then U_PARTIAL_MATCH will not be returned.
Victor Chang	7322950	2020-09-17 13:39:19 +0100	[diff] [blame]	275	*/
				276	UMatchDegree matchAndReplace(Replaceable& text,
				277	UTransPosition& pos,
				278	UBool incremental) const;
				279
				280	/**
				281	* Create a rule string that represents this rule object. Append
				282	* it to the given string.
				283	*/
				284	virtual UnicodeString& toRule(UnicodeString& pat,
				285	UBool escapeUnprintable) const;
				286
				287	/**
				288	* Union the set of all characters that may be modified by this rule
				289	* into the given set.
				290	*/
				291	void addSourceSetTo(UnicodeSet& toUnionTo) const;
				292
				293	/**
				294	* Union the set of all characters that may be emitted by this rule
				295	* into the given set.
				296	*/
				297	void addTargetSetTo(UnicodeSet& toUnionTo) const;
				298
				299	private:
				300
				301	friend class StringMatcher;
				302
				303	TransliterationRule &operator=(const TransliterationRule &other); // forbid copying of this class
				304	};
				305
				306	U_NAMESPACE_END
				307
				308	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
				309
				310	#endif