Blame - libicu/cts_headers/strmatch.h - platform/external/icu

blob: 4ee5cbd08874b8fada0d0acea74eed1457266ed5 [file] [log] [blame]

Victor Chang	7322950	2020-09-17 13:39:19 +0100	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
				2	// License & terms of use: http://www.unicode.org/copyright.html
				3	/*
				4	* Copyright (C) 2001-2011, International Business Machines Corporation
				5	* and others. All Rights Reserved.
				6	**********************************************************************
				7	* Date Name Description
				8	* 07/23/01 aliu Creation.
				9	**********************************************************************
				10	*/
				11	#ifndef STRMATCH_H
				12	#define STRMATCH_H
				13
				14	#include "unicode/utypes.h"
				15
				16	#if !UCONFIG_NO_TRANSLITERATION
				17
				18	#include "unicode/unistr.h"
				19	#include "unicode/unifunct.h"
				20	#include "unicode/unimatch.h"
				21	#include "unicode/unirepl.h"
				22
				23	U_NAMESPACE_BEGIN
				24
				25	class TransliterationRuleData;
				26
				27	/**
				28	* An object that matches a fixed input string, implementing the
				29	* UnicodeMatcher API. This object also implements the
				30	* UnicodeReplacer API, allowing it to emit the matched text as
				31	* output. Since the match text may contain flexible match elements,
				32	* such as UnicodeSets, the emitted text is not the match pattern, but
				33	* instead a substring of the actual matched text. Following
				34	* convention, the output text is the leftmost match seen up to this
				35	* point.
				36	*
				37	* A StringMatcher may represent a segment, in which case it has a
				38	* positive segment number. This affects how the matcher converts
				39	* itself to a pattern but does not otherwise affect its function.
				40	*
				41	* A StringMatcher that is not a segment should not be used as a
				42	* UnicodeReplacer.
				43	*/
				44	class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
				45
				46	public:
				47
				48	/**
				49	* Construct a matcher that matches the given pattern string.
				50	* @param string the pattern to be matched, possibly containing
				51	* stand-ins that represent nested UnicodeMatcher objects.
				52	* @param start inclusive start index of text to be replaced
				53	* @param limit exclusive end index of text to be replaced;
				54	* must be greater than or equal to start
				55	* @param segmentNum the segment number from 1..n, or 0 if this is
				56	* not a segment.
				57	* @param data context object mapping stand-ins to
				58	* UnicodeMatcher objects.
				59	*/
				60	StringMatcher(const UnicodeString& string,
				61	int32_t start,
				62	int32_t limit,
				63	int32_t segmentNum,
				64	const TransliterationRuleData& data);
				65
				66	/**
				67	* Copy constructor
				68	* @param o the object to be copied.
				69	*/
				70	StringMatcher(const StringMatcher& o);
				71
				72	/**
				73	* Destructor
				74	*/
				75	virtual ~StringMatcher();
				76
				77	/**
				78	* Implement UnicodeFunctor
				79	* @return a copy of the object.
				80	*/
				81	virtual StringMatcher* clone() const;
				82
				83	/**
				84	* UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
				85	* and return the pointer.
				86	* @return the UnicodeMatcher point.
				87	*/
				88	virtual UnicodeMatcher* toMatcher() const;
				89
				90	/**
				91	* UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer
				92	* and return the pointer.
				93	* @return the UnicodeReplacer pointer.
				94	*/
				95	virtual UnicodeReplacer* toReplacer() const;
				96
				97	/**
				98	* Implement UnicodeMatcher
				99	* @param text the text to be matched
				100	* @param offset on input, the index into text at which to begin
				101	* matching. On output, the limit of the matched text. The
				102	* number of matched characters is the output value of offset
				103	* minus the input value. Offset should always point to the
				104	* HIGH SURROGATE (leading code unit) of a pair of surrogates,
				105	* both on entry and upon return.
				106	* @param limit the limit index of text to be matched. Greater
				107	* than offset for a forward direction match, less than offset for
				108	* a backward direction match. The last character to be
				109	* considered for matching will be text.charAt(limit-1) in the
				110	* forward direction or text.charAt(limit+1) in the backward
				111	* direction.
Victor Chang	ce4bf3c	2021-01-19 16:34:24 +0000	[diff] [blame]	112	* @param incremental if true, then assume further characters may
Victor Chang	7322950	2020-09-17 13:39:19 +0100	[diff] [blame]	113	* be inserted at limit and check for partial matching. Otherwise
				114	* assume the text as given is complete.
				115	* @return a match degree value indicating a full match, a partial
Victor Chang	ce4bf3c	2021-01-19 16:34:24 +0000	[diff] [blame]	116	* match, or a mismatch. If incremental is false then
Victor Chang	7322950	2020-09-17 13:39:19 +0100	[diff] [blame]	117	* U_PARTIAL_MATCH should never be returned.
				118	*/
				119	virtual UMatchDegree matches(const Replaceable& text,
				120	int32_t& offset,
				121	int32_t limit,
				122	UBool incremental);
				123
				124	/**
				125	* Implement UnicodeMatcher
				126	* @param result Output param to receive the pattern.
				127	* @param escapeUnprintable if True then escape the unprintable characters.
				128	* @return A reference to 'result'.
				129	*/
				130	virtual UnicodeString& toPattern(UnicodeString& result,
Victor Chang	ce4bf3c	2021-01-19 16:34:24 +0000	[diff] [blame]	131	UBool escapeUnprintable = false) const;
Victor Chang	7322950	2020-09-17 13:39:19 +0100	[diff] [blame]	132
				133	/**
				134	* Implement UnicodeMatcher
Victor Chang	ce4bf3c	2021-01-19 16:34:24 +0000	[diff] [blame]	135	* Returns true if this matcher will match a character c, where c
Victor Chang	7322950	2020-09-17 13:39:19 +0100	[diff] [blame]	136	* & 0xFF == v, at offset, in the forward direction (with limit >
				137	* offset). This is used by <tt>RuleBasedTransliterator</tt> for
				138	* indexing.
				139	* @param v the given value
Victor Chang	ce4bf3c	2021-01-19 16:34:24 +0000	[diff] [blame]	140	* @return true if this matcher will match a character c,
Victor Chang	7322950	2020-09-17 13:39:19 +0100	[diff] [blame]	141	* where c & 0xFF == v
				142	*/
				143	virtual UBool matchesIndexValue(uint8_t v) const;
				144
				145	/**
				146	* Implement UnicodeMatcher
				147	*/
				148	virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
				149
				150	/**
				151	* Implement UnicodeFunctor
				152	*/
				153	virtual void setData(const TransliterationRuleData*);
				154
				155	/**
				156	* Replace characters in 'text' from 'start' to 'limit' with the
				157	* output text of this object. Update the 'cursor' parameter to
				158	* give the cursor position and return the length of the
				159	* replacement text.
				160	*
				161	* @param text the text to be matched
				162	* @param start inclusive start index of text to be replaced
				163	* @param limit exclusive end index of text to be replaced;
				164	* must be greater than or equal to start
				165	* @param cursor output parameter for the cursor position.
				166	* Not all replacer objects will update this, but in a complete
				167	* tree of replacer objects, representing the entire output side
				168	* of a transliteration rule, at least one must update it.
				169	* @return the number of 16-bit code units in the text replacing
				170	* the characters at offsets start..(limit-1) in text
				171	*/
				172	virtual int32_t replace(Replaceable& text,
				173	int32_t start,
				174	int32_t limit,
				175	int32_t& cursor);
				176
				177	/**
				178	* Returns a string representation of this replacer. If the
				179	* result of calling this function is passed to the appropriate
				180	* parser, typically TransliteratorParser, it will produce another
				181	* replacer that is equal to this one.
				182	* @param result the string to receive the pattern. Previous
				183	* contents will be deleted.
Victor Chang	ce4bf3c	2021-01-19 16:34:24 +0000	[diff] [blame]	184	* @param escapeUnprintable if true then convert unprintable
Victor Chang	7322950	2020-09-17 13:39:19 +0100	[diff] [blame]	185	* character to their hex escape representations, \\uxxxx or
				186	* \\Uxxxxxxxx. Unprintable characters are defined by
				187	* Utility.isUnprintable().
				188	* @return a reference to 'result'.
				189	*/
				190	virtual UnicodeString& toReplacerPattern(UnicodeString& result,
				191	UBool escapeUnprintable) const;
				192
				193	/**
				194	* Remove any match data. This must be called before performing a
				195	* set of matches with this segment.
				196	*/
				197	void resetMatch();
				198
				199	/**
				200	* ICU "poor man's RTTI", returns a UClassID for the actual class.
				201	*/
				202	virtual UClassID getDynamicClassID() const;
				203
				204	/**
				205	* ICU "poor man's RTTI", returns a UClassID for this class.
				206	*/
				207	static UClassID U_EXPORT2 getStaticClassID();
				208
				209	/**
				210	* Union the set of all characters that may output by this object
				211	* into the given set.
				212	* @param toUnionTo the set into which to union the output characters
				213	*/
				214	virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;
				215
				216	private:
				217
				218	/**
				219	* The text to be matched.
				220	*/
				221	UnicodeString pattern;
				222
				223	/**
				224	* Context object that maps stand-ins to matcher and replacer
				225	* objects.
				226	*/
				227	const TransliterationRuleData* data;
				228
				229	/**
				230	* The segment number, 1-based, or 0 if not a segment.
				231	*/
				232	int32_t segmentNumber;
				233
				234	/**
				235	* Start offset, in the match text, of the <em>rightmost</em>
				236	* match.
				237	*/
				238	int32_t matchStart;
				239
				240	/**
				241	* Limit offset, in the match text, of the <em>rightmost</em>
				242	* match.
				243	*/
				244	int32_t matchLimit;
				245
				246	};
				247
				248	U_NAMESPACE_END
				249
				250	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
				251
				252	#endif