Blame - libicu/cts_headers/rbt_pars.h - platform/external/icu

blob: 2a972e1eaa377468dafdcc903d9e31879312e578 [file] [log] [blame]

Victor Chang	7322950	2020-09-17 13:39:19 +0100	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
				2	// License & terms of use: http://www.unicode.org/copyright.html
				3	/*
				4	**********************************************************************
				5	* Copyright (C) 1999-2011, International Business Machines Corporation
				6	* and others. All Rights Reserved.
				7	**********************************************************************
				8	* Date Name Description
				9	* 11/17/99 aliu Creation.
				10	**********************************************************************
				11	*/
				12	#ifndef RBT_PARS_H
				13	#define RBT_PARS_H
				14
				15	#include "unicode/utypes.h"
				16
				17	#if !UCONFIG_NO_TRANSLITERATION
				18	#ifdef __cplusplus
				19
				20	#include "unicode/uobject.h"
				21	#include "unicode/parseerr.h"
				22	#include "unicode/unorm.h"
				23	#include "rbt.h"
				24	#include "hash.h"
				25	#include "uvector.h"
				26
				27	U_NAMESPACE_BEGIN
				28
				29	class TransliterationRuleData;
				30	class UnicodeFunctor;
				31	class ParseData;
				32	class RuleHalf;
				33	class ParsePosition;
				34	class StringMatcher;
				35
				36	class TransliteratorParser : public UMemory {
				37
				38	public:
				39
				40	/**
				41	* A Vector of TransliterationRuleData objects, one for each discrete group
				42	* of rules in the rule set
				43	*/
				44	UVector dataVector;
				45
				46	/**
				47	* PUBLIC data member.
				48	* A Vector of UnicodeStrings containing all of the ID blocks in the rule set
				49	*/
				50	UVector idBlockVector;
				51
				52	/**
				53	* PUBLIC data member containing the parsed compound filter, if any.
				54	*/
				55	UnicodeSet* compoundFilter;
				56
				57	private:
				58
				59	/**
				60	* The current data object for which we are parsing rules
				61	*/
				62	TransliterationRuleData* curData;
				63
				64	UTransDirection direction;
				65
				66	/**
				67	* Parse error information.
				68	*/
				69	UParseError parseError;
				70
				71	/**
				72	* Temporary symbol table used during parsing.
				73	*/
				74	ParseData* parseData;
				75
				76	/**
				77	* Temporary vector of matcher variables. When parsing is complete, this
				78	* is copied into the array data.variables. As with data.variables,
				79	* element 0 corresponds to character data.variablesBase.
				80	*/
				81	UVector variablesVector;
				82
				83	/**
				84	* Temporary table of variable names. When parsing is complete, this is
				85	* copied into data.variableNames.
				86	*/
				87	Hashtable variableNames;
				88
				89	/**
				90	* String of standins for segments. Used during the parsing of a single
				91	* rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds
				92	* to StringMatcher object segmentObjects.elementAt(0), etc.
				93	*/
				94	UnicodeString segmentStandins;
				95
				96	/**
				97	* Vector of StringMatcher objects for segments. Used during the
				98	* parsing of a single rule.
				99	* segmentStandins.charAt(0) is the standin for "$1" and corresponds
				100	* to StringMatcher object segmentObjects.elementAt(0), etc.
				101	*/
				102	UVector segmentObjects;
				103
				104	/**
				105	* The next available stand-in for variables. This starts at some point in
				106	* the private use area (discovered dynamically) and increments up toward
				107	* <code>variableLimit</code>. At any point during parsing, available
				108	* variables are <code>variableNext..variableLimit-1</code>.
				109	*/
				110	UChar variableNext;
				111
				112	/**
				113	* The last available stand-in for variables. This is discovered
				114	* dynamically. At any point during parsing, available variables are
				115	* <code>variableNext..variableLimit-1</code>.
				116	*/
				117	UChar variableLimit;
				118
				119	/**
				120	* When we encounter an undefined variable, we do not immediately signal
				121	* an error, in case we are defining this variable, e.g., "$a = [a-z];".
				122	* Instead, we save the name of the undefined variable, and substitute
				123	* in the placeholder char variableLimit - 1, and decrement
				124	* variableLimit.
				125	*/
				126	UnicodeString undefinedVariableName;
				127
				128	/**
				129	* The stand-in character for the 'dot' set, represented by '.' in
				130	* patterns. This is allocated the first time it is needed, and
				131	* reused thereafter.
				132	*/
				133	UChar dotStandIn;
				134
				135	public:
				136
				137	/**
				138	* Constructor.
				139	*/
				140	TransliteratorParser(UErrorCode &statusReturn);
				141
				142	/**
				143	* Destructor.
				144	*/
				145	~TransliteratorParser();
				146
				147	/**
				148	* Parse the given string as a sequence of rules, separated by newline
				149	* characters ('\n'), and cause this object to implement those rules. Any
				150	* previous rules are discarded. Typically this method is called exactly
				151	* once after construction.
				152	*
				153	* Parse the given rules, in the given direction. After this call
				154	* returns, query the public data members for results. The caller
				155	* owns the 'data' and 'compoundFilter' data members after this
				156	* call returns.
				157	* @param rules rules, separated by ';'
				158	* @param direction either FORWARD or REVERSE.
				159	* @param pe Struct to recieve information on position
				160	* of error if an error is encountered
				161	* @param ec Output param set to success/failure code.
				162	*/
				163	void parse(const UnicodeString& rules,
				164	UTransDirection direction,
				165	UParseError& pe,
				166	UErrorCode& ec);
				167
				168	/**
				169	* Return the compound filter parsed by parse(). Caller owns result.
				170	* @return the compound filter parsed by parse().
				171	*/
				172	UnicodeSet* orphanCompoundFilter();
				173
				174	private:
				175
				176	/**
				177	* Return a representation of this transliterator as source rules.
				178	* @param rules Output param to receive the rules.
				179	* @param direction either FORWARD or REVERSE.
				180	*/
				181	void parseRules(const UnicodeString& rules,
				182	UTransDirection direction,
				183	UErrorCode& status);
				184
				185	/**
				186	* MAIN PARSER. Parse the next rule in the given rule string, starting
				187	* at pos. Return the index after the last character parsed. Do not
				188	* parse characters at or after limit.
				189	*
				190	* Important: The character at pos must be a non-whitespace character
				191	* that is not the comment character.
				192	*
				193	* This method handles quoting, escaping, and whitespace removal. It
				194	* parses the end-of-rule character. It recognizes context and cursor
				195	* indicators. Once it does a lexical breakdown of the rule at pos, it
				196	* creates a rule object and adds it to our rule list.
				197	* @param rules Output param to receive the rules.
				198	* @param pos the starting position.
				199	* @param limit pointer past the last character of the rule.
				200	* @return the index after the last character parsed.
				201	*/
				202	int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
				203
				204	/**
				205	* Set the variable range to [start, end] (inclusive).
				206	* @param start the start value of the range.
				207	* @param end the end value of the range.
				208	*/
				209	void setVariableRange(int32_t start, int32_t end, UErrorCode& status);
				210
				211	/**
				212	* Assert that the given character is NOT within the variable range.
Victor Chang	ce4bf3c	2021-01-19 16:34:24 +0000	[diff] [blame]	213	* If it is, return false. This is neccesary to ensure that the
Victor Chang	7322950	2020-09-17 13:39:19 +0100	[diff] [blame]	214	* variable range does not overlap characters used in a rule.
				215	* @param ch the given character.
				216	* @return True, if the given character is NOT within the variable range.
				217	*/
				218	UBool checkVariableRange(UChar32 ch) const;
				219
				220	/**
				221	* Set the maximum backup to 'backup', in response to a pragma
				222	* statement.
				223	* @param backup the new value to be set.
				224	*/
				225	void pragmaMaximumBackup(int32_t backup);
				226
				227	/**
				228	* Begin normalizing all rules using the given mode, in response
				229	* to a pragma statement.
				230	* @param mode the given mode.
				231	*/
				232	void pragmaNormalizeRules(UNormalizationMode mode);
				233
				234	/**
				235	* Return true if the given rule looks like a pragma.
				236	* @param pos offset to the first non-whitespace character
				237	* of the rule.
				238	* @param limit pointer past the last character of the rule.
				239	* @return true if the given rule looks like a pragma.
				240	*/
				241	static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);
				242
				243	/**
				244	* Parse a pragma. This method assumes resemblesPragma() has
				245	* already returned true.
				246	* @param pos offset to the first non-whitespace character
				247	* of the rule.
				248	* @param limit pointer past the last character of the rule.
				249	* @return the position index after the final ';' of the pragma,
				250	* or -1 on failure.
				251	*/
				252	int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
				253
				254	/**
				255	* Called by main parser upon syntax error. Search the rule string
				256	* for the probable end of the rule. Of course, if the error is that
				257	* the end of rule marker is missing, then the rule end will not be found.
				258	* In any case the rule start will be correctly reported.
				259	* @param parseErrorCode error code.
				260	* @param msg error description.
				261	* @param start position of first character of current rule.
				262	* @return start position of first character of current rule.
				263	*/
				264	int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
				265	UErrorCode& status);
				266
				267	/**
				268	* Parse a UnicodeSet out, store it, and return the stand-in character
				269	* used to represent it.
				270	*
				271	* @param rule the rule for UnicodeSet.
				272	* @param pos the position in pattern at which to start parsing.
				273	* @return the stand-in character used to represent it.
				274	*/
				275	UChar parseSet(const UnicodeString& rule,
				276	ParsePosition& pos,
				277	UErrorCode& status);
				278
				279	/**
				280	* Generate and return a stand-in for a new UnicodeFunctor. Store
				281	* the matcher (adopt it).
				282	* @param adopted the UnicodeFunctor to be adopted.
				283	* @return a stand-in for a new UnicodeFunctor.
				284	*/
				285	UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);
				286
				287	/**
				288	* Return the standin for segment seg (1-based).
				289	* @param seg the given segment.
				290	* @return the standIn character for the given segment.
				291	*/
				292	UChar getSegmentStandin(int32_t seg, UErrorCode& status);
				293
				294	/**
				295	* Set the object for segment seg (1-based).
				296	* @param seg the given segment.
				297	* @param adopted the StringMatcher to be adopted.
				298	*/
				299	void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);
				300
				301	/**
				302	* Return the stand-in for the dot set. It is allocated the first
				303	* time and reused thereafter.
				304	* @return the stand-in for the dot set.
				305	*/
				306	UChar getDotStandIn(UErrorCode& status);
				307
				308	/**
				309	* Append the value of the given variable name to the given
				310	* UnicodeString.
				311	* @param name the variable name to be appended.
				312	* @param buf the given UnicodeString to append to.
				313	*/
				314	void appendVariableDef(const UnicodeString& name,
				315	UnicodeString& buf,
				316	UErrorCode& status);
				317
				318	/**
				319	* Glue method to get around access restrictions in C++.
				320	*/
				321	/static Transliterator createBasicInstance(const UnicodeString& id,
				322	const UnicodeString* canonID);*/
				323
				324	friend class RuleHalf;
				325
				326	// Disallowed methods; no impl.
				327	/**
				328	* Copy constructor
				329	*/
				330	TransliteratorParser(const TransliteratorParser&);
				331
				332	/**
				333	* Assignment operator
				334	*/
				335	TransliteratorParser& operator=(const TransliteratorParser&);
				336	};
				337
				338	U_NAMESPACE_END
				339
				340	#endif /* #ifdef __cplusplus */
				341
				342	/**
				343	* Strip/convert the following from the transliterator rules:
				344	* comments
				345	* newlines
				346	* white space at the beginning and end of a line
				347	* unescape \u notation
				348	*
				349	* The target must be equal in size as the source.
				350	* @internal
				351	*/
				352	U_CAPI int32_t
				353	utrans_stripRules(const UChar source, int32_t sourceLen, UChar target, UErrorCode *status);
				354
				355	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
				356
				357	#endif