Blame - libicu/cts_headers/usrchimp.h - platform/external/icu

blob: f11816785f6db6dae02fd69da4b7500985f5537a [file] [log] [blame]

Victor Chang	7322950	2020-09-17 13:39:19 +0100	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
				2	// License & terms of use: http://www.unicode.org/copyright.html
				3	/*
				4	**********************************************************************
				5	* Copyright (C) 2001-2015 IBM and others. All rights reserved.
				6	**********************************************************************
				7	* Date Name Description
				8	* 08/13/2001 synwee Creation.
				9	**********************************************************************
				10	*/
				11	#ifndef USRCHIMP_H
				12	#define USRCHIMP_H
				13
				14	#include "unicode/utypes.h"
				15
				16	#if !UCONFIG_NO_COLLATION
				17
				18	#include "unicode/normalizer2.h"
				19	#include "unicode/ucol.h"
				20	#include "unicode/ucoleitr.h"
				21	#include "unicode/ubrk.h"
				22
				23	/* mask off anything but primary order */
				24	#define UCOL_PRIMARYORDERMASK 0xffff0000
				25	/* mask off anything but secondary order */
				26	#define UCOL_SECONDARYORDERMASK 0x0000ff00
				27	/* mask off anything but tertiary order */
				28	#define UCOL_TERTIARYORDERMASK 0x000000ff
				29	/* primary order shift */
				30	#define UCOL_PRIMARYORDERSHIFT 16
				31	/* secondary order shift */
				32	#define UCOL_SECONDARYORDERSHIFT 8
				33
				34	#define UCOL_IGNORABLE 0
				35
				36	/* get weights from a CE */
				37	#define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
				38	#define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
				39	#define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)
				40
				41	#define UCOL_CONTINUATION_MARKER 0xC0
				42
				43	#define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)
				44
				45	/**
				46	* This indicates an error has occured during processing or there are no more CEs
				47	* to be returned.
				48	*/
				49	#define UCOL_PROCESSED_NULLORDER ((int64_t)U_INT64_MAX)
				50
				51	U_NAMESPACE_BEGIN
				52
				53	class CollationElementIterator;
				54	class Collator;
				55
				56	struct PCEI
				57	{
				58	uint64_t ce;
				59	int32_t low;
				60	int32_t high;
				61	};
				62
				63	struct PCEBuffer
				64	{
				65	PCEI defaultBuffer[16];
				66	PCEI *buffer;
				67	int32_t bufferIndex;
				68	int32_t bufferSize;
				69
				70	PCEBuffer();
				71	~PCEBuffer();
				72
				73	void reset();
				74	UBool isEmpty() const;
				75	void put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
				76	const PCEI *get();
				77	};
				78
				79	class UCollationPCE : public UMemory {
				80	private:
				81	PCEBuffer pceBuffer;
				82	CollationElementIterator *cei;
				83	UCollationStrength strength;
				84	UBool toShift;
				85	UBool isShifted;
				86	uint32_t variableTop;
				87
				88	public:
				89	UCollationPCE(UCollationElements *elems);
				90	UCollationPCE(CollationElementIterator *iter);
				91	~UCollationPCE();
				92
				93	void init(UCollationElements *elems);
				94	void init(CollationElementIterator *iter);
				95
				96	/**
				97	* Get the processed ordering priority of the next collation element in the text.
				98	* A single character may contain more than one collation element.
				99	*
				100	* @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
				101	* @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
				102	* @param status A pointer to an UErrorCode to receive any errors.
				103	* @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER
				104	* if an error has occured or if the end of string has been reached
				105	*/
				106	int64_t nextProcessed(int32_t ixLow, int32_t ixHigh, UErrorCode *status);
				107	/**
				108	* Get the processed ordering priority of the previous collation element in the text.
				109	* A single character may contain more than one collation element.
				110	*
				111	* @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
				112	* @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
				113	* @param status A pointer to an UErrorCode to receive any errors. Noteably
				114	* a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
				115	* buffer has been exhausted.
				116	* @return The previous collation elements ordering, otherwise returns
				117	* UCOL_PROCESSED_NULLORDER if an error has occured or if the start of
				118	* string has been reached.
				119	*/
				120	int64_t previousProcessed(int32_t ixLow, int32_t ixHigh, UErrorCode *status);
				121
				122	private:
				123	void init(const Collator &coll);
				124	uint64_t processCE(uint32_t ce);
				125	};
				126
				127	U_NAMESPACE_END
				128
				129	#define INITIAL_ARRAY_SIZE_ 256
				130	#define MAX_TABLE_SIZE_ 257
				131
				132	struct USearch {
				133	// required since collation element iterator does not have a getText API
				134	const UChar *text;
				135	int32_t textLength; // exact length
				136	UBool isOverlap;
				137	UBool isCanonicalMatch;
				138	int16_t elementComparisonType;
				139	UBreakIterator *internalBreakIter; //internal character breakiterator
				140	UBreakIterator *breakIter;
				141	// value USEARCH_DONE is the default value
				142	// if we are not at the start of the text or the end of the text,
				143	// depending on the iteration direction and matchedIndex is USEARCH_DONE
				144	// it means that we can't find any more matches in that particular direction
				145	int32_t matchedIndex;
				146	int32_t matchedLength;
				147	UBool isForwardSearching;
				148	UBool reset;
				149	};
				150
				151	struct UPattern {
				152	const UChar *text;
				153	int32_t textLength; // exact length
				154	// length required for backwards ce comparison
				155	int32_t cesLength;
				156	int32_t *ces;
				157	int32_t cesBuffer[INITIAL_ARRAY_SIZE_];
				158	int32_t pcesLength;
				159	int64_t *pces;
				160	int64_t pcesBuffer[INITIAL_ARRAY_SIZE_];
				161	UBool hasPrefixAccents;
				162	UBool hasSuffixAccents;
				163	int16_t defaultShiftSize;
				164	int16_t shift[MAX_TABLE_SIZE_];
				165	int16_t backShift[MAX_TABLE_SIZE_];
				166	};
				167
				168	struct UStringSearch {
				169	struct USearch *search;
				170	struct UPattern pattern;
				171	const UCollator *collator;
				172	const icu::Normalizer2 *nfd;
				173	// positions within the collation element iterator is used to determine
				174	// if we are at the start of the text.
				175	UCollationElements *textIter;
				176	icu::UCollationPCE *textProcessedIter;
				177	// utility collation element, used throughout program for temporary
				178	// iteration.
				179	UCollationElements *utilIter;
				180	UBool ownCollator;
				181	UCollationStrength strength;
				182	uint32_t ceMask;
				183	uint32_t variableTop;
				184	UBool toShift;
				185	UChar canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
				186	UChar canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
				187	};
				188
				189	/**
				190	* Exact matches without checking for the ends for extra accents.
				191	* The match after the position within the collation element iterator is to be
				192	* found.
				193	* After a match is found the offset in the collation element iterator will be
				194	* shifted to the start of the match.
				195	* Implementation note:
				196	* For tertiary we can't use the collator->tertiaryMask, that is a
				197	* preprocessed mask that takes into account case options. since we are only
				198	* concerned with exact matches, we don't need that.
				199	* Alternate handling - since only the 16 most significant digits is only used,
				200	* we can safely do a compare without masking if the ce is a variable, we mask
				201	* and get only the primary values no shifting to quartenary is required since
				202	* all primary values less than variabletop will need to be masked off anyway.
				203	* If the end character is composite and the pattern ce does not match the text
				204	* ce, we skip it until we find a match in the end composite character or when
				205	* it has passed the character. This is so that we can match pattern "a" with
				206	* the text "\u00e6"
				207	* @param strsrch string search data
				208	* @param status error status if any
Victor Chang	ce4bf3c	2021-01-19 16:34:24 +0000	[diff] [blame]	209	* @return true if an exact match is found, false otherwise
Victor Chang	7322950	2020-09-17 13:39:19 +0100	[diff] [blame]	210	*/
				211	U_CFUNC
				212	UBool usearch_handleNextExact(UStringSearch strsrch, UErrorCode status);
				213
				214	/**
				215	* Canonical matches.
				216	* According to the definition, matches found here will include the whole span
				217	* of beginning and ending accents if it overlaps that region.
				218	* @param strsrch string search data
				219	* @param status error status if any
Victor Chang	ce4bf3c	2021-01-19 16:34:24 +0000	[diff] [blame]	220	* @return true if a canonical match is found, false otherwise
Victor Chang	7322950	2020-09-17 13:39:19 +0100	[diff] [blame]	221	*/
				222	U_CFUNC
				223	UBool usearch_handleNextCanonical(UStringSearch strsrch, UErrorCode status);
				224
				225	/**
				226	* Gets the previous match.
				227	* Comments follows from handleNextExact
				228	* @param strsrch string search data
				229	* @param status error status if any
Victor Chang	ce4bf3c	2021-01-19 16:34:24 +0000	[diff] [blame]	230	* @return True if a exact math is found, false otherwise.
Victor Chang	7322950	2020-09-17 13:39:19 +0100	[diff] [blame]	231	*/
				232	U_CFUNC
				233	UBool usearch_handlePreviousExact(UStringSearch strsrch, UErrorCode status);
				234
				235	/**
				236	* Canonical matches.
				237	* According to the definition, matches found here will include the whole span
				238	* of beginning and ending accents if it overlaps that region.
				239	* @param strsrch string search data
				240	* @param status error status if any
Victor Chang	ce4bf3c	2021-01-19 16:34:24 +0000	[diff] [blame]	241	* @return true if a canonical match is found, false otherwise
Victor Chang	7322950	2020-09-17 13:39:19 +0100	[diff] [blame]	242	*/
				243	U_CFUNC
				244	UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
				245	UErrorCode *status);
				246
				247	#endif /* #if !UCONFIG_NO_COLLATION */
				248
				249	#endif