blob: 4aeb3bb3d8292107c8cb6098e29ad8119c6d86da [file] [log] [blame]
Victor Chang73229502020-09-17 13:39:19 +01001// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 2009-2013, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: normalizer2.h
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2009nov22
16* created by: Markus W. Scherer
17*/
18
19#ifndef __NORMALIZER2_H__
20#define __NORMALIZER2_H__
21
22/**
23 * \file
24 * \brief C++ API: New API for Unicode Normalization.
25 */
26
27#include "unicode/utypes.h"
28
29#if U_SHOW_CPLUSPLUS_API
30
31#if !UCONFIG_NO_NORMALIZATION
32
33#include "unicode/stringpiece.h"
34#include "unicode/uniset.h"
35#include "unicode/unistr.h"
36#include "unicode/unorm2.h"
37
38U_NAMESPACE_BEGIN
39
40class ByteSink;
41
42/**
43 * Unicode normalization functionality for standard Unicode normalization or
44 * for using custom mapping tables.
45 * All instances of this class are unmodifiable/immutable.
46 * Instances returned by getInstance() are singletons that must not be deleted by the caller.
47 * The Normalizer2 class is not intended for public subclassing.
48 *
49 * The primary functions are to produce a normalized string and to detect whether
50 * a string is already normalized.
51 * The most commonly used normalization forms are those defined in
52 * http://www.unicode.org/unicode/reports/tr15/
53 * However, this API supports additional normalization forms for specialized purposes.
54 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
55 * and can be used in implementations of UTS #46.
56 *
57 * Not only are the standard compose and decompose modes supplied,
58 * but additional modes are provided as documented in the Mode enum.
59 *
60 * Some of the functions in this class identify normalization boundaries.
61 * At a normalization boundary, the portions of the string
62 * before it and starting from it do not interact and can be handled independently.
63 *
64 * The spanQuickCheckYes() stops at a normalization boundary.
65 * When the goal is a normalized string, then the text before the boundary
66 * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
67 *
68 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
69 * a character is guaranteed to be at a normalization boundary,
70 * regardless of context.
71 * This is used for moving from one normalization boundary to the next
72 * or preceding boundary, and for performing iterative normalization.
73 *
74 * Iterative normalization is useful when only a small portion of a
75 * longer string needs to be processed.
76 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
77 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
78 * (to process only the substring for which sort key bytes are computed).
79 *
80 * The set of normalization boundaries returned by these functions may not be
81 * complete: There may be more boundaries that could be returned.
82 * Different functions may return different boundaries.
83 * @stable ICU 4.4
84 */
85class U_COMMON_API Normalizer2 : public UObject {
86public:
87 /**
88 * Destructor.
89 * @stable ICU 4.4
90 */
91 ~Normalizer2();
92
93 /**
94 * Returns a Normalizer2 instance for Unicode NFC normalization.
95 * Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode).
96 * Returns an unmodifiable singleton instance. Do not delete it.
97 * @param errorCode Standard ICU error code. Its input value must
98 * pass the U_SUCCESS() test, or else the function returns
99 * immediately. Check for U_FAILURE() on output or use with
100 * function chaining. (See User Guide for details.)
101 * @return the requested Normalizer2, if successful
102 * @stable ICU 49
103 */
104 static const Normalizer2 *
105 getNFCInstance(UErrorCode &errorCode);
106
107 /**
108 * Returns a Normalizer2 instance for Unicode NFD normalization.
109 * Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode).
110 * Returns an unmodifiable singleton instance. Do not delete it.
111 * @param errorCode Standard ICU error code. Its input value must
112 * pass the U_SUCCESS() test, or else the function returns
113 * immediately. Check for U_FAILURE() on output or use with
114 * function chaining. (See User Guide for details.)
115 * @return the requested Normalizer2, if successful
116 * @stable ICU 49
117 */
118 static const Normalizer2 *
119 getNFDInstance(UErrorCode &errorCode);
120
121 /**
122 * Returns a Normalizer2 instance for Unicode NFKC normalization.
123 * Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode).
124 * Returns an unmodifiable singleton instance. Do not delete it.
125 * @param errorCode Standard ICU error code. Its input value must
126 * pass the U_SUCCESS() test, or else the function returns
127 * immediately. Check for U_FAILURE() on output or use with
128 * function chaining. (See User Guide for details.)
129 * @return the requested Normalizer2, if successful
130 * @stable ICU 49
131 */
132 static const Normalizer2 *
133 getNFKCInstance(UErrorCode &errorCode);
134
135 /**
136 * Returns a Normalizer2 instance for Unicode NFKD normalization.
137 * Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode).
138 * Returns an unmodifiable singleton instance. Do not delete it.
139 * @param errorCode Standard ICU error code. Its input value must
140 * pass the U_SUCCESS() test, or else the function returns
141 * immediately. Check for U_FAILURE() on output or use with
142 * function chaining. (See User Guide for details.)
143 * @return the requested Normalizer2, if successful
144 * @stable ICU 49
145 */
146 static const Normalizer2 *
147 getNFKDInstance(UErrorCode &errorCode);
148
149 /**
150 * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
151 * Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode).
152 * Returns an unmodifiable singleton instance. Do not delete it.
153 * @param errorCode Standard ICU error code. Its input value must
154 * pass the U_SUCCESS() test, or else the function returns
155 * immediately. Check for U_FAILURE() on output or use with
156 * function chaining. (See User Guide for details.)
157 * @return the requested Normalizer2, if successful
158 * @stable ICU 49
159 */
160 static const Normalizer2 *
161 getNFKCCasefoldInstance(UErrorCode &errorCode);
162
163 /**
164 * Returns a Normalizer2 instance which uses the specified data file
165 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
166 * and which composes or decomposes text according to the specified mode.
167 * Returns an unmodifiable singleton instance. Do not delete it.
168 *
169 * Use packageName=NULL for data files that are part of ICU's own data.
170 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
171 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
172 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
173 *
174 * @param packageName NULL for ICU built-in data, otherwise application data package name
175 * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
176 * @param mode normalization mode (compose or decompose etc.)
177 * @param errorCode Standard ICU error code. Its input value must
178 * pass the U_SUCCESS() test, or else the function returns
179 * immediately. Check for U_FAILURE() on output or use with
180 * function chaining. (See User Guide for details.)
181 * @return the requested Normalizer2, if successful
182 * @stable ICU 4.4
183 */
184 static const Normalizer2 *
185 getInstance(const char *packageName,
186 const char *name,
187 UNormalization2Mode mode,
188 UErrorCode &errorCode);
189
190 /**
191 * Returns the normalized form of the source string.
192 * @param src source string
193 * @param errorCode Standard ICU error code. Its input value must
194 * pass the U_SUCCESS() test, or else the function returns
195 * immediately. Check for U_FAILURE() on output or use with
196 * function chaining. (See User Guide for details.)
197 * @return normalized src
198 * @stable ICU 4.4
199 */
200 UnicodeString
201 normalize(const UnicodeString &src, UErrorCode &errorCode) const {
202 UnicodeString result;
203 normalize(src, result, errorCode);
204 return result;
205 }
206 /**
207 * Writes the normalized form of the source string to the destination string
208 * (replacing its contents) and returns the destination string.
209 * The source and destination strings must be different objects.
210 * @param src source string
211 * @param dest destination string; its contents is replaced with normalized src
212 * @param errorCode Standard ICU error code. Its input value must
213 * pass the U_SUCCESS() test, or else the function returns
214 * immediately. Check for U_FAILURE() on output or use with
215 * function chaining. (See User Guide for details.)
216 * @return dest
217 * @stable ICU 4.4
218 */
219 virtual UnicodeString &
220 normalize(const UnicodeString &src,
221 UnicodeString &dest,
222 UErrorCode &errorCode) const = 0;
223
224 /**
225 * Normalizes a UTF-8 string and optionally records how source substrings
226 * relate to changed and unchanged result substrings.
227 *
228 * Currently implemented completely only for "compose" modes,
229 * such as for NFC, NFKC, and NFKC_Casefold
230 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
231 * Otherwise currently converts to & from UTF-16 and does not support edits.
232 *
233 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
234 * @param src Source UTF-8 string.
235 * @param sink A ByteSink to which the normalized UTF-8 result string is written.
236 * sink.Flush() is called at the end.
237 * @param edits Records edits for index mapping, working with styled text,
238 * and getting only changes (if any).
239 * The Edits contents is undefined if any error occurs.
240 * This function calls edits->reset() first unless
241 * options includes U_EDITS_NO_RESET. edits can be nullptr.
242 * @param errorCode Standard ICU error code. Its input value must
243 * pass the U_SUCCESS() test, or else the function returns
244 * immediately. Check for U_FAILURE() on output or use with
245 * function chaining. (See User Guide for details.)
246 * @stable ICU 60
247 */
248 virtual void
249 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
250 Edits *edits, UErrorCode &errorCode) const;
251
252 /**
253 * Appends the normalized form of the second string to the first string
254 * (merging them at the boundary) and returns the first string.
255 * The result is normalized if the first string was normalized.
256 * The first and second strings must be different objects.
257 * @param first string, should be normalized
258 * @param second string, will be normalized
259 * @param errorCode Standard ICU error code. Its input value must
260 * pass the U_SUCCESS() test, or else the function returns
261 * immediately. Check for U_FAILURE() on output or use with
262 * function chaining. (See User Guide for details.)
263 * @return first
264 * @stable ICU 4.4
265 */
266 virtual UnicodeString &
267 normalizeSecondAndAppend(UnicodeString &first,
268 const UnicodeString &second,
269 UErrorCode &errorCode) const = 0;
270 /**
271 * Appends the second string to the first string
272 * (merging them at the boundary) and returns the first string.
273 * The result is normalized if both the strings were normalized.
274 * The first and second strings must be different objects.
275 * @param first string, should be normalized
276 * @param second string, should be normalized
277 * @param errorCode Standard ICU error code. Its input value must
278 * pass the U_SUCCESS() test, or else the function returns
279 * immediately. Check for U_FAILURE() on output or use with
280 * function chaining. (See User Guide for details.)
281 * @return first
282 * @stable ICU 4.4
283 */
284 virtual UnicodeString &
285 append(UnicodeString &first,
286 const UnicodeString &second,
287 UErrorCode &errorCode) const = 0;
288
289 /**
290 * Gets the decomposition mapping of c.
291 * Roughly equivalent to normalizing the String form of c
292 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
293 * returns FALSE and does not write a string
294 * if c does not have a decomposition mapping in this instance's data.
295 * This function is independent of the mode of the Normalizer2.
296 * @param c code point
297 * @param decomposition String object which will be set to c's
298 * decomposition mapping, if there is one.
299 * @return TRUE if c has a decomposition, otherwise FALSE
300 * @stable ICU 4.6
301 */
302 virtual UBool
303 getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
304
305 /**
306 * Gets the raw decomposition mapping of c.
307 *
308 * This is similar to the getDecomposition() method but returns the
309 * raw decomposition mapping as specified in UnicodeData.txt or
310 * (for custom data) in the mapping files processed by the gennorm2 tool.
311 * By contrast, getDecomposition() returns the processed,
312 * recursively-decomposed version of this mapping.
313 *
314 * When used on a standard NFKC Normalizer2 instance,
315 * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
316 *
317 * When used on a standard NFC Normalizer2 instance,
318 * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
319 * in this case, the result contains either one or two code points (=1..4 char16_ts).
320 *
321 * This function is independent of the mode of the Normalizer2.
322 * The default implementation returns FALSE.
323 * @param c code point
324 * @param decomposition String object which will be set to c's
325 * raw decomposition mapping, if there is one.
326 * @return TRUE if c has a decomposition, otherwise FALSE
327 * @stable ICU 49
328 */
329 virtual UBool
330 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
331
332 /**
333 * Performs pairwise composition of a & b and returns the composite if there is one.
334 *
335 * Returns a composite code point c only if c has a two-way mapping to a+b.
336 * In standard Unicode normalization, this means that
337 * c has a canonical decomposition to a+b
338 * and c does not have the Full_Composition_Exclusion property.
339 *
340 * This function is independent of the mode of the Normalizer2.
341 * The default implementation returns a negative value.
342 * @param a A (normalization starter) code point.
343 * @param b Another code point.
344 * @return The non-negative composite code point if there is one; otherwise a negative value.
345 * @stable ICU 49
346 */
347 virtual UChar32
348 composePair(UChar32 a, UChar32 b) const;
349
350 /**
351 * Gets the combining class of c.
352 * The default implementation returns 0
353 * but all standard implementations return the Unicode Canonical_Combining_Class value.
354 * @param c code point
355 * @return c's combining class
356 * @stable ICU 49
357 */
358 virtual uint8_t
359 getCombiningClass(UChar32 c) const;
360
361 /**
362 * Tests if the string is normalized.
363 * Internally, in cases where the quickCheck() method would return "maybe"
364 * (which is only possible for the two COMPOSE modes) this method
365 * resolves to "yes" or "no" to provide a definitive result,
366 * at the cost of doing more work in those cases.
367 * @param s input string
368 * @param errorCode Standard ICU error code. Its input value must
369 * pass the U_SUCCESS() test, or else the function returns
370 * immediately. Check for U_FAILURE() on output or use with
371 * function chaining. (See User Guide for details.)
372 * @return TRUE if s is normalized
373 * @stable ICU 4.4
374 */
375 virtual UBool
376 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
377 /**
378 * Tests if the UTF-8 string is normalized.
379 * Internally, in cases where the quickCheck() method would return "maybe"
380 * (which is only possible for the two COMPOSE modes) this method
381 * resolves to "yes" or "no" to provide a definitive result,
382 * at the cost of doing more work in those cases.
383 *
384 * This works for all normalization modes,
385 * but it is currently optimized for UTF-8 only for "compose" modes,
386 * such as for NFC, NFKC, and NFKC_Casefold
387 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
388 * For other modes it currently converts to UTF-16 and calls isNormalized().
389 *
390 * @param s UTF-8 input string
391 * @param errorCode Standard ICU error code. Its input value must
392 * pass the U_SUCCESS() test, or else the function returns
393 * immediately. Check for U_FAILURE() on output or use with
394 * function chaining. (See User Guide for details.)
395 * @return TRUE if s is normalized
396 * @stable ICU 60
397 */
398 virtual UBool
399 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
400
401
402 /**
403 * Tests if the string is normalized.
404 * For the two COMPOSE modes, the result could be "maybe" in cases that
405 * would take a little more work to resolve definitively.
406 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
407 * combination of quick check + normalization, to avoid
408 * re-checking the "yes" prefix.
409 * @param s input string
410 * @param errorCode Standard ICU error code. Its input value must
411 * pass the U_SUCCESS() test, or else the function returns
412 * immediately. Check for U_FAILURE() on output or use with
413 * function chaining. (See User Guide for details.)
414 * @return UNormalizationCheckResult
415 * @stable ICU 4.4
416 */
417 virtual UNormalizationCheckResult
418 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
419
420 /**
421 * Returns the end of the normalized substring of the input string.
422 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
423 * the substring <code>UnicodeString(s, 0, end)</code>
424 * will pass the quick check with a "yes" result.
425 *
426 * The returned end index is usually one or more characters before the
427 * "no" or "maybe" character: The end index is at a normalization boundary.
428 * (See the class documentation for more about normalization boundaries.)
429 *
430 * When the goal is a normalized string and most input strings are expected
431 * to be normalized already, then call this method,
432 * and if it returns a prefix shorter than the input string,
433 * copy that prefix and use normalizeSecondAndAppend() for the remainder.
434 * @param s input string
435 * @param errorCode Standard ICU error code. Its input value must
436 * pass the U_SUCCESS() test, or else the function returns
437 * immediately. Check for U_FAILURE() on output or use with
438 * function chaining. (See User Guide for details.)
439 * @return "yes" span end index
440 * @stable ICU 4.4
441 */
442 virtual int32_t
443 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
444
445 /**
446 * Tests if the character always has a normalization boundary before it,
447 * regardless of context.
448 * If true, then the character does not normalization-interact with
449 * preceding characters.
450 * In other words, a string containing this character can be normalized
451 * by processing portions before this character and starting from this
452 * character independently.
453 * This is used for iterative normalization. See the class documentation for details.
454 * @param c character to test
455 * @return TRUE if c has a normalization boundary before it
456 * @stable ICU 4.4
457 */
458 virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
459
460 /**
461 * Tests if the character always has a normalization boundary after it,
462 * regardless of context.
463 * If true, then the character does not normalization-interact with
464 * following characters.
465 * In other words, a string containing this character can be normalized
466 * by processing portions up to this character and after this
467 * character independently.
468 * This is used for iterative normalization. See the class documentation for details.
469 * Note that this operation may be significantly slower than hasBoundaryBefore().
470 * @param c character to test
471 * @return TRUE if c has a normalization boundary after it
472 * @stable ICU 4.4
473 */
474 virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
475
476 /**
477 * Tests if the character is normalization-inert.
478 * If true, then the character does not change, nor normalization-interact with
479 * preceding or following characters.
480 * In other words, a string containing this character can be normalized
481 * by processing portions before this character and after this
482 * character independently.
483 * This is used for iterative normalization. See the class documentation for details.
484 * Note that this operation may be significantly slower than hasBoundaryBefore().
485 * @param c character to test
486 * @return TRUE if c is normalization-inert
487 * @stable ICU 4.4
488 */
489 virtual UBool isInert(UChar32 c) const = 0;
490};
491
492/**
493 * Normalization filtered by a UnicodeSet.
494 * Normalizes portions of the text contained in the filter set and leaves
495 * portions not contained in the filter set unchanged.
496 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
497 * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
498 * This class implements all of (and only) the Normalizer2 API.
499 * An instance of this class is unmodifiable/immutable but is constructed and
500 * must be destructed by the owner.
501 * @stable ICU 4.4
502 */
503class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
504public:
505 /**
506 * Constructs a filtered normalizer wrapping any Normalizer2 instance
507 * and a filter set.
508 * Both are aliased and must not be modified or deleted while this object
509 * is used.
510 * The filter set should be frozen; otherwise the performance will suffer greatly.
511 * @param n2 wrapped Normalizer2 instance
512 * @param filterSet UnicodeSet which determines the characters to be normalized
513 * @stable ICU 4.4
514 */
515 FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
516 norm2(n2), set(filterSet) {}
517
518 /**
519 * Destructor.
520 * @stable ICU 4.4
521 */
522 ~FilteredNormalizer2();
523
524 /**
525 * Writes the normalized form of the source string to the destination string
526 * (replacing its contents) and returns the destination string.
527 * The source and destination strings must be different objects.
528 * @param src source string
529 * @param dest destination string; its contents is replaced with normalized src
530 * @param errorCode Standard ICU error code. Its input value must
531 * pass the U_SUCCESS() test, or else the function returns
532 * immediately. Check for U_FAILURE() on output or use with
533 * function chaining. (See User Guide for details.)
534 * @return dest
535 * @stable ICU 4.4
536 */
537 virtual UnicodeString &
538 normalize(const UnicodeString &src,
539 UnicodeString &dest,
540 UErrorCode &errorCode) const U_OVERRIDE;
541
542 /**
543 * Normalizes a UTF-8 string and optionally records how source substrings
544 * relate to changed and unchanged result substrings.
545 *
546 * Currently implemented completely only for "compose" modes,
547 * such as for NFC, NFKC, and NFKC_Casefold
548 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
549 * Otherwise currently converts to & from UTF-16 and does not support edits.
550 *
551 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
552 * @param src Source UTF-8 string.
553 * @param sink A ByteSink to which the normalized UTF-8 result string is written.
554 * sink.Flush() is called at the end.
555 * @param edits Records edits for index mapping, working with styled text,
556 * and getting only changes (if any).
557 * The Edits contents is undefined if any error occurs.
558 * This function calls edits->reset() first unless
559 * options includes U_EDITS_NO_RESET. edits can be nullptr.
560 * @param errorCode Standard ICU error code. Its input value must
561 * pass the U_SUCCESS() test, or else the function returns
562 * immediately. Check for U_FAILURE() on output or use with
563 * function chaining. (See User Guide for details.)
564 * @stable ICU 60
565 */
566 virtual void
567 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
568 Edits *edits, UErrorCode &errorCode) const U_OVERRIDE;
569
570 /**
571 * Appends the normalized form of the second string to the first string
572 * (merging them at the boundary) and returns the first string.
573 * The result is normalized if the first string was normalized.
574 * The first and second strings must be different objects.
575 * @param first string, should be normalized
576 * @param second string, will be normalized
577 * @param errorCode Standard ICU error code. Its input value must
578 * pass the U_SUCCESS() test, or else the function returns
579 * immediately. Check for U_FAILURE() on output or use with
580 * function chaining. (See User Guide for details.)
581 * @return first
582 * @stable ICU 4.4
583 */
584 virtual UnicodeString &
585 normalizeSecondAndAppend(UnicodeString &first,
586 const UnicodeString &second,
587 UErrorCode &errorCode) const U_OVERRIDE;
588 /**
589 * Appends the second string to the first string
590 * (merging them at the boundary) and returns the first string.
591 * The result is normalized if both the strings were normalized.
592 * The first and second strings must be different objects.
593 * @param first string, should be normalized
594 * @param second string, should be normalized
595 * @param errorCode Standard ICU error code. Its input value must
596 * pass the U_SUCCESS() test, or else the function returns
597 * immediately. Check for U_FAILURE() on output or use with
598 * function chaining. (See User Guide for details.)
599 * @return first
600 * @stable ICU 4.4
601 */
602 virtual UnicodeString &
603 append(UnicodeString &first,
604 const UnicodeString &second,
605 UErrorCode &errorCode) const U_OVERRIDE;
606
607 /**
608 * Gets the decomposition mapping of c.
609 * For details see the base class documentation.
610 *
611 * This function is independent of the mode of the Normalizer2.
612 * @param c code point
613 * @param decomposition String object which will be set to c's
614 * decomposition mapping, if there is one.
615 * @return TRUE if c has a decomposition, otherwise FALSE
616 * @stable ICU 4.6
617 */
618 virtual UBool
619 getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
620
621 /**
622 * Gets the raw decomposition mapping of c.
623 * For details see the base class documentation.
624 *
625 * This function is independent of the mode of the Normalizer2.
626 * @param c code point
627 * @param decomposition String object which will be set to c's
628 * raw decomposition mapping, if there is one.
629 * @return TRUE if c has a decomposition, otherwise FALSE
630 * @stable ICU 49
631 */
632 virtual UBool
633 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
634
635 /**
636 * Performs pairwise composition of a & b and returns the composite if there is one.
637 * For details see the base class documentation.
638 *
639 * This function is independent of the mode of the Normalizer2.
640 * @param a A (normalization starter) code point.
641 * @param b Another code point.
642 * @return The non-negative composite code point if there is one; otherwise a negative value.
643 * @stable ICU 49
644 */
645 virtual UChar32
646 composePair(UChar32 a, UChar32 b) const U_OVERRIDE;
647
648 /**
649 * Gets the combining class of c.
650 * The default implementation returns 0
651 * but all standard implementations return the Unicode Canonical_Combining_Class value.
652 * @param c code point
653 * @return c's combining class
654 * @stable ICU 49
655 */
656 virtual uint8_t
657 getCombiningClass(UChar32 c) const U_OVERRIDE;
658
659 /**
660 * Tests if the string is normalized.
661 * For details see the Normalizer2 base class documentation.
662 * @param s input string
663 * @param errorCode Standard ICU error code. Its input value must
664 * pass the U_SUCCESS() test, or else the function returns
665 * immediately. Check for U_FAILURE() on output or use with
666 * function chaining. (See User Guide for details.)
667 * @return TRUE if s is normalized
668 * @stable ICU 4.4
669 */
670 virtual UBool
671 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
672 /**
673 * Tests if the UTF-8 string is normalized.
674 * Internally, in cases where the quickCheck() method would return "maybe"
675 * (which is only possible for the two COMPOSE modes) this method
676 * resolves to "yes" or "no" to provide a definitive result,
677 * at the cost of doing more work in those cases.
678 *
679 * This works for all normalization modes,
680 * but it is currently optimized for UTF-8 only for "compose" modes,
681 * such as for NFC, NFKC, and NFKC_Casefold
682 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
683 * For other modes it currently converts to UTF-16 and calls isNormalized().
684 *
685 * @param s UTF-8 input string
686 * @param errorCode Standard ICU error code. Its input value must
687 * pass the U_SUCCESS() test, or else the function returns
688 * immediately. Check for U_FAILURE() on output or use with
689 * function chaining. (See User Guide for details.)
690 * @return TRUE if s is normalized
691 * @stable ICU 60
692 */
693 virtual UBool
694 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE;
695 /**
696 * Tests if the string is normalized.
697 * For details see the Normalizer2 base class documentation.
698 * @param s input string
699 * @param errorCode Standard ICU error code. Its input value must
700 * pass the U_SUCCESS() test, or else the function returns
701 * immediately. Check for U_FAILURE() on output or use with
702 * function chaining. (See User Guide for details.)
703 * @return UNormalizationCheckResult
704 * @stable ICU 4.4
705 */
706 virtual UNormalizationCheckResult
707 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
708 /**
709 * Returns the end of the normalized substring of the input string.
710 * For details see the Normalizer2 base class documentation.
711 * @param s input string
712 * @param errorCode Standard ICU error code. Its input value must
713 * pass the U_SUCCESS() test, or else the function returns
714 * immediately. Check for U_FAILURE() on output or use with
715 * function chaining. (See User Guide for details.)
716 * @return "yes" span end index
717 * @stable ICU 4.4
718 */
719 virtual int32_t
720 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
721
722 /**
723 * Tests if the character always has a normalization boundary before it,
724 * regardless of context.
725 * For details see the Normalizer2 base class documentation.
726 * @param c character to test
727 * @return TRUE if c has a normalization boundary before it
728 * @stable ICU 4.4
729 */
730 virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE;
731
732 /**
733 * Tests if the character always has a normalization boundary after it,
734 * regardless of context.
735 * For details see the Normalizer2 base class documentation.
736 * @param c character to test
737 * @return TRUE if c has a normalization boundary after it
738 * @stable ICU 4.4
739 */
740 virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE;
741
742 /**
743 * Tests if the character is normalization-inert.
744 * For details see the Normalizer2 base class documentation.
745 * @param c character to test
746 * @return TRUE if c is normalization-inert
747 * @stable ICU 4.4
748 */
749 virtual UBool isInert(UChar32 c) const U_OVERRIDE;
750private:
751 UnicodeString &
752 normalize(const UnicodeString &src,
753 UnicodeString &dest,
754 USetSpanCondition spanCondition,
755 UErrorCode &errorCode) const;
756
757 void
758 normalizeUTF8(uint32_t options, const char *src, int32_t length,
759 ByteSink &sink, Edits *edits,
760 USetSpanCondition spanCondition,
761 UErrorCode &errorCode) const;
762
763 UnicodeString &
764 normalizeSecondAndAppend(UnicodeString &first,
765 const UnicodeString &second,
766 UBool doNormalize,
767 UErrorCode &errorCode) const;
768
769 const Normalizer2 &norm2;
770 const UnicodeSet &set;
771};
772
773U_NAMESPACE_END
774
775#endif // !UCONFIG_NO_NORMALIZATION
776
777#endif /* U_SHOW_CPLUSPLUS_API */
778
779#endif // __NORMALIZER2_H__