blob: b2c872107c5d2910a190d677d2e02672cb8e45b1 [file] [log] [blame]
Victor Changf1a8c982020-11-20 18:16:37 +00001// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 2009-2015, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: unorm2.h
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2009dec15
16* created by: Markus W. Scherer
17*/
18
19#ifndef __UNORM2_H__
20#define __UNORM2_H__
21
22/**
23 * \file
24 * \brief C API: New API for Unicode Normalization.
25 *
26 * Unicode normalization functionality for standard Unicode normalization or
27 * for using custom mapping tables.
28 * All instances of UNormalizer2 are unmodifiable/immutable.
29 * Instances returned by unorm2_getInstance() are singletons that must not be deleted by the caller.
30 * For more details see the Normalizer2 C++ class.
31 */
32
33#include "unicode/utypes.h"
34#include "unicode/stringoptions.h"
35
36#if U_SHOW_CPLUSPLUS_API
37#include "unicode/localpointer.h"
38#endif // U_SHOW_CPLUSPLUS_API
39
40/**
41 * Constants for normalization modes.
42 * For details about standard Unicode normalization forms
43 * and about the algorithms which are also used with custom mapping tables
44 * see http://www.unicode.org/unicode/reports/tr15/
45 * @stable ICU 4.4
46 */
47typedef enum {
48 /**
49 * Decomposition followed by composition.
50 * Same as standard NFC when using an "nfc" instance.
51 * Same as standard NFKC when using an "nfkc" instance.
52 * For details about standard Unicode normalization forms
53 * see http://www.unicode.org/unicode/reports/tr15/
54 * @stable ICU 4.4
55 */
56 UNORM2_COMPOSE,
57 /**
58 * Map, and reorder canonically.
59 * Same as standard NFD when using an "nfc" instance.
60 * Same as standard NFKD when using an "nfkc" instance.
61 * For details about standard Unicode normalization forms
62 * see http://www.unicode.org/unicode/reports/tr15/
63 * @stable ICU 4.4
64 */
65 UNORM2_DECOMPOSE,
66 /**
67 * "Fast C or D" form.
68 * If a string is in this form, then further decomposition <i>without reordering</i>
69 * would yield the same form as DECOMPOSE.
70 * Text in "Fast C or D" form can be processed efficiently with data tables
71 * that are "canonically closed", that is, that provide equivalent data for
72 * equivalent text, without having to be fully normalized.
73 * Not a standard Unicode normalization form.
74 * Not a unique form: Different FCD strings can be canonically equivalent.
75 * For details see http://www.unicode.org/notes/tn5/#FCD
76 * @stable ICU 4.4
77 */
78 UNORM2_FCD,
79 /**
80 * Compose only contiguously.
81 * Also known as "FCC" or "Fast C Contiguous".
82 * The result will often but not always be in NFC.
83 * The result will conform to FCD which is useful for processing.
84 * Not a standard Unicode normalization form.
85 * For details see http://www.unicode.org/notes/tn5/#FCC
86 * @stable ICU 4.4
87 */
88 UNORM2_COMPOSE_CONTIGUOUS
89} UNormalization2Mode;
90
91/**
92 * Result values for normalization quick check functions.
93 * For details see http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
94 * @stable ICU 2.0
95 */
96typedef enum UNormalizationCheckResult {
97 /**
98 * The input string is not in the normalization form.
99 * @stable ICU 2.0
100 */
101 UNORM_NO,
102 /**
103 * The input string is in the normalization form.
104 * @stable ICU 2.0
105 */
106 UNORM_YES,
107 /**
108 * The input string may or may not be in the normalization form.
109 * This value is only returned for composition forms like NFC and FCC,
110 * when a backward-combining character is found for which the surrounding text
111 * would have to be analyzed further.
112 * @stable ICU 2.0
113 */
114 UNORM_MAYBE
115} UNormalizationCheckResult;
116
117/**
118 * Opaque C service object type for the new normalization API.
119 * @stable ICU 4.4
120 */
121struct UNormalizer2;
122typedef struct UNormalizer2 UNormalizer2; /**< C typedef for struct UNormalizer2. @stable ICU 4.4 */
123
124#if !UCONFIG_NO_NORMALIZATION
125
126#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
127
128/**
129 * Returns a UNormalizer2 instance for Unicode NFC normalization.
130 * Same as unorm2_getInstance(NULL, "nfc", UNORM2_COMPOSE, pErrorCode).
131 * Returns an unmodifiable singleton instance. Do not delete it.
132 * @param pErrorCode Standard ICU error code. Its input value must
133 * pass the U_SUCCESS() test, or else the function returns
134 * immediately. Check for U_FAILURE() on output or use with
135 * function chaining. (See User Guide for details.)
136 * @return the requested Normalizer2, if successful
137 * @stable ICU 49
138 */
139U_STABLE const UNormalizer2 * U_EXPORT2
140unorm2_getNFCInstance(UErrorCode *pErrorCode) __INTRODUCED_IN(31);
141
142#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
143
144#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
145
146/**
147 * Returns a UNormalizer2 instance for Unicode NFD normalization.
148 * Same as unorm2_getInstance(NULL, "nfc", UNORM2_DECOMPOSE, pErrorCode).
149 * Returns an unmodifiable singleton instance. Do not delete it.
150 * @param pErrorCode Standard ICU error code. Its input value must
151 * pass the U_SUCCESS() test, or else the function returns
152 * immediately. Check for U_FAILURE() on output or use with
153 * function chaining. (See User Guide for details.)
154 * @return the requested Normalizer2, if successful
155 * @stable ICU 49
156 */
157U_STABLE const UNormalizer2 * U_EXPORT2
158unorm2_getNFDInstance(UErrorCode *pErrorCode) __INTRODUCED_IN(31);
159
160#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
161
162#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
163
164/**
165 * Returns a UNormalizer2 instance for Unicode NFKC normalization.
166 * Same as unorm2_getInstance(NULL, "nfkc", UNORM2_COMPOSE, pErrorCode).
167 * Returns an unmodifiable singleton instance. Do not delete it.
168 * @param pErrorCode Standard ICU error code. Its input value must
169 * pass the U_SUCCESS() test, or else the function returns
170 * immediately. Check for U_FAILURE() on output or use with
171 * function chaining. (See User Guide for details.)
172 * @return the requested Normalizer2, if successful
173 * @stable ICU 49
174 */
175U_STABLE const UNormalizer2 * U_EXPORT2
176unorm2_getNFKCInstance(UErrorCode *pErrorCode) __INTRODUCED_IN(31);
177
178#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
179
180#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
181
182/**
183 * Returns a UNormalizer2 instance for Unicode NFKD normalization.
184 * Same as unorm2_getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, pErrorCode).
185 * Returns an unmodifiable singleton instance. Do not delete it.
186 * @param pErrorCode Standard ICU error code. Its input value must
187 * pass the U_SUCCESS() test, or else the function returns
188 * immediately. Check for U_FAILURE() on output or use with
189 * function chaining. (See User Guide for details.)
190 * @return the requested Normalizer2, if successful
191 * @stable ICU 49
192 */
193U_STABLE const UNormalizer2 * U_EXPORT2
194unorm2_getNFKDInstance(UErrorCode *pErrorCode) __INTRODUCED_IN(31);
195
196#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
197
198#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
199
200/**
201 * Returns a UNormalizer2 instance for Unicode NFKC_Casefold normalization.
202 * Same as unorm2_getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, pErrorCode).
203 * Returns an unmodifiable singleton instance. Do not delete it.
204 * @param pErrorCode Standard ICU error code. Its input value must
205 * pass the U_SUCCESS() test, or else the function returns
206 * immediately. Check for U_FAILURE() on output or use with
207 * function chaining. (See User Guide for details.)
208 * @return the requested Normalizer2, if successful
209 * @stable ICU 49
210 */
211U_STABLE const UNormalizer2 * U_EXPORT2
212unorm2_getNFKCCasefoldInstance(UErrorCode *pErrorCode) __INTRODUCED_IN(31);
213
214#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
215
216
217
218
219
220#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
221
222/**
223 * Closes a UNormalizer2 instance from unorm2_openFiltered().
224 * Do not close instances from unorm2_getInstance()!
225 * @param norm2 UNormalizer2 instance to be closed
226 * @stable ICU 4.4
227 */
228U_STABLE void U_EXPORT2
229unorm2_close(UNormalizer2 *norm2) __INTRODUCED_IN(31);
230
231#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
232
233#if U_SHOW_CPLUSPLUS_API
234
235U_NAMESPACE_BEGIN
236
237/**
238 * \class LocalUNormalizer2Pointer
239 * "Smart pointer" class, closes a UNormalizer2 via unorm2_close().
240 * For most methods see the LocalPointerBase base class.
241 *
242 * @see LocalPointerBase
243 * @see LocalPointer
244 * @stable ICU 4.4
245 */
246U_DEFINE_LOCAL_OPEN_POINTER(LocalUNormalizer2Pointer, UNormalizer2, unorm2_close);
247
248U_NAMESPACE_END
249
250#endif
251
252#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
253
254/**
255 * Writes the normalized form of the source string to the destination string
256 * (replacing its contents) and returns the length of the destination string.
257 * The source and destination strings must be different buffers.
258 * @param norm2 UNormalizer2 instance
259 * @param src source string
260 * @param length length of the source string, or -1 if NUL-terminated
261 * @param dest destination string; its contents is replaced with normalized src
262 * @param capacity number of UChars that can be written to dest
263 * @param pErrorCode Standard ICU error code. Its input value must
264 * pass the U_SUCCESS() test, or else the function returns
265 * immediately. Check for U_FAILURE() on output or use with
266 * function chaining. (See User Guide for details.)
267 * @return dest
268 * @stable ICU 4.4
269 */
270U_STABLE int32_t U_EXPORT2
271unorm2_normalize(const UNormalizer2 *norm2,
272 const UChar *src, int32_t length,
273 UChar *dest, int32_t capacity,
274 UErrorCode *pErrorCode) __INTRODUCED_IN(31);
275
276#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
277#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
278
279/**
280 * Appends the normalized form of the second string to the first string
281 * (merging them at the boundary) and returns the length of the first string.
282 * The result is normalized if the first string was normalized.
283 * The first and second strings must be different buffers.
284 * @param norm2 UNormalizer2 instance
285 * @param first string, should be normalized
286 * @param firstLength length of the first string, or -1 if NUL-terminated
287 * @param firstCapacity number of UChars that can be written to first
288 * @param second string, will be normalized
289 * @param secondLength length of the source string, or -1 if NUL-terminated
290 * @param pErrorCode Standard ICU error code. Its input value must
291 * pass the U_SUCCESS() test, or else the function returns
292 * immediately. Check for U_FAILURE() on output or use with
293 * function chaining. (See User Guide for details.)
294 * @return first
295 * @stable ICU 4.4
296 */
297U_STABLE int32_t U_EXPORT2
298unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
299 UChar *first, int32_t firstLength, int32_t firstCapacity,
300 const UChar *second, int32_t secondLength,
301 UErrorCode *pErrorCode) __INTRODUCED_IN(31);
302
303#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
304#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
305
306/**
307 * Appends the second string to the first string
308 * (merging them at the boundary) and returns the length of the first string.
309 * The result is normalized if both the strings were normalized.
310 * The first and second strings must be different buffers.
311 * @param norm2 UNormalizer2 instance
312 * @param first string, should be normalized
313 * @param firstLength length of the first string, or -1 if NUL-terminated
314 * @param firstCapacity number of UChars that can be written to first
315 * @param second string, should be normalized
316 * @param secondLength length of the source string, or -1 if NUL-terminated
317 * @param pErrorCode Standard ICU error code. Its input value must
318 * pass the U_SUCCESS() test, or else the function returns
319 * immediately. Check for U_FAILURE() on output or use with
320 * function chaining. (See User Guide for details.)
321 * @return first
322 * @stable ICU 4.4
323 */
324U_STABLE int32_t U_EXPORT2
325unorm2_append(const UNormalizer2 *norm2,
326 UChar *first, int32_t firstLength, int32_t firstCapacity,
327 const UChar *second, int32_t secondLength,
328 UErrorCode *pErrorCode) __INTRODUCED_IN(31);
329
330#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
331
332#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
333
334/**
335 * Gets the decomposition mapping of c.
336 * Roughly equivalent to normalizing the String form of c
337 * on a UNORM2_DECOMPOSE UNormalizer2 instance, but much faster, and except that this function
338 * returns a negative value and does not write a string
339 * if c does not have a decomposition mapping in this instance's data.
340 * This function is independent of the mode of the UNormalizer2.
341 * @param norm2 UNormalizer2 instance
342 * @param c code point
343 * @param decomposition String buffer which will be set to c's
344 * decomposition mapping, if there is one.
345 * @param capacity number of UChars that can be written to decomposition
346 * @param pErrorCode Standard ICU error code. Its input value must
347 * pass the U_SUCCESS() test, or else the function returns
348 * immediately. Check for U_FAILURE() on output or use with
349 * function chaining. (See User Guide for details.)
350 * @return the non-negative length of c's decomposition, if there is one; otherwise a negative value
351 * @stable ICU 4.6
352 */
353U_STABLE int32_t U_EXPORT2
354unorm2_getDecomposition(const UNormalizer2 *norm2,
355 UChar32 c, UChar *decomposition, int32_t capacity,
356 UErrorCode *pErrorCode) __INTRODUCED_IN(31);
357
358#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
359
360#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
361
362/**
363 * Gets the raw decomposition mapping of c.
364 *
365 * This is similar to the unorm2_getDecomposition() function but returns the
366 * raw decomposition mapping as specified in UnicodeData.txt or
367 * (for custom data) in the mapping files processed by the gennorm2 tool.
368 * By contrast, unorm2_getDecomposition() returns the processed,
369 * recursively-decomposed version of this mapping.
370 *
371 * When used on a standard NFKC Normalizer2 instance,
372 * unorm2_getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
373 *
374 * When used on a standard NFC Normalizer2 instance,
375 * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
376 * in this case, the result contains either one or two code points (=1..4 UChars).
377 *
378 * This function is independent of the mode of the UNormalizer2.
379 * @param norm2 UNormalizer2 instance
380 * @param c code point
381 * @param decomposition String buffer which will be set to c's
382 * raw decomposition mapping, if there is one.
383 * @param capacity number of UChars that can be written to decomposition
384 * @param pErrorCode Standard ICU error code. Its input value must
385 * pass the U_SUCCESS() test, or else the function returns
386 * immediately. Check for U_FAILURE() on output or use with
387 * function chaining. (See User Guide for details.)
388 * @return the non-negative length of c's raw decomposition, if there is one; otherwise a negative value
389 * @stable ICU 49
390 */
391U_STABLE int32_t U_EXPORT2
392unorm2_getRawDecomposition(const UNormalizer2 *norm2,
393 UChar32 c, UChar *decomposition, int32_t capacity,
394 UErrorCode *pErrorCode) __INTRODUCED_IN(31);
395
396#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
397
398#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
399
400/**
401 * Performs pairwise composition of a & b and returns the composite if there is one.
402 *
403 * Returns a composite code point c only if c has a two-way mapping to a+b.
404 * In standard Unicode normalization, this means that
405 * c has a canonical decomposition to a+b
406 * and c does not have the Full_Composition_Exclusion property.
407 *
408 * This function is independent of the mode of the UNormalizer2.
409 * @param norm2 UNormalizer2 instance
410 * @param a A (normalization starter) code point.
411 * @param b Another code point.
412 * @return The non-negative composite code point if there is one; otherwise a negative value.
413 * @stable ICU 49
414 */
415U_STABLE UChar32 U_EXPORT2
416unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) __INTRODUCED_IN(31);
417
418#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
419
420#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
421
422/**
423 * Gets the combining class of c.
424 * The default implementation returns 0
425 * but all standard implementations return the Unicode Canonical_Combining_Class value.
426 * @param norm2 UNormalizer2 instance
427 * @param c code point
428 * @return c's combining class
429 * @stable ICU 49
430 */
431U_STABLE uint8_t U_EXPORT2
432unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) __INTRODUCED_IN(31);
433
434#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
435
436#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
437
438/**
439 * Tests if the string is normalized.
440 * Internally, in cases where the quickCheck() method would return "maybe"
441 * (which is only possible for the two COMPOSE modes) this method
442 * resolves to "yes" or "no" to provide a definitive result,
443 * at the cost of doing more work in those cases.
444 * @param norm2 UNormalizer2 instance
445 * @param s input string
446 * @param length length of the string, or -1 if NUL-terminated
447 * @param pErrorCode Standard ICU error code. Its input value must
448 * pass the U_SUCCESS() test, or else the function returns
449 * immediately. Check for U_FAILURE() on output or use with
450 * function chaining. (See User Guide for details.)
451 * @return TRUE if s is normalized
452 * @stable ICU 4.4
453 */
454U_STABLE UBool U_EXPORT2
455unorm2_isNormalized(const UNormalizer2 *norm2,
456 const UChar *s, int32_t length,
457 UErrorCode *pErrorCode) __INTRODUCED_IN(31);
458
459#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
460
461#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
462
463/**
464 * Tests if the string is normalized.
465 * For the two COMPOSE modes, the result could be "maybe" in cases that
466 * would take a little more work to resolve definitively.
467 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
468 * combination of quick check + normalization, to avoid
469 * re-checking the "yes" prefix.
470 * @param norm2 UNormalizer2 instance
471 * @param s input string
472 * @param length length of the string, or -1 if NUL-terminated
473 * @param pErrorCode Standard ICU error code. Its input value must
474 * pass the U_SUCCESS() test, or else the function returns
475 * immediately. Check for U_FAILURE() on output or use with
476 * function chaining. (See User Guide for details.)
477 * @return UNormalizationCheckResult
478 * @stable ICU 4.4
479 */
480U_STABLE UNormalizationCheckResult U_EXPORT2
481unorm2_quickCheck(const UNormalizer2 *norm2,
482 const UChar *s, int32_t length,
483 UErrorCode *pErrorCode) __INTRODUCED_IN(31);
484
485#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
486
487#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
488
489/**
490 * Returns the end of the normalized substring of the input string.
491 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
492 * the substring <code>UnicodeString(s, 0, end)</code>
493 * will pass the quick check with a "yes" result.
494 *
495 * The returned end index is usually one or more characters before the
496 * "no" or "maybe" character: The end index is at a normalization boundary.
497 * (See the class documentation for more about normalization boundaries.)
498 *
499 * When the goal is a normalized string and most input strings are expected
500 * to be normalized already, then call this method,
501 * and if it returns a prefix shorter than the input string,
502 * copy that prefix and use normalizeSecondAndAppend() for the remainder.
503 * @param norm2 UNormalizer2 instance
504 * @param s input string
505 * @param length length of the string, or -1 if NUL-terminated
506 * @param pErrorCode Standard ICU error code. Its input value must
507 * pass the U_SUCCESS() test, or else the function returns
508 * immediately. Check for U_FAILURE() on output or use with
509 * function chaining. (See User Guide for details.)
510 * @return "yes" span end index
511 * @stable ICU 4.4
512 */
513U_STABLE int32_t U_EXPORT2
514unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
515 const UChar *s, int32_t length,
516 UErrorCode *pErrorCode) __INTRODUCED_IN(31);
517
518#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
519
520#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
521
522/**
523 * Tests if the character always has a normalization boundary before it,
524 * regardless of context.
525 * For details see the Normalizer2 base class documentation.
526 * @param norm2 UNormalizer2 instance
527 * @param c character to test
528 * @return TRUE if c has a normalization boundary before it
529 * @stable ICU 4.4
530 */
531U_STABLE UBool U_EXPORT2
532unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) __INTRODUCED_IN(31);
533
534#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
535
536#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
537
538/**
539 * Tests if the character always has a normalization boundary after it,
540 * regardless of context.
541 * For details see the Normalizer2 base class documentation.
542 * @param norm2 UNormalizer2 instance
543 * @param c character to test
544 * @return TRUE if c has a normalization boundary after it
545 * @stable ICU 4.4
546 */
547U_STABLE UBool U_EXPORT2
548unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) __INTRODUCED_IN(31);
549
550#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
551
552#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
553
554/**
555 * Tests if the character is normalization-inert.
556 * For details see the Normalizer2 base class documentation.
557 * @param norm2 UNormalizer2 instance
558 * @param c character to test
559 * @return TRUE if c is normalization-inert
560 * @stable ICU 4.4
561 */
562U_STABLE UBool U_EXPORT2
563unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) __INTRODUCED_IN(31);
564
565#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
566
567
568
569#endif /* !UCONFIG_NO_NORMALIZATION */
570#endif /* __UNORM2_H__ */