blob: 3f45829ed51d531f29e8ed4831d9912cf6b5f1c5 [file] [log] [blame]
Victor Changf1a8c982020-11-20 18:16:37 +00001// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4******************************************************************************
5* Copyright (C) 1996-2015, International Business Machines Corporation and others.
6* All Rights Reserved.
7******************************************************************************
8*/
9
10#ifndef UBRK_H
11#define UBRK_H
12
13#include "unicode/utypes.h"
14#include "unicode/uloc.h"
15#include "unicode/utext.h"
16
17#if U_SHOW_CPLUSPLUS_API
18#include "unicode/localpointer.h"
19#endif // U_SHOW_CPLUSPLUS_API
20
21/**
22 * A text-break iterator.
23 * For usage in C programs.
24 */
25#ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
26# define UBRK_TYPEDEF_UBREAK_ITERATOR
27 /**
28 * Opaque type representing an ICU Break iterator object.
29 * @stable ICU 2.0
30 */
31 typedef struct UBreakIterator UBreakIterator;
32#endif
33
34#if !UCONFIG_NO_BREAK_ITERATION
35
36
37/**
38 * \file
39 * \brief C API: BreakIterator
40 *
41 * <h2> BreakIterator C API </h2>
42 *
43 * The BreakIterator C API defines methods for finding the location
44 * of boundaries in text. Pointer to a UBreakIterator maintain a
45 * current position and scan over text returning the index of characters
46 * where boundaries occur.
47 * <p>
48 * Line boundary analysis determines where a text string can be broken
49 * when line-wrapping. The mechanism correctly handles punctuation and
50 * hyphenated words.
51 * <p>
52 * Note: The locale keyword "lb" can be used to modify line break
53 * behavior according to the CSS level 3 line-break options, see
54 * <http://dev.w3.org/csswg/css-text/#line-breaking>. For example:
55 * "ja@lb=strict", "zh@lb=loose".
56 * <p>
57 * Sentence boundary analysis allows selection with correct
58 * interpretation of periods within numbers and abbreviations, and
59 * trailing punctuation marks such as quotation marks and parentheses.
60 * <p>
61 * Note: The locale keyword "ss" can be used to enable use of
62 * segmentation suppression data (preventing breaks in English after
63 * abbreviations such as "Mr." or "Est.", for example), as follows:
64 * "en@ss=standard".
65 * <p>
66 * Word boundary analysis is used by search and replace functions, as
67 * well as within text editing applications that allow the user to
68 * select words with a double click. Word selection provides correct
69 * interpretation of punctuation marks within and following
70 * words. Characters that are not part of a word, such as symbols or
71 * punctuation marks, have word-breaks on both sides.
72 * <p>
73 * Character boundary analysis identifies the boundaries of
74 * "Extended Grapheme Clusters", which are groupings of codepoints
75 * that should be treated as character-like units for many text operations.
76 * Please see Unicode Standard Annex #29, Unicode Text Segmentation,
77 * http://www.unicode.org/reports/tr29/ for additional information
78 * on grapheme clusters and guidelines on their use.
79 * <p>
80 * Title boundary analysis locates all positions,
81 * typically starts of words, that should be set to Title Case
82 * when title casing the text.
83 * <p>
84 * The text boundary positions are found according to the rules
85 * described in Unicode Standard Annex #29, Text Boundaries, and
86 * Unicode Standard Annex #14, Line Breaking Properties. These
87 * are available at http://www.unicode.org/reports/tr14/ and
88 * http://www.unicode.org/reports/tr29/.
89 * <p>
90 * In addition to the plain C API defined in this header file, an
91 * object oriented C++ API with equivalent functionality is defined in the
92 * file brkiter.h.
93 * <p>
94 * Code snippets illustrating the use of the Break Iterator APIs
95 * are available in the ICU User Guide,
96 * http://icu-project.org/userguide/boundaryAnalysis.html
97 * and in the sample program icu/source/samples/break/break.cpp
98 */
99
100/** The possible types of text boundaries. @stable ICU 2.0 */
101typedef enum UBreakIteratorType {
102 /** Character breaks @stable ICU 2.0 */
103 UBRK_CHARACTER = 0,
104 /** Word breaks @stable ICU 2.0 */
105 UBRK_WORD = 1,
106 /** Line breaks @stable ICU 2.0 */
107 UBRK_LINE = 2,
108 /** Sentence breaks @stable ICU 2.0 */
109 UBRK_SENTENCE = 3,
110
111#ifndef U_HIDE_DEPRECATED_API
112 /**
113 * Title Case breaks
114 * The iterator created using this type locates title boundaries as described for
115 * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
116 * please use Word Boundary iterator.
117 *
118 * @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later.
119 */
120 UBRK_TITLE = 4,
121 /**
122 * One more than the highest normal UBreakIteratorType value.
123 * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
124 */
125 UBRK_COUNT = 5
126#endif // U_HIDE_DEPRECATED_API
127} UBreakIteratorType;
128
129/** Value indicating all text boundaries have been returned.
130 * @stable ICU 2.0
131 */
132#define UBRK_DONE ((int32_t) -1)
133
134
135/**
136 * Enum constants for the word break tags returned by
137 * getRuleStatus(). A range of values is defined for each category of
138 * word, to allow for further subdivisions of a category in future releases.
139 * Applications should check for tag values falling within the range, rather
140 * than for single individual values.
141 *
142 * The numeric values of all of these constants are stable (will not change).
143 *
144 * @stable ICU 2.2
145*/
146typedef enum UWordBreak {
147 /** Tag value for "words" that do not fit into any of other categories.
148 * Includes spaces and most punctuation. */
149 UBRK_WORD_NONE = 0,
150 /** Upper bound for tags for uncategorized words. */
151 UBRK_WORD_NONE_LIMIT = 100,
152 /** Tag value for words that appear to be numbers, lower limit. */
153 UBRK_WORD_NUMBER = 100,
154 /** Tag value for words that appear to be numbers, upper limit. */
155 UBRK_WORD_NUMBER_LIMIT = 200,
156 /** Tag value for words that contain letters, excluding
157 * hiragana, katakana or ideographic characters, lower limit. */
158 UBRK_WORD_LETTER = 200,
159 /** Tag value for words containing letters, upper limit */
160 UBRK_WORD_LETTER_LIMIT = 300,
161 /** Tag value for words containing kana characters, lower limit */
162 UBRK_WORD_KANA = 300,
163 /** Tag value for words containing kana characters, upper limit */
164 UBRK_WORD_KANA_LIMIT = 400,
165 /** Tag value for words containing ideographic characters, lower limit */
166 UBRK_WORD_IDEO = 400,
167 /** Tag value for words containing ideographic characters, upper limit */
168 UBRK_WORD_IDEO_LIMIT = 500
169} UWordBreak;
170
171/**
172 * Enum constants for the line break tags returned by getRuleStatus().
173 * A range of values is defined for each category of
174 * word, to allow for further subdivisions of a category in future releases.
175 * Applications should check for tag values falling within the range, rather
176 * than for single individual values.
177 *
178 * The numeric values of all of these constants are stable (will not change).
179 *
180 * @stable ICU 2.8
181*/
182typedef enum ULineBreakTag {
183 /** Tag value for soft line breaks, positions at which a line break
184 * is acceptable but not required */
185 UBRK_LINE_SOFT = 0,
186 /** Upper bound for soft line breaks. */
187 UBRK_LINE_SOFT_LIMIT = 100,
188 /** Tag value for a hard, or mandatory line break */
189 UBRK_LINE_HARD = 100,
190 /** Upper bound for hard line breaks. */
191 UBRK_LINE_HARD_LIMIT = 200
192} ULineBreakTag;
193
194
195
196/**
197 * Enum constants for the sentence break tags returned by getRuleStatus().
198 * A range of values is defined for each category of
199 * sentence, to allow for further subdivisions of a category in future releases.
200 * Applications should check for tag values falling within the range, rather
201 * than for single individual values.
202 *
203 * The numeric values of all of these constants are stable (will not change).
204 *
205 * @stable ICU 2.8
206*/
207typedef enum USentenceBreakTag {
208 /** Tag value for for sentences ending with a sentence terminator
209 * ('.', '?', '!', etc.) character, possibly followed by a
210 * hard separator (CR, LF, PS, etc.)
211 */
212 UBRK_SENTENCE_TERM = 0,
213 /** Upper bound for tags for sentences ended by sentence terminators. */
214 UBRK_SENTENCE_TERM_LIMIT = 100,
215 /** Tag value for for sentences that do not contain an ending
216 * sentence terminator ('.', '?', '!', etc.) character, but
217 * are ended only by a hard separator (CR, LF, PS, etc.) or end of input.
218 */
219 UBRK_SENTENCE_SEP = 100,
220 /** Upper bound for tags for sentences ended by a separator. */
221 UBRK_SENTENCE_SEP_LIMIT = 200
222 /** Tag value for a hard, or mandatory line break */
223} USentenceBreakTag;
224
225
226#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
227
228/**
229 * Open a new UBreakIterator for locating text boundaries for a specified locale.
230 * A UBreakIterator may be used for detecting character, line, word,
231 * and sentence breaks in text.
232 * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
233 * UBRK_LINE, UBRK_SENTENCE
234 * @param locale The locale specifying the text-breaking conventions. Note that
235 * locale keys such as "lb" and "ss" may be used to modify text break behavior,
236 * see general discussion of BreakIterator C API.
237 * @param text The text to be iterated over. May be null, in which case ubrk_setText() is
238 * used to specify the text to be iterated.
239 * @param textLength The number of characters in text, or -1 if null-terminated.
240 * @param status A UErrorCode to receive any errors.
241 * @return A UBreakIterator for the specified locale.
242 * @see ubrk_openRules
243 * @stable ICU 2.0
244 */
Victor Changce4bf3c2021-01-19 16:34:24 +0000245U_CAPI UBreakIterator* U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000246ubrk_open(UBreakIteratorType type,
247 const char *locale,
248 const UChar *text,
249 int32_t textLength,
250 UErrorCode *status) __INTRODUCED_IN(31);
251
252#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
253
254
255
256
257
Victor Chang16f37a62021-02-01 22:20:48 +0000258#ifndef U_HIDE_DEPRECATED_API
Victor Changf1a8c982020-11-20 18:16:37 +0000259
260
Victor Chang16f37a62021-02-01 22:20:48 +0000261
262#endif /* U_HIDE_DEPRECATED_API */
263
Victor Chang16f37a62021-02-01 22:20:48 +0000264
265
266
Victor Chang16f37a62021-02-01 22:20:48 +0000267
Victor Changf1a8c982020-11-20 18:16:37 +0000268#ifndef U_HIDE_DEPRECATED_API
269
270/**
271 * A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone().
272 * @deprecated ICU 52. Do not rely on ubrk_safeClone() cloning into any provided buffer.
273 */
274#define U_BRK_SAFECLONE_BUFFERSIZE 1
275
276#endif /* U_HIDE_DEPRECATED_API */
277
278#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
279
280/**
281* Close a UBreakIterator.
282* Once closed, a UBreakIterator may no longer be used.
283* @param bi The break iterator to close.
284 * @stable ICU 2.0
285*/
Victor Changce4bf3c2021-01-19 16:34:24 +0000286U_CAPI void U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000287ubrk_close(UBreakIterator *bi) __INTRODUCED_IN(31);
288
289#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
290
291#if U_SHOW_CPLUSPLUS_API
292
293U_NAMESPACE_BEGIN
294
295/**
296 * \class LocalUBreakIteratorPointer
297 * "Smart pointer" class, closes a UBreakIterator via ubrk_close().
298 * For most methods see the LocalPointerBase base class.
299 *
300 * @see LocalPointerBase
301 * @see LocalPointer
302 * @stable ICU 4.4
303 */
304U_DEFINE_LOCAL_OPEN_POINTER(LocalUBreakIteratorPointer, UBreakIterator, ubrk_close);
305
306U_NAMESPACE_END
307
308#endif
309
310#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
311
312/**
313 * Sets an existing iterator to point to a new piece of text.
314 * The break iterator retains a pointer to the supplied text.
315 * The caller must not modify or delete the text while the BreakIterator
316 * retains the reference.
317 *
318 * @param bi The iterator to use
319 * @param text The text to be set
320 * @param textLength The length of the text
321 * @param status The error code
322 * @stable ICU 2.0
323 */
Victor Changce4bf3c2021-01-19 16:34:24 +0000324U_CAPI void U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000325ubrk_setText(UBreakIterator* bi,
326 const UChar* text,
327 int32_t textLength,
328 UErrorCode* status) __INTRODUCED_IN(31);
329
330#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
331
332
333#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
334
335/**
336 * Sets an existing iterator to point to a new piece of text.
337 *
338 * All index positions returned by break iterator functions are
339 * native indices from the UText. For example, when breaking UTF-8
340 * encoded text, the break positions returned by \ref ubrk_next, \ref ubrk_previous, etc.
341 * will be UTF-8 string indices, not UTF-16 positions.
342 *
343 * @param bi The iterator to use
344 * @param text The text to be set.
345 * This function makes a shallow clone of the supplied UText. This means
346 * that the caller is free to immediately close or otherwise reuse the
347 * UText that was passed as a parameter, but that the underlying text itself
348 * must not be altered while being referenced by the break iterator.
349 * @param status The error code
350 * @stable ICU 3.4
351 */
Victor Changce4bf3c2021-01-19 16:34:24 +0000352U_CAPI void U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000353ubrk_setUText(UBreakIterator* bi,
354 UText* text,
355 UErrorCode* status) __INTRODUCED_IN(31);
356
357#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
358
359
360
361#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
362
363/**
364 * Determine the most recently-returned text boundary.
365 *
366 * @param bi The break iterator to use.
367 * @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous,
368 * \ref ubrk_first, or \ref ubrk_last.
369 * @stable ICU 2.0
370 */
Victor Changce4bf3c2021-01-19 16:34:24 +0000371U_CAPI int32_t U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000372ubrk_current(const UBreakIterator *bi) __INTRODUCED_IN(31);
373
374#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
375
376#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
377
378/**
379 * Advance the iterator to the boundary following the current boundary.
380 *
381 * @param bi The break iterator to use.
382 * @return The character index of the next text boundary, or UBRK_DONE
383 * if all text boundaries have been returned.
384 * @see ubrk_previous
385 * @stable ICU 2.0
386 */
Victor Changce4bf3c2021-01-19 16:34:24 +0000387U_CAPI int32_t U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000388ubrk_next(UBreakIterator *bi) __INTRODUCED_IN(31);
389
390#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
391
392#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
393
394/**
395 * Set the iterator position to the boundary preceding the current boundary.
396 *
397 * @param bi The break iterator to use.
398 * @return The character index of the preceding text boundary, or UBRK_DONE
399 * if all text boundaries have been returned.
400 * @see ubrk_next
401 * @stable ICU 2.0
402 */
Victor Changce4bf3c2021-01-19 16:34:24 +0000403U_CAPI int32_t U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000404ubrk_previous(UBreakIterator *bi) __INTRODUCED_IN(31);
405
406#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
407
408#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
409
410/**
411 * Set the iterator position to zero, the start of the text being scanned.
412 * @param bi The break iterator to use.
413 * @return The new iterator position (zero).
414 * @see ubrk_last
415 * @stable ICU 2.0
416 */
Victor Changce4bf3c2021-01-19 16:34:24 +0000417U_CAPI int32_t U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000418ubrk_first(UBreakIterator *bi) __INTRODUCED_IN(31);
419
420#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
421
422#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
423
424/**
425 * Set the iterator position to the index immediately <EM>beyond</EM> the last character in the text being scanned.
426 * This is not the same as the last character.
427 * @param bi The break iterator to use.
428 * @return The character offset immediately <EM>beyond</EM> the last character in the
429 * text being scanned.
430 * @see ubrk_first
431 * @stable ICU 2.0
432 */
Victor Changce4bf3c2021-01-19 16:34:24 +0000433U_CAPI int32_t U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000434ubrk_last(UBreakIterator *bi) __INTRODUCED_IN(31);
435
436#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
437
438#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
439
440/**
441 * Set the iterator position to the first boundary preceding the specified offset.
442 * The new position is always smaller than offset, or UBRK_DONE.
443 * @param bi The break iterator to use.
444 * @param offset The offset to begin scanning.
445 * @return The text boundary preceding offset, or UBRK_DONE.
446 * @see ubrk_following
447 * @stable ICU 2.0
448 */
Victor Changce4bf3c2021-01-19 16:34:24 +0000449U_CAPI int32_t U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000450ubrk_preceding(UBreakIterator *bi,
451 int32_t offset) __INTRODUCED_IN(31);
452
453#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
454
455#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
456
457/**
458 * Advance the iterator to the first boundary following the specified offset.
459 * The value returned is always greater than offset, or UBRK_DONE.
460 * @param bi The break iterator to use.
461 * @param offset The offset to begin scanning.
462 * @return The text boundary following offset, or UBRK_DONE.
463 * @see ubrk_preceding
464 * @stable ICU 2.0
465 */
Victor Changce4bf3c2021-01-19 16:34:24 +0000466U_CAPI int32_t U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000467ubrk_following(UBreakIterator *bi,
468 int32_t offset) __INTRODUCED_IN(31);
469
470#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
471
472#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
473
474/**
475* Get a locale for which text breaking information is available.
476* A UBreakIterator in a locale returned by this function will perform the correct
477* text breaking for the locale.
478* @param index The index of the desired locale.
479* @return A locale for which number text breaking information is available, or 0 if none.
480* @see ubrk_countAvailable
481* @stable ICU 2.0
482*/
Victor Changce4bf3c2021-01-19 16:34:24 +0000483U_CAPI const char* U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000484ubrk_getAvailable(int32_t index) __INTRODUCED_IN(31);
485
486#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
487
488#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
489
490/**
491* Determine how many locales have text breaking information available.
492* This function is most useful as determining the loop ending condition for
493* calls to \ref ubrk_getAvailable.
494* @return The number of locales for which text breaking information is available.
495* @see ubrk_getAvailable
496* @stable ICU 2.0
497*/
Victor Changce4bf3c2021-01-19 16:34:24 +0000498U_CAPI int32_t U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000499ubrk_countAvailable(void) __INTRODUCED_IN(31);
500
501#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
502
503
504#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
505
506/**
507* Returns true if the specified position is a boundary position. As a side
508* effect, leaves the iterator pointing to the first boundary position at
509* or after "offset".
510* @param bi The break iterator to use.
511* @param offset the offset to check.
512* @return True if "offset" is a boundary position.
513* @stable ICU 2.0
514*/
Victor Changce4bf3c2021-01-19 16:34:24 +0000515U_CAPI UBool U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000516ubrk_isBoundary(UBreakIterator *bi, int32_t offset) __INTRODUCED_IN(31);
517
518#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
519
520#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
521
522/**
523 * Return the status from the break rule that determined the most recently
524 * returned break position. The values appear in the rule source
525 * within brackets, {123}, for example. For rules that do not specify a
526 * status, a default value of 0 is returned.
527 * <p>
528 * For word break iterators, the possible values are defined in enum UWordBreak.
529 * @stable ICU 2.2
530 */
Victor Changce4bf3c2021-01-19 16:34:24 +0000531U_CAPI int32_t U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000532ubrk_getRuleStatus(UBreakIterator *bi) __INTRODUCED_IN(31);
533
534#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
535
536#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
537
538/**
539 * Get the statuses from the break rules that determined the most recently
540 * returned break position. The values appear in the rule source
541 * within brackets, {123}, for example. The default status value for rules
542 * that do not explicitly provide one is zero.
543 * <p>
544 * For word break iterators, the possible values are defined in enum UWordBreak.
545 * @param bi The break iterator to use
546 * @param fillInVec an array to be filled in with the status values.
547 * @param capacity the length of the supplied vector. A length of zero causes
548 * the function to return the number of status values, in the
549 * normal way, without attempting to store any values.
550 * @param status receives error codes.
551 * @return The number of rule status values from rules that determined
552 * the most recent boundary returned by the break iterator.
553 * @stable ICU 3.0
554 */
Victor Changce4bf3c2021-01-19 16:34:24 +0000555U_CAPI int32_t U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000556ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status) __INTRODUCED_IN(31);
557
558#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
559
560
561
562
563
564
565
566
567#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
568
569#endif