blob: 0b8bbf3f0817de2d3ec4c688122483fed09075b9 [file] [log] [blame]
Victor Changf1a8c982020-11-20 18:16:37 +00001// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4******************************************************************************
5* Copyright (C) 1996-2015, International Business Machines Corporation and others.
6* All Rights Reserved.
7******************************************************************************
8*/
9
10#ifndef UBRK_H
11#define UBRK_H
12
13#include "unicode/utypes.h"
14#include "unicode/uloc.h"
15#include "unicode/utext.h"
16
17#if U_SHOW_CPLUSPLUS_API
18#include "unicode/localpointer.h"
19#endif // U_SHOW_CPLUSPLUS_API
20
21/**
22 * A text-break iterator.
23 * For usage in C programs.
24 */
25#ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
26# define UBRK_TYPEDEF_UBREAK_ITERATOR
27 /**
28 * Opaque type representing an ICU Break iterator object.
29 * @stable ICU 2.0
30 */
31 typedef struct UBreakIterator UBreakIterator;
32#endif
33
34#if !UCONFIG_NO_BREAK_ITERATION
35
36
37/**
38 * \file
39 * \brief C API: BreakIterator
40 *
41 * <h2> BreakIterator C API </h2>
42 *
43 * The BreakIterator C API defines methods for finding the location
44 * of boundaries in text. Pointer to a UBreakIterator maintain a
45 * current position and scan over text returning the index of characters
46 * where boundaries occur.
47 * <p>
48 * Line boundary analysis determines where a text string can be broken
49 * when line-wrapping. The mechanism correctly handles punctuation and
50 * hyphenated words.
51 * <p>
52 * Note: The locale keyword "lb" can be used to modify line break
53 * behavior according to the CSS level 3 line-break options, see
54 * <http://dev.w3.org/csswg/css-text/#line-breaking>. For example:
55 * "ja@lb=strict", "zh@lb=loose".
56 * <p>
57 * Sentence boundary analysis allows selection with correct
58 * interpretation of periods within numbers and abbreviations, and
59 * trailing punctuation marks such as quotation marks and parentheses.
60 * <p>
61 * Note: The locale keyword "ss" can be used to enable use of
62 * segmentation suppression data (preventing breaks in English after
63 * abbreviations such as "Mr." or "Est.", for example), as follows:
64 * "en@ss=standard".
65 * <p>
66 * Word boundary analysis is used by search and replace functions, as
67 * well as within text editing applications that allow the user to
68 * select words with a double click. Word selection provides correct
69 * interpretation of punctuation marks within and following
70 * words. Characters that are not part of a word, such as symbols or
71 * punctuation marks, have word-breaks on both sides.
72 * <p>
73 * Character boundary analysis identifies the boundaries of
74 * "Extended Grapheme Clusters", which are groupings of codepoints
75 * that should be treated as character-like units for many text operations.
76 * Please see Unicode Standard Annex #29, Unicode Text Segmentation,
77 * http://www.unicode.org/reports/tr29/ for additional information
78 * on grapheme clusters and guidelines on their use.
79 * <p>
80 * Title boundary analysis locates all positions,
81 * typically starts of words, that should be set to Title Case
82 * when title casing the text.
83 * <p>
84 * The text boundary positions are found according to the rules
85 * described in Unicode Standard Annex #29, Text Boundaries, and
86 * Unicode Standard Annex #14, Line Breaking Properties. These
87 * are available at http://www.unicode.org/reports/tr14/ and
88 * http://www.unicode.org/reports/tr29/.
89 * <p>
90 * In addition to the plain C API defined in this header file, an
91 * object oriented C++ API with equivalent functionality is defined in the
92 * file brkiter.h.
93 * <p>
94 * Code snippets illustrating the use of the Break Iterator APIs
95 * are available in the ICU User Guide,
96 * http://icu-project.org/userguide/boundaryAnalysis.html
97 * and in the sample program icu/source/samples/break/break.cpp
98 */
99
100/** The possible types of text boundaries. @stable ICU 2.0 */
101typedef enum UBreakIteratorType {
102 /** Character breaks @stable ICU 2.0 */
103 UBRK_CHARACTER = 0,
104 /** Word breaks @stable ICU 2.0 */
105 UBRK_WORD = 1,
106 /** Line breaks @stable ICU 2.0 */
107 UBRK_LINE = 2,
108 /** Sentence breaks @stable ICU 2.0 */
109 UBRK_SENTENCE = 3,
110
111#ifndef U_HIDE_DEPRECATED_API
112 /**
113 * Title Case breaks
114 * The iterator created using this type locates title boundaries as described for
115 * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
116 * please use Word Boundary iterator.
117 *
118 * @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later.
119 */
120 UBRK_TITLE = 4,
121 /**
122 * One more than the highest normal UBreakIteratorType value.
123 * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
124 */
125 UBRK_COUNT = 5
126#endif // U_HIDE_DEPRECATED_API
127} UBreakIteratorType;
128
129/** Value indicating all text boundaries have been returned.
130 * @stable ICU 2.0
131 */
132#define UBRK_DONE ((int32_t) -1)
133
134
135/**
136 * Enum constants for the word break tags returned by
137 * getRuleStatus(). A range of values is defined for each category of
138 * word, to allow for further subdivisions of a category in future releases.
139 * Applications should check for tag values falling within the range, rather
140 * than for single individual values.
141 *
142 * The numeric values of all of these constants are stable (will not change).
143 *
144 * @stable ICU 2.2
145*/
146typedef enum UWordBreak {
147 /** Tag value for "words" that do not fit into any of other categories.
148 * Includes spaces and most punctuation. */
149 UBRK_WORD_NONE = 0,
150 /** Upper bound for tags for uncategorized words. */
151 UBRK_WORD_NONE_LIMIT = 100,
152 /** Tag value for words that appear to be numbers, lower limit. */
153 UBRK_WORD_NUMBER = 100,
154 /** Tag value for words that appear to be numbers, upper limit. */
155 UBRK_WORD_NUMBER_LIMIT = 200,
156 /** Tag value for words that contain letters, excluding
157 * hiragana, katakana or ideographic characters, lower limit. */
158 UBRK_WORD_LETTER = 200,
159 /** Tag value for words containing letters, upper limit */
160 UBRK_WORD_LETTER_LIMIT = 300,
161 /** Tag value for words containing kana characters, lower limit */
162 UBRK_WORD_KANA = 300,
163 /** Tag value for words containing kana characters, upper limit */
164 UBRK_WORD_KANA_LIMIT = 400,
165 /** Tag value for words containing ideographic characters, lower limit */
166 UBRK_WORD_IDEO = 400,
167 /** Tag value for words containing ideographic characters, upper limit */
168 UBRK_WORD_IDEO_LIMIT = 500
169} UWordBreak;
170
171/**
172 * Enum constants for the line break tags returned by getRuleStatus().
173 * A range of values is defined for each category of
174 * word, to allow for further subdivisions of a category in future releases.
175 * Applications should check for tag values falling within the range, rather
176 * than for single individual values.
177 *
178 * The numeric values of all of these constants are stable (will not change).
179 *
180 * @stable ICU 2.8
181*/
182typedef enum ULineBreakTag {
183 /** Tag value for soft line breaks, positions at which a line break
184 * is acceptable but not required */
185 UBRK_LINE_SOFT = 0,
186 /** Upper bound for soft line breaks. */
187 UBRK_LINE_SOFT_LIMIT = 100,
188 /** Tag value for a hard, or mandatory line break */
189 UBRK_LINE_HARD = 100,
190 /** Upper bound for hard line breaks. */
191 UBRK_LINE_HARD_LIMIT = 200
192} ULineBreakTag;
193
194
195
196/**
197 * Enum constants for the sentence break tags returned by getRuleStatus().
198 * A range of values is defined for each category of
199 * sentence, to allow for further subdivisions of a category in future releases.
200 * Applications should check for tag values falling within the range, rather
201 * than for single individual values.
202 *
203 * The numeric values of all of these constants are stable (will not change).
204 *
205 * @stable ICU 2.8
206*/
207typedef enum USentenceBreakTag {
208 /** Tag value for for sentences ending with a sentence terminator
209 * ('.', '?', '!', etc.) character, possibly followed by a
210 * hard separator (CR, LF, PS, etc.)
211 */
212 UBRK_SENTENCE_TERM = 0,
213 /** Upper bound for tags for sentences ended by sentence terminators. */
214 UBRK_SENTENCE_TERM_LIMIT = 100,
215 /** Tag value for for sentences that do not contain an ending
216 * sentence terminator ('.', '?', '!', etc.) character, but
217 * are ended only by a hard separator (CR, LF, PS, etc.) or end of input.
218 */
219 UBRK_SENTENCE_SEP = 100,
220 /** Upper bound for tags for sentences ended by a separator. */
221 UBRK_SENTENCE_SEP_LIMIT = 200
222 /** Tag value for a hard, or mandatory line break */
223} USentenceBreakTag;
224
225
226#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
227
228/**
229 * Open a new UBreakIterator for locating text boundaries for a specified locale.
230 * A UBreakIterator may be used for detecting character, line, word,
231 * and sentence breaks in text.
232 * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
233 * UBRK_LINE, UBRK_SENTENCE
234 * @param locale The locale specifying the text-breaking conventions. Note that
235 * locale keys such as "lb" and "ss" may be used to modify text break behavior,
236 * see general discussion of BreakIterator C API.
237 * @param text The text to be iterated over. May be null, in which case ubrk_setText() is
238 * used to specify the text to be iterated.
239 * @param textLength The number of characters in text, or -1 if null-terminated.
240 * @param status A UErrorCode to receive any errors.
241 * @return A UBreakIterator for the specified locale.
242 * @see ubrk_openRules
243 * @stable ICU 2.0
244 */
Victor Changce4bf3c2021-01-19 16:34:24 +0000245U_CAPI UBreakIterator* U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000246ubrk_open(UBreakIteratorType type,
247 const char *locale,
248 const UChar *text,
249 int32_t textLength,
250 UErrorCode *status) __INTRODUCED_IN(31);
251
252#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
253
254
255
256
257
Victor Chang16f37a62021-02-01 22:20:48 +0000258#ifndef U_HIDE_DEPRECATED_API
Victor Changf1a8c982020-11-20 18:16:37 +0000259
260
Victor Chang16f37a62021-02-01 22:20:48 +0000261
262#endif /* U_HIDE_DEPRECATED_API */
263
264#ifndef U_HIDE_DRAFT_API
265
266
267
268#endif // U_HIDE_DRAFT_API
269
Victor Changf1a8c982020-11-20 18:16:37 +0000270#ifndef U_HIDE_DEPRECATED_API
271
272/**
273 * A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone().
274 * @deprecated ICU 52. Do not rely on ubrk_safeClone() cloning into any provided buffer.
275 */
276#define U_BRK_SAFECLONE_BUFFERSIZE 1
277
278#endif /* U_HIDE_DEPRECATED_API */
279
280#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
281
282/**
283* Close a UBreakIterator.
284* Once closed, a UBreakIterator may no longer be used.
285* @param bi The break iterator to close.
286 * @stable ICU 2.0
287*/
Victor Changce4bf3c2021-01-19 16:34:24 +0000288U_CAPI void U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000289ubrk_close(UBreakIterator *bi) __INTRODUCED_IN(31);
290
291#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
292
293#if U_SHOW_CPLUSPLUS_API
294
295U_NAMESPACE_BEGIN
296
297/**
298 * \class LocalUBreakIteratorPointer
299 * "Smart pointer" class, closes a UBreakIterator via ubrk_close().
300 * For most methods see the LocalPointerBase base class.
301 *
302 * @see LocalPointerBase
303 * @see LocalPointer
304 * @stable ICU 4.4
305 */
306U_DEFINE_LOCAL_OPEN_POINTER(LocalUBreakIteratorPointer, UBreakIterator, ubrk_close);
307
308U_NAMESPACE_END
309
310#endif
311
312#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
313
314/**
315 * Sets an existing iterator to point to a new piece of text.
316 * The break iterator retains a pointer to the supplied text.
317 * The caller must not modify or delete the text while the BreakIterator
318 * retains the reference.
319 *
320 * @param bi The iterator to use
321 * @param text The text to be set
322 * @param textLength The length of the text
323 * @param status The error code
324 * @stable ICU 2.0
325 */
Victor Changce4bf3c2021-01-19 16:34:24 +0000326U_CAPI void U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000327ubrk_setText(UBreakIterator* bi,
328 const UChar* text,
329 int32_t textLength,
330 UErrorCode* status) __INTRODUCED_IN(31);
331
332#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
333
334
335#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
336
337/**
338 * Sets an existing iterator to point to a new piece of text.
339 *
340 * All index positions returned by break iterator functions are
341 * native indices from the UText. For example, when breaking UTF-8
342 * encoded text, the break positions returned by \ref ubrk_next, \ref ubrk_previous, etc.
343 * will be UTF-8 string indices, not UTF-16 positions.
344 *
345 * @param bi The iterator to use
346 * @param text The text to be set.
347 * This function makes a shallow clone of the supplied UText. This means
348 * that the caller is free to immediately close or otherwise reuse the
349 * UText that was passed as a parameter, but that the underlying text itself
350 * must not be altered while being referenced by the break iterator.
351 * @param status The error code
352 * @stable ICU 3.4
353 */
Victor Changce4bf3c2021-01-19 16:34:24 +0000354U_CAPI void U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000355ubrk_setUText(UBreakIterator* bi,
356 UText* text,
357 UErrorCode* status) __INTRODUCED_IN(31);
358
359#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
360
361
362
363#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
364
365/**
366 * Determine the most recently-returned text boundary.
367 *
368 * @param bi The break iterator to use.
369 * @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous,
370 * \ref ubrk_first, or \ref ubrk_last.
371 * @stable ICU 2.0
372 */
Victor Changce4bf3c2021-01-19 16:34:24 +0000373U_CAPI int32_t U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000374ubrk_current(const UBreakIterator *bi) __INTRODUCED_IN(31);
375
376#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
377
378#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
379
380/**
381 * Advance the iterator to the boundary following the current boundary.
382 *
383 * @param bi The break iterator to use.
384 * @return The character index of the next text boundary, or UBRK_DONE
385 * if all text boundaries have been returned.
386 * @see ubrk_previous
387 * @stable ICU 2.0
388 */
Victor Changce4bf3c2021-01-19 16:34:24 +0000389U_CAPI int32_t U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000390ubrk_next(UBreakIterator *bi) __INTRODUCED_IN(31);
391
392#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
393
394#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
395
396/**
397 * Set the iterator position to the boundary preceding the current boundary.
398 *
399 * @param bi The break iterator to use.
400 * @return The character index of the preceding text boundary, or UBRK_DONE
401 * if all text boundaries have been returned.
402 * @see ubrk_next
403 * @stable ICU 2.0
404 */
Victor Changce4bf3c2021-01-19 16:34:24 +0000405U_CAPI int32_t U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000406ubrk_previous(UBreakIterator *bi) __INTRODUCED_IN(31);
407
408#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
409
410#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
411
412/**
413 * Set the iterator position to zero, the start of the text being scanned.
414 * @param bi The break iterator to use.
415 * @return The new iterator position (zero).
416 * @see ubrk_last
417 * @stable ICU 2.0
418 */
Victor Changce4bf3c2021-01-19 16:34:24 +0000419U_CAPI int32_t U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000420ubrk_first(UBreakIterator *bi) __INTRODUCED_IN(31);
421
422#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
423
424#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
425
426/**
427 * Set the iterator position to the index immediately <EM>beyond</EM> the last character in the text being scanned.
428 * This is not the same as the last character.
429 * @param bi The break iterator to use.
430 * @return The character offset immediately <EM>beyond</EM> the last character in the
431 * text being scanned.
432 * @see ubrk_first
433 * @stable ICU 2.0
434 */
Victor Changce4bf3c2021-01-19 16:34:24 +0000435U_CAPI int32_t U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000436ubrk_last(UBreakIterator *bi) __INTRODUCED_IN(31);
437
438#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
439
440#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
441
442/**
443 * Set the iterator position to the first boundary preceding the specified offset.
444 * The new position is always smaller than offset, or UBRK_DONE.
445 * @param bi The break iterator to use.
446 * @param offset The offset to begin scanning.
447 * @return The text boundary preceding offset, or UBRK_DONE.
448 * @see ubrk_following
449 * @stable ICU 2.0
450 */
Victor Changce4bf3c2021-01-19 16:34:24 +0000451U_CAPI int32_t U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000452ubrk_preceding(UBreakIterator *bi,
453 int32_t offset) __INTRODUCED_IN(31);
454
455#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
456
457#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
458
459/**
460 * Advance the iterator to the first boundary following the specified offset.
461 * The value returned is always greater than offset, or UBRK_DONE.
462 * @param bi The break iterator to use.
463 * @param offset The offset to begin scanning.
464 * @return The text boundary following offset, or UBRK_DONE.
465 * @see ubrk_preceding
466 * @stable ICU 2.0
467 */
Victor Changce4bf3c2021-01-19 16:34:24 +0000468U_CAPI int32_t U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000469ubrk_following(UBreakIterator *bi,
470 int32_t offset) __INTRODUCED_IN(31);
471
472#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
473
474#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
475
476/**
477* Get a locale for which text breaking information is available.
478* A UBreakIterator in a locale returned by this function will perform the correct
479* text breaking for the locale.
480* @param index The index of the desired locale.
481* @return A locale for which number text breaking information is available, or 0 if none.
482* @see ubrk_countAvailable
483* @stable ICU 2.0
484*/
Victor Changce4bf3c2021-01-19 16:34:24 +0000485U_CAPI const char* U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000486ubrk_getAvailable(int32_t index) __INTRODUCED_IN(31);
487
488#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
489
490#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
491
492/**
493* Determine how many locales have text breaking information available.
494* This function is most useful as determining the loop ending condition for
495* calls to \ref ubrk_getAvailable.
496* @return The number of locales for which text breaking information is available.
497* @see ubrk_getAvailable
498* @stable ICU 2.0
499*/
Victor Changce4bf3c2021-01-19 16:34:24 +0000500U_CAPI int32_t U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000501ubrk_countAvailable(void) __INTRODUCED_IN(31);
502
503#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
504
505
506#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
507
508/**
509* Returns true if the specified position is a boundary position. As a side
510* effect, leaves the iterator pointing to the first boundary position at
511* or after "offset".
512* @param bi The break iterator to use.
513* @param offset the offset to check.
514* @return True if "offset" is a boundary position.
515* @stable ICU 2.0
516*/
Victor Changce4bf3c2021-01-19 16:34:24 +0000517U_CAPI UBool U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000518ubrk_isBoundary(UBreakIterator *bi, int32_t offset) __INTRODUCED_IN(31);
519
520#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
521
522#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
523
524/**
525 * Return the status from the break rule that determined the most recently
526 * returned break position. The values appear in the rule source
527 * within brackets, {123}, for example. For rules that do not specify a
528 * status, a default value of 0 is returned.
529 * <p>
530 * For word break iterators, the possible values are defined in enum UWordBreak.
531 * @stable ICU 2.2
532 */
Victor Changce4bf3c2021-01-19 16:34:24 +0000533U_CAPI int32_t U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000534ubrk_getRuleStatus(UBreakIterator *bi) __INTRODUCED_IN(31);
535
536#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
537
538#if !defined(__ANDROID__) || __ANDROID_API__ >= 31
539
540/**
541 * Get the statuses from the break rules that determined the most recently
542 * returned break position. The values appear in the rule source
543 * within brackets, {123}, for example. The default status value for rules
544 * that do not explicitly provide one is zero.
545 * <p>
546 * For word break iterators, the possible values are defined in enum UWordBreak.
547 * @param bi The break iterator to use
548 * @param fillInVec an array to be filled in with the status values.
549 * @param capacity the length of the supplied vector. A length of zero causes
550 * the function to return the number of status values, in the
551 * normal way, without attempting to store any values.
552 * @param status receives error codes.
553 * @return The number of rule status values from rules that determined
554 * the most recent boundary returned by the break iterator.
555 * @stable ICU 3.0
556 */
Victor Changce4bf3c2021-01-19 16:34:24 +0000557U_CAPI int32_t U_EXPORT2
Victor Changf1a8c982020-11-20 18:16:37 +0000558ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status) __INTRODUCED_IN(31);
559
560#endif // !defined(__ANDROID__) || __ANDROID_API__ >= 31
561
562
563
564
565
566
567
568
569#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
570
571#endif