blob: 155433b89a86ac0808990912b0760b5cadd3b51a [file] [log] [blame]
Victor Chang73229502020-09-17 13:39:19 +01001// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/**
4 ************************************************************************************
5 * Copyright (C) 2006-2012, International Business Machines Corporation and others. *
6 * All Rights Reserved. *
7 ************************************************************************************
8 */
9
10#ifndef BRKENG_H
11#define BRKENG_H
12
13#include "unicode/utypes.h"
14#include "unicode/uobject.h"
15#include "unicode/utext.h"
16#include "unicode/uscript.h"
17
18U_NAMESPACE_BEGIN
19
20class UnicodeSet;
21class UStack;
22class UVector32;
23class DictionaryMatcher;
24
25/*******************************************************************
26 * LanguageBreakEngine
27 */
28
29/**
30 * <p>LanguageBreakEngines implement language-specific knowledge for
31 * finding text boundaries within a run of characters belonging to a
32 * specific set. The boundaries will be of a specific kind, e.g. word,
33 * line, etc.</p>
34 *
35 * <p>LanguageBreakEngines should normally be implemented so as to
36 * be shared between threads without locking.</p>
37 */
38class LanguageBreakEngine : public UMemory {
39 public:
40
41 /**
42 * <p>Default constructor.</p>
43 *
44 */
45 LanguageBreakEngine();
46
47 /**
48 * <p>Virtual destructor.</p>
49 */
50 virtual ~LanguageBreakEngine();
51
52 /**
53 * <p>Indicate whether this engine handles a particular character for
54 * a particular kind of break.</p>
55 *
56 * @param c A character which begins a run that the engine might handle
Victor Changce4bf3c2021-01-19 16:34:24 +000057 * @return true if this engine handles the particular character and break
Victor Chang73229502020-09-17 13:39:19 +010058 * type.
59 */
60 virtual UBool handles(UChar32 c) const = 0;
61
62 /**
63 * <p>Find any breaks within a run in the supplied text.</p>
64 *
65 * @param text A UText representing the text. The
66 * iterator is left at the end of the run of characters which the engine
67 * is capable of handling.
68 * @param startPos The start of the run within the supplied text.
69 * @param endPos The end of the run within the supplied text.
70 * @param foundBreaks A Vector of int32_t to receive the breaks.
71 * @return The number of breaks found.
72 */
73 virtual int32_t findBreaks( UText *text,
74 int32_t startPos,
75 int32_t endPos,
76 UVector32 &foundBreaks ) const = 0;
77
78};
79
80/*******************************************************************
81 * LanguageBreakFactory
82 */
83
84/**
85 * <p>LanguageBreakFactorys find and return a LanguageBreakEngine
86 * that can determine breaks for characters in a specific set, if
87 * such an object can be found.</p>
88 *
89 * <p>If a LanguageBreakFactory is to be shared between threads,
90 * appropriate synchronization must be used; there is none internal
91 * to the factory.</p>
92 *
93 * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can
94 * normally be shared between threads without synchronization, unless
95 * the specific subclass of LanguageBreakFactory indicates otherwise.</p>
96 *
97 * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine
98 * it returns when it itself is deleted, unless the specific subclass of
99 * LanguageBreakFactory indicates otherwise. Naturally, the factory should
100 * not be deleted until the LanguageBreakEngines it has returned are no
101 * longer needed.</p>
102 */
103class LanguageBreakFactory : public UMemory {
104 public:
105
106 /**
107 * <p>Default constructor.</p>
108 *
109 */
110 LanguageBreakFactory();
111
112 /**
113 * <p>Virtual destructor.</p>
114 */
115 virtual ~LanguageBreakFactory();
116
117 /**
118 * <p>Find and return a LanguageBreakEngine that can find the desired
119 * kind of break for the set of characters to which the supplied
120 * character belongs. It is up to the set of available engines to
121 * determine what the sets of characters are.</p>
122 *
123 * @param c A character that begins a run for which a LanguageBreakEngine is
124 * sought.
125 * @return A LanguageBreakEngine with the desired characteristics, or 0.
126 */
127 virtual const LanguageBreakEngine *getEngineFor(UChar32 c) = 0;
128
129};
130
131/*******************************************************************
132 * UnhandledEngine
133 */
134
135/**
136 * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that
137 * handles characters that no other LanguageBreakEngine is available to
138 * handle. It is told the character and the type of break; at its
139 * discretion it may handle more than the specified character (e.g.,
140 * the entire script to which that character belongs.</p>
141 *
142 * <p>UnhandledEngines may not be shared between threads without
143 * external synchronization.</p>
144 */
145
146class UnhandledEngine : public LanguageBreakEngine {
147 private:
148
149 /**
150 * The sets of characters handled.
151 * @internal
152 */
153
154 UnicodeSet *fHandled;
155
156 public:
157
158 /**
159 * <p>Default constructor.</p>
160 *
161 */
162 UnhandledEngine(UErrorCode &status);
163
164 /**
165 * <p>Virtual destructor.</p>
166 */
167 virtual ~UnhandledEngine();
168
169 /**
170 * <p>Indicate whether this engine handles a particular character for
171 * a particular kind of break.</p>
172 *
173 * @param c A character which begins a run that the engine might handle
Victor Changce4bf3c2021-01-19 16:34:24 +0000174 * @return true if this engine handles the particular character and break
Victor Chang73229502020-09-17 13:39:19 +0100175 * type.
176 */
177 virtual UBool handles(UChar32 c) const;
178
179 /**
180 * <p>Find any breaks within a run in the supplied text.</p>
181 *
182 * @param text A UText representing the text (TODO: UText). The
183 * iterator is left at the end of the run of characters which the engine
184 * is capable of handling.
185 * @param startPos The start of the run within the supplied text.
186 * @param endPos The end of the run within the supplied text.
187 * @param foundBreaks An allocated C array of the breaks found, if any
188 * @return The number of breaks found.
189 */
190 virtual int32_t findBreaks( UText *text,
191 int32_t startPos,
192 int32_t endPos,
193 UVector32 &foundBreaks ) const;
194
195 /**
196 * <p>Tell the engine to handle a particular character and break type.</p>
197 *
198 * @param c A character which the engine should handle
199 */
200 virtual void handleCharacter(UChar32 c);
201
202};
203
204/*******************************************************************
205 * ICULanguageBreakFactory
206 */
207
208/**
209 * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for
210 * ICU. It creates dictionary-based LanguageBreakEngines from dictionary
211 * data in the ICU data file.</p>
212 */
213class ICULanguageBreakFactory : public LanguageBreakFactory {
214 private:
215
216 /**
217 * The stack of break engines created by this factory
218 * @internal
219 */
220
221 UStack *fEngines;
222
223 public:
224
225 /**
226 * <p>Standard constructor.</p>
227 *
228 */
229 ICULanguageBreakFactory(UErrorCode &status);
230
231 /**
232 * <p>Virtual destructor.</p>
233 */
234 virtual ~ICULanguageBreakFactory();
235
236 /**
237 * <p>Find and return a LanguageBreakEngine that can find the desired
238 * kind of break for the set of characters to which the supplied
239 * character belongs. It is up to the set of available engines to
240 * determine what the sets of characters are.</p>
241 *
242 * @param c A character that begins a run for which a LanguageBreakEngine is
243 * sought.
244 * @return A LanguageBreakEngine with the desired characteristics, or 0.
245 */
246 virtual const LanguageBreakEngine *getEngineFor(UChar32 c);
247
248protected:
249 /**
250 * <p>Create a LanguageBreakEngine for the set of characters to which
251 * the supplied character belongs, for the specified break type.</p>
252 *
253 * @param c A character that begins a run for which a LanguageBreakEngine is
254 * sought.
255 * @return A LanguageBreakEngine with the desired characteristics, or 0.
256 */
257 virtual const LanguageBreakEngine *loadEngineFor(UChar32 c);
258
259 /**
260 * <p>Create a DictionaryMatcher for the specified script and break type.</p>
261 * @param script An ISO 15924 script code that identifies the dictionary to be
262 * created.
263 * @return A DictionaryMatcher with the desired characteristics, or NULL.
264 */
265 virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script);
266};
267
268U_NAMESPACE_END
269
270 /* BRKENG_H */
271#endif