blob: 4ea676fc71610f70b4e490c3dc3afceab49d55ff [file] [log] [blame]
Victor Chang73229502020-09-17 13:39:19 +01001// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/**
4 *******************************************************************************
5 * Copyright (C) 2006-2014, International Business Machines Corporation *
6 * and others. All Rights Reserved. *
7 *******************************************************************************
8 */
9
10#ifndef DICTBE_H
11#define DICTBE_H
12
13#include "unicode/utypes.h"
14#include "unicode/uniset.h"
15#include "unicode/utext.h"
16
17#include "brkeng.h"
18#include "uvectr32.h"
19
20U_NAMESPACE_BEGIN
21
22class DictionaryMatcher;
23class Normalizer2;
24
25/*******************************************************************
26 * DictionaryBreakEngine
27 */
28
29/**
30 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
31 * dictionary to determine language-specific breaks.</p>
32 *
33 * <p>After it is constructed a DictionaryBreakEngine may be shared between
34 * threads without synchronization.</p>
35 */
36class DictionaryBreakEngine : public LanguageBreakEngine {
37 private:
38 /**
39 * The set of characters handled by this engine
40 * @internal
41 */
42
43 UnicodeSet fSet;
44
45 public:
46
47 /**
48 * <p>Constructor </p>
49 */
50 DictionaryBreakEngine();
51
52 /**
53 * <p>Virtual destructor.</p>
54 */
55 virtual ~DictionaryBreakEngine();
56
57 /**
58 * <p>Indicate whether this engine handles a particular character for
59 * a particular kind of break.</p>
60 *
61 * @param c A character which begins a run that the engine might handle
Victor Changce4bf3c2021-01-19 16:34:24 +000062 * @return true if this engine handles the particular character and break
Victor Chang73229502020-09-17 13:39:19 +010063 * type.
64 */
65 virtual UBool handles(UChar32 c) const;
66
67 /**
68 * <p>Find any breaks within a run in the supplied text.</p>
69 *
70 * @param text A UText representing the text. The iterator is left at
71 * the end of the run of characters which the engine is capable of handling
72 * that starts from the first character in the range.
73 * @param startPos The start of the run within the supplied text.
74 * @param endPos The end of the run within the supplied text.
75 * @param foundBreaks vector of int32_t to receive the break positions
76 * @return The number of breaks found.
77 */
78 virtual int32_t findBreaks( UText *text,
79 int32_t startPos,
80 int32_t endPos,
81 UVector32 &foundBreaks ) const;
82
83 protected:
84
85 /**
86 * <p>Set the character set handled by this engine.</p>
87 *
88 * @param set A UnicodeSet of the set of characters handled by the engine
89 */
90 virtual void setCharacters( const UnicodeSet &set );
91
92 /**
93 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
94 *
95 * @param text A UText representing the text
96 * @param rangeStart The start of the range of dictionary characters
97 * @param rangeEnd The end of the range of dictionary characters
98 * @param foundBreaks Output of C array of int32_t break positions, or 0
99 * @return The number of breaks found
100 */
101 virtual int32_t divideUpDictionaryRange( UText *text,
102 int32_t rangeStart,
103 int32_t rangeEnd,
104 UVector32 &foundBreaks ) const = 0;
105
106};
107
108/*******************************************************************
109 * ThaiBreakEngine
110 */
111
112/**
113 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
114 * dictionary and heuristics to determine Thai-specific breaks.</p>
115 *
116 * <p>After it is constructed a ThaiBreakEngine may be shared between
117 * threads without synchronization.</p>
118 */
119class ThaiBreakEngine : public DictionaryBreakEngine {
120 private:
121 /**
122 * The set of characters handled by this engine
123 * @internal
124 */
125
126 UnicodeSet fThaiWordSet;
127 UnicodeSet fEndWordSet;
128 UnicodeSet fBeginWordSet;
129 UnicodeSet fSuffixSet;
130 UnicodeSet fMarkSet;
131 DictionaryMatcher *fDictionary;
132
133 public:
134
135 /**
136 * <p>Default constructor.</p>
137 *
138 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
139 * engine is deleted.
140 */
141 ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
142
143 /**
144 * <p>Virtual destructor.</p>
145 */
146 virtual ~ThaiBreakEngine();
147
148 protected:
149 /**
150 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
151 *
152 * @param text A UText representing the text
153 * @param rangeStart The start of the range of dictionary characters
154 * @param rangeEnd The end of the range of dictionary characters
155 * @param foundBreaks Output of C array of int32_t break positions, or 0
156 * @return The number of breaks found
157 */
158 virtual int32_t divideUpDictionaryRange( UText *text,
159 int32_t rangeStart,
160 int32_t rangeEnd,
161 UVector32 &foundBreaks ) const;
162
163};
164
165/*******************************************************************
166 * LaoBreakEngine
167 */
168
169/**
170 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
171 * dictionary and heuristics to determine Lao-specific breaks.</p>
172 *
173 * <p>After it is constructed a LaoBreakEngine may be shared between
174 * threads without synchronization.</p>
175 */
176class LaoBreakEngine : public DictionaryBreakEngine {
177 private:
178 /**
179 * The set of characters handled by this engine
180 * @internal
181 */
182
183 UnicodeSet fLaoWordSet;
184 UnicodeSet fEndWordSet;
185 UnicodeSet fBeginWordSet;
186 UnicodeSet fMarkSet;
187 DictionaryMatcher *fDictionary;
188
189 public:
190
191 /**
192 * <p>Default constructor.</p>
193 *
194 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
195 * engine is deleted.
196 */
197 LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
198
199 /**
200 * <p>Virtual destructor.</p>
201 */
202 virtual ~LaoBreakEngine();
203
204 protected:
205 /**
206 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
207 *
208 * @param text A UText representing the text
209 * @param rangeStart The start of the range of dictionary characters
210 * @param rangeEnd The end of the range of dictionary characters
211 * @param foundBreaks Output of C array of int32_t break positions, or 0
212 * @return The number of breaks found
213 */
214 virtual int32_t divideUpDictionaryRange( UText *text,
215 int32_t rangeStart,
216 int32_t rangeEnd,
217 UVector32 &foundBreaks ) const;
218
219};
220
221/*******************************************************************
222 * BurmeseBreakEngine
223 */
224
225/**
226 * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
227 * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
228 *
229 * <p>After it is constructed a BurmeseBreakEngine may be shared between
230 * threads without synchronization.</p>
231 */
232class BurmeseBreakEngine : public DictionaryBreakEngine {
233 private:
234 /**
235 * The set of characters handled by this engine
236 * @internal
237 */
238
239 UnicodeSet fBurmeseWordSet;
240 UnicodeSet fEndWordSet;
241 UnicodeSet fBeginWordSet;
242 UnicodeSet fMarkSet;
243 DictionaryMatcher *fDictionary;
244
245 public:
246
247 /**
248 * <p>Default constructor.</p>
249 *
250 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
251 * engine is deleted.
252 */
253 BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
254
255 /**
256 * <p>Virtual destructor.</p>
257 */
258 virtual ~BurmeseBreakEngine();
259
260 protected:
261 /**
262 * <p>Divide up a range of known dictionary characters.</p>
263 *
264 * @param text A UText representing the text
265 * @param rangeStart The start of the range of dictionary characters
266 * @param rangeEnd The end of the range of dictionary characters
267 * @param foundBreaks Output of C array of int32_t break positions, or 0
268 * @return The number of breaks found
269 */
270 virtual int32_t divideUpDictionaryRange( UText *text,
271 int32_t rangeStart,
272 int32_t rangeEnd,
273 UVector32 &foundBreaks ) const;
274
275};
276
277/*******************************************************************
278 * KhmerBreakEngine
279 */
280
281/**
282 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
283 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
284 *
285 * <p>After it is constructed a KhmerBreakEngine may be shared between
286 * threads without synchronization.</p>
287 */
288class KhmerBreakEngine : public DictionaryBreakEngine {
289 private:
290 /**
291 * The set of characters handled by this engine
292 * @internal
293 */
294
295 UnicodeSet fKhmerWordSet;
296 UnicodeSet fEndWordSet;
297 UnicodeSet fBeginWordSet;
298 UnicodeSet fMarkSet;
299 DictionaryMatcher *fDictionary;
300
301 public:
302
303 /**
304 * <p>Default constructor.</p>
305 *
306 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
307 * engine is deleted.
308 */
309 KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
310
311 /**
312 * <p>Virtual destructor.</p>
313 */
314 virtual ~KhmerBreakEngine();
315
316 protected:
317 /**
318 * <p>Divide up a range of known dictionary characters.</p>
319 *
320 * @param text A UText representing the text
321 * @param rangeStart The start of the range of dictionary characters
322 * @param rangeEnd The end of the range of dictionary characters
323 * @param foundBreaks Output of C array of int32_t break positions, or 0
324 * @return The number of breaks found
325 */
326 virtual int32_t divideUpDictionaryRange( UText *text,
327 int32_t rangeStart,
328 int32_t rangeEnd,
329 UVector32 &foundBreaks ) const;
330
331};
332
333#if !UCONFIG_NO_NORMALIZATION
334
335/*******************************************************************
336 * CjkBreakEngine
337 */
338
339//indicates language/script that the CjkBreakEngine will handle
340enum LanguageType {
341 kKorean,
342 kChineseJapanese
343};
344
345/**
346 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
347 * dictionary with costs associated with each word and
348 * Viterbi decoding to determine CJK-specific breaks.</p>
349 */
350class CjkBreakEngine : public DictionaryBreakEngine {
351 protected:
352 /**
353 * The set of characters handled by this engine
354 * @internal
355 */
356 UnicodeSet fHangulWordSet;
357 UnicodeSet fHanWordSet;
358 UnicodeSet fKatakanaWordSet;
359 UnicodeSet fHiraganaWordSet;
360
361 DictionaryMatcher *fDictionary;
362 const Normalizer2 *nfkcNorm2;
363
364 public:
365
366 /**
367 * <p>Default constructor.</p>
368 *
369 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
370 * engine is deleted. The DictionaryMatcher must contain costs for each word
371 * in order for the dictionary to work properly.
372 */
373 CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
374
375 /**
376 * <p>Virtual destructor.</p>
377 */
378 virtual ~CjkBreakEngine();
379
380 protected:
381 /**
382 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
383 *
384 * @param text A UText representing the text
385 * @param rangeStart The start of the range of dictionary characters
386 * @param rangeEnd The end of the range of dictionary characters
387 * @param foundBreaks Output of C array of int32_t break positions, or 0
388 * @return The number of breaks found
389 */
390 virtual int32_t divideUpDictionaryRange( UText *text,
391 int32_t rangeStart,
392 int32_t rangeEnd,
393 UVector32 &foundBreaks ) const;
394
395};
396
397#endif
398
399U_NAMESPACE_END
400
401 /* DICTBE_H */
402#endif