| // Copyright (C) 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /** |
| ******************************************************************************* |
| * Copyright (C) 2006-2014, International Business Machines Corporation * |
| * and others. All Rights Reserved. * |
| ******************************************************************************* |
| */ |
| |
| #ifndef DICTBE_H |
| #define DICTBE_H |
| |
| #include "unicode/utypes.h" |
| #include "unicode/uniset.h" |
| #include "unicode/utext.h" |
| |
| #include "brkeng.h" |
| |
| U_NAMESPACE_BEGIN |
| |
| class DictionaryMatcher; |
| class Normalizer2; |
| |
| /******************************************************************* |
| * DictionaryBreakEngine |
| */ |
| |
| /** |
| * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a |
| * dictionary to determine language-specific breaks.</p> |
| * |
| * <p>After it is constructed a DictionaryBreakEngine may be shared between |
| * threads without synchronization.</p> |
| */ |
| class DictionaryBreakEngine : public LanguageBreakEngine { |
| private: |
| /** |
| * The set of characters handled by this engine |
| * @internal |
| */ |
| |
| UnicodeSet fSet; |
| |
| /** |
| * The set of break types handled by this engine |
| * @internal |
| */ |
| |
| uint32_t fTypes; |
| |
| /** |
| * <p>Default constructor.</p> |
| * |
| */ |
| DictionaryBreakEngine(); |
| |
| public: |
| |
| /** |
| * <p>Constructor setting the break types handled.</p> |
| * |
| * @param breakTypes A bitmap of types handled by the engine. |
| */ |
| DictionaryBreakEngine( uint32_t breakTypes ); |
| |
| /** |
| * <p>Virtual destructor.</p> |
| */ |
| virtual ~DictionaryBreakEngine(); |
| |
| /** |
| * <p>Indicate whether this engine handles a particular character for |
| * a particular kind of break.</p> |
| * |
| * @param c A character which begins a run that the engine might handle |
| * @param breakType The type of text break which the caller wants to determine |
| * @return TRUE if this engine handles the particular character and break |
| * type. |
| */ |
| virtual UBool handles( UChar32 c, int32_t breakType ) const; |
| |
| /** |
| * <p>Find any breaks within a run in the supplied text.</p> |
| * |
| * @param text A UText representing the text. The iterator is left at |
| * the end of the run of characters which the engine is capable of handling |
| * that starts from the first (or last) character in the range. |
| * @param startPos The start of the run within the supplied text. |
| * @param endPos The end of the run within the supplied text. |
| * @param reverse Whether the caller is looking for breaks in a reverse |
| * direction. |
| * @param breakType The type of break desired, or -1. |
| * @param foundBreaks An allocated C array of the breaks found, if any |
| * @return The number of breaks found. |
| */ |
| virtual int32_t findBreaks( UText *text, |
| int32_t startPos, |
| int32_t endPos, |
| UBool reverse, |
| int32_t breakType, |
| UStack &foundBreaks ) const; |
| |
| protected: |
| |
| /** |
| * <p>Set the character set handled by this engine.</p> |
| * |
| * @param set A UnicodeSet of the set of characters handled by the engine |
| */ |
| virtual void setCharacters( const UnicodeSet &set ); |
| |
| /** |
| * <p>Set the break types handled by this engine.</p> |
| * |
| * @param breakTypes A bitmap of types handled by the engine. |
| */ |
| // virtual void setBreakTypes( uint32_t breakTypes ); |
| |
| /** |
| * <p>Divide up a range of known dictionary characters handled by this break engine.</p> |
| * |
| * @param text A UText representing the text |
| * @param rangeStart The start of the range of dictionary characters |
| * @param rangeEnd The end of the range of dictionary characters |
| * @param foundBreaks Output of C array of int32_t break positions, or 0 |
| * @return The number of breaks found |
| */ |
| virtual int32_t divideUpDictionaryRange( UText *text, |
| int32_t rangeStart, |
| int32_t rangeEnd, |
| UStack &foundBreaks ) const = 0; |
| |
| }; |
| |
| /******************************************************************* |
| * ThaiBreakEngine |
| */ |
| |
| /** |
| * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a |
| * dictionary and heuristics to determine Thai-specific breaks.</p> |
| * |
| * <p>After it is constructed a ThaiBreakEngine may be shared between |
| * threads without synchronization.</p> |
| */ |
| class ThaiBreakEngine : public DictionaryBreakEngine { |
| private: |
| /** |
| * The set of characters handled by this engine |
| * @internal |
| */ |
| |
| UnicodeSet fThaiWordSet; |
| UnicodeSet fEndWordSet; |
| UnicodeSet fBeginWordSet; |
| UnicodeSet fSuffixSet; |
| UnicodeSet fMarkSet; |
| DictionaryMatcher *fDictionary; |
| |
| public: |
| |
| /** |
| * <p>Default constructor.</p> |
| * |
| * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the |
| * engine is deleted. |
| */ |
| ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); |
| |
| /** |
| * <p>Virtual destructor.</p> |
| */ |
| virtual ~ThaiBreakEngine(); |
| |
| protected: |
| /** |
| * <p>Divide up a range of known dictionary characters handled by this break engine.</p> |
| * |
| * @param text A UText representing the text |
| * @param rangeStart The start of the range of dictionary characters |
| * @param rangeEnd The end of the range of dictionary characters |
| * @param foundBreaks Output of C array of int32_t break positions, or 0 |
| * @return The number of breaks found |
| */ |
| virtual int32_t divideUpDictionaryRange( UText *text, |
| int32_t rangeStart, |
| int32_t rangeEnd, |
| UStack &foundBreaks ) const; |
| |
| }; |
| |
| /******************************************************************* |
| * LaoBreakEngine |
| */ |
| |
| /** |
| * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a |
| * dictionary and heuristics to determine Lao-specific breaks.</p> |
| * |
| * <p>After it is constructed a LaoBreakEngine may be shared between |
| * threads without synchronization.</p> |
| */ |
| class LaoBreakEngine : public DictionaryBreakEngine { |
| private: |
| /** |
| * The set of characters handled by this engine |
| * @internal |
| */ |
| |
| UnicodeSet fLaoWordSet; |
| UnicodeSet fEndWordSet; |
| UnicodeSet fBeginWordSet; |
| UnicodeSet fMarkSet; |
| DictionaryMatcher *fDictionary; |
| |
| public: |
| |
| /** |
| * <p>Default constructor.</p> |
| * |
| * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the |
| * engine is deleted. |
| */ |
| LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); |
| |
| /** |
| * <p>Virtual destructor.</p> |
| */ |
| virtual ~LaoBreakEngine(); |
| |
| protected: |
| /** |
| * <p>Divide up a range of known dictionary characters handled by this break engine.</p> |
| * |
| * @param text A UText representing the text |
| * @param rangeStart The start of the range of dictionary characters |
| * @param rangeEnd The end of the range of dictionary characters |
| * @param foundBreaks Output of C array of int32_t break positions, or 0 |
| * @return The number of breaks found |
| */ |
| virtual int32_t divideUpDictionaryRange( UText *text, |
| int32_t rangeStart, |
| int32_t rangeEnd, |
| UStack &foundBreaks ) const; |
| |
| }; |
| |
| /******************************************************************* |
| * BurmeseBreakEngine |
| */ |
| |
| /** |
| * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a |
| * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p> |
| * |
| * <p>After it is constructed a BurmeseBreakEngine may be shared between |
| * threads without synchronization.</p> |
| */ |
| class BurmeseBreakEngine : public DictionaryBreakEngine { |
| private: |
| /** |
| * The set of characters handled by this engine |
| * @internal |
| */ |
| |
| UnicodeSet fBurmeseWordSet; |
| UnicodeSet fEndWordSet; |
| UnicodeSet fBeginWordSet; |
| UnicodeSet fMarkSet; |
| DictionaryMatcher *fDictionary; |
| |
| public: |
| |
| /** |
| * <p>Default constructor.</p> |
| * |
| * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the |
| * engine is deleted. |
| */ |
| BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); |
| |
| /** |
| * <p>Virtual destructor.</p> |
| */ |
| virtual ~BurmeseBreakEngine(); |
| |
| protected: |
| /** |
| * <p>Divide up a range of known dictionary characters.</p> |
| * |
| * @param text A UText representing the text |
| * @param rangeStart The start of the range of dictionary characters |
| * @param rangeEnd The end of the range of dictionary characters |
| * @param foundBreaks Output of C array of int32_t break positions, or 0 |
| * @return The number of breaks found |
| */ |
| virtual int32_t divideUpDictionaryRange( UText *text, |
| int32_t rangeStart, |
| int32_t rangeEnd, |
| UStack &foundBreaks ) const; |
| |
| }; |
| |
| /******************************************************************* |
| * KhmerBreakEngine |
| */ |
| |
| /** |
| * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a |
| * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p> |
| * |
| * <p>After it is constructed a KhmerBreakEngine may be shared between |
| * threads without synchronization.</p> |
| */ |
| class KhmerBreakEngine : public DictionaryBreakEngine { |
| private: |
| /** |
| * The set of characters handled by this engine |
| * @internal |
| */ |
| |
| UnicodeSet fKhmerWordSet; |
| UnicodeSet fEndWordSet; |
| UnicodeSet fBeginWordSet; |
| UnicodeSet fMarkSet; |
| DictionaryMatcher *fDictionary; |
| |
| public: |
| |
| /** |
| * <p>Default constructor.</p> |
| * |
| * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the |
| * engine is deleted. |
| */ |
| KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); |
| |
| /** |
| * <p>Virtual destructor.</p> |
| */ |
| virtual ~KhmerBreakEngine(); |
| |
| protected: |
| /** |
| * <p>Divide up a range of known dictionary characters.</p> |
| * |
| * @param text A UText representing the text |
| * @param rangeStart The start of the range of dictionary characters |
| * @param rangeEnd The end of the range of dictionary characters |
| * @param foundBreaks Output of C array of int32_t break positions, or 0 |
| * @return The number of breaks found |
| */ |
| virtual int32_t divideUpDictionaryRange( UText *text, |
| int32_t rangeStart, |
| int32_t rangeEnd, |
| UStack &foundBreaks ) const; |
| |
| }; |
| |
| #if !UCONFIG_NO_NORMALIZATION |
| |
| /******************************************************************* |
| * CjkBreakEngine |
| */ |
| |
| //indicates language/script that the CjkBreakEngine will handle |
| enum LanguageType { |
| kKorean, |
| kChineseJapanese |
| }; |
| |
| /** |
| * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a |
| * dictionary with costs associated with each word and |
| * Viterbi decoding to determine CJK-specific breaks.</p> |
| */ |
| class CjkBreakEngine : public DictionaryBreakEngine { |
| protected: |
| /** |
| * The set of characters handled by this engine |
| * @internal |
| */ |
| UnicodeSet fHangulWordSet; |
| UnicodeSet fHanWordSet; |
| UnicodeSet fKatakanaWordSet; |
| UnicodeSet fHiraganaWordSet; |
| |
| DictionaryMatcher *fDictionary; |
| const Normalizer2 *nfkcNorm2; |
| |
| public: |
| |
| /** |
| * <p>Default constructor.</p> |
| * |
| * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the |
| * engine is deleted. The DictionaryMatcher must contain costs for each word |
| * in order for the dictionary to work properly. |
| */ |
| CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status); |
| |
| /** |
| * <p>Virtual destructor.</p> |
| */ |
| virtual ~CjkBreakEngine(); |
| |
| protected: |
| /** |
| * <p>Divide up a range of known dictionary characters handled by this break engine.</p> |
| * |
| * @param text A UText representing the text |
| * @param rangeStart The start of the range of dictionary characters |
| * @param rangeEnd The end of the range of dictionary characters |
| * @param foundBreaks Output of C array of int32_t break positions, or 0 |
| * @return The number of breaks found |
| */ |
| virtual int32_t divideUpDictionaryRange( UText *text, |
| int32_t rangeStart, |
| int32_t rangeEnd, |
| UStack &foundBreaks ) const; |
| |
| }; |
| |
| #endif |
| |
| U_NAMESPACE_END |
| |
| /* DICTBE_H */ |
| #endif |