| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /** |
| ************************************************************************************ |
| * Copyright (C) 2006-2012, International Business Machines Corporation and others. * |
| * All Rights Reserved. * |
| ************************************************************************************ |
| */ |
| |
| #ifndef BRKENG_H |
| #define BRKENG_H |
| |
| #include "unicode/utypes.h" |
| #include "unicode/uobject.h" |
| #include "unicode/utext.h" |
| #include "unicode/uscript.h" |
| |
| U_NAMESPACE_BEGIN |
| |
| class UnicodeSet; |
| class UStack; |
| class DictionaryMatcher; |
| |
| /******************************************************************* |
| * LanguageBreakEngine |
| */ |
| |
| /** |
| * <p>LanguageBreakEngines implement language-specific knowledge for |
| * finding text boundaries within a run of characters belonging to a |
| * specific set. The boundaries will be of a specific kind, e.g. word, |
| * line, etc.</p> |
| * |
| * <p>LanguageBreakEngines should normally be implemented so as to |
| * be shared between threads without locking.</p> |
| */ |
| class LanguageBreakEngine : public UMemory { |
| public: |
| |
| /** |
| * <p>Default constructor.</p> |
| * |
| */ |
| LanguageBreakEngine(); |
| |
| /** |
| * <p>Virtual destructor.</p> |
| */ |
| virtual ~LanguageBreakEngine(); |
| |
| /** |
| * <p>Indicate whether this engine handles a particular character for |
| * a particular kind of break.</p> |
| * |
| * @param c A character which begins a run that the engine might handle |
| * @param breakType The type of text break which the caller wants to determine |
| * @return TRUE if this engine handles the particular character and break |
| * type. |
| */ |
| virtual UBool handles(UChar32 c, int32_t breakType) const = 0; |
| |
| /** |
| * <p>Find any breaks within a run in the supplied text.</p> |
| * |
| * @param text A UText representing the text. The |
| * iterator is left at the end of the run of characters which the engine |
| * is capable of handling. |
| * @param startPos The start of the run within the supplied text. |
| * @param endPos The end of the run within the supplied text. |
| * @param reverse Whether the caller is looking for breaks in a reverse |
| * direction. |
| * @param breakType The type of break desired, or -1. |
| * @param foundBreaks An allocated C array of the breaks found, if any |
| * @return The number of breaks found. |
| */ |
| virtual int32_t findBreaks( UText *text, |
| int32_t startPos, |
| int32_t endPos, |
| UBool reverse, |
| int32_t breakType, |
| UStack &foundBreaks ) const = 0; |
| |
| }; |
| |
| /******************************************************************* |
| * LanguageBreakFactory |
| */ |
| |
| /** |
| * <p>LanguageBreakFactorys find and return a LanguageBreakEngine |
| * that can determine breaks for characters in a specific set, if |
| * such an object can be found.</p> |
| * |
| * <p>If a LanguageBreakFactory is to be shared between threads, |
| * appropriate synchronization must be used; there is none internal |
| * to the factory.</p> |
| * |
| * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can |
| * normally be shared between threads without synchronization, unless |
| * the specific subclass of LanguageBreakFactory indicates otherwise.</p> |
| * |
| * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine |
| * it returns when it itself is deleted, unless the specific subclass of |
| * LanguageBreakFactory indicates otherwise. Naturally, the factory should |
| * not be deleted until the LanguageBreakEngines it has returned are no |
| * longer needed.</p> |
| */ |
| class LanguageBreakFactory : public UMemory { |
| public: |
| |
| /** |
| * <p>Default constructor.</p> |
| * |
| */ |
| LanguageBreakFactory(); |
| |
| /** |
| * <p>Virtual destructor.</p> |
| */ |
| virtual ~LanguageBreakFactory(); |
| |
| /** |
| * <p>Find and return a LanguageBreakEngine that can find the desired |
| * kind of break for the set of characters to which the supplied |
| * character belongs. It is up to the set of available engines to |
| * determine what the sets of characters are.</p> |
| * |
| * @param c A character that begins a run for which a LanguageBreakEngine is |
| * sought. |
| * @param breakType The kind of text break for which a LanguageBreakEngine is |
| * sought. |
| * @return A LanguageBreakEngine with the desired characteristics, or 0. |
| */ |
| virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType) = 0; |
| |
| }; |
| |
| /******************************************************************* |
| * UnhandledEngine |
| */ |
| |
| /** |
| * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that |
| * handles characters that no other LanguageBreakEngine is available to |
| * handle. It is told the character and the type of break; at its |
| * discretion it may handle more than the specified character (e.g., |
| * the entire script to which that character belongs.</p> |
| * |
| * <p>UnhandledEngines may not be shared between threads without |
| * external synchronization.</p> |
| */ |
| |
| class UnhandledEngine : public LanguageBreakEngine { |
| private: |
| |
| /** |
| * The sets of characters handled, for each break type |
| * @internal |
| */ |
| |
| UnicodeSet *fHandled[4]; |
| |
| public: |
| |
| /** |
| * <p>Default constructor.</p> |
| * |
| */ |
| UnhandledEngine(UErrorCode &status); |
| |
| /** |
| * <p>Virtual destructor.</p> |
| */ |
| virtual ~UnhandledEngine(); |
| |
| /** |
| * <p>Indicate whether this engine handles a particular character for |
| * a particular kind of break.</p> |
| * |
| * @param c A character which begins a run that the engine might handle |
| * @param breakType The type of text break which the caller wants to determine |
| * @return TRUE if this engine handles the particular character and break |
| * type. |
| */ |
| virtual UBool handles(UChar32 c, int32_t breakType) const; |
| |
| /** |
| * <p>Find any breaks within a run in the supplied text.</p> |
| * |
| * @param text A UText representing the text (TODO: UText). The |
| * iterator is left at the end of the run of characters which the engine |
| * is capable of handling. |
| * @param startPos The start of the run within the supplied text. |
| * @param endPos The end of the run within the supplied text. |
| * @param reverse Whether the caller is looking for breaks in a reverse |
| * direction. |
| * @param breakType The type of break desired, or -1. |
| * @param foundBreaks An allocated C array of the breaks found, if any |
| * @return The number of breaks found. |
| */ |
| virtual int32_t findBreaks( UText *text, |
| int32_t startPos, |
| int32_t endPos, |
| UBool reverse, |
| int32_t breakType, |
| UStack &foundBreaks ) const; |
| |
| /** |
| * <p>Tell the engine to handle a particular character and break type.</p> |
| * |
| * @param c A character which the engine should handle |
| * @param breakType The type of text break for which the engine should handle c |
| */ |
| virtual void handleCharacter(UChar32 c, int32_t breakType); |
| |
| }; |
| |
| /******************************************************************* |
| * ICULanguageBreakFactory |
| */ |
| |
| /** |
| * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for |
| * ICU. It creates dictionary-based LanguageBreakEngines from dictionary |
| * data in the ICU data file.</p> |
| */ |
| class ICULanguageBreakFactory : public LanguageBreakFactory { |
| private: |
| |
| /** |
| * The stack of break engines created by this factory |
| * @internal |
| */ |
| |
| UStack *fEngines; |
| |
| public: |
| |
| /** |
| * <p>Standard constructor.</p> |
| * |
| */ |
| ICULanguageBreakFactory(UErrorCode &status); |
| |
| /** |
| * <p>Virtual destructor.</p> |
| */ |
| virtual ~ICULanguageBreakFactory(); |
| |
| /** |
| * <p>Find and return a LanguageBreakEngine that can find the desired |
| * kind of break for the set of characters to which the supplied |
| * character belongs. It is up to the set of available engines to |
| * determine what the sets of characters are.</p> |
| * |
| * @param c A character that begins a run for which a LanguageBreakEngine is |
| * sought. |
| * @param breakType The kind of text break for which a LanguageBreakEngine is |
| * sought. |
| * @return A LanguageBreakEngine with the desired characteristics, or 0. |
| */ |
| virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType); |
| |
| protected: |
| /** |
| * <p>Create a LanguageBreakEngine for the set of characters to which |
| * the supplied character belongs, for the specified break type.</p> |
| * |
| * @param c A character that begins a run for which a LanguageBreakEngine is |
| * sought. |
| * @param breakType The kind of text break for which a LanguageBreakEngine is |
| * sought. |
| * @return A LanguageBreakEngine with the desired characteristics, or 0. |
| */ |
| virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, int32_t breakType); |
| |
| /** |
| * <p>Create a DictionaryMatcher for the specified script and break type.</p> |
| * @param script An ISO 15924 script code that identifies the dictionary to be |
| * created. |
| * @param breakType The kind of text break for which a dictionary is |
| * sought. |
| * @return A DictionaryMatcher with the desired characteristics, or NULL. |
| */ |
| virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType); |
| }; |
| |
| U_NAMESPACE_END |
| |
| /* BRKENG_H */ |
| #endif |