| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| // |
| // rbbirb.h |
| // |
| // Copyright (C) 2002-2008, International Business Machines Corporation and others. |
| // All Rights Reserved. |
| // |
| // This file contains declarations for several classes from the |
| // Rule Based Break Iterator rule builder. |
| // |
| |
| |
| #ifndef RBBIRB_H |
| #define RBBIRB_H |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_BREAK_ITERATION |
| |
| #include <utility> |
| |
| #include "unicode/uobject.h" |
| #include "unicode/rbbi.h" |
| #include "unicode/uniset.h" |
| #include "unicode/parseerr.h" |
| #include "uhash.h" |
| #include "uvector.h" |
| #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that |
| // looks up references to $variables within a set. |
| |
| |
| U_NAMESPACE_BEGIN |
| |
| class RBBIRuleScanner; |
| struct RBBIRuleTableEl; |
| class RBBISetBuilder; |
| class RBBINode; |
| class RBBITableBuilder; |
| |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // RBBISymbolTable. Implements SymbolTable interface that is used by the |
| // UnicodeSet parser to resolve references to $variables. |
| // |
| //-------------------------------------------------------------------------------- |
| class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one |
| public: // of these structs for each entry. |
| RBBISymbolTableEntry(); |
| UnicodeString key; |
| RBBINode *val; |
| ~RBBISymbolTableEntry(); |
| |
| private: |
| RBBISymbolTableEntry(const RBBISymbolTableEntry &other); // forbid copying of this class |
| RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other); // forbid copying of this class |
| }; |
| |
| |
| class RBBISymbolTable : public UMemory, public SymbolTable { |
| private: |
| const UnicodeString &fRules; |
| UHashtable *fHashTable; |
| RBBIRuleScanner *fRuleScanner; |
| |
| // These next two fields are part of the mechanism for passing references to |
| // already-constructed UnicodeSets back to the UnicodeSet constructor |
| // when the pattern includes $variable references. |
| const UnicodeString ffffString; // = "/uffff" |
| UnicodeSet *fCachedSetLookup; |
| |
| public: |
| // API inherited from class SymbolTable |
| virtual const UnicodeString* lookup(const UnicodeString& s) const; |
| virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const; |
| virtual UnicodeString parseReference(const UnicodeString& text, |
| ParsePosition& pos, int32_t limit) const; |
| |
| // Additional Functions |
| RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status); |
| virtual ~RBBISymbolTable(); |
| |
| virtual RBBINode *lookupNode(const UnicodeString &key) const; |
| virtual void addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err); |
| |
| #ifdef RBBI_DEBUG |
| virtual void rbbiSymtablePrint() const; |
| #else |
| // A do-nothing inline function for non-debug builds. Member funcs can't be empty |
| // or the call sites won't compile. |
| int32_t fFakeField; |
| #define rbbiSymtablePrint() fFakeField=0; |
| #endif |
| |
| private: |
| RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class |
| RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class |
| }; |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // class RBBIRuleBuilder The top-level class handling RBBI rule compiling. |
| // |
| //-------------------------------------------------------------------------------- |
| class RBBIRuleBuilder : public UMemory { |
| public: |
| |
| // Create a rule based break iterator from a set of rules. |
| // This function is the main entry point into the rule builder. The |
| // public ICU API for creating RBBIs uses this function to do the actual work. |
| // |
| static BreakIterator * createRuleBasedBreakIterator( const UnicodeString &rules, |
| UParseError *parseError, |
| UErrorCode &status); |
| |
| public: |
| // The "public" functions and data members that appear below are accessed |
| // (and shared) by the various parts that make up the rule builder. They |
| // are NOT intended to be accessed by anything outside of the |
| // rule builder implementation. |
| RBBIRuleBuilder(const UnicodeString &rules, |
| UParseError *parseErr, |
| UErrorCode &status |
| ); |
| |
| virtual ~RBBIRuleBuilder(); |
| |
| /** |
| * Build the state tables and char class Trie from the source rules. |
| */ |
| RBBIDataHeader *build(UErrorCode &status); |
| |
| |
| /** |
| * Fold together redundant character classes (table columns) and |
| * redundant states (table rows). Done after initial table generation, |
| * before serializing the result. |
| */ |
| void optimizeTables(); |
| |
| char *fDebugEnv; // controls debug trace output |
| UErrorCode *fStatus; // Error reporting. Keeping status |
| UParseError *fParseError; // here avoids passing it everywhere. |
| const UnicodeString &fRules; // The rule string that we are compiling |
| UnicodeString fStrippedRules; // The rule string, with comments stripped. |
| |
| RBBIRuleScanner *fScanner; // The scanner. |
| RBBINode *fForwardTree; // The parse trees, generated by the scanner, |
| RBBINode *fReverseTree; // then manipulated by subsequent steps. |
| RBBINode *fSafeFwdTree; |
| RBBINode *fSafeRevTree; |
| |
| RBBINode **fDefaultTree; // For rules not qualified with a ! |
| // the tree to which they belong to. |
| |
| UBool fChainRules; // True for chained Unicode TR style rules. |
| // False for traditional regexp rules. |
| |
| UBool fLBCMNoChain; // True: suppress chaining of rules on |
| // chars with LineBreak property == CM. |
| |
| UBool fLookAheadHardBreak; // True: Look ahead matches cause an |
| // immediate break, no continuing for the |
| // longest match. |
| |
| RBBISetBuilder *fSetBuilder; // Set and Character Category builder. |
| UVector *fUSetNodes; // Vector of all uset nodes. |
| |
| RBBITableBuilder *fForwardTable; // State transition table, build time form. |
| |
| UVector *fRuleStatusVals; // The values that can be returned |
| // from getRuleStatus(). |
| |
| RBBIDataHeader *flattenData(); // Create the flattened (runtime format) |
| // data tables.. |
| private: |
| RBBIRuleBuilder(const RBBIRuleBuilder &other); // forbid copying of this class |
| RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other); // forbid copying of this class |
| }; |
| |
| |
| |
| |
| //---------------------------------------------------------------------------- |
| // |
| // RBBISetTableEl is an entry in the hash table of UnicodeSets that have |
| // been encountered. The val Node will be of nodetype uset |
| // and contain pointers to the actual UnicodeSets. |
| // The Key is the source string for initializing the set. |
| // |
| // The hash table is used to avoid creating duplicate |
| // unnamed (not $var references) UnicodeSets. |
| // |
| // Memory Management: |
| // The Hash Table owns these RBBISetTableEl structs and |
| // the key strings. It does NOT own the val nodes. |
| // |
| //---------------------------------------------------------------------------- |
| struct RBBISetTableEl { |
| UnicodeString *key; |
| RBBINode *val; |
| }; |
| |
| /** |
| * A pair of ints, used to bundle pairs of states or pairs of character classes. |
| */ |
| typedef std::pair<int32_t, int32_t> IntPair; |
| |
| |
| //---------------------------------------------------------------------------- |
| // |
| // RBBIDebugPrintf Printf equivalent, for debugging output. |
| // Conditional compilation of the implementation lets us |
| // get rid of the stdio dependency in environments where it |
| // is unavailable. |
| // |
| //---------------------------------------------------------------------------- |
| #ifdef RBBI_DEBUG |
| #include <stdio.h> |
| #define RBBIDebugPrintf printf |
| #define RBBIDebugPuts puts |
| #else |
| #undef RBBIDebugPrintf |
| #define RBBIDebugPuts(arg) |
| #endif |
| |
| U_NAMESPACE_END |
| |
| #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
| |
| #endif |
| |
| |
| |