Victor Chang | 7322950 | 2020-09-17 13:39:19 +0100 | [diff] [blame] | 1 | // © 2016 and later: Unicode, Inc. and others. |
| 2 | // License & terms of use: http://www.unicode.org/copyright.html |
| 3 | /* |
| 4 | ******************************************************************************* |
| 5 | * |
| 6 | * Copyright (C) 1999-2014 International Business Machines |
| 7 | * Corporation and others. All Rights Reserved. |
| 8 | * |
| 9 | ******************************************************************************* |
| 10 | * file name: rbbidata.h |
| 11 | * encoding: UTF-8 |
| 12 | * tab size: 8 (not used) |
| 13 | * indentation:4 |
| 14 | * |
| 15 | * RBBI data formats Includes |
| 16 | * |
| 17 | * Structs that describes the format of the Binary RBBI data, |
| 18 | * as it is stored in ICU's data file. |
| 19 | * |
| 20 | * RBBIDataWrapper - Instances of this class sit between the |
| 21 | * raw data structs and the RulesBasedBreakIterator objects |
| 22 | * that are created by applications. The wrapper class |
| 23 | * provides reference counting for the underlying data, |
| 24 | * and direct pointers to data that would not otherwise |
| 25 | * be accessible without ugly pointer arithmetic. The |
| 26 | * wrapper does not attempt to provide any higher level |
| 27 | * abstractions for the data itself. |
| 28 | * |
| 29 | * There will be only one instance of RBBIDataWrapper for any |
| 30 | * set of RBBI run time data being shared by instances |
| 31 | * (clones) of RulesBasedBreakIterator. |
| 32 | */ |
| 33 | |
| 34 | #ifndef __RBBIDATA_H__ |
| 35 | #define __RBBIDATA_H__ |
| 36 | |
| 37 | #include "unicode/utypes.h" |
| 38 | #include "unicode/udata.h" |
| 39 | #include "udataswp.h" |
| 40 | |
| 41 | /** |
| 42 | * Swap RBBI data. See udataswp.h. |
| 43 | * @internal |
| 44 | */ |
| 45 | U_CAPI int32_t U_EXPORT2 |
| 46 | ubrk_swap(const UDataSwapper *ds, |
| 47 | const void *inData, int32_t length, void *outData, |
| 48 | UErrorCode *pErrorCode); |
| 49 | |
| 50 | #ifdef __cplusplus |
| 51 | |
| 52 | #include "unicode/uobject.h" |
| 53 | #include "unicode/unistr.h" |
| 54 | #include "unicode/uversion.h" |
| 55 | #include "umutex.h" |
| 56 | #include "utrie2.h" |
| 57 | |
| 58 | U_NAMESPACE_BEGIN |
| 59 | |
| 60 | // The current RBBI data format version. |
| 61 | static const uint8_t RBBI_DATA_FORMAT_VERSION[] = {5, 0, 0, 0}; |
| 62 | |
| 63 | /* |
| 64 | * The following structs map exactly onto the raw data from ICU common data file. |
| 65 | */ |
| 66 | struct RBBIDataHeader { |
| 67 | uint32_t fMagic; /* == 0xbla0 */ |
| 68 | UVersionInfo fFormatVersion; /* Data Format. Same as the value in struct UDataInfo */ |
| 69 | /* if there is one associated with this data. */ |
| 70 | /* (version originates in rbbi, is copied to UDataInfo) */ |
| 71 | uint32_t fLength; /* Total length in bytes of this RBBI Data, */ |
| 72 | /* including all sections, not just the header. */ |
| 73 | uint32_t fCatCount; /* Number of character categories. */ |
| 74 | |
| 75 | /* */ |
| 76 | /* Offsets and sizes of each of the subsections within the RBBI data. */ |
| 77 | /* All offsets are bytes from the start of the RBBIDataHeader. */ |
| 78 | /* All sizes are in bytes. */ |
| 79 | /* */ |
| 80 | uint32_t fFTable; /* forward state transition table. */ |
| 81 | uint32_t fFTableLen; |
| 82 | uint32_t fRTable; /* Offset to the reverse state transition table. */ |
| 83 | uint32_t fRTableLen; |
| 84 | uint32_t fTrie; /* Offset to Trie data for character categories */ |
| 85 | uint32_t fTrieLen; |
| 86 | uint32_t fRuleSource; /* Offset to the source for for the break */ |
| 87 | uint32_t fRuleSourceLen; /* rules. Stored UChar *. */ |
| 88 | uint32_t fStatusTable; /* Offset to the table of rule status values */ |
| 89 | uint32_t fStatusTableLen; |
| 90 | |
| 91 | uint32_t fReserved[6]; /* Reserved for expansion */ |
| 92 | |
| 93 | }; |
| 94 | |
| 95 | |
| 96 | |
| 97 | struct RBBIStateTableRow { |
| 98 | int16_t fAccepting; /* Non-zero if this row is for an accepting state. */ |
| 99 | /* Value 0: not an accepting state. */ |
| 100 | /* -1: Unconditional Accepting state. */ |
| 101 | /* positive: Look-ahead match has completed. */ |
| 102 | /* Actual boundary position happened earlier */ |
| 103 | /* Value here == fLookAhead in earlier */ |
| 104 | /* state, at actual boundary pos. */ |
| 105 | int16_t fLookAhead; /* Non-zero if this row is for a state that */ |
| 106 | /* corresponds to a '/' in the rule source. */ |
| 107 | /* Value is the same as the fAccepting */ |
| 108 | /* value for the rule (which will appear */ |
| 109 | /* in a different state. */ |
| 110 | int16_t fTagIdx; /* Non-zero if this row covers a {tagged} position */ |
| 111 | /* from a rule. Value is the index in the */ |
| 112 | /* StatusTable of the set of matching */ |
| 113 | /* tags (rule status values) */ |
| 114 | int16_t fReserved; |
| 115 | uint16_t fNextState[1]; /* Next State, indexed by char category. */ |
| 116 | /* Variable-length array declared with length 1 */ |
| 117 | /* to disable bounds checkers. */ |
| 118 | /* Array Size is actually fData->fHeader->fCatCount*/ |
| 119 | /* CAUTION: see RBBITableBuilder::getTableSize() */ |
| 120 | /* before changing anything here. */ |
| 121 | }; |
| 122 | |
| 123 | |
| 124 | struct RBBIStateTable { |
| 125 | uint32_t fNumStates; /* Number of states. */ |
| 126 | uint32_t fRowLen; /* Length of a state table row, in bytes. */ |
| 127 | uint32_t fFlags; /* Option Flags for this state table */ |
| 128 | uint32_t fReserved; /* reserved */ |
| 129 | char fTableData[1]; /* First RBBIStateTableRow begins here. */ |
| 130 | /* Variable-length array declared with length 1 */ |
| 131 | /* to disable bounds checkers. */ |
| 132 | /* (making it char[] simplifies ugly address */ |
| 133 | /* arithmetic for indexing variable length rows.) */ |
| 134 | }; |
| 135 | |
| 136 | typedef enum { |
| 137 | RBBI_LOOKAHEAD_HARD_BREAK = 1, |
| 138 | RBBI_BOF_REQUIRED = 2 |
| 139 | } RBBIStateTableFlags; |
| 140 | |
| 141 | |
| 142 | /* */ |
| 143 | /* The reference counting wrapper class */ |
| 144 | /* */ |
| 145 | class RBBIDataWrapper : public UMemory { |
| 146 | public: |
| 147 | enum EDontAdopt { |
| 148 | kDontAdopt |
| 149 | }; |
| 150 | RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status); |
| 151 | RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt dontAdopt, UErrorCode &status); |
| 152 | RBBIDataWrapper(UDataMemory* udm, UErrorCode &status); |
| 153 | ~RBBIDataWrapper(); |
| 154 | |
| 155 | static UBool isDataVersionAcceptable(const UVersionInfo version); |
| 156 | |
| 157 | void init0(); |
| 158 | void init(const RBBIDataHeader *data, UErrorCode &status); |
| 159 | RBBIDataWrapper *addReference(); |
| 160 | void removeReference(); |
| 161 | UBool operator ==(const RBBIDataWrapper &other) const; |
| 162 | int32_t hashCode(); |
| 163 | const UnicodeString &getRuleSourceString() const; |
| 164 | void printData(); |
| 165 | void printTable(const char *heading, const RBBIStateTable *table); |
| 166 | |
| 167 | /* */ |
| 168 | /* Pointers to items within the data */ |
| 169 | /* */ |
| 170 | const RBBIDataHeader *fHeader; |
| 171 | const RBBIStateTable *fForwardTable; |
| 172 | const RBBIStateTable *fReverseTable; |
| 173 | const UChar *fRuleSource; |
| 174 | const int32_t *fRuleStatusTable; |
| 175 | |
| 176 | /* number of int32_t values in the rule status table. Used to sanity check indexing */ |
| 177 | int32_t fStatusMaxIdx; |
| 178 | |
| 179 | UTrie2 *fTrie; |
| 180 | |
| 181 | private: |
| 182 | u_atomic_int32_t fRefCount; |
| 183 | UDataMemory *fUDataMem; |
| 184 | UnicodeString fRuleString; |
| 185 | UBool fDontFreeData; |
| 186 | |
| 187 | RBBIDataWrapper(const RBBIDataWrapper &other); /* forbid copying of this class */ |
| 188 | RBBIDataWrapper &operator=(const RBBIDataWrapper &other); /* forbid copying of this class */ |
| 189 | }; |
| 190 | |
| 191 | |
| 192 | |
| 193 | U_NAMESPACE_END |
| 194 | |
Victor Chang | d8aa9d5 | 2021-01-05 23:49:57 +0000 | [diff] [blame^] | 195 | U_CFUNC UBool rbbi_cleanup(void); |
| 196 | |
Victor Chang | 7322950 | 2020-09-17 13:39:19 +0100 | [diff] [blame] | 197 | #endif /* C++ */ |
| 198 | |
| 199 | #endif |