blob: 3749f16799e6c5b784f61fdce196d4b71477b643 [file] [log] [blame]
Victor Chang73229502020-09-17 13:39:19 +01001// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 1999-2014 International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: rbbidata.h
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* RBBI data formats Includes
16*
17* Structs that describes the format of the Binary RBBI data,
18* as it is stored in ICU's data file.
19*
20* RBBIDataWrapper - Instances of this class sit between the
21* raw data structs and the RulesBasedBreakIterator objects
22* that are created by applications. The wrapper class
23* provides reference counting for the underlying data,
24* and direct pointers to data that would not otherwise
25* be accessible without ugly pointer arithmetic. The
26* wrapper does not attempt to provide any higher level
27* abstractions for the data itself.
28*
29* There will be only one instance of RBBIDataWrapper for any
30* set of RBBI run time data being shared by instances
31* (clones) of RulesBasedBreakIterator.
32*/
33
34#ifndef __RBBIDATA_H__
35#define __RBBIDATA_H__
36
37#include "unicode/utypes.h"
38#include "unicode/udata.h"
39#include "udataswp.h"
40
41/**
42 * Swap RBBI data. See udataswp.h.
43 * @internal
44 */
45U_CAPI int32_t U_EXPORT2
46ubrk_swap(const UDataSwapper *ds,
47 const void *inData, int32_t length, void *outData,
48 UErrorCode *pErrorCode);
49
50#ifdef __cplusplus
51
Victor Changce4bf3c2021-01-19 16:34:24 +000052#include "unicode/ucptrie.h"
Victor Chang73229502020-09-17 13:39:19 +010053#include "unicode/uobject.h"
54#include "unicode/unistr.h"
55#include "unicode/uversion.h"
56#include "umutex.h"
Victor Changce4bf3c2021-01-19 16:34:24 +000057
Victor Chang73229502020-09-17 13:39:19 +010058
59U_NAMESPACE_BEGIN
60
61// The current RBBI data format version.
Victor Changce4bf3c2021-01-19 16:34:24 +000062static const uint8_t RBBI_DATA_FORMAT_VERSION[] = {6, 0, 0, 0};
Victor Chang73229502020-09-17 13:39:19 +010063
64/*
65 * The following structs map exactly onto the raw data from ICU common data file.
66 */
67struct RBBIDataHeader {
68 uint32_t fMagic; /* == 0xbla0 */
69 UVersionInfo fFormatVersion; /* Data Format. Same as the value in struct UDataInfo */
70 /* if there is one associated with this data. */
71 /* (version originates in rbbi, is copied to UDataInfo) */
72 uint32_t fLength; /* Total length in bytes of this RBBI Data, */
73 /* including all sections, not just the header. */
74 uint32_t fCatCount; /* Number of character categories. */
75
76 /* */
77 /* Offsets and sizes of each of the subsections within the RBBI data. */
78 /* All offsets are bytes from the start of the RBBIDataHeader. */
79 /* All sizes are in bytes. */
80 /* */
81 uint32_t fFTable; /* forward state transition table. */
82 uint32_t fFTableLen;
83 uint32_t fRTable; /* Offset to the reverse state transition table. */
84 uint32_t fRTableLen;
85 uint32_t fTrie; /* Offset to Trie data for character categories */
86 uint32_t fTrieLen;
87 uint32_t fRuleSource; /* Offset to the source for for the break */
88 uint32_t fRuleSourceLen; /* rules. Stored UChar *. */
89 uint32_t fStatusTable; /* Offset to the table of rule status values */
90 uint32_t fStatusTableLen;
91
92 uint32_t fReserved[6]; /* Reserved for expansion */
93
94};
95
96
97
Victor Changce4bf3c2021-01-19 16:34:24 +000098template <typename T>
99struct RBBIStateTableRowT {
100 T fAccepting; // Non-zero if this row is for an accepting state.
101 // Value 0: not an accepting state.
102 // 1: (ACCEPTING_UNCONDITIONAL) Unconditional Accepting state.
103 // >1: Look-ahead match has completed.
104 // Actual boundary position happened earlier.
105 // Value here == fLookAhead in earlier
106 // state, at actual boundary pos.
107 T fLookAhead; // Non-zero if this row is for a state that
108 // corresponds to a '/' in the rule source.
109 // Value is the same as the fAccepting
110 // value for the rule (which will appear
111 // in a different state.
112 T fTagsIdx; // Non-zero if this row covers a {tagged} position
113 // from a rule. Value is the index in the
114 // StatusTable of the set of matching
115 // tags (rule status values)
116 T fNextState[1]; // Next State, indexed by char category.
117 // Variable-length array declared with length 1
118 // to disable bounds checkers.
119 // Array Size is actually fData->fHeader->fCatCount
120 // CAUTION: see RBBITableBuilder::getTableSize()
121 // before changing anything here.
Victor Chang73229502020-09-17 13:39:19 +0100122};
123
Victor Changce4bf3c2021-01-19 16:34:24 +0000124typedef RBBIStateTableRowT<uint8_t> RBBIStateTableRow8;
125typedef RBBIStateTableRowT<uint16_t> RBBIStateTableRow16;
126
127constexpr uint16_t ACCEPTING_UNCONDITIONAL = 1; // Value constant for RBBIStateTableRow::fAccepting
128
129union RBBIStateTableRow {
130 RBBIStateTableRow16 r16;
131 RBBIStateTableRow8 r8;
132};
Victor Chang73229502020-09-17 13:39:19 +0100133
134struct RBBIStateTable {
Victor Changce4bf3c2021-01-19 16:34:24 +0000135 uint32_t fNumStates; // Number of states.
136 uint32_t fRowLen; // Length of a state table row, in bytes.
137 uint32_t fDictCategoriesStart; // Char category number of the first dictionary
138 // char class, or the the largest category number + 1
139 // if there are no dictionary categories.
140 uint32_t fLookAheadResultsSize; // Size of run-time array required for holding
141 // look-ahead results. Indexed by row.fLookAhead.
142 uint32_t fFlags; // Option Flags for this state table.
143 char fTableData[1]; // First RBBIStateTableRow begins here.
144 // Variable-length array declared with length 1
145 // to disable bounds checkers.
146 // (making it char[] simplifies ugly address
147 // arithmetic for indexing variable length rows.)
Victor Chang73229502020-09-17 13:39:19 +0100148};
149
Victor Changce4bf3c2021-01-19 16:34:24 +0000150constexpr uint32_t RBBI_LOOKAHEAD_HARD_BREAK = 1;
151constexpr uint32_t RBBI_BOF_REQUIRED = 2;
152constexpr uint32_t RBBI_8BITS_ROWS = 4;
Victor Chang73229502020-09-17 13:39:19 +0100153
154
155/* */
156/* The reference counting wrapper class */
157/* */
158class RBBIDataWrapper : public UMemory {
159public:
160 enum EDontAdopt {
161 kDontAdopt
162 };
163 RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status);
164 RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt dontAdopt, UErrorCode &status);
165 RBBIDataWrapper(UDataMemory* udm, UErrorCode &status);
166 ~RBBIDataWrapper();
167
168 static UBool isDataVersionAcceptable(const UVersionInfo version);
169
170 void init0();
171 void init(const RBBIDataHeader *data, UErrorCode &status);
172 RBBIDataWrapper *addReference();
173 void removeReference();
174 UBool operator ==(const RBBIDataWrapper &other) const;
175 int32_t hashCode();
176 const UnicodeString &getRuleSourceString() const;
177 void printData();
178 void printTable(const char *heading, const RBBIStateTable *table);
179
180 /* */
181 /* Pointers to items within the data */
182 /* */
183 const RBBIDataHeader *fHeader;
184 const RBBIStateTable *fForwardTable;
185 const RBBIStateTable *fReverseTable;
Victor Changce4bf3c2021-01-19 16:34:24 +0000186 const char *fRuleSource;
Victor Chang73229502020-09-17 13:39:19 +0100187 const int32_t *fRuleStatusTable;
188
189 /* number of int32_t values in the rule status table. Used to sanity check indexing */
190 int32_t fStatusMaxIdx;
191
Victor Changce4bf3c2021-01-19 16:34:24 +0000192 UCPTrie *fTrie;
Victor Chang73229502020-09-17 13:39:19 +0100193
194private:
195 u_atomic_int32_t fRefCount;
196 UDataMemory *fUDataMem;
197 UnicodeString fRuleString;
198 UBool fDontFreeData;
199
Victor Changce4bf3c2021-01-19 16:34:24 +0000200 RBBIDataWrapper(const RBBIDataWrapper &other) = delete; /* forbid copying of this class */
201 RBBIDataWrapper &operator=(const RBBIDataWrapper &other) = delete; /* forbid copying of this class */
Victor Chang73229502020-09-17 13:39:19 +0100202};
203
204
205
206U_NAMESPACE_END
207
Victor Changd8aa9d52021-01-05 23:49:57 +0000208U_CFUNC UBool rbbi_cleanup(void);
209
Victor Chang73229502020-09-17 13:39:19 +0100210#endif /* C++ */
211
212#endif