blob: 44e69399e397534895558796faa0c66809f6b5e9 [file] [log] [blame]
Victor Chang73229502020-09-17 13:39:19 +01001// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2013-2015, International Business Machines
6* Corporation and others. All Rights Reserved.
7*******************************************************************************
8* collationdatareader.h
9*
10* created on: 2013feb07
11* created by: Markus W. Scherer
12*/
13
14#ifndef __COLLATIONDATAREADER_H__
15#define __COLLATIONDATAREADER_H__
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_COLLATION
20
21#include "unicode/udata.h"
22
23struct UDataMemory;
24
25U_NAMESPACE_BEGIN
26
27struct CollationTailoring;
28
29/**
30 * Collation binary data reader.
31 */
32struct U_I18N_API CollationDataReader /* all static */ {
33 // The following constants are also copied into source/common/ucol_swp.cpp.
34 // Keep them in sync!
35 enum {
36 /**
37 * Number of int32_t indexes.
38 *
39 * Can be 2 if there are only options.
40 * Can be 7 or 8 if there are only options and a script reordering.
41 * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0.
42 */
43 IX_INDEXES_LENGTH, // 0
44 /**
45 * Bits 31..24: numericPrimary, for numeric collation
46 * 23..16: fast Latin format version (0 = no fast Latin table)
47 * 15.. 0: options bit set
48 */
49 IX_OPTIONS,
50 IX_RESERVED2,
51 IX_RESERVED3,
52
53 /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */
54 IX_JAMO_CE32S_START, // 4
55
56 // Byte offsets from the start of the data, after the generic header.
57 // The indexes[] are at byte offset 0, other data follows.
58 // Each data item is aligned properly.
59 // The data items should be in descending order of unit size,
60 // to minimize the need for padding.
61 // Each item's byte length is given by the difference between its offset and
62 // the next index/offset value.
63 /** Byte offset to int32_t reorderCodes[]. */
64 IX_REORDER_CODES_OFFSET,
65 /**
66 * Byte offset to uint8_t reorderTable[].
67 * Empty table if <256 bytes (padding only).
68 * Otherwise 256 bytes or more (with padding).
69 */
70 IX_REORDER_TABLE_OFFSET,
71 /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */
72 IX_TRIE_OFFSET,
73
74 IX_RESERVED8_OFFSET, // 8
75 /** Byte offset to int64_t ces[]. */
76 IX_CES_OFFSET,
77 IX_RESERVED10_OFFSET,
78 /** Byte offset to uint32_t ce32s[]. */
79 IX_CE32S_OFFSET,
80
81 /** Byte offset to uint32_t rootElements[]. */
82 IX_ROOT_ELEMENTS_OFFSET, // 12
83 /** Byte offset to UChar *contexts[]. */
84 IX_CONTEXTS_OFFSET,
85 /** Byte offset to uint16_t [] with serialized unsafeBackwardSet. */
86 IX_UNSAFE_BWD_OFFSET,
87 /** Byte offset to uint16_t fastLatinTable[]. */
88 IX_FAST_LATIN_TABLE_OFFSET,
89
90 /** Byte offset to uint16_t scripts[]. */
91 IX_SCRIPTS_OFFSET, // 16
92 /**
93 * Byte offset to UBool compressibleBytes[].
94 * Empty table if <256 bytes (padding only).
95 * Otherwise 256 bytes or more (with padding).
96 */
97 IX_COMPRESSIBLE_BYTES_OFFSET,
98 IX_RESERVED18_OFFSET,
99 IX_TOTAL_SIZE
100 };
101
102 static void read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength,
103 CollationTailoring &tailoring, UErrorCode &errorCode);
104
105 static UBool U_CALLCONV
106 isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo);
107
108private:
109 CollationDataReader(); // no constructor
110};
111
112/*
113 * Format of collation data (ucadata.icu, binary data in coll/ *.res files).
114 * Format version 5.
115 *
116 * The root collation data is stored in the ucadata.icu file.
117 * Tailorings are stored inside .res resource bundle files, with a complete file header.
118 *
119 * Collation data begins with a standard ICU data file header
120 * (DataHeader, see ucmndata.h and unicode/udata.h).
121 * The UDataInfo.dataVersion field contains the UCA and other version numbers,
122 * see the comments for CollationTailoring.version.
123 *
124 * After the header, the file contains the following parts.
125 * Constants are defined as enum values of the CollationDataReader class.
126 * See also the Collation class.
127 *
128 * int32_t indexes[indexesLength];
129 * The indexes array has variable length.
130 * Some tailorings only need the length and the options,
131 * others only add reorderCodes and the reorderTable,
132 * some need to store mappings.
133 * Only as many indexes are stored as needed to read all of the data.
134 *
135 * Index 0: indexesLength
136 * Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS
137 * Index 2..3: Unused/reserved/0.
138 * Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo
139 * are stored in a short, contiguous part of the ce32s array.
140 *
141 * Indexes 5..19 are byte offsets in ascending order.
142 * Each byte offset marks the start of the next part in the data file,
143 * and the end of the previous one.
144 * When two consecutive byte offsets are the same (or too short),
145 * then the corresponding part is empty.
146 * Byte offsets are offsets from after the header,
147 * that is, from the beginning of the indexes[].
148 * Each part starts at an offset with proper alignment for its data.
149 * If necessary, the previous part may include padding bytes to achieve this alignment.
150 * The last byte offset that is stored in the indexes indicates the total size of the data
151 * (starting with the indexes).
152 *
153 * int32_t reorderCodes[]; -- empty in root
154 * The list of script and reordering codes.
155 *
156 * Beginning with format version 5, this array may optionally
157 * have trailing entries with a full list of reorder ranges
158 * as described for CollationSettings::reorderRanges.
159 *
160 * Script or reorder codes are first and do not exceed 16-bit values.
161 * Range limits are stored in the upper 16 bits, and are never 0.
162 * Split this array into reorder codes and ranges at the first entry
163 * with non-zero upper 16 bits.
164 *
165 * If the ranges are missing but needed for split-reordered primary lead bytes,
166 * then they are regenerated at load time.
167 *
168 * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes
169 * Primary-weight lead byte permutation table.
170 * Normally present when the reorderCodes are, but can be built at load time.
171 *
172 * Beginning with format version 5, a 0 entry at a non-zero index
173 * (which is otherwise an illegal value)
174 * means that the primary lead byte is "split"
175 * (there are different offsets for primaries that share that lead byte)
176 * and the reordering offset must be determined via the reorder ranges
177 * that are either stored as part of the reorderCodes array
178 * or regenerated at load time.
179 *
180 * UTrie2 trie; -- see utrie2_impl.h and utrie2.h
181 * The trie holds the main collation data. Each code point is mapped to a 32-bit value.
182 * It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set,
183 * in which case it is a special CE32 and contains a 4-bit tag and further data.
184 * See the Collation class for details.
185 *
186 * The trie has a value for each lead surrogate code unit with some bits encoding
187 * collective properties of the 1024 supplementary characters whose UTF-16 form starts with
188 * the lead surrogate. See Collation::LEAD_SURROGATE_TAG..
189 *
190 * int64_t ces[];
191 * 64-bit CEs and expansions that cannot be stored in a more compact form.
192 *
193 * uint32_t ce32s[];
194 * CE32s for expansions in compact form, and for characters whose trie values
195 * contain special data.
196 *
197 * uint32_t rootElements[]; -- empty in all tailorings
198 * Compact storage for all of the CEs that occur in the root collation.
199 * See the CollationRootElements class.
200 *
201 * UChar *contexts[];
202 * Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings.
203 *
204 * uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize()
205 * Serialized form of characters that are unsafe when iterating backwards,
206 * and at the end of an identical string prefix.
207 * Back up to a safe character.
208 * Lead surrogates are "unsafe" when any of their corresponding supplementary
209 * code points are unsafe.
210 * Does not include [:^lccc=0:][:^tccc=0:].
211 * For each tailoring, the root unsafeBackwardSet is subtracted.
212 * (As a result, in many tailorings no set needs to be stored.)
213 *
214 * uint16_t fastLatinTable[];
215 * Optional optimization for Latin text.
216 * See the CollationFastLatin class.
217 *
218 * uint16_t scripts[]; -- empty in all tailorings
219 * Format version 5:
220 * uint16_t numScripts;
221 * uint16_t scriptsIndex[numScripts+16];
222 * uint16_t scriptStarts[];
223 * See CollationData::numScripts etc.
224 *
225 * Format version 4:
226 * Table of the reordering groups with their first and last lead bytes,
227 * and their script and reordering codes.
228 * See CollationData::scripts.
229 *
230 * UBool compressibleBytes[]; -- empty in all tailorings
231 * Flag for getSortKey(), indicating primary weight lead bytes that are compressible.
232 *
233 * -----------------
234 * Changes for formatVersion 5 (ICU 55)
235 *
236 * Reordering moves single scripts, not groups of scripts.
237 * Reorder ranges are optionally appended to the reorderCodes,
238 * and a 0 entry in the reorderTable indicates a split lead byte.
239 * The scripts data has a new format.
240 *
241 * The rootElements may contain secondary and tertiary weights below common=05.
242 * (Used for small Hiragana letters.)
243 * Where is occurs, there is also an explicit unit with common secondary & tertiary weights.
244 * There are no other data structure changes, but builder code needs to be able to handle such data.
245 *
246 * The collation element for the merge separator code point U+FFFE
247 * does not necessarily have special, unique secondary/tertiary weights any more.
248 */
249
250U_NAMESPACE_END
251
252#endif // !UCONFIG_NO_COLLATION
253#endif // __COLLATIONDATAREADER_H__