blob: 6e2dbee5b618be3d292a2a0a5bc8677aeadf75ba [file] [log] [blame]
Fredrik Roubert0596fae2017-04-18 21:34:02 +02001// © 2016 and later: Unicode, Inc. and others.
Fredrik Roubert64339d32016-10-21 19:43:16 +02002// License & terms of use: http://www.unicode.org/copyright.html
Craig Cornelius54dcd9b2013-02-15 14:03:14 -08003/*
4*******************************************************************************
Fredrik Roubert8de051c2016-03-10 13:13:27 +01005* Copyright (C) 2014-2016, International Business Machines
Craig Cornelius54dcd9b2013-02-15 14:03:14 -08006* Corporation and others. All Rights Reserved.
7*******************************************************************************
8* dictionarydata.h
9*
10* created on: 2012may31
11* created by: Markus W. Scherer & Maxime Serrano
12*/
13
14#include "dictionarydata.h"
15#include "unicode/ucharstrie.h"
16#include "unicode/bytestrie.h"
17#include "unicode/udata.h"
18#include "cmemory.h"
19
20#if !UCONFIG_NO_BREAK_ITERATION
21
22U_NAMESPACE_BEGIN
23
ccornelius59d709d2014-02-20 10:29:46 -080024const int32_t DictionaryData::TRIE_TYPE_BYTES = 0;
25const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1;
26const int32_t DictionaryData::TRIE_TYPE_MASK = 7;
27const int32_t DictionaryData::TRIE_HAS_VALUES = 8;
Craig Cornelius54dcd9b2013-02-15 14:03:14 -080028
ccornelius59d709d2014-02-20 10:29:46 -080029const int32_t DictionaryData::TRANSFORM_NONE = 0;
30const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
31const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
32const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
33
Craig Cornelius54dcd9b2013-02-15 14:03:14 -080034DictionaryMatcher::~DictionaryMatcher() {
35}
36
37UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
38 udata_close(file);
39}
40
41int32_t UCharsDictionaryMatcher::getType() const {
42 return DictionaryData::TRIE_TYPE_UCHARS;
43}
44
ccorneliusf9878a22014-11-20 18:09:39 -080045int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
46 int32_t *lengths, int32_t *cpLengths, int32_t *values,
47 int32_t *prefix) const {
48
Craig Cornelius54dcd9b2013-02-15 14:03:14 -080049 UCharsTrie uct(characters);
Fredrik Roubert8de051c2016-03-10 13:13:27 +010050 int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
ccorneliusf9878a22014-11-20 18:09:39 -080051 int32_t wordCount = 0;
52 int32_t codePointsMatched = 0;
53
54 for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
55 UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
Fredrik Roubert8de051c2016-03-10 13:13:27 +010056 int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
ccorneliusf9878a22014-11-20 18:09:39 -080057 codePointsMatched += 1;
Craig Cornelius54dcd9b2013-02-15 14:03:14 -080058 if (USTRINGTRIE_HAS_VALUE(result)) {
ccorneliusf9878a22014-11-20 18:09:39 -080059 if (wordCount < limit) {
Craig Cornelius54dcd9b2013-02-15 14:03:14 -080060 if (values != NULL) {
ccorneliusf9878a22014-11-20 18:09:39 -080061 values[wordCount] = uct.getValue();
Craig Cornelius54dcd9b2013-02-15 14:03:14 -080062 }
ccorneliusf9878a22014-11-20 18:09:39 -080063 if (lengths != NULL) {
64 lengths[wordCount] = lengthMatched;
65 }
66 if (cpLengths != NULL) {
67 cpLengths[wordCount] = codePointsMatched;
68 }
69 ++wordCount;
Craig Cornelius54dcd9b2013-02-15 14:03:14 -080070 }
71 if (result == USTRINGTRIE_FINAL_VALUE) {
72 break;
73 }
74 }
75 else if (result == USTRINGTRIE_NO_MATCH) {
76 break;
77 }
ccorneliusf9878a22014-11-20 18:09:39 -080078 if (lengthMatched >= maxLength) {
Craig Cornelius54dcd9b2013-02-15 14:03:14 -080079 break;
80 }
Craig Cornelius54dcd9b2013-02-15 14:03:14 -080081 }
ccorneliusf9878a22014-11-20 18:09:39 -080082
83 if (prefix != NULL) {
84 *prefix = codePointsMatched;
85 }
86 return wordCount;
Craig Cornelius54dcd9b2013-02-15 14:03:14 -080087}
88
89BytesDictionaryMatcher::~BytesDictionaryMatcher() {
90 udata_close(file);
91}
92
93UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
94 if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
95 if (c == 0x200D) {
96 return 0xFF;
97 } else if (c == 0x200C) {
98 return 0xFE;
99 }
100 int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
101 if (delta < 0 || 0xFD < delta) {
102 return U_SENTINEL;
103 }
104 return (UChar32)delta;
105 }
106 return c;
107}
108
109int32_t BytesDictionaryMatcher::getType() const {
110 return DictionaryData::TRIE_TYPE_BYTES;
111}
112
ccorneliusf9878a22014-11-20 18:09:39 -0800113int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
114 int32_t *lengths, int32_t *cpLengths, int32_t *values,
115 int32_t *prefix) const {
Craig Cornelius54dcd9b2013-02-15 14:03:14 -0800116 BytesTrie bt(characters);
Fredrik Roubert8de051c2016-03-10 13:13:27 +0100117 int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
ccorneliusf9878a22014-11-20 18:09:39 -0800118 int32_t wordCount = 0;
119 int32_t codePointsMatched = 0;
120
121 for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
122 UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
Fredrik Roubert8de051c2016-03-10 13:13:27 +0100123 int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
ccorneliusf9878a22014-11-20 18:09:39 -0800124 codePointsMatched += 1;
Craig Cornelius54dcd9b2013-02-15 14:03:14 -0800125 if (USTRINGTRIE_HAS_VALUE(result)) {
ccorneliusf9878a22014-11-20 18:09:39 -0800126 if (wordCount < limit) {
Craig Cornelius54dcd9b2013-02-15 14:03:14 -0800127 if (values != NULL) {
ccorneliusf9878a22014-11-20 18:09:39 -0800128 values[wordCount] = bt.getValue();
ccorneliusfceb3982014-04-16 12:27:14 -0700129 }
ccorneliusf9878a22014-11-20 18:09:39 -0800130 if (lengths != NULL) {
131 lengths[wordCount] = lengthMatched;
132 }
133 if (cpLengths != NULL) {
134 cpLengths[wordCount] = codePointsMatched;
135 }
136 ++wordCount;
Craig Cornelius54dcd9b2013-02-15 14:03:14 -0800137 }
138 if (result == USTRINGTRIE_FINAL_VALUE) {
139 break;
140 }
141 }
142 else if (result == USTRINGTRIE_NO_MATCH) {
143 break;
144 }
ccorneliusf9878a22014-11-20 18:09:39 -0800145 if (lengthMatched >= maxLength) {
Craig Cornelius54dcd9b2013-02-15 14:03:14 -0800146 break;
147 }
Craig Cornelius54dcd9b2013-02-15 14:03:14 -0800148 }
ccorneliusf9878a22014-11-20 18:09:39 -0800149
150 if (prefix != NULL) {
151 *prefix = codePointsMatched;
152 }
153 return wordCount;
Craig Cornelius54dcd9b2013-02-15 14:03:14 -0800154}
155
156
157U_NAMESPACE_END
158
159U_NAMESPACE_USE
160
161U_CAPI int32_t U_EXPORT2
162udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
163 void *outData, UErrorCode *pErrorCode) {
164 const UDataInfo *pInfo;
165 int32_t headerSize;
166 const uint8_t *inBytes;
167 uint8_t *outBytes;
168 const int32_t *inIndexes;
169 int32_t indexes[DictionaryData::IX_COUNT];
170 int32_t i, offset, size;
171
172 headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
173 if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
174 pInfo = (const UDataInfo *)((const char *)inData + 4);
175 if (!(pInfo->dataFormat[0] == 0x44 &&
176 pInfo->dataFormat[1] == 0x69 &&
177 pInfo->dataFormat[2] == 0x63 &&
178 pInfo->dataFormat[3] == 0x74 &&
179 pInfo->formatVersion[0] == 1)) {
180 udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
181 pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
182 *pErrorCode = U_UNSUPPORTED_ERROR;
183 return 0;
184 }
185
186 inBytes = (const uint8_t *)inData + headerSize;
187 outBytes = (uint8_t *)outData + headerSize;
188
189 inIndexes = (const int32_t *)inBytes;
190 if (length >= 0) {
191 length -= headerSize;
192 if (length < (int32_t)(sizeof(indexes))) {
193 udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
194 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
195 return 0;
196 }
197 }
198
199 for (i = 0; i < DictionaryData::IX_COUNT; i++) {
200 indexes[i] = udata_readInt32(ds, inIndexes[i]);
201 }
202
203 size = indexes[DictionaryData::IX_TOTAL_SIZE];
204
205 if (length >= 0) {
206 if (length < size) {
207 udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
208 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
209 return 0;
210 }
211
212 if (inBytes != outBytes) {
213 uprv_memcpy(outBytes, inBytes, size);
214 }
215
216 offset = 0;
217 ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
218 offset = (int32_t)sizeof(indexes);
219 int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
220 int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
221
222 if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
223 ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
224 } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
225 // nothing to do
226 } else {
227 udata_printError(ds, "udict_swap(): unknown trie type!\n");
228 *pErrorCode = U_UNSUPPORTED_ERROR;
229 return 0;
230 }
231
232 // these next two sections are empty in the current format,
233 // but may be used later.
234 offset = nextOffset;
235 nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
236 offset = nextOffset;
237 nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
238 offset = nextOffset;
239 }
240 return headerSize + size;
241}
242#endif