blob: 1a859fed557379eb425953ccbe2de8bfc45bdfa2 [file] [log] [blame]
Julia Lavrova90787fe2020-07-20 17:32:03 +00001/*
2* Copyright 2020 Google Inc.
3*
4* Use of this source code is governed by a BSD-style license that can be
5* found in the LICENSE file.
6*/
7#include "include/private/SkTFitsIn.h"
8#include "include/private/SkTemplates.h"
9#include "modules/skshaper/src/SkUnicode.h"
10#include "src/utils/SkUTF.h"
11#include <unicode/ubidi.h>
12#include <unicode/ubrk.h>
13#include <unicode/utext.h>
14#include <unicode/utypes.h>
15#include <vector>
16#include <functional>
17
Julia Lavrova1798f4f2020-08-26 14:22:48 +000018using SkUnicodeBidi = std::unique_ptr<UBiDi, SkFunctionWrapper<decltype(ubidi_close), ubidi_close>>;
Julia Lavrova90787fe2020-07-20 17:32:03 +000019using ICUUText = std::unique_ptr<UText, SkFunctionWrapper<decltype(utext_close), utext_close>>;
20using ICUBreakIterator = std::unique_ptr<UBreakIterator, SkFunctionWrapper<decltype(ubrk_close), ubrk_close>>;
21
22/** Replaces invalid utf-8 sequences with REPLACEMENT CHARACTER U+FFFD. */
23static inline SkUnichar utf8_next(const char** ptr, const char* end) {
24 SkUnichar val = SkUTF::NextUTF8(ptr, end);
25 return val < 0 ? 0xFFFD : val;
26}
27
Julia Lavrova1798f4f2020-08-26 14:22:48 +000028class SkBidiIterator_icu : public SkBidiIterator {
29 SkUnicodeBidi fBidi;
30public:
31 explicit SkBidiIterator_icu(SkUnicodeBidi bidi) : fBidi(std::move(bidi)) {}
32 Position getLength() override { return ubidi_getLength(fBidi.get()); }
33 Level getLevelAt(Position pos) override { return ubidi_getLevelAt(fBidi.get(), pos); }
34
35 static std::unique_ptr<SkBidiIterator> makeBidiIterator(const uint16_t utf16[], int utf16Units, Direction dir) {
36 UErrorCode status = U_ZERO_ERROR;
37 SkUnicodeBidi bidi(ubidi_openSized(utf16Units, 0, &status));
38 if (U_FAILURE(status)) {
39 SkDEBUGF("Bidi error: %s", u_errorName(status));
40 return nullptr;
41 }
42 SkASSERT(bidi);
43 uint8_t bidiLevel = (dir == SkBidiIterator::kLTR) ? UBIDI_LTR : UBIDI_RTL;
44 // The required lifetime of utf16 isn't well documented.
45 // It appears it isn't used after ubidi_setPara except through ubidi_getText.
46 ubidi_setPara(bidi.get(), (const UChar*)utf16, utf16Units, bidiLevel, nullptr, &status);
47 if (U_FAILURE(status)) {
48 SkDEBUGF("Bidi error: %s", u_errorName(status));
49 return nullptr;
50 }
51 return std::unique_ptr<SkBidiIterator>(new SkBidiIterator_icu(std::move(bidi)));
52 }
53
54 // ICU bidi iterator works with utf16 but clients (Flutter for instance) may work with utf8
55 // This method allows the clients not to think about all these details
56 static std::unique_ptr<SkBidiIterator> makeBidiIterator(const char utf8[], int utf8Units, Direction dir) {
57 // Convert utf8 into utf16 since ubidi only accepts utf16
58 if (!SkTFitsIn<int32_t>(utf8Units)) {
59 SkDEBUGF("Bidi error: text too long");
60 return nullptr;
61 }
62
63 // Getting the length like this seems to always set U_BUFFER_OVERFLOW_ERROR
64 int utf16Units = SkUTF::UTF8ToUTF16(nullptr, 0, utf8, utf8Units);
65 if (utf16Units < 0) {
66 SkDEBUGF("Bidi error: Invalid utf8 input");
67 return nullptr;
68 }
69 std::unique_ptr<uint16_t[]> utf16(new uint16_t[utf16Units]);
70 SkDEBUGCODE(int dstLen =) SkUTF::UTF8ToUTF16(utf16.get(), utf16Units, utf8, utf8Units);
71 SkASSERT(dstLen == utf16Units);
72
73 return makeBidiIterator(utf16.get(), utf16Units, dir);
74 }
75
76 // This method returns the final results only: a list of bidi regions
77 // (this is all SkParagraph really needs; SkShaper however uses the iterator itself)
78 static std::vector<Region> getBidiRegions(const char utf8[], int utf8Units, Direction dir) {
79
80 auto bidiIterator = makeBidiIterator(utf8, utf8Units, dir);
81 std::vector<Region> bidiRegions;
82 const char* start8 = utf8;
83 const char* end8 = utf8 + utf8Units;
84 SkBidiIterator::Level currentLevel = 0;
85
86 Position pos8 = 0;
87 Position pos16 = 0;
88 Position end16 = bidiIterator->getLength();
89 while (pos16 < end16) {
90 auto level = bidiIterator->getLevelAt(pos16);
91 if (pos16 == 0) {
92 currentLevel = level;
93 } else if (level != currentLevel) {
94 auto end = start8 - utf8;
95 bidiRegions.emplace_back(pos8, end, currentLevel);
96 currentLevel = level;
97 pos8 = end;
98 }
99 SkUnichar u = utf8_next(&start8, end8);
100 pos16 += SkUTF::ToUTF16(u);
101 }
102 auto end = start8 - utf8;
103 if (end != pos8) {
104 bidiRegions.emplace_back(pos8, end, currentLevel);
105 }
106 return bidiRegions;
107 }
108};
109
110void SkBidiIterator::ReorderVisual(const Level runLevels[], int levelsCount,
111 int32_t logicalFromVisual[]) {
112 ubidi_reorderVisual(runLevels, levelsCount, logicalFromVisual);
113}
Julia Lavrova90787fe2020-07-20 17:32:03 +0000114
115class SkUnicode_icu : public SkUnicode {
116
117 static UBreakIteratorType convertType(UBreakType type) {
118 switch (type) {
119 case UBreakType::kLines: return UBRK_LINE;
120 case UBreakType::kGraphemes: return UBRK_CHARACTER;
121 case UBreakType::kWords: return UBRK_WORD;
122 default:
123 SkDEBUGF("Convert error: wrong break type");
124 return UBRK_CHARACTER;
125 }
126 }
127
Julia Lavrova05ce2812020-09-01 20:51:05 +0000128 static int convertUtf8ToUtf16(const char* utf8, size_t utf8Units, std::unique_ptr<uint16_t[]>* utf16) {
129 int utf16Units = SkUTF::UTF8ToUTF16(nullptr, 0, utf8, utf8Units);
130 if (utf16Units < 0) {
131 SkDEBUGF("Convert error: Invalid utf8 input");
132 return utf16Units;
133 }
134 *utf16 = std::unique_ptr<uint16_t[]>(new uint16_t[utf16Units]);
135 SkDEBUGCODE(int dstLen =) SkUTF::UTF8ToUTF16(utf16->get(), utf16Units, utf8, utf8Units);
136 SkASSERT(dstLen == utf16Units);
137 return utf16Units;
138 }
139
Julia Lavrova1798f4f2020-08-26 14:22:48 +0000140 static bool extractBidi(const char utf8[], int utf8Units, TextDirection dir, std::vector<BidiRegion>* bidiRegions) {
Julia Lavrova90787fe2020-07-20 17:32:03 +0000141
142 // Convert to UTF16 since for now bidi iterator only operates on utf16
143 std::unique_ptr<uint16_t[]> utf16;
Julia Lavrova05ce2812020-09-01 20:51:05 +0000144 auto utf16Units = convertUtf8ToUtf16(utf8, utf8Units, &utf16);
Julia Lavrova90787fe2020-07-20 17:32:03 +0000145 if (utf16Units < 0) {
146 return false;
147 }
148
149 // Create bidi iterator
150 UErrorCode status = U_ZERO_ERROR;
Julia Lavrova1798f4f2020-08-26 14:22:48 +0000151 SkUnicodeBidi bidi(ubidi_openSized(utf16Units, 0, &status));
Julia Lavrova90787fe2020-07-20 17:32:03 +0000152 if (U_FAILURE(status)) {
153 SkDEBUGF("Bidi error: %s", u_errorName(status));
154 return false;
155 }
156 SkASSERT(bidi);
Julia Lavrova1798f4f2020-08-26 14:22:48 +0000157 uint8_t bidiLevel = (dir == TextDirection::kLTR) ? UBIDI_LTR : UBIDI_RTL;
Julia Lavrova90787fe2020-07-20 17:32:03 +0000158 // The required lifetime of utf16 isn't well documented.
159 // It appears it isn't used after ubidi_setPara except through ubidi_getText.
160 ubidi_setPara(bidi.get(), (const UChar*)utf16.get(), utf16Units, bidiLevel, nullptr, &status);
161 if (U_FAILURE(status)) {
162 SkDEBUGF("Bidi error: %s", u_errorName(status));
163 return false;
164 }
165
166 // Iterate through bidi regions and the result positions into utf8
167 const char* start8 = utf8;
168 const char* end8 = utf8 + utf8Units;
169 BidiLevel currentLevel = 0;
170
171 Position pos8 = 0;
172 Position pos16 = 0;
173 Position end16 = ubidi_getLength(bidi.get());
174 while (pos16 < end16) {
175 auto level = ubidi_getLevelAt(bidi.get(), pos16);
176 if (pos16 == 0) {
177 currentLevel = level;
178 } else if (level != currentLevel) {
179 Position end = start8 - utf8;
180 bidiRegions->emplace_back(pos8, end, currentLevel);
181 currentLevel = level;
182 pos8 = end;
183 }
184 SkUnichar u = utf8_next(&start8, end8);
185 pos16 += SkUTF::ToUTF16(u);
186 }
187 Position end = start8 - utf8;
188 if (end != pos8) {
189 bidiRegions->emplace_back(pos8, end, currentLevel);
190 }
191 return true;
192 }
193
194 static bool extractWords(uint16_t utf16[], int utf16Units, std::vector<Position>* words) {
195
196 UErrorCode status = U_ZERO_ERROR;
197
198 UBreakIteratorType breakType = convertType(UBreakType::kWords);
199 ICUBreakIterator iterator(ubrk_open(breakType, uloc_getDefault(), nullptr, 0, &status));
200 if (U_FAILURE(status)) {
201 SkDEBUGF("Break error: %s", u_errorName(status));
202 return false;
203 }
204 SkASSERT(iterator);
205
206 UText sUtf16UText = UTEXT_INITIALIZER;
207 ICUUText utf16UText(utext_openUChars(&sUtf16UText, (UChar*)utf16, utf16Units, &status));
208 if (U_FAILURE(status)) {
209 SkDEBUGF("Break error: %s", u_errorName(status));
210 return false;
211 }
212
213 ubrk_setUText(iterator.get(), utf16UText.get(), &status);
214 if (U_FAILURE(status)) {
215 SkDEBUGF("Break error: %s", u_errorName(status));
216 return false;
217 }
218
219 // Get the words
220 int32_t pos = ubrk_first(iterator.get());
221 while (pos != UBRK_DONE) {
222 words->emplace_back(pos);
223 pos = ubrk_next(iterator.get());
224 }
225
226 return true;
227 }
228
229 static bool extractPositions(const char utf8[], int utf8Units, UBreakType type, std::function<void(int, int)> add) {
230
231 UErrorCode status = U_ZERO_ERROR;
232 UText sUtf8UText = UTEXT_INITIALIZER;
233 ICUUText text(utext_openUTF8(&sUtf8UText, &utf8[0], utf8Units, &status));
234
235 if (U_FAILURE(status)) {
236 SkDEBUGF("Break error: %s", u_errorName(status));
237 return false;
238 }
239 SkASSERT(text);
240
241 ICUBreakIterator iterator(ubrk_open(convertType(type), uloc_getDefault(), nullptr, 0, &status));
242 if (U_FAILURE(status)) {
243 SkDEBUGF("Break error: %s", u_errorName(status));
244 }
245
246 ubrk_setUText(iterator.get(), text.get(), &status);
247 if (U_FAILURE(status)) {
248 SkDEBUGF("Break error: %s", u_errorName(status));
249 return false;
250 }
251
252 auto iter = iterator.get();
253 int32_t pos = ubrk_first(iter);
254 while (pos != UBRK_DONE) {
255 add(pos, ubrk_getRuleStatus(iter));
256 pos = ubrk_next(iter);
257 }
258 return true;
259 }
260
261 static bool extractWhitespaces(const char utf8[], int utf8Units, std::vector<Position>* whitespaces) {
262
263 const char* start = utf8;
264 const char* end = utf8 + utf8Units;
265 const char* ch = start;
266 while (ch < end) {
267 auto index = ch - start;
268 auto unichar = utf8_next(&ch, end);
269 if (u_isWhitespace(unichar)) {
270 auto ending = ch - start;
271 for (auto k = index; k < ending; ++k) {
272 whitespaces->emplace_back(k);
273 }
274 }
275 }
276 return true;
277 }
278
279public:
280 ~SkUnicode_icu() override { }
Julia Lavrova1798f4f2020-08-26 14:22:48 +0000281 std::unique_ptr<SkBidiIterator> makeBidiIterator(const uint16_t text[], int count,
282 SkBidiIterator::Direction dir) override {
283 return SkBidiIterator_icu::makeBidiIterator(text, count, dir);
284 }
285 std::unique_ptr<SkBidiIterator> makeBidiIterator(const char text[], int count,
286 SkBidiIterator::Direction dir) override {
287 return SkBidiIterator_icu::makeBidiIterator(text, count, dir);
288 }
Julia Lavrova90787fe2020-07-20 17:32:03 +0000289
Julia Lavrova1798f4f2020-08-26 14:22:48 +0000290 bool getBidiRegions(const char utf8[], int utf8Units, TextDirection dir, std::vector<BidiRegion>* results) override {
Julia Lavrova90787fe2020-07-20 17:32:03 +0000291 return extractBidi(utf8, utf8Units, dir, results);
292 }
293
294 bool getLineBreaks(const char utf8[], int utf8Units, std::vector<LineBreakBefore>* results) override {
295
296 return extractPositions(utf8, utf8Units, UBreakType::kLines,
297 [results](int pos, int status) {
298 results->emplace_back(pos,status == UBRK_LINE_HARD
299 ? LineBreakType::kHardLineBreak
300 : LineBreakType::kSoftLineBreak);
301 });
302 }
303
304 bool getWords(const char utf8[], int utf8Units, std::vector<Position>* results) override {
305
306 // Convert to UTF16 since we want the results in utf16
307 std::unique_ptr<uint16_t[]> utf16;
Julia Lavrova05ce2812020-09-01 20:51:05 +0000308 auto utf16Units = convertUtf8ToUtf16(utf8, utf8Units, &utf16);
Julia Lavrova90787fe2020-07-20 17:32:03 +0000309 if (utf16Units < 0) {
310 return false;
311 }
312
313 return extractWords(utf16.get(), utf16Units, results);
314 }
315
316 bool getGraphemes(const char utf8[], int utf8Units, std::vector<Position>* results) override {
317
318 return extractPositions(utf8, utf8Units, UBreakType::kGraphemes,
319 [results](int pos, int status) { results->emplace_back(pos);
320 });
321 }
322
323 bool getWhitespaces(const char utf8[], int utf8Units, std::vector<Position>* results) override {
324
325 return extractWhitespaces(utf8, utf8Units, results);
326 }
327
328 void reorderVisual(const BidiLevel runLevels[], int levelsCount, int32_t logicalFromVisual[]) override {
329 ubidi_reorderVisual(runLevels, levelsCount, logicalFromVisual);
330 }
331};
332
Julia Lavrova05ce2812020-09-01 20:51:05 +0000333std::unique_ptr<SkUnicode> SkUnicode::Make() { return std::make_unique<SkUnicode_icu>(); }