blob: 3a515fad3202ef39a1e91fe41192734803ea492c [file] [log] [blame]
Adam Lesinski393b5f02015-12-17 13:03:11 -08001/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "compile/Pseudolocalizer.h"
Adam Lesinskice5e56e2016-10-21 17:56:45 -070018
Adam Lesinski393b5f02015-12-17 13:03:11 -080019#include "util/Util.h"
20
Adam Lesinskid5083f62017-01-16 15:07:21 -080021using android::StringPiece;
22
Adam Lesinski393b5f02015-12-17 13:03:11 -080023namespace aapt {
24
25// String basis to generate expansion
Adam Lesinskice5e56e2016-10-21 17:56:45 -070026static const std::string kExpansionString =
Adam Lesinskicacb28f2016-10-19 12:18:14 -070027 "one two three "
28 "four five six seven eight nine ten eleven twelve thirteen "
29 "fourteen fiveteen sixteen seventeen nineteen twenty";
Adam Lesinski393b5f02015-12-17 13:03:11 -080030
31// Special unicode characters to override directionality of the words
Adam Lesinskice5e56e2016-10-21 17:56:45 -070032static const std::string kRlm = "\u200f";
33static const std::string kRlo = "\u202e";
34static const std::string kPdf = "\u202c";
Adam Lesinski393b5f02015-12-17 13:03:11 -080035
36// Placeholder marks
Adam Lesinskice5e56e2016-10-21 17:56:45 -070037static const std::string kPlaceholderOpen = "\u00bb";
38static const std::string kPlaceholderClose = "\u00ab";
Adam Lesinski393b5f02015-12-17 13:03:11 -080039
Adam Lesinskice5e56e2016-10-21 17:56:45 -070040static const char kArgStart = '{';
41static const char kArgEnd = '}';
Adam Lesinski393b5f02015-12-17 13:03:11 -080042
43class PseudoMethodNone : public PseudoMethodImpl {
Adam Lesinskicacb28f2016-10-19 12:18:14 -070044 public:
Adam Lesinskid5083f62017-01-16 15:07:21 -080045 std::string Text(const StringPiece& text) override { return text.to_string(); }
46 std::string Placeholder(const StringPiece& text) override { return text.to_string(); }
Adam Lesinski393b5f02015-12-17 13:03:11 -080047};
48
49class PseudoMethodBidi : public PseudoMethodImpl {
Adam Lesinskicacb28f2016-10-19 12:18:14 -070050 public:
Adam Lesinskice5e56e2016-10-21 17:56:45 -070051 std::string Text(const StringPiece& text) override;
52 std::string Placeholder(const StringPiece& text) override;
Adam Lesinski393b5f02015-12-17 13:03:11 -080053};
54
55class PseudoMethodAccent : public PseudoMethodImpl {
Adam Lesinskicacb28f2016-10-19 12:18:14 -070056 public:
Adam Lesinskice5e56e2016-10-21 17:56:45 -070057 PseudoMethodAccent() : depth_(0), word_count_(0), length_(0) {}
58 std::string Start() override;
59 std::string End() override;
60 std::string Text(const StringPiece& text) override;
61 std::string Placeholder(const StringPiece& text) override;
Adam Lesinskicacb28f2016-10-19 12:18:14 -070062
63 private:
Adam Lesinskice5e56e2016-10-21 17:56:45 -070064 size_t depth_;
65 size_t word_count_;
66 size_t length_;
Adam Lesinski393b5f02015-12-17 13:03:11 -080067};
68
Adam Lesinskice5e56e2016-10-21 17:56:45 -070069Pseudolocalizer::Pseudolocalizer(Method method) : last_depth_(0) {
70 SetMethod(method);
Adam Lesinski393b5f02015-12-17 13:03:11 -080071}
72
Adam Lesinskice5e56e2016-10-21 17:56:45 -070073void Pseudolocalizer::SetMethod(Method method) {
Adam Lesinskicacb28f2016-10-19 12:18:14 -070074 switch (method) {
Adam Lesinski393b5f02015-12-17 13:03:11 -080075 case Method::kNone:
Adam Lesinskice5e56e2016-10-21 17:56:45 -070076 impl_ = util::make_unique<PseudoMethodNone>();
Adam Lesinskicacb28f2016-10-19 12:18:14 -070077 break;
Adam Lesinski393b5f02015-12-17 13:03:11 -080078 case Method::kAccent:
Adam Lesinskice5e56e2016-10-21 17:56:45 -070079 impl_ = util::make_unique<PseudoMethodAccent>();
Adam Lesinskicacb28f2016-10-19 12:18:14 -070080 break;
Adam Lesinski393b5f02015-12-17 13:03:11 -080081 case Method::kBidi:
Adam Lesinskice5e56e2016-10-21 17:56:45 -070082 impl_ = util::make_unique<PseudoMethodBidi>();
Adam Lesinskicacb28f2016-10-19 12:18:14 -070083 break;
84 }
Adam Lesinski393b5f02015-12-17 13:03:11 -080085}
86
Adam Lesinskice5e56e2016-10-21 17:56:45 -070087std::string Pseudolocalizer::Text(const StringPiece& text) {
Adam Lesinskicacb28f2016-10-19 12:18:14 -070088 std::string out;
Adam Lesinskice5e56e2016-10-21 17:56:45 -070089 size_t depth = last_depth_;
Adam Lesinskicacb28f2016-10-19 12:18:14 -070090 size_t lastpos, pos;
91 const size_t length = text.size();
92 const char* str = text.data();
93 bool escaped = false;
94 for (lastpos = pos = 0; pos < length; pos++) {
95 char16_t c = str[pos];
96 if (escaped) {
97 escaped = false;
98 continue;
Adam Lesinski393b5f02015-12-17 13:03:11 -080099 }
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700100 if (c == '\'') {
101 escaped = true;
102 continue;
103 }
104
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700105 if (c == kArgStart) {
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700106 depth++;
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700107 } else if (c == kArgEnd && depth) {
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700108 depth--;
109 }
110
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700111 if (last_depth_ != depth || pos == length - 1) {
112 bool pseudo = ((last_depth_ % 2) == 0);
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700113 size_t nextpos = pos;
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700114 if (!pseudo || depth == last_depth_) {
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700115 nextpos++;
116 }
117 size_t size = nextpos - lastpos;
118 if (size) {
Adam Lesinskid5083f62017-01-16 15:07:21 -0800119 std::string chunk = text.substr(lastpos, size).to_string();
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700120 if (pseudo) {
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700121 chunk = impl_->Text(chunk);
122 } else if (str[lastpos] == kArgStart && str[nextpos - 1] == kArgEnd) {
123 chunk = impl_->Placeholder(chunk);
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700124 }
125 out.append(chunk);
126 }
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700127 if (pseudo && depth < last_depth_) { // End of message
128 out.append(impl_->End());
129 } else if (!pseudo && depth > last_depth_) { // Start of message
130 out.append(impl_->Start());
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700131 }
132 lastpos = nextpos;
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700133 last_depth_ = depth;
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700134 }
135 }
136 return out;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800137}
138
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700139static const char* PseudolocalizeChar(const char c) {
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700140 switch (c) {
141 case 'a':
142 return "\u00e5";
143 case 'b':
144 return "\u0253";
145 case 'c':
146 return "\u00e7";
147 case 'd':
148 return "\u00f0";
149 case 'e':
150 return "\u00e9";
151 case 'f':
152 return "\u0192";
153 case 'g':
154 return "\u011d";
155 case 'h':
156 return "\u0125";
157 case 'i':
158 return "\u00ee";
159 case 'j':
160 return "\u0135";
161 case 'k':
162 return "\u0137";
163 case 'l':
164 return "\u013c";
165 case 'm':
166 return "\u1e3f";
167 case 'n':
168 return "\u00f1";
169 case 'o':
170 return "\u00f6";
171 case 'p':
172 return "\u00fe";
173 case 'q':
174 return "\u0051";
175 case 'r':
176 return "\u0155";
177 case 's':
178 return "\u0161";
179 case 't':
180 return "\u0163";
181 case 'u':
182 return "\u00fb";
183 case 'v':
184 return "\u0056";
185 case 'w':
186 return "\u0175";
187 case 'x':
188 return "\u0445";
189 case 'y':
190 return "\u00fd";
191 case 'z':
192 return "\u017e";
193 case 'A':
194 return "\u00c5";
195 case 'B':
196 return "\u03b2";
197 case 'C':
198 return "\u00c7";
199 case 'D':
200 return "\u00d0";
201 case 'E':
202 return "\u00c9";
203 case 'G':
204 return "\u011c";
205 case 'H':
206 return "\u0124";
207 case 'I':
208 return "\u00ce";
209 case 'J':
210 return "\u0134";
211 case 'K':
212 return "\u0136";
213 case 'L':
214 return "\u013b";
215 case 'M':
216 return "\u1e3e";
217 case 'N':
218 return "\u00d1";
219 case 'O':
220 return "\u00d6";
221 case 'P':
222 return "\u00de";
223 case 'Q':
224 return "\u0071";
225 case 'R':
226 return "\u0154";
227 case 'S':
228 return "\u0160";
229 case 'T':
230 return "\u0162";
231 case 'U':
232 return "\u00db";
233 case 'V':
234 return "\u03bd";
235 case 'W':
236 return "\u0174";
237 case 'X':
238 return "\u00d7";
239 case 'Y':
240 return "\u00dd";
241 case 'Z':
242 return "\u017d";
243 case '!':
244 return "\u00a1";
245 case '?':
246 return "\u00bf";
247 case '$':
248 return "\u20ac";
249 default:
250 return nullptr;
251 }
Adam Lesinski393b5f02015-12-17 13:03:11 -0800252}
253
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700254static bool IsPossibleNormalPlaceholderEnd(const char c) {
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700255 switch (c) {
256 case 's':
257 return true;
258 case 'S':
259 return true;
260 case 'c':
261 return true;
262 case 'C':
263 return true;
264 case 'd':
265 return true;
266 case 'o':
267 return true;
268 case 'x':
269 return true;
270 case 'X':
271 return true;
272 case 'f':
273 return true;
274 case 'e':
275 return true;
276 case 'E':
277 return true;
278 case 'g':
279 return true;
280 case 'G':
281 return true;
282 case 'a':
283 return true;
284 case 'A':
285 return true;
286 case 'b':
287 return true;
288 case 'B':
289 return true;
290 case 'h':
291 return true;
292 case 'H':
293 return true;
294 case '%':
295 return true;
296 case 'n':
297 return true;
298 default:
299 return false;
300 }
Adam Lesinski393b5f02015-12-17 13:03:11 -0800301}
302
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700303static std::string PseudoGenerateExpansion(const unsigned int length) {
304 std::string result = kExpansionString;
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700305 const char* s = result.data();
306 if (result.size() < length) {
307 result += " ";
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700308 result += PseudoGenerateExpansion(length - result.size());
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700309 } else {
310 int ext = 0;
311 // Should contain only whole words, so looking for a space
312 for (unsigned int i = length + 1; i < result.size(); ++i) {
313 ++ext;
314 if (s[i] == ' ') {
315 break;
316 }
Adam Lesinski393b5f02015-12-17 13:03:11 -0800317 }
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700318 result = result.substr(0, length + ext);
319 }
320 return result;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800321}
322
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700323std::string PseudoMethodAccent::Start() {
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700324 std::string result;
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700325 if (depth_ == 0) {
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700326 result = "[";
327 }
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700328 word_count_ = length_ = 0;
329 depth_++;
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700330 return result;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800331}
332
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700333std::string PseudoMethodAccent::End() {
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700334 std::string result;
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700335 if (length_) {
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700336 result += " ";
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700337 result += PseudoGenerateExpansion(word_count_ > 3 ? length_ : length_ / 2);
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700338 }
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700339 word_count_ = length_ = 0;
340 depth_--;
341 if (depth_ == 0) {
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700342 result += "]";
343 }
344 return result;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800345}
346
347/**
348 * Converts characters so they look like they've been localized.
349 *
350 * Note: This leaves placeholder syntax untouched.
351 */
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700352std::string PseudoMethodAccent::Text(const StringPiece& source) {
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700353 const char* s = source.data();
354 std::string result;
355 const size_t I = source.size();
356 bool lastspace = true;
357 for (size_t i = 0; i < I; i++) {
358 char c = s[i];
359 if (c == '%') {
360 // Placeholder syntax, no need to pseudolocalize
361 std::string chunk;
362 bool end = false;
363 chunk.append(&c, 1);
364 while (!end && i + 1 < I) {
365 ++i;
366 c = s[i];
367 chunk.append(&c, 1);
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700368 if (IsPossibleNormalPlaceholderEnd(c)) {
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700369 end = true;
370 } else if (i + 1 < I && c == 't') {
371 ++i;
372 c = s[i];
373 chunk.append(&c, 1);
374 end = true;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800375 }
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700376 }
377 // Treat chunk as a placeholder unless it ends with %.
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700378 result += ((c == '%') ? chunk : Placeholder(chunk));
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700379 } else if (c == '<' || c == '&') {
380 // html syntax, no need to pseudolocalize
381 bool tag_closed = false;
382 while (!tag_closed && i < I) {
383 if (c == '&') {
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700384 std::string escape_text;
385 escape_text.append(&c, 1);
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700386 bool end = false;
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700387 size_t html_code_pos = i;
388 while (!end && html_code_pos < I) {
389 ++html_code_pos;
390 c = s[html_code_pos];
391 escape_text.append(&c, 1);
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700392 // Valid html code
393 if (c == ';') {
394 end = true;
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700395 i = html_code_pos;
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700396 }
397 // Wrong html code
398 else if (!((c == '#' || (c >= 'a' && c <= 'z') ||
399 (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')))) {
400 end = true;
401 }
402 }
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700403 result += escape_text;
404 if (escape_text != "&lt;") {
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700405 tag_closed = true;
406 }
407 continue;
408 }
409 if (c == '>') {
410 tag_closed = true;
411 result.append(&c, 1);
412 continue;
413 }
414 result.append(&c, 1);
415 i++;
416 c = s[i];
417 }
418 } else {
419 // This is a pure text that should be pseudolocalized
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700420 const char* p = PseudolocalizeChar(c);
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700421 if (p != nullptr) {
422 result += p;
423 } else {
424 bool space = isspace(c);
Adam Lesinski393b5f02015-12-17 13:03:11 -0800425 if (lastspace && !space) {
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700426 word_count_++;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800427 }
428 lastspace = space;
429 result.append(&c, 1);
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700430 }
431 // Count only pseudolocalizable chars and delimiters
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700432 length_++;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800433 }
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700434 }
435 return result;
436}
437
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700438std::string PseudoMethodAccent::Placeholder(const StringPiece& source) {
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700439 // Surround a placeholder with brackets
Adam Lesinskid5083f62017-01-16 15:07:21 -0800440 return kPlaceholderOpen + source.to_string() + kPlaceholderClose;
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700441}
442
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700443std::string PseudoMethodBidi::Text(const StringPiece& source) {
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700444 const char* s = source.data();
445 std::string result;
446 bool lastspace = true;
447 bool space = true;
Igor Viarheichyk4fb65162017-07-06 15:23:51 -0700448 bool escape = false;
449 const char ESCAPE_CHAR = '\\';
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700450 for (size_t i = 0; i < source.size(); i++) {
451 char c = s[i];
Igor Viarheichyk4fb65162017-07-06 15:23:51 -0700452 if (!escape && c == ESCAPE_CHAR) {
453 escape = true;
454 continue;
455 }
456 space = (!escape && isspace(c)) || (escape && (c == 'n' || c == 't'));
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700457 if (lastspace && !space) {
458 // Word start
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700459 result += kRlm + kRlo;
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700460 } else if (!lastspace && space) {
461 // Word end
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700462 result += kPdf + kRlm;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800463 }
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700464 lastspace = space;
Igor Viarheichyk4fb65162017-07-06 15:23:51 -0700465 if (escape) {
466 result.append(&ESCAPE_CHAR, 1);
467 escape=false;
468 }
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700469 result.append(&c, 1);
470 }
471 if (!lastspace) {
472 // End of last word
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700473 result += kPdf + kRlm;
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700474 }
475 return result;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800476}
477
Adam Lesinskice5e56e2016-10-21 17:56:45 -0700478std::string PseudoMethodBidi::Placeholder(const StringPiece& source) {
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700479 // Surround a placeholder with directionality change sequence
Adam Lesinskid5083f62017-01-16 15:07:21 -0800480 return kRlm + kRlo + source.to_string() + kPdf + kRlm;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800481}
482
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700483} // namespace aapt