blob: de307737493ebe586e9dc7a6ee56d2e601b64b78 [file] [log] [blame]
Seigo Nonaka44a1df22017-08-31 18:22:38 -07001/*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
Seigo Nonakaf90c9b62017-10-09 11:10:24 -070017#include <sys/mman.h>
18#include <sys/types.h>
19#include <sys/stat.h>
20#include <unistd.h>
21
22#include <algorithm>
23
Seigo Nonaka44a1df22017-08-31 18:22:38 -070024#include <core_jni_helpers.h>
25#include <minikin/Hyphenator.h>
Seigo Nonaka44a1df22017-08-31 18:22:38 -070026
27namespace android {
28
Seigo Nonakaf90c9b62017-10-09 11:10:24 -070029static std::string buildFileName(const std::string& locale) {
30 constexpr char SYSTEM_HYPHENATOR_PREFIX[] = "/system/usr/hyphen-data/hyph-";
31 constexpr char SYSTEM_HYPHENATOR_SUFFIX[] = ".hyb";
32 std::string lowerLocale;
33 lowerLocale.reserve(locale.size());
34 std::transform(locale.begin(), locale.end(), std::back_inserter(lowerLocale), ::tolower);
35 return SYSTEM_HYPHENATOR_PREFIX + lowerLocale + SYSTEM_HYPHENATOR_SUFFIX;
36}
37
38static const uint8_t* mmapPatternFile(const std::string& locale) {
39 const std::string hyFilePath = buildFileName(locale);
Nick Kralevich4b3a08c2019-01-28 10:39:10 -080040 const int fd = open(hyFilePath.c_str(), O_RDONLY | O_CLOEXEC);
Seigo Nonakaf90c9b62017-10-09 11:10:24 -070041 if (fd == -1) {
42 return nullptr; // Open failed.
43 }
44
45 struct stat st = {};
46 if (fstat(fd, &st) == -1) { // Unlikely to happen.
47 close(fd);
48 return nullptr;
49 }
50
51 void* ptr = mmap(nullptr, st.st_size, PROT_READ, MAP_SHARED, fd, 0 /* offset */);
52 close(fd);
53 if (ptr == MAP_FAILED) {
54 return nullptr;
55 }
56 return reinterpret_cast<const uint8_t*>(ptr);
57}
58
59static void addHyphenatorWithoutPatternFile(const std::string& locale, int minPrefix,
60 int minSuffix) {
61 minikin::addHyphenator(locale, minikin::Hyphenator::loadBinary(
62 nullptr, minPrefix, minSuffix, locale));
63}
64
65static void addHyphenator(const std::string& locale, int minPrefix, int minSuffix) {
66 const uint8_t* ptr = mmapPatternFile(locale);
67 if (ptr == nullptr) {
68 ALOGE("Unable to find pattern file or unable to map it for %s", locale.c_str());
69 return;
70 }
71 minikin::addHyphenator(locale, minikin::Hyphenator::loadBinary(
72 ptr, minPrefix, minSuffix, locale));
73}
74
75static void addHyphenatorAlias(const std::string& from, const std::string& to) {
76 minikin::addHyphenatorAlias(from, to);
77}
78
79static void init() {
80 // TODO: Confirm that these are the best values. Various sources suggest (1, 1), but that
81 // appears too small.
82 constexpr int INDIC_MIN_PREFIX = 2;
83 constexpr int INDIC_MIN_SUFFIX = 2;
84
Roozbeh Pournader28c5b4d2017-10-24 16:40:21 -070085 addHyphenator("as", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Assamese
86 addHyphenator("be", 2, 2); // Belarusian
87 addHyphenator("bg", 2, 2); // Bulgarian
88 addHyphenator("bn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Bengali
89 addHyphenator("cu", 1, 2); // Church Slavonic
90 addHyphenator("cy", 2, 3); // Welsh
91 addHyphenator("da", 2, 2); // Danish
92 addHyphenator("de-1901", 2, 2); // German 1901 orthography
93 addHyphenator("de-1996", 2, 2); // German 1996 orthography
94 addHyphenator("de-CH-1901", 2, 2); // Swiss High German 1901 orthography
95 addHyphenator("en-GB", 2, 3); // British English
96 addHyphenator("en-US", 2, 3); // American English
97 addHyphenator("es", 2, 2); // Spanish
98 addHyphenator("et", 2, 3); // Estonian
99 addHyphenator("eu", 2, 2); // Basque
100 addHyphenator("fr", 2, 3); // French
101 addHyphenator("ga", 2, 3); // Irish
102 addHyphenator("gu", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Gujarati
103 addHyphenator("hi", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Hindi
104 addHyphenator("hr", 2, 2); // Croatian
105 addHyphenator("hu", 2, 2); // Hungarian
Seigo Nonakaf90c9b62017-10-09 11:10:24 -0700106 // texhyphen sources say Armenian may be (1, 2); but that it needs confirmation.
107 // Going with a more conservative value of (2, 2) for now.
Roozbeh Pournader28c5b4d2017-10-24 16:40:21 -0700108 addHyphenator("hy", 2, 2); // Armenian
109 addHyphenator("kn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Kannada
110 addHyphenator("la", 2, 2); // Latin
111 addHyphenator("ml", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Malayalam
112 addHyphenator("mn-Cyrl", 2, 2); // Mongolian in Cyrillic script
113 addHyphenator("mr", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Marathi
114 addHyphenator("nb", 2, 2); // Norwegian Bokmål
115 addHyphenator("nn", 2, 2); // Norwegian Nynorsk
116 addHyphenator("or", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Oriya
117 addHyphenator("pa", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Punjabi
118 addHyphenator("pt", 2, 3); // Portuguese
119 addHyphenator("sl", 2, 2); // Slovenian
120 addHyphenator("ta", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Tamil
121 addHyphenator("te", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Telugu
122 addHyphenator("tk", 2, 2); // Turkmen
123 addHyphenator("und-Ethi", 1, 1); // Any language in Ethiopic script
Seigo Nonakaf90c9b62017-10-09 11:10:24 -0700124
125 // Following two hyphenators do not have pattern files but there is some special logic based on
126 // language.
127 addHyphenatorWithoutPatternFile("ca", 2, 2); // Catalan
128 addHyphenatorWithoutPatternFile("pl", 2, 2); // Polish
129
130 // English locales that fall back to en-US. The data is from CLDR. It's all English locales,
131 // minus the locales whose parent is en-001 (from supplementalData.xml, under <parentLocales>).
132 // TODO: Figure out how to get this from ICU.
Roozbeh Pournader28c5b4d2017-10-24 16:40:21 -0700133 addHyphenatorAlias("en-AS", "en-US"); // English (American Samoa)
134 addHyphenatorAlias("en-GU", "en-US"); // English (Guam)
135 addHyphenatorAlias("en-MH", "en-US"); // English (Marshall Islands)
136 addHyphenatorAlias("en-MP", "en-US"); // English (Northern Mariana Islands)
137 addHyphenatorAlias("en-PR", "en-US"); // English (Puerto Rico)
138 addHyphenatorAlias("en-UM", "en-US"); // English (United States Minor Outlying Islands)
139 addHyphenatorAlias("en-VI", "en-US"); // English (Virgin Islands)
Seigo Nonakaf90c9b62017-10-09 11:10:24 -0700140
141 // All English locales other than those falling back to en-US are mapped to en-GB.
142 addHyphenatorAlias("en", "en-GB");
143
144 // For German, we're assuming the 1996 (and later) orthography by default.
145 addHyphenatorAlias("de", "de-1996");
146 // Liechtenstein uses the Swiss hyphenation rules for the 1901 orthography.
147 addHyphenatorAlias("de-LI-1901", "de-CH-1901");
148
149 // Norwegian is very probably Norwegian Bokmål.
150 addHyphenatorAlias("no", "nb");
151
152 // Use mn-Cyrl. According to CLDR's likelySubtags.xml, mn is most likely to be mn-Cyrl.
Roozbeh Pournader28c5b4d2017-10-24 16:40:21 -0700153 addHyphenatorAlias("mn", "mn-Cyrl"); // Mongolian
Seigo Nonakaf90c9b62017-10-09 11:10:24 -0700154
155 // Fall back to Ethiopic script for languages likely to be written in Ethiopic.
156 // Data is from CLDR's likelySubtags.xml.
157 // TODO: Convert this to a mechanism using ICU4J's ULocale#addLikelySubtags().
Roozbeh Pournader28c5b4d2017-10-24 16:40:21 -0700158 addHyphenatorAlias("am", "und-Ethi"); // Amharic
159 addHyphenatorAlias("byn", "und-Ethi"); // Blin
160 addHyphenatorAlias("gez", "und-Ethi"); // Geʻez
161 addHyphenatorAlias("ti", "und-Ethi"); // Tigrinya
162 addHyphenatorAlias("wal", "und-Ethi"); // Wolaytta
Seigo Nonakaf90c9b62017-10-09 11:10:24 -0700163
Roozbeh Pournader28c5b4d2017-10-24 16:40:21 -0700164 // Use Hindi as a fallback hyphenator for all languages written in Devanagari, etc. This makes
165 // sense because our Indic patterns are not really linguistic, but script-based.
166 addHyphenatorAlias("und-Beng", "bn"); // Bengali
167 addHyphenatorAlias("und-Deva", "hi"); // Devanagari -> Hindi
168 addHyphenatorAlias("und-Gujr", "gu"); // Gujarati
169 addHyphenatorAlias("und-Guru", "pa"); // Gurmukhi -> Punjabi
170 addHyphenatorAlias("und-Knda", "kn"); // Kannada
171 addHyphenatorAlias("und-Mlym", "ml"); // Malayalam
172 addHyphenatorAlias("und-Orya", "or"); // Oriya
173 addHyphenatorAlias("und-Taml", "ta"); // Tamil
174 addHyphenatorAlias("und-Telu", "te"); // Telugu
Seigo Nonaka44a1df22017-08-31 18:22:38 -0700175}
176
177static const JNINativeMethod gMethods[] = {
Seigo Nonakaf90c9b62017-10-09 11:10:24 -0700178 {"nInit", "()V", (void*) init},
Seigo Nonaka44a1df22017-08-31 18:22:38 -0700179};
180
181int register_android_text_Hyphenator(JNIEnv* env) {
182 return RegisterMethodsOrDie(env, "android/text/Hyphenator", gMethods, NELEM(gMethods));
183}
184
185} // namespace android