blob: 80042957cc188eaeb8a07a77d1ec9f5ca2e7c03f [file] [log] [blame]
Denys Vlasenko42a8fd02009-07-11 21:36:13 +02001/* vi: set sw=4 ts=4: */
2/*
3 * Unicode support routines.
4 *
Denys Vlasenkof6106e62009-07-16 02:27:04 +02005 * Copyright (C) 2009 Denys Vlasenko
Denys Vlasenko42a8fd02009-07-11 21:36:13 +02006 *
7 * Licensed under GPL version 2, see file LICENSE in this tarball for details.
8 */
9#include "libbb.h"
Denys Vlasenko28055022010-01-04 20:49:58 +010010#include "unicode.h"
11
Denys Vlasenko94ca6942010-01-20 02:51:09 +010012/* If it's not a constant... */
13#ifndef unicode_status
Denys Vlasenko28055022010-01-04 20:49:58 +010014uint8_t unicode_status;
Denys Vlasenko94ca6942010-01-20 02:51:09 +010015#endif
Denys Vlasenko42a8fd02009-07-11 21:36:13 +020016
Denys Vlasenkofda8f572009-07-11 22:26:48 +020017size_t FAST_FUNC bb_mbstrlen(const char *string)
18{
19 size_t width = mbstowcs(NULL, string, INT_MAX);
20 if (width == (size_t)-1L)
21 return strlen(string);
22 return width;
23}
Denys Vlasenko42a8fd02009-07-11 21:36:13 +020024
Denys Vlasenko28055022010-01-04 20:49:58 +010025#if ENABLE_LOCALE_SUPPORT
26
27/* Unicode support using libc */
28
29void FAST_FUNC init_unicode(void)
30{
31 /* In unicode, this is a one character string */
32 static const char unicode_0x394[] = { 0xce, 0x94, 0 };
33
34 if (unicode_status != UNICODE_UNKNOWN)
35 return;
36
37 unicode_status = bb_mbstrlen(unicode_0x394) == 1 ? UNICODE_ON : UNICODE_OFF;
38}
39
40#else
Denys Vlasenko42a8fd02009-07-11 21:36:13 +020041
Denys Vlasenkofda8f572009-07-11 22:26:48 +020042/* Crude "locale support" which knows only C and Unicode locales */
43
Denys Vlasenko28055022010-01-04 20:49:58 +010044# if ENABLE_FEATURE_CHECK_UNICODE_IN_ENV
45void FAST_FUNC init_unicode(void)
Denys Vlasenko42a8fd02009-07-11 21:36:13 +020046{
47 char *lang;
48
Denys Vlasenko28055022010-01-04 20:49:58 +010049 if (unicode_status != UNICODE_UNKNOWN)
Denys Vlasenko42a8fd02009-07-11 21:36:13 +020050 return;
Denys Vlasenko42a8fd02009-07-11 21:36:13 +020051
Denys Vlasenko28055022010-01-04 20:49:58 +010052 unicode_status = UNICODE_OFF;
Denys Vlasenko42a8fd02009-07-11 21:36:13 +020053 lang = getenv("LANG");
Denys Vlasenkofff73642009-07-16 16:09:25 +020054 if (!lang || !(strstr(lang, ".utf") || strstr(lang, ".UTF")))
Denys Vlasenko42a8fd02009-07-11 21:36:13 +020055 return;
Denys Vlasenko28055022010-01-04 20:49:58 +010056 unicode_status = UNICODE_ON;
Denys Vlasenko42a8fd02009-07-11 21:36:13 +020057}
58# endif
59
60static size_t wcrtomb_internal(char *s, wchar_t wc)
61{
Denys Vlasenko01ba1672009-07-16 03:06:22 +020062 int n, i;
Denys Vlasenko42a8fd02009-07-11 21:36:13 +020063 uint32_t v = wc;
64
65 if (v <= 0x7f) {
66 *s = v;
67 return 1;
68 }
69
Denys Vlasenkofda8f572009-07-11 22:26:48 +020070 /* RFC 3629 says that Unicode ends at 10FFFF,
71 * but we cover entire 32 bits */
Denys Vlasenko42a8fd02009-07-11 21:36:13 +020072
Denys Vlasenko42a8fd02009-07-11 21:36:13 +020073 /* 4000000-FFFFFFFF -> 111111tt 10tttttt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
Denys Vlasenkofda8f572009-07-11 22:26:48 +020074 /* 200000-3FFFFFF -> 111110tt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
Denys Vlasenkofda8f572009-07-11 22:26:48 +020075 /* 10000-1FFFFF -> 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx */
Denys Vlasenkofda8f572009-07-11 22:26:48 +020076 /* 800-FFFF -> 1110yyyy 10yyyyxx 10xxxxxx */
Denys Vlasenko01ba1672009-07-16 03:06:22 +020077 /* 80-7FF -> 110yyyxx 10xxxxxx */
78
79 /* How many bytes do we need? */
80 n = 2;
81 /* (0x80000000+ would result in n = 7, limiting n to 6) */
82 while (v >= 0x800 && n < 6) {
83 v >>= 5;
Denys Vlasenkofda8f572009-07-11 22:26:48 +020084 n++;
85 }
Denys Vlasenko01ba1672009-07-16 03:06:22 +020086 /* Fill bytes n-1..1 */
87 i = n;
88 while (--i) {
89 s[i] = (wc & 0x3f) | 0x80;
90 wc >>= 6;
91 }
92 /* Fill byte 0 */
Denys Vlasenkofda8f572009-07-11 22:26:48 +020093 s[0] = wc | (uint8_t)(0x3f00 >> n);
94 return n;
Denys Vlasenko42a8fd02009-07-11 21:36:13 +020095}
96
97size_t FAST_FUNC wcrtomb(char *s, wchar_t wc, mbstate_t *ps UNUSED_PARAM)
98{
Denys Vlasenko28055022010-01-04 20:49:58 +010099 if (unicode_status != UNICODE_ON) {
Denys Vlasenko42a8fd02009-07-11 21:36:13 +0200100 *s = wc;
101 return 1;
102 }
103
104 return wcrtomb_internal(s, wc);
105}
106
107size_t FAST_FUNC wcstombs(char *dest, const wchar_t *src, size_t n)
108{
109 size_t org_n = n;
110
Denys Vlasenko28055022010-01-04 20:49:58 +0100111 if (unicode_status != UNICODE_ON) {
Denys Vlasenko42a8fd02009-07-11 21:36:13 +0200112 while (n) {
113 wchar_t c = *src++;
114 *dest++ = c;
115 if (c == 0)
116 break;
117 n--;
118 }
119 return org_n - n;
120 }
121
122 while (n >= MB_CUR_MAX) {
123 wchar_t wc = *src++;
124 size_t len = wcrtomb_internal(dest, wc);
125
126 if (wc == L'\0')
127 return org_n - n;
128 dest += len;
129 n -= len;
130 }
131 while (n) {
132 char tbuf[MB_CUR_MAX];
133 wchar_t wc = *src++;
134 size_t len = wcrtomb_internal(tbuf, wc);
135
136 if (len > n)
137 len = n;
138 memcpy(dest, tbuf, len);
139 if (wc == L'\0')
140 return org_n - n;
141 dest += len;
142 n -= len;
143 }
144 return org_n - n;
145}
146
147size_t FAST_FUNC mbstowcs(wchar_t *dest, const char *src, size_t n)
148{
149 size_t org_n = n;
150
Denys Vlasenko28055022010-01-04 20:49:58 +0100151 if (unicode_status != UNICODE_ON) {
Denys Vlasenko42a8fd02009-07-11 21:36:13 +0200152 while (n) {
153 unsigned char c = *src++;
Denys Vlasenkofda8f572009-07-11 22:26:48 +0200154
155 if (dest)
156 *dest++ = c;
Denys Vlasenko42a8fd02009-07-11 21:36:13 +0200157 if (c == 0)
158 break;
159 n--;
160 }
161 return org_n - n;
162 }
163
164 while (n) {
165 int bytes;
166 unsigned c = (unsigned char) *src++;
167
168 if (c <= 0x7f) {
Denys Vlasenkofda8f572009-07-11 22:26:48 +0200169 if (dest)
170 *dest++ = c;
Denys Vlasenko42a8fd02009-07-11 21:36:13 +0200171 if (c == '\0')
172 break;
173 n--;
174 continue;
175 }
176
177 /* 80-7FF -> 110yyyxx 10xxxxxx */
178 /* 800-FFFF -> 1110yyyy 10yyyyxx 10xxxxxx */
179 /* 10000-1FFFFF -> 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx */
180 /* 200000-3FFFFFF -> 111110tt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
181 /* 4000000-FFFFFFFF -> 111111tt 10tttttt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
182 bytes = 0;
183 do {
184 c <<= 1;
185 bytes++;
186 } while ((c & 0x80) && bytes < 6);
187 if (bytes == 1)
188 return (size_t) -1L;
189 c = (uint8_t)(c) >> bytes;
190
191 while (--bytes) {
192 unsigned ch = (unsigned char) *src++;
193 if ((ch & 0xc0) != 0x80) {
194 return (size_t) -1L;
195 }
196 c = (c << 6) + (ch & 0x3f);
197 }
198
199 /* TODO */
200 /* Need to check that c isn't produced by overlong encoding */
201 /* Example: 11000000 10000000 converts to NUL */
202 /* 11110000 10000000 10000100 10000000 converts to 0x100 */
203 /* correct encoding: 11000100 10000000 */
204 if (c <= 0x7f) { /* crude check */
205 return (size_t) -1L;
206 //or maybe: c = 0xfffd; /* replacement character */
207 }
208
Denys Vlasenkofda8f572009-07-11 22:26:48 +0200209 if (dest)
210 *dest++ = c;
Denys Vlasenko42a8fd02009-07-11 21:36:13 +0200211 n--;
212 }
213
214 return org_n - n;
215}
216
217int FAST_FUNC iswspace(wint_t wc)
218{
219 return (unsigned)wc <= 0x7f && isspace(wc);
220}
221
222int FAST_FUNC iswalnum(wint_t wc)
223{
224 return (unsigned)wc <= 0x7f && isalnum(wc);
225}
226
227int FAST_FUNC iswpunct(wint_t wc)
228{
229 return (unsigned)wc <= 0x7f && ispunct(wc);
230}
231
232#endif