blob: db7cb8f35edae6db7fb6abddb1876320e55ef769 [file] [log] [blame]
schwarze@openbsd.orgb8ae02a2018-08-21 13:56:27 +00001/* $OpenBSD: utf8.c,v 1.8 2018/08/21 13:56:27 schwarze Exp $ */
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +00002/*
3 * Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18/*
19 * Utility functions for multibyte-character handling,
20 * in particular to sanitize untrusted strings for terminal output.
21 */
22
Darren Tuckerdf820722016-06-06 11:36:13 +100023#include "includes.h"
24
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +000025#include <sys/types.h>
Darren Tuckerf3f2cc82016-07-11 17:23:38 +100026#ifdef HAVE_LANGINFO_H
27# include <langinfo.h>
28#endif
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +000029#include <limits.h>
Damien Millerdda78a02016-12-12 13:57:10 +110030#include <locale.h>
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +000031#include <stdarg.h>
32#include <stdio.h>
33#include <stdlib.h>
34#include <string.h>
Darren Tuckerdf820722016-06-06 11:36:13 +100035#if defined(HAVE_STRNVIS) && defined(HAVE_VIS_H) && !defined(BROKEN_STRNVIS)
36# include <vis.h>
37#endif
Darren Tuckerf3f2cc82016-07-11 17:23:38 +100038#ifdef HAVE_WCHAR_H
39# include <wchar.h>
40#endif
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +000041
42#include "utf8.h"
43
44static int dangerous_locale(void);
schwarze@openbsd.orgcd9e1ea2016-05-30 12:57:21 +000045static int grow_dst(char **, size_t *, size_t, char **, size_t);
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +000046static int vasnmprintf(char **, size_t, int *, const char *, va_list);
47
48
49/*
50 * For US-ASCII and UTF-8 encodings, we can safely recover from
51 * encoding errors and from non-printable characters. For any
52 * other encodings, err to the side of caution and abort parsing:
53 * For state-dependent encodings, recovery is impossible.
54 * For arbitrary encodings, replacement of non-printable
55 * characters would be non-trivial and too fragile.
schwarze@openbsd.orgb8ae02a2018-08-21 13:56:27 +000056 * The comments indicate what nl_langinfo(CODESET)
57 * returns for US-ASCII on various operating systems.
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +000058 */
59
60static int
61dangerous_locale(void) {
62 char *loc;
63
64 loc = nl_langinfo(CODESET);
schwarze@openbsd.orgb8ae02a2018-08-21 13:56:27 +000065 return strcmp(loc, "UTF-8") != 0 &&
66 strcmp(loc, "US-ASCII") != 0 && /* OpenBSD */
67 strcmp(loc, "ANSI_X3.4-1968") != 0 && /* Linux */
68 strcmp(loc, "ISO8859-1") != 0 && /* AIX */
69 strcmp(loc, "646") != 0 && /* Solaris, NetBSD */
70 strcmp(loc, "") != 0; /* Solaris 6 */
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +000071}
72
schwarze@openbsd.orgcd9e1ea2016-05-30 12:57:21 +000073static int
74grow_dst(char **dst, size_t *sz, size_t maxsz, char **dp, size_t need)
75{
76 char *tp;
77 size_t tsz;
78
79 if (*dp + need < *dst + *sz)
80 return 0;
81 tsz = *sz + 128;
82 if (tsz > maxsz)
83 tsz = maxsz;
deraadt@openbsd.org9e509d42017-05-31 09:15:42 +000084 if ((tp = recallocarray(*dst, *sz, tsz, 1)) == NULL)
schwarze@openbsd.orgcd9e1ea2016-05-30 12:57:21 +000085 return -1;
86 *dp = tp + (*dp - *dst);
87 *dst = tp;
88 *sz = tsz;
89 return 0;
90}
91
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +000092/*
93 * The following two functions limit the number of bytes written,
94 * including the terminating '\0', to sz. Unless wp is NULL,
95 * they limit the number of display columns occupied to *wp.
96 * Whichever is reached first terminates the output string.
97 * To stay close to the standard interfaces, they return the number of
98 * non-NUL bytes that would have been written if both were unlimited.
99 * If wp is NULL, newline, carriage return, and tab are allowed;
100 * otherwise, the actual number of columns occupied by what was
101 * written is returned in *wp.
102 */
103
104static int
105vasnmprintf(char **str, size_t maxsz, int *wp, const char *fmt, va_list ap)
106{
107 char *src; /* Source string returned from vasprintf. */
108 char *sp; /* Pointer into src. */
109 char *dst; /* Destination string to be returned. */
110 char *dp; /* Pointer into dst. */
111 char *tp; /* Temporary pointer for dst. */
112 size_t sz; /* Number of bytes allocated for dst. */
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +0000113 wchar_t wc; /* Wide character at sp. */
114 int len; /* Number of bytes in the character at sp. */
115 int ret; /* Number of bytes needed to format src. */
116 int width; /* Display width of the character wc. */
117 int total_width, max_width, print;
118
schwarze@openbsd.orgac284a32016-05-30 12:05:56 +0000119 src = NULL;
120 if ((ret = vasprintf(&src, fmt, ap)) <= 0)
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +0000121 goto fail;
122
schwarze@openbsd.orgcd9e1ea2016-05-30 12:57:21 +0000123 sz = strlen(src) + 1;
schwarze@openbsd.orgac284a32016-05-30 12:05:56 +0000124 if ((dst = malloc(sz)) == NULL) {
125 free(src);
jsg@openbsd.org3ec5fa42017-02-02 10:54:25 +0000126 ret = -1;
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +0000127 goto fail;
schwarze@openbsd.orgac284a32016-05-30 12:05:56 +0000128 }
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +0000129
130 if (maxsz > INT_MAX)
131 maxsz = INT_MAX;
132
133 sp = src;
134 dp = dst;
135 ret = 0;
136 print = 1;
137 total_width = 0;
138 max_width = wp == NULL ? INT_MAX : *wp;
139 while (*sp != '\0') {
140 if ((len = mbtowc(&wc, sp, MB_CUR_MAX)) == -1) {
141 (void)mbtowc(NULL, NULL, MB_CUR_MAX);
142 if (dangerous_locale()) {
143 ret = -1;
144 break;
145 }
146 len = 1;
147 width = -1;
148 } else if (wp == NULL &&
149 (wc == L'\n' || wc == L'\r' || wc == L'\t')) {
150 /*
151 * Don't use width uninitialized; the actual
152 * value doesn't matter because total_width
153 * is only returned for wp != NULL.
154 */
155 width = 0;
156 } else if ((width = wcwidth(wc)) == -1 &&
157 dangerous_locale()) {
158 ret = -1;
159 break;
160 }
161
162 /* Valid, printable character. */
163
164 if (width >= 0) {
165 if (print && (dp - dst >= (int)maxsz - len ||
166 total_width > max_width - width))
167 print = 0;
168 if (print) {
schwarze@openbsd.orgcd9e1ea2016-05-30 12:57:21 +0000169 if (grow_dst(&dst, &sz, maxsz,
170 &dp, len) == -1) {
171 ret = -1;
172 break;
173 }
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +0000174 total_width += width;
175 memcpy(dp, sp, len);
176 dp += len;
177 }
178 sp += len;
179 if (ret >= 0)
180 ret += len;
181 continue;
182 }
183
184 /* Escaping required. */
185
186 while (len > 0) {
187 if (print && (dp - dst >= (int)maxsz - 4 ||
188 total_width > max_width - 4))
189 print = 0;
190 if (print) {
schwarze@openbsd.orgcd9e1ea2016-05-30 12:57:21 +0000191 if (grow_dst(&dst, &sz, maxsz,
192 &dp, 4) == -1) {
193 ret = -1;
194 break;
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +0000195 }
196 tp = vis(dp, *sp, VIS_OCTAL | VIS_ALL, 0);
197 width = tp - dp;
198 total_width += width;
199 dp = tp;
200 } else
201 width = 4;
202 len--;
203 sp++;
204 if (ret >= 0)
205 ret += width;
206 }
207 if (len > 0)
208 break;
209 }
210 free(src);
211 *dp = '\0';
212 *str = dst;
213 if (wp != NULL)
214 *wp = total_width;
215
216 /*
217 * If the string was truncated by the width limit but
218 * would have fit into the size limit, the only sane way
219 * to report the problem is using the return value, such
220 * that the usual idiom "if (ret < 0 || ret >= sz) error"
221 * works as expected.
222 */
223
224 if (ret < (int)maxsz && !print)
225 ret = -1;
226 return ret;
227
228fail:
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +0000229 if (wp != NULL)
230 *wp = 0;
schwarze@openbsd.orgac284a32016-05-30 12:05:56 +0000231 if (ret == 0) {
232 *str = src;
233 return 0;
234 } else {
235 *str = NULL;
236 return -1;
237 }
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +0000238}
239
240int
241snmprintf(char *str, size_t sz, int *wp, const char *fmt, ...)
242{
243 va_list ap;
244 char *cp;
245 int ret;
246
247 va_start(ap, fmt);
248 ret = vasnmprintf(&cp, sz, wp, fmt, ap);
249 va_end(ap);
schwarze@openbsd.orgac284a32016-05-30 12:05:56 +0000250 if (cp != NULL) {
251 (void)strlcpy(str, cp, sz);
252 free(cp);
253 } else
254 *str = '\0';
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +0000255 return ret;
256}
257
258/*
259 * To stay close to the standard interfaces, the following functions
260 * return the number of non-NUL bytes written.
261 */
262
263int
264vfmprintf(FILE *stream, const char *fmt, va_list ap)
265{
266 char *str;
267 int ret;
268
269 if ((ret = vasnmprintf(&str, INT_MAX, NULL, fmt, ap)) < 0)
270 return -1;
271 if (fputs(str, stream) == EOF)
272 ret = -1;
273 free(str);
274 return ret;
275}
276
277int
278fmprintf(FILE *stream, const char *fmt, ...)
279{
280 va_list ap;
281 int ret;
282
283 va_start(ap, fmt);
284 ret = vfmprintf(stream, fmt, ap);
285 va_end(ap);
286 return ret;
287}
288
289int
290mprintf(const char *fmt, ...)
291{
292 va_list ap;
293 int ret;
294
295 va_start(ap, fmt);
296 ret = vfmprintf(stdout, fmt, ap);
297 va_end(ap);
298 return ret;
299}
Damien Millerdda78a02016-12-12 13:57:10 +1100300
301/*
302 * Set up libc for multibyte output in the user's chosen locale.
303 *
304 * XXX: we are known to have problems with Turkish (i/I confusion) so we
305 * deliberately fall back to the C locale for now. Longer term we should
306 * always prefer to select C.[encoding] if possible, but there's no
307 * standardisation in locales between systems, so we'll need to survey
308 * what's out there first.
309 */
310void
311msetlocale(void)
312{
313 const char *vars[] = { "LC_ALL", "LC_CTYPE", "LANG", NULL };
314 char *cp;
315 int i;
316
317 /*
318 * We can't yet cope with dotless/dotted I in Turkish locales,
319 * so fall back to the C locale for these.
320 */
321 for (i = 0; vars[i] != NULL; i++) {
322 if ((cp = getenv(vars[i])) == NULL)
323 continue;
324 if (strncasecmp(cp, "TR", 2) != 0)
325 break;
326 /*
327 * If we're in a UTF-8 locale then prefer to use
328 * the C.UTF-8 locale (or equivalent) if it exists.
329 */
330 if ((strcasestr(cp, "UTF-8") != NULL ||
331 strcasestr(cp, "UTF8") != NULL) &&
332 (setlocale(LC_CTYPE, "C.UTF-8") != NULL ||
333 setlocale(LC_CTYPE, "POSIX.UTF-8") != NULL))
334 return;
335 setlocale(LC_CTYPE, "C");
336 return;
337 }
338 /* We can handle this locale */
339 setlocale(LC_CTYPE, "");
340}