blob: caf789cee65d6ee7297a5e5e7ebd0e0656132d62 [file] [log] [blame]
schwarze@openbsd.orgac284a32016-05-30 12:05:56 +00001/* $OpenBSD: utf8.c,v 1.2 2016/05/30 12:05:56 schwarze Exp $ */
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +00002/*
3 * Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18/*
19 * Utility functions for multibyte-character handling,
20 * in particular to sanitize untrusted strings for terminal output.
21 */
22
23#include <sys/types.h>
24#include <langinfo.h>
25#include <limits.h>
26#include <stdarg.h>
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30#include <vis.h>
31#include <wchar.h>
32
33#include "utf8.h"
34
35static int dangerous_locale(void);
36static int vasnmprintf(char **, size_t, int *, const char *, va_list);
37
38
39/*
40 * For US-ASCII and UTF-8 encodings, we can safely recover from
41 * encoding errors and from non-printable characters. For any
42 * other encodings, err to the side of caution and abort parsing:
43 * For state-dependent encodings, recovery is impossible.
44 * For arbitrary encodings, replacement of non-printable
45 * characters would be non-trivial and too fragile.
46 */
47
48static int
49dangerous_locale(void) {
50 char *loc;
51
52 loc = nl_langinfo(CODESET);
53 return strcmp(loc, "US-ASCII") && strcmp(loc, "UTF-8");
54}
55
56/*
57 * The following two functions limit the number of bytes written,
58 * including the terminating '\0', to sz. Unless wp is NULL,
59 * they limit the number of display columns occupied to *wp.
60 * Whichever is reached first terminates the output string.
61 * To stay close to the standard interfaces, they return the number of
62 * non-NUL bytes that would have been written if both were unlimited.
63 * If wp is NULL, newline, carriage return, and tab are allowed;
64 * otherwise, the actual number of columns occupied by what was
65 * written is returned in *wp.
66 */
67
68static int
69vasnmprintf(char **str, size_t maxsz, int *wp, const char *fmt, va_list ap)
70{
71 char *src; /* Source string returned from vasprintf. */
72 char *sp; /* Pointer into src. */
73 char *dst; /* Destination string to be returned. */
74 char *dp; /* Pointer into dst. */
75 char *tp; /* Temporary pointer for dst. */
76 size_t sz; /* Number of bytes allocated for dst. */
77 size_t tsz; /* Temporary size while extending dst. */
78 wchar_t wc; /* Wide character at sp. */
79 int len; /* Number of bytes in the character at sp. */
80 int ret; /* Number of bytes needed to format src. */
81 int width; /* Display width of the character wc. */
82 int total_width, max_width, print;
83
schwarze@openbsd.orgac284a32016-05-30 12:05:56 +000084 src = NULL;
85 if ((ret = vasprintf(&src, fmt, ap)) <= 0)
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +000086 goto fail;
87
88 sz = strlen(src);
schwarze@openbsd.orgac284a32016-05-30 12:05:56 +000089 if ((dst = malloc(sz)) == NULL) {
90 free(src);
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +000091 goto fail;
schwarze@openbsd.orgac284a32016-05-30 12:05:56 +000092 }
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +000093
94 if (maxsz > INT_MAX)
95 maxsz = INT_MAX;
96
97 sp = src;
98 dp = dst;
99 ret = 0;
100 print = 1;
101 total_width = 0;
102 max_width = wp == NULL ? INT_MAX : *wp;
103 while (*sp != '\0') {
104 if ((len = mbtowc(&wc, sp, MB_CUR_MAX)) == -1) {
105 (void)mbtowc(NULL, NULL, MB_CUR_MAX);
106 if (dangerous_locale()) {
107 ret = -1;
108 break;
109 }
110 len = 1;
111 width = -1;
112 } else if (wp == NULL &&
113 (wc == L'\n' || wc == L'\r' || wc == L'\t')) {
114 /*
115 * Don't use width uninitialized; the actual
116 * value doesn't matter because total_width
117 * is only returned for wp != NULL.
118 */
119 width = 0;
120 } else if ((width = wcwidth(wc)) == -1 &&
121 dangerous_locale()) {
122 ret = -1;
123 break;
124 }
125
126 /* Valid, printable character. */
127
128 if (width >= 0) {
129 if (print && (dp - dst >= (int)maxsz - len ||
130 total_width > max_width - width))
131 print = 0;
132 if (print) {
133 total_width += width;
134 memcpy(dp, sp, len);
135 dp += len;
136 }
137 sp += len;
138 if (ret >= 0)
139 ret += len;
140 continue;
141 }
142
143 /* Escaping required. */
144
145 while (len > 0) {
146 if (print && (dp - dst >= (int)maxsz - 4 ||
147 total_width > max_width - 4))
148 print = 0;
149 if (print) {
150 if (dp + 4 >= dst + sz) {
151 tsz = sz + 128;
152 if (tsz > maxsz)
153 tsz = maxsz;
154 tp = realloc(dst, tsz);
155 if (tp == NULL) {
156 ret = -1;
157 break;
158 }
159 dp = tp + (dp - dst);
160 dst = tp;
161 sz = tsz;
162 }
163 tp = vis(dp, *sp, VIS_OCTAL | VIS_ALL, 0);
164 width = tp - dp;
165 total_width += width;
166 dp = tp;
167 } else
168 width = 4;
169 len--;
170 sp++;
171 if (ret >= 0)
172 ret += width;
173 }
174 if (len > 0)
175 break;
176 }
177 free(src);
178 *dp = '\0';
179 *str = dst;
180 if (wp != NULL)
181 *wp = total_width;
182
183 /*
184 * If the string was truncated by the width limit but
185 * would have fit into the size limit, the only sane way
186 * to report the problem is using the return value, such
187 * that the usual idiom "if (ret < 0 || ret >= sz) error"
188 * works as expected.
189 */
190
191 if (ret < (int)maxsz && !print)
192 ret = -1;
193 return ret;
194
195fail:
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +0000196 if (wp != NULL)
197 *wp = 0;
schwarze@openbsd.orgac284a32016-05-30 12:05:56 +0000198 if (ret == 0) {
199 *str = src;
200 return 0;
201 } else {
202 *str = NULL;
203 return -1;
204 }
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +0000205}
206
207int
208snmprintf(char *str, size_t sz, int *wp, const char *fmt, ...)
209{
210 va_list ap;
211 char *cp;
212 int ret;
213
214 va_start(ap, fmt);
215 ret = vasnmprintf(&cp, sz, wp, fmt, ap);
216 va_end(ap);
schwarze@openbsd.orgac284a32016-05-30 12:05:56 +0000217 if (cp != NULL) {
218 (void)strlcpy(str, cp, sz);
219 free(cp);
220 } else
221 *str = '\0';
schwarze@openbsd.org0e059cd2016-05-25 23:48:45 +0000222 return ret;
223}
224
225/*
226 * To stay close to the standard interfaces, the following functions
227 * return the number of non-NUL bytes written.
228 */
229
230int
231vfmprintf(FILE *stream, const char *fmt, va_list ap)
232{
233 char *str;
234 int ret;
235
236 if ((ret = vasnmprintf(&str, INT_MAX, NULL, fmt, ap)) < 0)
237 return -1;
238 if (fputs(str, stream) == EOF)
239 ret = -1;
240 free(str);
241 return ret;
242}
243
244int
245fmprintf(FILE *stream, const char *fmt, ...)
246{
247 va_list ap;
248 int ret;
249
250 va_start(ap, fmt);
251 ret = vfmprintf(stream, fmt, ap);
252 va_end(ap);
253 return ret;
254}
255
256int
257mprintf(const char *fmt, ...)
258{
259 va_list ap;
260 int ret;
261
262 va_start(ap, fmt);
263 ret = vfmprintf(stdout, fmt, ap);
264 va_end(ap);
265 return ret;
266}