Blame - utf8.c - platform/external/openssh

blob: dead79b8a252404451c9a7df356e286a7fc7f75f [file] [log] [blame]

Greg Hartman	9768ca4	2017-06-22 20:49:52 -0700	[diff] [blame]	1	/* $OpenBSD: utf8.c,v 1.5 2017/02/19 00:10:57 djm Exp $ */
				2	/*
				3	* Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org>
				4	*
				5	* Permission to use, copy, modify, and distribute this software for any
				6	* purpose with or without fee is hereby granted, provided that the above
				7	* copyright notice and this permission notice appear in all copies.
				8	*
				9	* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
				10	* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
				11	* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
				12	* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				13	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				14	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
				15	* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				16	*/
				17
				18	/*
				19	* Utility functions for multibyte-character handling,
				20	* in particular to sanitize untrusted strings for terminal output.
				21	*/
				22
				23	#include "includes.h"
				24
				25	#include <sys/types.h>
				26	#ifdef HAVE_LANGINFO_H
				27	# include <langinfo.h>
				28	#endif
				29	#include <limits.h>
				30	#include <locale.h>
				31	#include <stdarg.h>
				32	#include <stdio.h>
				33	#include <stdlib.h>
				34	#include <string.h>
				35	#if defined(HAVE_STRNVIS) && defined(HAVE_VIS_H) && !defined(BROKEN_STRNVIS)
				36	# include <vis.h>
				37	#endif
				38	#ifdef HAVE_WCHAR_H
				39	# include <wchar.h>
				40	#endif
				41
				42	#include "utf8.h"
				43
				44	static int dangerous_locale(void);
				45	static int grow_dst(char *, size_t , size_t, char **, size_t);
				46	static int vasnmprintf(char *, size_t, int , const char *, va_list);
				47
				48
				49	/*
				50	* For US-ASCII and UTF-8 encodings, we can safely recover from
				51	* encoding errors and from non-printable characters. For any
				52	* other encodings, err to the side of caution and abort parsing:
				53	* For state-dependent encodings, recovery is impossible.
				54	* For arbitrary encodings, replacement of non-printable
				55	* characters would be non-trivial and too fragile.
				56	*/
				57
				58	static int
				59	dangerous_locale(void) {
				60	char *loc;
				61
				62	loc = nl_langinfo(CODESET);
				63	return strcmp(loc, "US-ASCII") != 0 && strcmp(loc, "UTF-8") != 0 &&
				64	strcmp(loc, "ANSI_X3.4-1968") != 0 && strcmp(loc, "646") != 0;
				65	}
				66
				67	static int
				68	grow_dst(char *dst, size_t sz, size_t maxsz, char **dp, size_t need)
				69	{
				70	char *tp;
				71	size_t tsz;
				72
				73	if (dp + need < dst + *sz)
				74	return 0;
				75	tsz = *sz + 128;
				76	if (tsz > maxsz)
				77	tsz = maxsz;
				78	if ((tp = realloc(*dst, tsz)) == NULL)
				79	return -1;
				80	dp = tp + (dp - *dst);
				81	*dst = tp;
				82	*sz = tsz;
				83	return 0;
				84	}
				85
				86	/*
				87	* The following two functions limit the number of bytes written,
				88	* including the terminating '\0', to sz. Unless wp is NULL,
				89	* they limit the number of display columns occupied to *wp.
				90	* Whichever is reached first terminates the output string.
				91	* To stay close to the standard interfaces, they return the number of
				92	* non-NUL bytes that would have been written if both were unlimited.
				93	* If wp is NULL, newline, carriage return, and tab are allowed;
				94	* otherwise, the actual number of columns occupied by what was
				95	* written is returned in *wp.
				96	*/
				97
				98	static int
				99	vasnmprintf(char *str, size_t maxsz, int wp, const char *fmt, va_list ap)
				100	{
				101	char src; / Source string returned from vasprintf. */
				102	char sp; / Pointer into src. */
				103	char dst; / Destination string to be returned. */
				104	char dp; / Pointer into dst. */
				105	char tp; / Temporary pointer for dst. */
				106	size_t sz; /* Number of bytes allocated for dst. */
				107	wchar_t wc; /* Wide character at sp. */
				108	int len; /* Number of bytes in the character at sp. */
				109	int ret; /* Number of bytes needed to format src. */
				110	int width; /* Display width of the character wc. */
				111	int total_width, max_width, print;
				112
				113	src = NULL;
				114	if ((ret = vasprintf(&src, fmt, ap)) <= 0)
				115	goto fail;
				116
				117	sz = strlen(src) + 1;
				118	if ((dst = malloc(sz)) == NULL) {
				119	free(src);
				120	ret = -1;
				121	goto fail;
				122	}
				123
				124	if (maxsz > INT_MAX)
				125	maxsz = INT_MAX;
				126
				127	sp = src;
				128	dp = dst;
				129	ret = 0;
				130	print = 1;
				131	total_width = 0;
				132	max_width = wp == NULL ? INT_MAX : *wp;
				133	while (*sp != '\0') {
				134	if ((len = mbtowc(&wc, sp, MB_CUR_MAX)) == -1) {
				135	(void)mbtowc(NULL, NULL, MB_CUR_MAX);
				136	if (dangerous_locale()) {
				137	ret = -1;
				138	break;
				139	}
				140	len = 1;
				141	width = -1;
				142	} else if (wp == NULL &&
				143	(wc == L'\n' \|\| wc == L'\r' \|\| wc == L'\t')) {
				144	/*
				145	* Don't use width uninitialized; the actual
				146	* value doesn't matter because total_width
				147	* is only returned for wp != NULL.
				148	*/
				149	width = 0;
				150	} else if ((width = wcwidth(wc)) == -1 &&
				151	dangerous_locale()) {
				152	ret = -1;
				153	break;
				154	}
				155
				156	/* Valid, printable character. */
				157
				158	if (width >= 0) {
				159	if (print && (dp - dst >= (int)maxsz - len \|\|
				160	total_width > max_width - width))
				161	print = 0;
				162	if (print) {
				163	if (grow_dst(&dst, &sz, maxsz,
				164	&dp, len) == -1) {
				165	ret = -1;
				166	break;
				167	}
				168	total_width += width;
				169	memcpy(dp, sp, len);
				170	dp += len;
				171	}
				172	sp += len;
				173	if (ret >= 0)
				174	ret += len;
				175	continue;
				176	}
				177
				178	/* Escaping required. */
				179
				180	while (len > 0) {
				181	if (print && (dp - dst >= (int)maxsz - 4 \|\|
				182	total_width > max_width - 4))
				183	print = 0;
				184	if (print) {
				185	if (grow_dst(&dst, &sz, maxsz,
				186	&dp, 4) == -1) {
				187	ret = -1;
				188	break;
				189	}
				190	tp = vis(dp, *sp, VIS_OCTAL \| VIS_ALL, 0);
				191	width = tp - dp;
				192	total_width += width;
				193	dp = tp;
				194	} else
				195	width = 4;
				196	len--;
				197	sp++;
				198	if (ret >= 0)
				199	ret += width;
				200	}
				201	if (len > 0)
				202	break;
				203	}
				204	free(src);
				205	*dp = '\0';
				206	*str = dst;
				207	if (wp != NULL)
				208	*wp = total_width;
				209
				210	/*
				211	* If the string was truncated by the width limit but
				212	* would have fit into the size limit, the only sane way
				213	* to report the problem is using the return value, such
				214	* that the usual idiom "if (ret < 0 \|\| ret >= sz) error"
				215	* works as expected.
				216	*/
				217
				218	if (ret < (int)maxsz && !print)
				219	ret = -1;
				220	return ret;
				221
				222	fail:
				223	if (wp != NULL)
				224	*wp = 0;
				225	if (ret == 0) {
				226	*str = src;
				227	return 0;
				228	} else {
				229	*str = NULL;
				230	return -1;
				231	}
				232	}
				233
				234	int
				235	snmprintf(char str, size_t sz, int wp, const char *fmt, ...)
				236	{
				237	va_list ap;
				238	char *cp;
				239	int ret;
				240
				241	va_start(ap, fmt);
				242	ret = vasnmprintf(&cp, sz, wp, fmt, ap);
				243	va_end(ap);
				244	if (cp != NULL) {
				245	(void)strlcpy(str, cp, sz);
				246	free(cp);
				247	} else
				248	*str = '\0';
				249	return ret;
				250	}
				251
				252	/*
				253	* To stay close to the standard interfaces, the following functions
				254	* return the number of non-NUL bytes written.
				255	*/
				256
				257	int
				258	vfmprintf(FILE stream, const char fmt, va_list ap)
				259	{
				260	char *str;
				261	int ret;
				262
				263	if ((ret = vasnmprintf(&str, INT_MAX, NULL, fmt, ap)) < 0)
				264	return -1;
				265	if (fputs(str, stream) == EOF)
				266	ret = -1;
				267	free(str);
				268	return ret;
				269	}
				270
				271	int
				272	fmprintf(FILE stream, const char fmt, ...)
				273	{
				274	va_list ap;
				275	int ret;
				276
				277	va_start(ap, fmt);
				278	ret = vfmprintf(stream, fmt, ap);
				279	va_end(ap);
				280	return ret;
				281	}
				282
				283	int
				284	mprintf(const char *fmt, ...)
				285	{
				286	va_list ap;
				287	int ret;
				288
				289	va_start(ap, fmt);
				290	ret = vfmprintf(stdout, fmt, ap);
				291	va_end(ap);
				292	return ret;
				293	}
				294
				295	/*
				296	* Set up libc for multibyte output in the user's chosen locale.
				297	*
				298	* XXX: we are known to have problems with Turkish (i/I confusion) so we
				299	* deliberately fall back to the C locale for now. Longer term we should
				300	* always prefer to select C.[encoding] if possible, but there's no
				301	* standardisation in locales between systems, so we'll need to survey
				302	* what's out there first.
				303	*/
				304	void
				305	msetlocale(void)
				306	{
				307	const char *vars[] = { "LC_ALL", "LC_CTYPE", "LANG", NULL };
				308	char *cp;
				309	int i;
				310
				311	/*
				312	* We can't yet cope with dotless/dotted I in Turkish locales,
				313	* so fall back to the C locale for these.
				314	*/
				315	for (i = 0; vars[i] != NULL; i++) {
				316	if ((cp = getenv(vars[i])) == NULL)
				317	continue;
				318	if (strncasecmp(cp, "TR", 2) != 0)
				319	break;
				320	/*
				321	* If we're in a UTF-8 locale then prefer to use
				322	* the C.UTF-8 locale (or equivalent) if it exists.
				323	*/
				324	if ((strcasestr(cp, "UTF-8") != NULL \|\|
				325	strcasestr(cp, "UTF8") != NULL) &&
				326	(setlocale(LC_CTYPE, "C.UTF-8") != NULL \|\|
				327	setlocale(LC_CTYPE, "POSIX.UTF-8") != NULL))
				328	return;
				329	setlocale(LC_CTYPE, "C");
				330	return;
				331	}
				332	/* We can handle this locale */
				333	setlocale(LC_CTYPE, "");
				334	}