more fine-grained Unicode support

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
diff --git a/libbb/unicode.c b/libbb/unicode.c
index 39b173e..878af84 100644
--- a/libbb/unicode.c
+++ b/libbb/unicode.c
@@ -216,8 +216,6 @@
 	return org_n - n;
 }
 
-#include "unicode_wcwidth.c"
-
 int FAST_FUNC iswspace(wint_t wc)
 {
 	return (unsigned)wc <= 0x7f && isspace(wc);
@@ -233,6 +231,8 @@
 	return (unsigned)wc <= 0x7f && ispunct(wc);
 }
 
+#include "unicode_wcwidth.c"
+
 #endif /* Homegrown Unicode support */
 
 
@@ -251,8 +251,22 @@
 	char *dst;
 	unsigned dst_len;
 
-	if (unicode_status != UNICODE_ON)
-		return xasprintf("%-*.*s", width, width, src);
+	if (unicode_status != UNICODE_ON) {
+		char *d = dst = xmalloc(width + 1);
+		while ((int)--width >= 0) {
+			unsigned char c = *src;
+			if (c == '\0') {
+				do
+					*d++ = ' ';
+				while ((int)--width >= 0);
+				break;
+			}
+			*d++ = (c >= ' ' && c < 0x7f) ? c : '?';
+			src++;
+		}
+		*d = '\0';
+		return dst;
+	}
 
 	dst = NULL;
 	dst_len = 0;
@@ -260,31 +274,64 @@
 		int w;
 		wchar_t wc;
 
-		dst = xrealloc(dst, dst_len + 2 * MB_CUR_MAX);
 #if ENABLE_LOCALE_SUPPORT
 		{
 			mbstate_t mbst = { 0 };
 			ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst);
-			if (rc <= 0) /* error, or end-of-string */
+			/* If invalid sequence is seen: -1 is returned,
+			 * src points to the invalid sequence, errno = EILSEQ.
+			 * Else number of wchars (excluding terminating L'\0')
+			 * written to dest is returned.
+			 * If len (here: 1) non-L'\0' wchars stored at dest,
+			 * src points to the next char to be converted.
+			 * If string is completely converted: src = NULL.
+			 */
+			if (rc == 0) /* end-of-string */
 				break;
+			if (rc < 0) { /* error */
+				src++;
+				goto subst;
+			}
+			if (!iswprint(wc))
+				goto subst;
 		}
 #else
-		src = mbstowc_internal(&wc, src);
-		if (!src || wc == 0) /* error, or end-of-string */
-			break;
-#endif
-		w = wcwidth(wc);
-		if (w < 0) /* non-printable wchar */
-			break;
-		width -= w;
-		if ((int)width < 0) { /* string is longer than width */
-			width += w;
-			while (width) {
-				dst[dst_len++] = ' ';
-				width--;
+		{
+			const char *src1 = mbstowc_internal(&wc, src);
+			/* src = NULL: invalid sequence is seen,
+			 * else: wc is set, src is advanced to next mb char
+			 */
+			if (src1) {/* no error */
+				if (wc == 0) /* end-of-string */
+					break;
+				src = src1;
+			} else { /* error */
+				src++;
+				goto subst;
 			}
+		}
+#endif
+		if (CONFIG_LAST_SUPPORTED_WCHAR && wc > CONFIG_LAST_SUPPORTED_WCHAR)
+			goto subst;
+		w = wcwidth(wc);
+		if ((ENABLE_UNICODE_COMBINING_WCHARS && w < 0) /* non-printable wchar */
+		 || (!ENABLE_UNICODE_COMBINING_WCHARS && wc <= 0)
+		 || (!ENABLE_UNICODE_WIDE_WCHARS && wc > 1)
+		) {
+ subst:
+			wc = CONFIG_SUBST_WCHAR;
+			w = 1;
+		}
+		width -= w;
+		/* Note: if width == 0, we still may add more chars,
+		 * they may be zero-width or combining ones */
+		if ((int)width < 0) {
+			/* can't add this wc, string would become longer than width */
+			width += w;
 			break;
 		}
+
+		dst = xrealloc(dst, dst_len + MB_CUR_MAX);
 #if ENABLE_LOCALE_SUPPORT
 		{
 			mbstate_t mbst = { 0 };
@@ -294,7 +341,14 @@
 		dst_len += wcrtomb_internal(&dst[dst_len], wc);
 #endif
 	}
+
+	/* Pad to remaining width */
+	dst = xrealloc(dst, dst_len + width + 1);
+	while ((int)--width >= 0) {
+		dst[dst_len++] = ' ';
+	}
 	dst[dst_len] = '\0';
+
 	return dst;
 }
 
diff --git a/libbb/unicode_wcwidth.c b/libbb/unicode_wcwidth.c
index 8d301f7..ab62b18 100644
--- a/libbb/unicode_wcwidth.c
+++ b/libbb/unicode_wcwidth.c
@@ -59,6 +59,13 @@
  * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
  */
 
+#if CONFIG_LAST_SUPPORTED_WCHAR == 0
+# define LAST_SUPPORTED_WCHAR ((1 << 31) - 1)
+#else
+# define LAST_SUPPORTED_WCHAR CONFIG_LAST_SUPPORTED_WCHAR
+#endif
+
+#if LAST_SUPPORTED_WCHAR >= 0x0300
 struct interval {
 	uint16_t first;
 	uint16_t last;
@@ -111,6 +118,7 @@
 	}
 	return 0;
 }
+#endif
 
 
 /* The following two functions define the column width of an ISO 10646
@@ -146,6 +154,7 @@
  */
 static int wcwidth(unsigned ucs)
 {
+#if LAST_SUPPORTED_WCHAR >= 0x0300
 	/* sorted list of non-overlapping intervals of non-spacing characters */
 	/* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
 	static const struct interval combining[] = {
@@ -420,12 +429,15 @@
 #undef BIG_
 #undef PAIR
 	};
+# if LAST_SUPPORTED_WCHAR >= 0x1100
 	static const struct interval combining0x10000[] = {
 		{ 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F },
 		{ 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 },
 		{ 0xD173, 0xD182 }, { 0xD185, 0xD18B }, { 0xD1AA, 0xD1AD },
 		{ 0xD242, 0xD244 }
 	};
+# endif
+#endif
 
 	if (ucs == 0)
 		return 0;
@@ -435,6 +447,9 @@
 	if (ucs < 0x0300) /* optimization */
 		return 1;
 
+#if LAST_SUPPORTED_WCHAR < 0x0300
+	return -1;
+#else
 	/* binary search in table of non-spacing characters */
 	if (in_interval_table(ucs, combining, ARRAY_SIZE(combining) - 1))
 		return 0;
@@ -444,6 +459,9 @@
 	if (ucs < 0x1100) /* optimization */
 		return 1;
 
+# if LAST_SUPPORTED_WCHAR < 0x1100
+	return -1;
+# else
 	/* binary search in table of non-spacing characters, cont. */
 	if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1))
 		return 0;
@@ -458,8 +476,8 @@
 
 	return 1 +
 		(  (/*ucs >= 0x1100 &&*/ ucs <= 0x115f) /* Hangul Jamo init. consonants */
-		|| ucs == 0x2329
-		|| ucs == 0x232a
+		|| ucs == 0x2329 /* left-pointing angle bracket; also CJK punct. char */
+		|| ucs == 0x232a /* right-pointing angle bracket; also CJK punct. char */
 		|| (ucs >= 0x2e80 && ucs <= 0xa4cf && ucs != 0x303f) /* CJK ... Yi */
 		|| (ucs >= 0xac00 && ucs <= 0xd7a3) /* Hangul Syllables */
 		|| (ucs >= 0xf900 && ucs <= 0xfaff) /* CJK Compatibility Ideographs */
@@ -470,4 +488,6 @@
 		|| (ucs >= 0x20000 && ucs <= 0x2fffd)
 		|| (ucs >= 0x30000 && ucs <= 0x3fffd)
 		);
+# endif
+#endif
 }