#8271: the utf-8 decoder now outputs the correct number of U+FFFD characters when used with the "replace" error handler on invalid utf-8 sequences. Patch by Serhiy Storchaka, tests by Ezio Melotti.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index f61f9d0..665f03d 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -4759,9 +4759,7 @@
goto End;
errmsg = "unexpected end of data";
startinpos = s - starts;
- endinpos = startinpos + 1;
- while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
- endinpos++;
+ endinpos = end - starts;
break;
case 1:
errmsg = "invalid start byte";
@@ -4769,11 +4767,11 @@
endinpos = startinpos + 1;
break;
case 2:
+ case 3:
+ case 4:
errmsg = "invalid continuation byte";
startinpos = s - starts;
- endinpos = startinpos + 1;
- while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
- endinpos++;
+ endinpos = startinpos + ch - 1;
break;
default:
if (unicode_putchar(&unicode, &outpos, ch) < 0)