Issue #3672: Reject surrogates in utf-8 codec; add surrogates error
handler.
diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py
index a3ea40a..992f3d2 100644
--- a/Lib/test/test_bytes.py
+++ b/Lib/test/test_bytes.py
@@ -169,13 +169,13 @@
                     self.assertEqual(b[start:stop:step], self.type2test(L[start:stop:step]))
 
     def test_encoding(self):
-        sample = "Hello world\n\u1234\u5678\u9abc\udef0"
+        sample = "Hello world\n\u1234\u5678\u9abc"
         for enc in ("utf8", "utf16"):
             b = self.type2test(sample, enc)
             self.assertEqual(b, self.type2test(sample.encode(enc)))
         self.assertRaises(UnicodeEncodeError, self.type2test, sample, "latin1")
         b = self.type2test(sample, "latin1", "ignore")
-        self.assertEqual(b, self.type2test(sample[:-4], "utf-8"))
+        self.assertEqual(b, self.type2test(sample[:-3], "utf-8"))
 
     def test_decode(self):
         sample = "Hello world\n\u1234\u5678\u9abc\def0\def0"
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 1730dbe..6706507 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -541,6 +541,17 @@
         self.check_state_handling_decode(self.encoding,
                                          u, u.encode(self.encoding))
 
+    def test_lone_surrogates(self):
+        self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
+        self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
+
+    def test_surrogates_handler(self):
+        self.assertEquals("abc\ud800def".encode("utf-8", "surrogates"),
+                          b"abc\xed\xa0\x80def")
+        self.assertEquals(b"abc\xed\xa0\x80def".decode("utf-8", "surrogates"),
+                          "abc\ud800def")
+        self.assertTrue(codecs.lookup_error("surrogates"))
+
 class UTF7Test(ReadTest):
     encoding = "utf-7"
 
@@ -1023,12 +1034,12 @@
                 # Skipped
                 continue
             # The Unicode strings are given in UTF-8
-            orig = str(orig, "utf-8")
+            orig = str(orig, "utf-8", "surrogates")
             if prepped is None:
                 # Input contains prohibited characters
                 self.assertRaises(UnicodeError, nameprep, orig)
             else:
-                prepped = str(prepped, "utf-8")
+                prepped = str(prepped, "utf-8", "surrogates")
                 try:
                     self.assertEquals(nameprep(orig), prepped)
                 except Exception as e:
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index 1fddc06..220a8eb 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -886,10 +886,10 @@
         self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
         self.assertEqual('\ud800\udc02'.encode('utf-8'), b'\xf0\x90\x80\x82')
         self.assertEqual('\ud84d\udc56'.encode('utf-8'), b'\xf0\xa3\x91\x96')
-        self.assertEqual('\ud800'.encode('utf-8'), b'\xed\xa0\x80')
-        self.assertEqual('\udc00'.encode('utf-8'), b'\xed\xb0\x80')
+        self.assertEqual('\ud800'.encode('utf-8', 'surrogates'), b'\xed\xa0\x80')
+        self.assertEqual('\udc00'.encode('utf-8', 'surrogates'), b'\xed\xb0\x80')
         self.assertEqual(
-            ('\ud800\udc02'*1000).encode('utf-8'),
+            ('\ud800\udc02'*1000).encode('utf-8', 'surrogates'),
             b'\xf0\x90\x80\x82'*1000
         )
         self.assertEqual(
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py
index aed8eaa..b84aaaf 100644
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -13,6 +13,7 @@
 import test.support
 
 encoding = 'utf-8'
+errors = 'surrogates'
 
 
 ### Run tests
@@ -61,7 +62,7 @@
                 (char + 'ABC').title(),
 
                 ]
-            h.update(''.join(data).encode(encoding))
+            h.update(''.join(data).encode(encoding, errors))
         result = h.hexdigest()
         self.assertEqual(result, self.expectedchecksum)