bpo-36673: Implement comment/PI parsing support for the TreeBuilder in ElementTree. (#12883)

* bpo-36673: Implement comment/PI parsing support for the TreeBuilder in ElementTree.

* bpo-36673: Rewrite the comment/PI factory handling for the TreeBuilder in "_elementtree" to make it use the same factories as the ElementTree module, and to make it explicit when the comments/PIs are inserted into the tree and when they are not (which is the default).
diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py
index e0d2cb7..8a228b8 100644
--- a/Lib/test/test_xml_etree.py
+++ b/Lib/test/test_xml_etree.py
@@ -1194,6 +1194,12 @@
             for i in range(0, len(data), chunk_size):
                 parser.feed(data[i:i+chunk_size])
 
+    def assert_events(self, parser, expected):
+        self.assertEqual(
+            [(event, (elem.tag, elem.text))
+             for event, elem in parser.read_events()],
+            expected)
+
     def assert_event_tags(self, parser, expected):
         events = parser.read_events()
         self.assertEqual([(action, elem.tag) for action, elem in events],
@@ -1276,8 +1282,10 @@
         self.assert_event_tags(parser, [])
 
         parser = ET.XMLPullParser(events=('start', 'end'))
-        self._feed(parser, "<!-- comment -->\n")
-        self.assert_event_tags(parser, [])
+        self._feed(parser, "<!-- text here -->\n")
+        self.assert_events(parser, [])
+
+        parser = ET.XMLPullParser(events=('start', 'end'))
         self._feed(parser, "<root>\n")
         self.assert_event_tags(parser, [('start', 'root')])
         self._feed(parser, "<element key='value'>text</element")
@@ -1314,6 +1322,33 @@
         self._feed(parser, "</root>")
         self.assertIsNone(parser.close())
 
+    def test_events_comment(self):
+        parser = ET.XMLPullParser(events=('start', 'comment', 'end'))
+        self._feed(parser, "<!-- text here -->\n")
+        self.assert_events(parser, [('comment', (ET.Comment, ' text here '))])
+        self._feed(parser, "<!-- more text here -->\n")
+        self.assert_events(parser, [('comment', (ET.Comment, ' more text here '))])
+        self._feed(parser, "<root-tag>text")
+        self.assert_event_tags(parser, [('start', 'root-tag')])
+        self._feed(parser, "<!-- inner comment-->\n")
+        self.assert_events(parser, [('comment', (ET.Comment, ' inner comment'))])
+        self._feed(parser, "</root-tag>\n")
+        self.assert_event_tags(parser, [('end', 'root-tag')])
+        self._feed(parser, "<!-- outer comment -->\n")
+        self.assert_events(parser, [('comment', (ET.Comment, ' outer comment '))])
+
+        parser = ET.XMLPullParser(events=('comment',))
+        self._feed(parser, "<!-- text here -->\n")
+        self.assert_events(parser, [('comment', (ET.Comment, ' text here '))])
+
+    def test_events_pi(self):
+        parser = ET.XMLPullParser(events=('start', 'pi', 'end'))
+        self._feed(parser, "<?pitarget?>\n")
+        self.assert_events(parser, [('pi', (ET.PI, 'pitarget'))])
+        parser = ET.XMLPullParser(events=('pi',))
+        self._feed(parser, "<?pitarget some text ?>\n")
+        self.assert_events(parser, [('pi', (ET.PI, 'pitarget some text '))])
+
     def test_events_sequence(self):
         # Test that events can be some sequence that's not just a tuple or list
         eventset = {'end', 'start'}
@@ -1333,7 +1368,6 @@
         self._feed(parser, "<foo>bar</foo>")
         self.assert_event_tags(parser, [('start', 'foo'), ('end', 'foo')])
 
-
     def test_unknown_event(self):
         with self.assertRaises(ValueError):
             ET.XMLPullParser(events=('start', 'end', 'bogus'))
@@ -2741,6 +2775,33 @@
         parser.feed(self.sample1)
         self.assertIsNone(parser.close())
 
+    def test_treebuilder_comment(self):
+        b = ET.TreeBuilder()
+        self.assertEqual(b.comment('ctext').tag, ET.Comment)
+        self.assertEqual(b.comment('ctext').text, 'ctext')
+
+        b = ET.TreeBuilder(comment_factory=ET.Comment)
+        self.assertEqual(b.comment('ctext').tag, ET.Comment)
+        self.assertEqual(b.comment('ctext').text, 'ctext')
+
+        b = ET.TreeBuilder(comment_factory=len)
+        self.assertEqual(b.comment('ctext'), len('ctext'))
+
+    def test_treebuilder_pi(self):
+        b = ET.TreeBuilder()
+        self.assertEqual(b.pi('target', None).tag, ET.PI)
+        self.assertEqual(b.pi('target', None).text, 'target')
+
+        b = ET.TreeBuilder(pi_factory=ET.PI)
+        self.assertEqual(b.pi('target').tag, ET.PI)
+        self.assertEqual(b.pi('target').text, "target")
+        self.assertEqual(b.pi('pitarget', ' text ').tag, ET.PI)
+        self.assertEqual(b.pi('pitarget', ' text ').text, "pitarget  text ")
+
+        b = ET.TreeBuilder(pi_factory=lambda target, text: (len(target), text))
+        self.assertEqual(b.pi('target'), (len('target'), None))
+        self.assertEqual(b.pi('pitarget', ' text '), (len('pitarget'), ' text '))
+
     def test_treebuilder_elementfactory_none(self):
         parser = ET.XMLParser(target=ET.TreeBuilder(element_factory=None))
         parser.feed(self.sample1)
@@ -2761,6 +2822,21 @@
         e = parser.close()
         self._check_sample1_element(e)
 
+    def test_subclass_comment_pi(self):
+        class MyTreeBuilder(ET.TreeBuilder):
+            def foobar(self, x):
+                return x * 2
+
+        tb = MyTreeBuilder(comment_factory=ET.Comment, pi_factory=ET.PI)
+        self.assertEqual(tb.foobar(10), 20)
+
+        parser = ET.XMLParser(target=tb)
+        parser.feed(self.sample1)
+        parser.feed('<!-- a comment--><?and a pi?>')
+
+        e = parser.close()
+        self._check_sample1_element(e)
+
     def test_element_factory(self):
         lst = []
         def myfactory(tag, attrib):
@@ -3418,6 +3494,12 @@
     # Copy the path cache (should be empty)
     path_cache = ElementPath._cache
     ElementPath._cache = path_cache.copy()
+    # Align the Comment/PI factories.
+    if hasattr(ET, '_set_factories'):
+        old_factories = ET._set_factories(ET.Comment, ET.PI)
+    else:
+        old_factories = None
+
     try:
         support.run_unittest(*test_classes)
     finally:
@@ -3426,6 +3508,8 @@
         nsmap.clear()
         nsmap.update(nsmap_copy)
         ElementPath._cache = path_cache
+        if old_factories is not None:
+            ET._set_factories(*old_factories)
         # don't interfere with subsequent tests
         ET = pyET = None
 
diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py
index c9e2f36..c640048 100644
--- a/Lib/xml/etree/ElementTree.py
+++ b/Lib/xml/etree/ElementTree.py
@@ -1374,12 +1374,30 @@
     *element_factory* is an optional element factory which is called
     to create new Element instances, as necessary.
 
+    *comment_factory* is a factory to create comments to be used instead of
+    the standard factory.  If *insert_comments* is false (the default),
+    comments will not be inserted into the tree.
+
+    *pi_factory* is a factory to create processing instructions to be used
+    instead of the standard factory.  If *insert_pis* is false (the default),
+    processing instructions will not be inserted into the tree.
     """
-    def __init__(self, element_factory=None):
+    def __init__(self, element_factory=None, *,
+                 comment_factory=None, pi_factory=None,
+                 insert_comments=False, insert_pis=False):
         self._data = [] # data collector
         self._elem = [] # element stack
         self._last = None # last element
+        self._root = None # root element
         self._tail = None # true if we're after an end tag
+        if comment_factory is None:
+            comment_factory = Comment
+        self._comment_factory = comment_factory
+        self.insert_comments = insert_comments
+        if pi_factory is None:
+            pi_factory = ProcessingInstruction
+        self._pi_factory = pi_factory
+        self.insert_pis = insert_pis
         if element_factory is None:
             element_factory = Element
         self._factory = element_factory
@@ -1387,8 +1405,8 @@
     def close(self):
         """Flush builder buffers and return toplevel document Element."""
         assert len(self._elem) == 0, "missing end tags"
-        assert self._last is not None, "missing toplevel element"
-        return self._last
+        assert self._root is not None, "missing toplevel element"
+        return self._root
 
     def _flush(self):
         if self._data:
@@ -1417,6 +1435,8 @@
         self._last = elem = self._factory(tag, attrs)
         if self._elem:
             self._elem[-1].append(elem)
+        elif self._root is None:
+            self._root = elem
         self._elem.append(elem)
         self._tail = 0
         return elem
@@ -1435,6 +1455,33 @@
         self._tail = 1
         return self._last
 
+    def comment(self, text):
+        """Create a comment using the comment_factory.
+
+        *text* is the text of the comment.
+        """
+        return self._handle_single(
+            self._comment_factory, self.insert_comments, text)
+
+    def pi(self, target, text=None):
+        """Create a processing instruction using the pi_factory.
+
+        *target* is the target name of the processing instruction.
+        *text* is the data of the processing instruction, or ''.
+        """
+        return self._handle_single(
+            self._pi_factory, self.insert_pis, target, text)
+
+    def _handle_single(self, factory, insert, *args):
+        elem = factory(*args)
+        if insert:
+            self._flush()
+            self._last = elem
+            if self._elem:
+                self._elem[-1].append(elem)
+            self._tail = 1
+        return elem
+
 
 # also see ElementTree and TreeBuilder
 class XMLParser:
@@ -1519,6 +1566,15 @@
                 def handler(prefix, event=event_name, append=append):
                     append((event, None))
                 parser.EndNamespaceDeclHandler = handler
+            elif event_name == 'comment':
+                def handler(text, event=event_name, append=append, self=self):
+                    append((event, self.target.comment(text)))
+                parser.CommentHandler = handler
+            elif event_name == 'pi':
+                def handler(pi_target, data, event=event_name, append=append,
+                            self=self):
+                    append((event, self.target.pi(pi_target, data)))
+                parser.ProcessingInstructionHandler = handler
             else:
                 raise ValueError("unknown event %r" % event_name)
 
@@ -1640,7 +1696,10 @@
     # (see tests)
     _Element_Py = Element
 
-    # Element, SubElement, ParseError, TreeBuilder, XMLParser
+    # Element, SubElement, ParseError, TreeBuilder, XMLParser, _set_factories
     from _elementtree import *
+    from _elementtree import _set_factories
 except ImportError:
     pass
+else:
+    _set_factories(Comment, ProcessingInstruction)