bpo-30500: urllib: Simplify splithost by calling into urlparse. (#1849) (#2294) The current regex based splitting produces a wrong result. For example:: http://abc#@def Web browsers parse that URL as ``http://abc/#@def``, that is, the host is ``abc``, the path is ``/``, and the fragment is ``#@def``. (cherry picked from commit 90e01e50ef8a9e6c91f30d965563c378a4ad26de)

commit: d4324baca4c03eb8d55446cd1b74b32ec5633af5 [log] [tgz]
author: Victor Stinner <victor.stinner@gmail.com> Tue Jun 20 16:20:36 2017 +0200
committer: GitHub <noreply@github.com> Tue Jun 20 16:20:36 2017 +0200
tree: 4144e9fae2d0d6913340f9c0ff9c96612abd0f26
parent: b39a7481ee7e6166d6d2b252a7a514b1f6553dfa [diff]
diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py
index 14de91e..1ce9201 100644
--- a/Lib/test/test_urllib.py
+++ b/Lib/test/test_urllib.py

@@ -879,6 +879,26 @@
         self.assertEqual(splithost('/foo/bar/baz.html'),
                          (None, '/foo/bar/baz.html'))
 
+        # bpo-30500: # starts a fragment.
+        self.assertEqual(splithost('//127.0.0.1#@host.com'),
+                         ('127.0.0.1', '/#@host.com'))
+        self.assertEqual(splithost('//127.0.0.1#@host.com:80'),
+                         ('127.0.0.1', '/#@host.com:80'))
+        self.assertEqual(splithost('//127.0.0.1:80#@host.com'),
+                         ('127.0.0.1:80', '/#@host.com'))
+
+        # Empty host is returned as empty string.
+        self.assertEqual(splithost("///file"),
+                         ('', '/file'))
+
+        # Trailing semicolon, question mark and hash symbol are kept.
+        self.assertEqual(splithost("//example.net/file;"),
+                         ('example.net', '/file;'))
+        self.assertEqual(splithost("//example.net/file?"),
+                         ('example.net', '/file?'))
+        self.assertEqual(splithost("//example.net/file#"),
+                         ('example.net', '/file#'))
+
     def test_splituser(self):
         splituser = urllib.splituser
         self.assertEqual(splituser('User:Pass@www.python.org:080'),

diff --git a/Lib/urllib.py b/Lib/urllib.py
index c3c8ef4..d85504a 100644
--- a/Lib/urllib.py
+++ b/Lib/urllib.py

@@ -1093,8 +1093,7 @@
     """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
     global _hostprog
     if _hostprog is None:
-        import re
-        _hostprog = re.compile('^//([^/?]*)(.*)$')
+        _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
 
     match = _hostprog.match(url)
     if match:
commit	d4324baca4c03eb8d55446cd1b74b32ec5633af5	[log] [tgz]
author	Victor Stinner <victor.stinner@gmail.com>	Tue Jun 20 16:20:36 2017 +0200
committer	GitHub <noreply@github.com>	Tue Jun 20 16:20:36 2017 +0200
tree	4144e9fae2d0d6913340f9c0ff9c96612abd0f26
parent	b39a7481ee7e6166d6d2b252a7a514b1f6553dfa [diff]