bpo-31415: Add `-X importtime` option (GH-3490)

It shows show import time of each module.
It's useful for optimizing startup time.

Typical usage: python -X importtime -c 'import requests'
diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst
index 5adad15..9c1c269 100644
--- a/Doc/using/cmdline.rst
+++ b/Doc/using/cmdline.rst
@@ -407,6 +407,10 @@
    * ``-X showalloccount`` to output the total count of allocated objects for
      each type when the program finishes. This only works when Python was built with
      ``COUNT_ALLOCS`` defined.
+   * ``-X importtime`` to show how long each import takes. It shows module name,
+     cumulative time (including nested imports) and self time (exluding nested
+     imports).  Note that its output may be broken in multi threaded application.
+     Typical usage is ``python3 -X importtime -c 'import asyncio'``.
 
    It also allows passing arbitrary values and retrieving them through the
    :data:`sys._xoptions` dictionary.
@@ -423,6 +427,9 @@
    .. versionadded:: 3.6
       The ``-X showalloccount`` option.
 
+   .. versionadded:: 3.7
+      The ``-X importtime`` option.
+
 
 Options you shouldn't use
 ~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/Misc/NEWS.d/next/Core and Builtins/2017-09-11-14-28-56.bpo-31415.GBdz7o.rst b/Misc/NEWS.d/next/Core and Builtins/2017-09-11-14-28-56.bpo-31415.GBdz7o.rst
new file mode 100644
index 0000000..52aa16a
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2017-09-11-14-28-56.bpo-31415.GBdz7o.rst
@@ -0,0 +1,2 @@
+Add ``-X importtime`` option to show how long each import takes. It can
+be used to optimize application's startup time.
diff --git a/Python/import.c b/Python/import.c
index e50ea4d..7554bf8 100644
--- a/Python/import.c
+++ b/Python/import.c
@@ -1667,8 +1667,38 @@
         }
     }
     else {
+        static int ximporttime = 0;
+        static int import_level;
+        static _PyTime_t accumulated;
+        _Py_IDENTIFIER(importtime);
+
+        _PyTime_t t1 = 0, accumulated_copy = accumulated;
+
         Py_XDECREF(mod);
 
+        /* XOptions is initialized after first some imports.
+         * So we can't have negative cache.
+         * Anyway, importlib.__find_and_load is much slower than
+         * _PyDict_GetItemId()
+         */
+        if (ximporttime == 0) {
+            PyObject *xoptions = PySys_GetXOptions();
+            if (xoptions) {
+                PyObject *value = _PyDict_GetItemId(xoptions, &PyId_importtime);
+                ximporttime = (value == Py_True);
+            }
+            if (ximporttime) {
+                fputs("import time: self [us] | cumulative | imported package\n",
+                      stderr);
+            }
+        }
+
+        if (ximporttime) {
+            import_level++;
+            t1 = _PyTime_GetMonotonicClock();
+            accumulated = 0;
+        }
+
         if (PyDTrace_IMPORT_FIND_LOAD_START_ENABLED())
             PyDTrace_IMPORT_FIND_LOAD_START(PyUnicode_AsUTF8(abs_name));
 
@@ -1680,6 +1710,18 @@
             PyDTrace_IMPORT_FIND_LOAD_DONE(PyUnicode_AsUTF8(abs_name),
                                            mod != NULL);
 
+        if (ximporttime) {
+            _PyTime_t cum = _PyTime_GetMonotonicClock() - t1;
+
+            import_level--;
+            fprintf(stderr, "import time: %9ld | %10ld | %*s%s\n",
+                    (long)_PyTime_AsMicroseconds(cum - accumulated, _PyTime_ROUND_CEILING),
+                    (long)_PyTime_AsMicroseconds(cum, _PyTime_ROUND_CEILING),
+                    import_level*2, "", PyUnicode_AsUTF8(abs_name));
+
+            accumulated = accumulated_copy + cum;
+        }
+
         if (mod == NULL) {
             goto error;
         }