Issue #15038: Optimize python Locks on Windows
Extract cross-platform condition variable support into a separate file and
provide user-mode non-recursive locks for Windows.
diff --git a/Python/ceval_gil.h b/Python/ceval_gil.h
index e7764f2..2702d5c 100644
--- a/Python/ceval_gil.h
+++ b/Python/ceval_gil.h
@@ -59,213 +59,49 @@
      (Note: this mechanism is enabled with FORCE_SWITCHING above)
 */
 
-#ifndef _POSIX_THREADS
-/* This means pthreads are not implemented in libc headers, hence the macro
-   not present in unistd.h. But they still can be implemented as an external
-   library (e.g. gnu pth in pthread emulation) */
-# ifdef HAVE_PTHREAD_H
-#  include <pthread.h> /* _POSIX_THREADS */
-# endif
+#include "condvar.h"
+#ifndef Py_HAVE_CONDVAR
+#error You need either a POSIX-compatible or a Windows system!
 #endif
 
-
-#ifdef _POSIX_THREADS
-
-/*
- * POSIX support
- */
-
-#include <pthread.h>
-
-#define ADD_MICROSECONDS(tv, interval) \
-do { \
-    tv.tv_usec += (long) interval; \
-    tv.tv_sec += tv.tv_usec / 1000000; \
-    tv.tv_usec %= 1000000; \
-} while (0)
-
-/* We assume all modern POSIX systems have gettimeofday() */
-#ifdef GETTIMEOFDAY_NO_TZ
-#define GETTIMEOFDAY(ptv) gettimeofday(ptv)
-#else
-#define GETTIMEOFDAY(ptv) gettimeofday(ptv, (struct timezone *)NULL)
-#endif
-
-#define MUTEX_T pthread_mutex_t
+#define MUTEX_T PyMUTEX_T
 #define MUTEX_INIT(mut) \
-    if (pthread_mutex_init(&mut, NULL)) { \
-        Py_FatalError("pthread_mutex_init(" #mut ") failed"); };
+    if (PyMUTEX_INIT(&(mut))) { \
+        Py_FatalError("PyMUTEX_INIT(" #mut ") failed"); };
 #define MUTEX_FINI(mut) \
-    if (pthread_mutex_destroy(&mut)) { \
-        Py_FatalError("pthread_mutex_destroy(" #mut ") failed"); };
+    if (PyMUTEX_FINI(&(mut))) { \
+        Py_FatalError("PyMUTEX_FINI(" #mut ") failed"); };
 #define MUTEX_LOCK(mut) \
-    if (pthread_mutex_lock(&mut)) { \
-        Py_FatalError("pthread_mutex_lock(" #mut ") failed"); };
+    if (PyMUTEX_LOCK(&(mut))) { \
+        Py_FatalError("PyMUTEX_LOCK(" #mut ") failed"); };
 #define MUTEX_UNLOCK(mut) \
-    if (pthread_mutex_unlock(&mut)) { \
-        Py_FatalError("pthread_mutex_unlock(" #mut ") failed"); };
+    if (PyMUTEX_UNLOCK(&(mut))) { \
+        Py_FatalError("PyMUTEX_UNLOCK(" #mut ") failed"); };
 
-#define COND_T pthread_cond_t
+#define COND_T PyCOND_T
 #define COND_INIT(cond) \
-    if (pthread_cond_init(&cond, NULL)) { \
-        Py_FatalError("pthread_cond_init(" #cond ") failed"); };
+    if (PyCOND_INIT(&(cond))) { \
+        Py_FatalError("PyCOND_INIT(" #cond ") failed"); };
 #define COND_FINI(cond) \
-    if (pthread_cond_destroy(&cond)) { \
-        Py_FatalError("pthread_cond_destroy(" #cond ") failed"); };
+    if (PyCOND_FINI(&(cond))) { \
+        Py_FatalError("PyCOND_FINI(" #cond ") failed"); };
 #define COND_SIGNAL(cond) \
-    if (pthread_cond_signal(&cond)) { \
-        Py_FatalError("pthread_cond_signal(" #cond ") failed"); };
+    if (PyCOND_SIGNAL(&(cond))) { \
+        Py_FatalError("PyCOND_SIGNAL(" #cond ") failed"); };
 #define COND_WAIT(cond, mut) \
-    if (pthread_cond_wait(&cond, &mut)) { \
-        Py_FatalError("pthread_cond_wait(" #cond ") failed"); };
+    if (PyCOND_WAIT(&(cond), &(mut))) { \
+        Py_FatalError("PyCOND_WAIT(" #cond ") failed"); };
 #define COND_TIMED_WAIT(cond, mut, microseconds, timeout_result) \
     { \
-        int r; \
-        struct timespec ts; \
-        struct timeval deadline; \
-        \
-        GETTIMEOFDAY(&deadline); \
-        ADD_MICROSECONDS(deadline, microseconds); \
-        ts.tv_sec = deadline.tv_sec; \
-        ts.tv_nsec = deadline.tv_usec * 1000; \
-        \
-        r = pthread_cond_timedwait(&cond, &mut, &ts); \
-        if (r == ETIMEDOUT) \
+        int r = PyCOND_TIMEDWAIT(&(cond), &(mut), (microseconds)); \
+        if (r < 0) \
+            Py_FatalError("PyCOND_WAIT(" #cond ") failed"); \
+        if (r) /* 1 == timeout, 2 == impl. can't say, so assume timeout */ \
             timeout_result = 1; \
-        else if (r) \
-            Py_FatalError("pthread_cond_timedwait(" #cond ") failed"); \
         else \
             timeout_result = 0; \
     } \
 
-#elif defined(NT_THREADS)
-
-/*
- * Windows (2000 and later, as well as (hopefully) CE) support
- */
-
-#include <windows.h>
-
-#define MUTEX_T CRITICAL_SECTION
-#define MUTEX_INIT(mut) do { \
-    if (!(InitializeCriticalSectionAndSpinCount(&(mut), 4000))) \
-        Py_FatalError("CreateMutex(" #mut ") failed"); \
-} while (0)
-#define MUTEX_FINI(mut) \
-    DeleteCriticalSection(&(mut))
-#define MUTEX_LOCK(mut) \
-    EnterCriticalSection(&(mut))
-#define MUTEX_UNLOCK(mut) \
-    LeaveCriticalSection(&(mut))
-
-/* We emulate condition variables with a semaphore.
-   We use a Semaphore rather than an auto-reset event, because although
-   an auto-resent event might appear to solve the lost-wakeup bug (race
-   condition between releasing the outer lock and waiting) because it
-   maintains state even though a wait hasn't happened, there is still
-   a lost wakeup problem if more than one thread are interrupted in the
-   critical place.  A semaphore solves that.
-   Because it is ok to signal a condition variable with no one
-   waiting, we need to keep track of the number of
-   waiting threads.  Otherwise, the semaphore's state could rise
-   without bound.
-
-   Generic emulations of the pthread_cond_* API using
-   Win32 functions can be found on the Web.
-   The following read can be edificating (or not):
-   http://www.cse.wustl.edu/~schmidt/win32-cv-1.html
-*/
-typedef struct COND_T
-{
-    HANDLE sem;    /* the semaphore */
-    int n_waiting; /* how many are unreleased */
-} COND_T;
-
-__inline static void _cond_init(COND_T *cond)
-{
-    /* A semaphore with a large max value,  The positive value
-     * is only needed to catch those "lost wakeup" events and
-     * race conditions when a timed wait elapses.
-     */
-    if (!(cond->sem = CreateSemaphore(NULL, 0, 1000, NULL)))
-        Py_FatalError("CreateSemaphore() failed");
-    cond->n_waiting = 0;
-}
-
-__inline static void _cond_fini(COND_T *cond)
-{
-    BOOL ok = CloseHandle(cond->sem);
-    if (!ok)
-        Py_FatalError("CloseHandle() failed");
-}
-
-__inline static void _cond_wait(COND_T *cond, MUTEX_T *mut)
-{
-    ++cond->n_waiting;
-    MUTEX_UNLOCK(*mut);
-    /* "lost wakeup bug" would occur if the caller were interrupted here,
-     * but we are safe because we are using a semaphore wich has an internal
-     * count.
-     */
-    if (WaitForSingleObject(cond->sem, INFINITE) == WAIT_FAILED)
-        Py_FatalError("WaitForSingleObject() failed");
-    MUTEX_LOCK(*mut);
-}
-
-__inline static int _cond_timed_wait(COND_T *cond, MUTEX_T *mut,
-                              int us)
-{
-    DWORD r;
-    ++cond->n_waiting;
-    MUTEX_UNLOCK(*mut);
-    r = WaitForSingleObject(cond->sem, us / 1000);
-    if (r == WAIT_FAILED)
-        Py_FatalError("WaitForSingleObject() failed");
-    MUTEX_LOCK(*mut);
-    if (r == WAIT_TIMEOUT)
-        --cond->n_waiting;
-        /* Here we have a benign race condition with _cond_signal.  If the
-         * wait operation has timed out, but before we can acquire the
-         * mutex again to decrement n_waiting, a thread holding the mutex
-         * still sees a positive n_waiting value and may call
-         * ReleaseSemaphore and decrement n_waiting.
-         * This will cause n_waiting to be decremented twice.
-         * This is benign, though, because ReleaseSemaphore will also have
-         * been called, leaving the semaphore state positive.  We may
-         * thus end up with semaphore in state 1, and n_waiting == -1, and
-         * the next time someone calls _cond_wait(), that thread will
-         * pass right through, decrementing the semaphore state and
-         * incrementing n_waiting, thus correcting the extra _cond_signal.
-         */
-    return r == WAIT_TIMEOUT;
-}
-
-__inline static void _cond_signal(COND_T  *cond) {
-    /* NOTE: This must be called with the mutex held */
-    if (cond->n_waiting > 0) {
-        if (!ReleaseSemaphore(cond->sem, 1, NULL))
-            Py_FatalError("ReleaseSemaphore() failed");
-        --cond->n_waiting;
-    }
-}
-
-#define COND_INIT(cond) \
-    _cond_init(&(cond))
-#define COND_FINI(cond) \
-    _cond_fini(&(cond))
-#define COND_SIGNAL(cond) \
-    _cond_signal(&(cond))
-#define COND_WAIT(cond, mut) \
-    _cond_wait(&(cond), &(mut))
-#define COND_TIMED_WAIT(cond, mut, us, timeout_result) do { \
-    (timeout_result) = _cond_timed_wait(&(cond), &(mut), us); \
-} while (0)
-
-#else
-
-#error You need either a POSIX-compatible or a Windows system!
-
-#endif /* _POSIX_THREADS, NT_THREADS */
 
 
 /* Whether the GIL is already taken (-1 if uninitialized). This is atomic
@@ -356,13 +192,13 @@
         MUTEX_LOCK(switch_mutex);
         /* Not switched yet => wait */
         if (_Py_atomic_load_relaxed(&gil_last_holder) == tstate) {
-	    RESET_GIL_DROP_REQUEST();
+        RESET_GIL_DROP_REQUEST();
             /* NOTE: if COND_WAIT does not atomically start waiting when
                releasing the mutex, another thread can run through, take
                the GIL and drop it again, and reset the condition
                before we even had a chance to wait for it. */
             COND_WAIT(switch_cond, switch_mutex);
-	}
+    }
         MUTEX_UNLOCK(switch_mutex);
     }
 #endif
diff --git a/Python/condvar.h b/Python/condvar.h
new file mode 100644
index 0000000..8d3c595
--- /dev/null
+++ b/Python/condvar.h
@@ -0,0 +1,353 @@
+/* 
+ * Portable condition variable support for windows and pthreads.
+ * Everything is inline, this header can be included where needed.
+ *
+ * APIs generally return 0 on success and non-zero on error,
+ * and the caller needs to use its platform's error mechanism to
+ * discover the error (errno, or GetLastError())
+ *
+ * Note that some implementations cannot distinguish between a
+ * condition variable wait time-out and successful wait. Most often
+ * the difference is moot anyway since the wait condition must be
+ * re-checked.
+ * PyCOND_TIMEDWAIT, in addition to returning negative on error,
+ * thus returns 0 on regular success, 1 on timeout
+ * or 2 if it can't tell.
+ */
+
+#ifndef _CONDVAR_H_
+#define _CONDVAR_H_
+
+#include "Python.h"
+
+#ifndef _POSIX_THREADS
+/* This means pthreads are not implemented in libc headers, hence the macro
+   not present in unistd.h. But they still can be implemented as an external
+   library (e.g. gnu pth in pthread emulation) */
+# ifdef HAVE_PTHREAD_H
+#  include <pthread.h> /* _POSIX_THREADS */
+# endif
+#endif
+
+#ifdef _POSIX_THREADS
+/*
+ * POSIX support
+ */
+#define Py_HAVE_CONDVAR
+
+#include <pthread.h>
+
+#define PyCOND_ADD_MICROSECONDS(tv, interval) \
+do { \
+    tv.tv_usec += (long) interval; \
+    tv.tv_sec += tv.tv_usec / 1000000; \
+    tv.tv_usec %= 1000000; \
+} while (0)
+
+/* We assume all modern POSIX systems have gettimeofday() */
+#ifdef GETTIMEOFDAY_NO_TZ
+#define PyCOND_GETTIMEOFDAY(ptv) gettimeofday(ptv)
+#else
+#define PyCOND_GETTIMEOFDAY(ptv) gettimeofday(ptv, (struct timezone *)NULL)
+#endif
+
+/* The following functions return 0 on success, nonzero on error */
+#define PyMUTEX_T pthread_mutex_t
+#define PyMUTEX_INIT(mut)       pthread_mutex_init((mut), NULL)
+#define PyMUTEX_FINI(mut)       pthread_mutex_destroy(mut)
+#define PyMUTEX_LOCK(mut)       pthread_mutex_lock(mut)
+#define PyMUTEX_UNLOCK(mut)     pthread_mutex_unlock(mut)
+
+#define PyCOND_T pthread_cond_t
+#define PyCOND_INIT(cond)       pthread_cond_init((cond), NULL)
+#define PyCOND_FINI(cond)       pthread_cond_destroy(cond)
+#define PyCOND_SIGNAL(cond)     pthread_cond_signal(cond)
+#define PyCOND_BROADCAST(cond)  pthread_cond_broadcast(cond)
+#define PyCOND_WAIT(cond, mut)  pthread_cond_wait((cond), (mut))
+
+/* return 0 for success, 1 on timeout, -1 on error */
+Py_LOCAL_INLINE(int)
+PyCOND_TIMEDWAIT(PyCOND_T *cond, PyMUTEX_T *mut, long us)
+{
+    int r;
+    struct timespec ts;
+    struct timeval deadline;
+
+    PyCOND_GETTIMEOFDAY(&deadline);
+    PyCOND_ADD_MICROSECONDS(deadline, us);
+    ts.tv_sec = deadline.tv_sec;
+    ts.tv_nsec = deadline.tv_usec * 1000;
+
+    r = pthread_cond_timedwait((cond), (mut), &ts);
+    if (r == ETIMEDOUT)
+        return 1;
+    else if (r)
+        return -1;
+    else 
+        return 0;
+}
+
+#elif defined(NT_THREADS)
+/*
+ * Windows (XP, 2003 server and later, as well as (hopefully) CE) support
+ *
+ * Emulated condition variables ones that work with XP and later, plus
+ * example native support on VISTA and onwards.
+ */
+#define Py_HAVE_CONDVAR
+
+
+/* include windows if it hasn't been done before */
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+/* options */
+/* non-emulated condition variables are provided for those that want
+ * to target Windows Vista.  Modify this macro to enable them.
+ */
+#ifndef _PY_EMULATED_WIN_CV
+#define _PY_EMULATED_WIN_CV 1  /* use emulated condition variables */
+#endif
+
+/* fall back to emulation if not targeting Vista */
+#if !defined NTDDI_VISTA || NTDDI_VERSION < NTDDI_VISTA
+#undef _PY_EMULATED_WIN_CV
+#define _PY_EMULATED_WIN_CV 1
+#endif
+
+
+#if _PY_EMULATED_WIN_CV
+
+/* The mutex is a CriticalSection object and
+   The condition variables is emulated with the help of a semaphore.
+   Semaphores are available on Windows XP (2003 server) and later.
+   We use a Semaphore rather than an auto-reset event, because although
+   an auto-resent event might appear to solve the lost-wakeup bug (race
+   condition between releasing the outer lock and waiting) because it
+   maintains state even though a wait hasn't happened, there is still
+   a lost wakeup problem if more than one thread are interrupted in the
+   critical place.  A semaphore solves that, because its state is counted,
+   not Boolean.
+   Because it is ok to signal a condition variable with no one
+   waiting, we need to keep track of the number of
+   waiting threads.  Otherwise, the semaphore's state could rise
+   without bound.  This also helps reduce the number of "spurious wakeups"
+   that would otherwise happen.
+
+   Generic emulations of the pthread_cond_* API using
+   earlier Win32 functions can be found on the Web.
+   The following read can be edificating (or not):
+   http://www.cse.wustl.edu/~schmidt/win32-cv-1.html
+*/
+
+typedef CRITICAL_SECTION PyMUTEX_T;
+
+Py_LOCAL_INLINE(int)
+PyMUTEX_INIT(PyMUTEX_T *cs)
+{
+    InitializeCriticalSection(cs);
+    return 0;
+}
+
+Py_LOCAL_INLINE(int)
+PyMUTEX_FINI(PyMUTEX_T *cs)
+{
+    DeleteCriticalSection(cs);
+    return 0;
+}
+
+Py_LOCAL_INLINE(int)
+PyMUTEX_LOCK(PyMUTEX_T *cs)
+{
+    EnterCriticalSection(cs);
+    return 0;
+}
+
+Py_LOCAL_INLINE(int)
+PyMUTEX_UNLOCK(PyMUTEX_T *cs)
+{
+    LeaveCriticalSection(cs);
+    return 0;
+}
+
+/* The ConditionVariable object.  From XP onwards it is easily emulated with
+ * a Semaphore
+ */
+
+typedef struct _PyCOND_T
+{
+    HANDLE sem;
+    int waiting;
+} PyCOND_T;
+
+Py_LOCAL_INLINE(int)
+PyCOND_INIT(PyCOND_T *cv)
+{
+    /* A semaphore with a "large" max value,  The positive value
+     * is only needed to catch those "lost wakeup" events and
+     * race conditions when a timed wait elapses.
+     */
+    cv->sem = CreateSemaphore(NULL, 0, 100000, NULL);
+    if (cv->sem==NULL)
+        return -1;
+    cv->waiting = 0;
+    return 0;
+}
+
+Py_LOCAL_INLINE(int)
+PyCOND_FINI(PyCOND_T *cv)
+{
+    return CloseHandle(cv->sem) ? 0 : -1;
+}
+
+/* this implementation can detect a timeout.  Returns 1 on timeout,
+ * 0 otherwise (and -1 on error)
+ */
+Py_LOCAL_INLINE(int)
+_PyCOND_WAIT_MS(PyCOND_T *cv, PyMUTEX_T *cs, DWORD ms)
+{
+    DWORD wait;
+    cv->waiting++;
+    PyMUTEX_UNLOCK(cs);
+    /* "lost wakeup bug" would occur if the caller were interrupted here,
+     * but we are safe because we are using a semaphore wich has an internal
+     * count.
+     */
+    wait = WaitForSingleObject(cv->sem, ms);
+    PyMUTEX_LOCK(cs);
+    if (wait != WAIT_OBJECT_0)
+        --cv->waiting;
+        /* Here we have a benign race condition with PyCOND_SIGNAL.
+         * When failure occurs or timeout, it is possible that
+         * PyCOND_SIGNAL also decrements this value
+         * and signals releases the mutex.  This is benign because it
+         * just means an extra spurious wakeup for a waiting thread.
+         */
+         
+    if (wait == WAIT_FAILED)
+        return -1;
+    /* return 0 on success, 1 on timeout */
+    return wait != WAIT_OBJECT_0;
+}
+
+Py_LOCAL_INLINE(int)
+PyCOND_WAIT(PyCOND_T *cv, PyMUTEX_T *cs)
+{
+    int result = _PyCOND_WAIT_MS(cv, cs, INFINITE);
+    return result >= 0 ? 0 : result;
+}
+
+Py_LOCAL_INLINE(int)
+PyCOND_TIMEDWAIT(PyCOND_T *cv, PyMUTEX_T *cs, long us)
+{
+    return _PyCOND_WAIT_MS(cv, cs, us/1000);
+}
+
+Py_LOCAL_INLINE(int)
+PyCOND_SIGNAL(PyCOND_T *cv)
+{
+    if (cv->waiting) {
+        /* notifying thread decreases the cv->waiting count so that
+         * a delay between notify and wakeup doesn't cause a number
+         * of extra ReleaseSemaphore calls
+         */
+        cv->waiting--;
+        return ReleaseSemaphore(cv->sem, 1, NULL) ? 0 : -1;
+    }
+    return 0;
+}
+
+Py_LOCAL_INLINE(int)
+PyCOND_BROADCAST(PyCOND_T *cv)
+{
+    if (cv->waiting) {
+        return ReleaseSemaphore(cv->sem, cv->waiting, NULL) ? 0 : -1;
+		cv->waiting = 0;
+    }
+    return 0;
+}
+
+#else
+
+/* Use native Win7 primitives if build target is Win7 or higher */
+
+/* SRWLOCK is faster and better than CriticalSection */
+typedef SRWLOCK PyMUTEX_T;
+
+Py_LOCAL_INLINE(int)
+PyMUTEX_INIT(PyMUTEX_T *cs)
+{
+    InitializeSRWLock(cs);
+    return 0;
+}
+
+Py_LOCAL_INLINE(int)
+PyMUTEX_FINI(PyMUTEX_T *cs)
+{
+    return 0;
+}
+
+Py_LOCAL_INLINE(int)
+PyMUTEX_LOCK(PyMUTEX_T *cs)
+{
+    AcquireSRWLockExclusive(cs);
+    return 0;
+}
+
+Py_LOCAL_INLINE(int)
+PyMUTEX_UNLOCK(PyMUTEX_T *cs)
+{
+    ReleaseSRWLockExclusive(cs);
+    return 0;
+}
+
+
+typedef CONDITION_VARIABLE  PyCOND_T;
+
+Py_LOCAL_INLINE(int)
+PyCOND_INIT(PyCOND_T *cv)
+{
+    InitializeConditionVariable(cv);
+    return 0;
+}
+Py_LOCAL_INLINE(int)
+PyCOND_FINI(PyCOND_T *cv)
+{
+    return 0;
+}
+
+Py_LOCAL_INLINE(int)
+PyCOND_WAIT(PyCOND_T *cv, PyMUTEX_T *cs)
+{
+    return SleepConditionVariableSRW(cv, cs, INFINITE, 0) ? 0 : -1;
+}
+
+/* This implementation makes no distinction about timeouts.  Signal
+ * 2 to indicate that we don't know.
+ */
+Py_LOCAL_INLINE(int)
+PyCOND_TIMEDWAIT(PyCOND_T *cv, PyMUTEX_T *cs, long us)
+{
+    return SleepConditionVariableSRW(cv, cs, us/1000, 0) ? 2 : -1;
+}
+
+Py_LOCAL_INLINE(int)
+PyCOND_SIGNAL(PyCOND_T *cv)
+{
+     WakeConditionVariable(cv);
+     return 0;
+}
+
+Py_LOCAL_INLINE(int)
+PyCOND_BROADCAST(PyCOND_T *cv)
+{
+     WakeAllConditionVariable(cv);
+     return 0;
+}
+
+
+#endif /* _PY_EMULATED_WIN_CV */
+
+#endif /* _POSIX_THREADS, NT_THREADS */
+
+#endif /* _CONDVAR_H_ */
diff --git a/Python/thread_nt.h b/Python/thread_nt.h
index d1bb0e5..938bf1e 100644
--- a/Python/thread_nt.h
+++ b/Python/thread_nt.h
@@ -9,6 +9,109 @@
 #include <process.h>
 #endif
 
+/* options */
+#ifndef _PY_USE_CV_LOCKS
+#define _PY_USE_CV_LOCKS 1     /* use locks based on cond vars */
+#endif
+
+/* Now, define a non-recursive mutex using either condition variables
+ * and critical sections (fast) or using operating system mutexes
+ * (slow)
+ */
+
+#if _PY_USE_CV_LOCKS
+
+#include "condvar.h"
+
+typedef struct _NRMUTEX
+{
+    PyMUTEX_T cs;
+    PyCOND_T cv;
+    int locked;
+} NRMUTEX;
+typedef NRMUTEX *PNRMUTEX;
+
+PNRMUTEX
+AllocNonRecursiveMutex()
+{
+    PNRMUTEX m = (PNRMUTEX)malloc(sizeof(NRMUTEX));
+    if (!m)
+        return NULL;
+    if (PyCOND_INIT(&m->cv))
+        goto fail;
+    if (PyMUTEX_INIT(&m->cs)) {
+        PyCOND_FINI(&m->cv);
+        goto fail;
+    }
+    m->locked = 0;
+    return m;
+fail:
+    free(m);
+    return NULL;
+}
+
+VOID
+FreeNonRecursiveMutex(PNRMUTEX mutex)
+{
+    if (mutex) {
+        PyCOND_FINI(&mutex->cv);
+        PyMUTEX_FINI(&mutex->cs);
+        free(mutex);
+    }
+}
+
+DWORD
+EnterNonRecursiveMutex(PNRMUTEX mutex, DWORD milliseconds)
+{
+    DWORD result = WAIT_OBJECT_0;
+    if (PyMUTEX_LOCK(&mutex->cs))
+        return WAIT_FAILED;
+    if (milliseconds == INFINITE) {
+        while (mutex->locked) {
+            if (PyCOND_WAIT(&mutex->cv, &mutex->cs)) {
+                result = WAIT_FAILED;
+                break;
+            }
+        }
+    } else if (milliseconds != 0) {
+        /* wait at least until the target */
+        DWORD now, target = GetTickCount() + milliseconds;
+        while (mutex->locked) {
+            if (PyCOND_TIMEDWAIT(&mutex->cv, &mutex->cs, milliseconds*1000) < 0) {
+                result = WAIT_FAILED;
+                break;
+            }
+            now = GetTickCount();
+            if (target <= now)
+                break;
+            milliseconds = target-now;
+        }
+    }
+    if (!mutex->locked) {
+        mutex->locked = 1;
+        result = WAIT_OBJECT_0;
+    } else if (result == WAIT_OBJECT_0)
+        result = WAIT_TIMEOUT;
+    /* else, it is WAIT_FAILED */
+    PyMUTEX_UNLOCK(&mutex->cs); /* must ignore result here */
+    return result;
+}
+
+BOOL
+LeaveNonRecursiveMutex(PNRMUTEX mutex)
+{
+    BOOL result;
+    if (PyMUTEX_LOCK(&mutex->cs))
+        return FALSE;
+    mutex->locked = 0;
+    result = PyCOND_SIGNAL(&mutex->cv);
+    result &= PyMUTEX_UNLOCK(&mutex->cs);
+    return result;
+}    
+
+#else /* if ! _PY_USE_CV_LOCKS */
+
+/* NR-locks based on a kernel mutex */
 #define PNRMUTEX HANDLE
 
 PNRMUTEX
@@ -35,6 +138,7 @@
 {
     return ReleaseSemaphore(mutex, 1, NULL);
 }
+#endif /* _PY_USE_CV_LOCKS */
 
 long PyThread_get_thread_ident(void);