Add command-line option --fair-sched=[no|yes|try]. Use --fair-sched=try
when running the annotate_hbefore regression test. Closes #270006.

To do: update manual.


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@12280 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/configure.in b/configure.in
index 9a635c2..8c5bc13 100644
--- a/configure.in
+++ b/configure.in
@@ -1612,10 +1612,12 @@
 ]], [[
   return FUTEX_WAIT;
 ]])], [
+ac_have_usable_linux_futex_h=yes
 AC_DEFINE([HAVE_USABLE_LINUX_FUTEX_H], 1,
           [Define to 1 if you have a usable <linux/futex.h> header file.])
 AC_MSG_RESULT([yes])
 ], [
+ac_have_usable_linux_futex_h=no
 AC_MSG_RESULT([no])
 ])
 
@@ -1855,8 +1857,9 @@
 AM_CONDITIONAL([HAVE_OPENMP], [test x$ac_have_openmp = xyes])
 
 
-# does this compiler have built-in functions for atomic memory access ?
-AC_MSG_CHECKING([if gcc supports __sync_bool_compare_and_swap])
+# does this compiler have built-in functions for atomic memory access for the
+# primary target ?
+AC_MSG_CHECKING([if gcc supports __sync_add_and_fetch for the primary target])
 
 safe_CFLAGS=$CFLAGS
 CFLAGS="$mflag_primary"
@@ -1868,7 +1871,7 @@
 ]])], [
   ac_have_builtin_atomic=yes
   AC_MSG_RESULT([yes])
-  AC_DEFINE(HAVE_BUILTIN_ATOMIC, 1, [Define to 1 if gcc supports __sync_bool_compare_and_swap() and __sync_add_and_fetch()])
+  AC_DEFINE(HAVE_BUILTIN_ATOMIC, 1, [Define to 1 if gcc supports __sync_bool_compare_and_swap() and __sync_add_and_fetch() for the primary target])
 ], [
   ac_have_builtin_atomic=no
   AC_MSG_RESULT([no])
@@ -1878,8 +1881,38 @@
 
 AM_CONDITIONAL([HAVE_BUILTIN_ATOMIC], [test x$ac_have_builtin_atomic = xyes])
 
+
+# does this compiler have built-in functions for atomic memory access for the
+# secondary target ?
+
+if test x$VGCONF_PLATFORM_SEC_CAPS != x; then
+
+AC_MSG_CHECKING([if gcc supports __sync_add_and_fetch for the secondary target])
+
+safe_CFLAGS=$CFLAGS
+CFLAGS="$mflag_secondary"
+
+AC_LINK_IFELSE([AC_LANG_PROGRAM([[]], [[
+  int variable = 1;
+  return (__sync_add_and_fetch(&variable, 1) ? 1 : 0)
+]])], [
+  ac_have_builtin_atomic_secondary=yes
+  AC_MSG_RESULT([yes])
+], [
+  ac_have_builtin_atomic_secondary=no
+  AC_MSG_RESULT([no])
+])
+
+CFLAGS=$safe_CFLAGS
+
+fi
+
+AM_CONDITIONAL([HAVE_BUILTIN_ATOMIC_SECONDARY],
+               [test x$ac_have_builtin_atomic_secondary = xyes])
+
+
 # does g++ have built-in functions for atomic memory access ?
-AC_MSG_CHECKING([if g++ supports __sync_bool_compare_and_swap])
+AC_MSG_CHECKING([if g++ supports __sync_add_and_fetch])
 
 safe_CXXFLAGS=$CXXFLAGS
 CXXFLAGS="$mflag_primary"
@@ -1903,6 +1936,23 @@
 
 AM_CONDITIONAL([HAVE_BUILTIN_ATOMIC_CXX], [test x$ac_have_builtin_atomic_cxx = xyes])
 
+
+if test x$ac_have_usable_linux_futex_h = xyes \
+        -a x$ac_have_builtin_atomic = xyes; then
+  ac_enable_linux_ticket_lock_primary=yes
+fi
+AM_CONDITIONAL([ENABLE_LINUX_TICKET_LOCK_PRIMARY],
+               [test x$ac_enable_linux_ticket_lock_primary = xyes])
+
+if test x$VGCONF_PLATFORM_SEC_CAPS != x \
+        -a x$ac_have_usable_linux_futex_h = xyes \
+        -a x$ac_have_builtin_atomic_secondary = xyes; then
+  ac_enable_linux_ticket_lock_secondary=yes
+fi
+AM_CONDITIONAL([ENABLE_LINUX_TICKET_LOCK_SECONDARY],
+               [test x$ac_enable_linux_ticket_lock_secondary = xyes])
+
+
 #----------------------------------------------------------------------------
 # Ok.  We're done checking.
 #----------------------------------------------------------------------------
diff --git a/coregrind/Makefile.am b/coregrind/Makefile.am
index 15aa4ed..6074ebc 100644
--- a/coregrind/Makefile.am
+++ b/coregrind/Makefile.am
@@ -214,6 +214,8 @@
 	m_initimg/priv_initimg_pathscan.h \
 	m_initimg/simple_huffman.c \
 	m_scheduler/priv_sema.h \
+	m_scheduler/priv_sched-lock.h \
+	m_scheduler/priv_sched-lock-impl.h \
 	m_syswrap/priv_types_n_macros.h \
 	m_syswrap/priv_syswrap-generic.h \
 	m_syswrap/priv_syswrap-linux.h \
@@ -327,6 +329,8 @@
 	m_replacemalloc/replacemalloc_core.c \
 	m_scheduler/scheduler.c \
 	m_scheduler/sema.c \
+	m_scheduler/sched-lock.c \
+	m_scheduler/sched-lock-generic.c \
 	m_sigframe/sigframe-x86-linux.c \
 	m_sigframe/sigframe-amd64-linux.c \
 	m_sigframe/sigframe-ppc32-linux.c \
@@ -371,6 +375,13 @@
     $(AM_CFLAGS_@VGCONF_PLATFORM_PRI_CAPS@)
 libcoregrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_a_CCASFLAGS = \
     $(AM_CCASFLAGS_@VGCONF_PLATFORM_PRI_CAPS@)
+if ENABLE_LINUX_TICKET_LOCK_PRIMARY
+libcoregrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_a_SOURCES += \
+    m_scheduler/ticket-lock-linux.c
+libcoregrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_a_CFLAGS += \
+    -DENABLE_LINUX_TICKET_LOCK
+endif
+
 if VGCONF_HAVE_PLATFORM_SEC
 libcoregrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_a_SOURCES = \
     $(COREGRIND_SOURCES_COMMON)
@@ -382,6 +393,12 @@
     $(AM_CFLAGS_@VGCONF_PLATFORM_SEC_CAPS@)
 libcoregrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_a_CCASFLAGS = \
     $(AM_CCASFLAGS_@VGCONF_PLATFORM_SEC_CAPS@)
+if ENABLE_LINUX_TICKET_LOCK_SECONDARY
+libcoregrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_a_SOURCES += \
+    m_scheduler/ticket-lock-linux.c
+libcoregrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_a_CFLAGS += \
+    -DENABLE_LINUX_TICKET_LOCK
+endif
 endif
 
 #----------------------------------------------------------------------------
diff --git a/coregrind/m_main.c b/coregrind/m_main.c
index d93888d..bb0b096 100644
--- a/coregrind/m_main.c
+++ b/coregrind/m_main.c
@@ -180,6 +180,7 @@
 "    --run-libc-freeres=no|yes free up glibc memory at exit on Linux? [yes]\n"
 "    --sim-hints=hint1,hint2,...  known hints:\n"
 "                                 lax-ioctls, enable-outer, fuse-compatible [none]\n"
+"    --fair-sched=no|yes|try   schedule threads fairly on multicore systems [no]\n"
 "    --kernel-variant=variant1,variant2,...  known variants: bproc [none]\n"
 "                              handle non-standard kernel variants\n"
 "    --show-emwarns=no|yes     show warnings about emulation limits? [no]\n"
@@ -486,6 +487,17 @@
       else if VG_BOOL_CLO(arg, "--trace-children",   VG_(clo_trace_children)) {}
       else if VG_BOOL_CLO(arg, "--child-silent-after-fork",
                             VG_(clo_child_silent_after_fork)) {}
+      else if VG_STR_CLO(arg, "--fair-sched",        tmp_str) {
+         if (VG_(strcmp)(tmp_str, "yes") == 0)
+            VG_(clo_fair_sched) = enable_fair_sched;
+         else if (VG_(strcmp)(tmp_str, "try") == 0)
+            VG_(clo_fair_sched) = try_fair_sched;
+         else if (VG_(strcmp)(tmp_str, "no") == 0)
+            VG_(clo_fair_sched) = disable_fair_sched;
+         else
+            VG_(fmsg_bad_option)(arg, "");
+
+      }
       else if VG_BOOL_CLO(arg, "--trace-sched",      VG_(clo_trace_sched)) {}
       else if VG_BOOL_CLO(arg, "--trace-signals",    VG_(clo_trace_signals)) {}
       else if VG_BOOL_CLO(arg, "--trace-symtab",     VG_(clo_trace_symtab)) {}
diff --git a/coregrind/m_options.c b/coregrind/m_options.c
index 9b89fe9..df00193 100644
--- a/coregrind/m_options.c
+++ b/coregrind/m_options.c
@@ -90,6 +90,8 @@
 Bool   VG_(clo_debug_dump_line) = False;
 Bool   VG_(clo_debug_dump_frames) = False;
 Bool   VG_(clo_trace_redir)    = False;
+enum FairSchedType
+       VG_(clo_fair_sched)     = disable_fair_sched;
 Bool   VG_(clo_trace_sched)    = False;
 Bool   VG_(clo_profile_heap)   = False;
 Int    VG_(clo_dump_error)     = 0;
diff --git a/coregrind/m_scheduler/priv_sched-lock-impl.h b/coregrind/m_scheduler/priv_sched-lock-impl.h
new file mode 100644
index 0000000..2eb1b89
--- /dev/null
+++ b/coregrind/m_scheduler/priv_sched-lock-impl.h
@@ -0,0 +1,51 @@
+
+/*--------------------------------------------------------------------*/
+/*--- Private scheduler lock header.        priv_sched-lock-impl.h ---*/
+/*---                                                              ---*/
+/*--- Scheduler lock implementation details.                       ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2011 Bart Van Assche <bvanassche@acm.org>.
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#ifndef __PRIV_SCHED_LOCK_IMPL_H
+#define __PRIV_SCHED_LOCK_IMPL_H
+
+struct sched_lock_ops {
+   const Char *(*get_sched_lock_name)(void);
+   struct sched_lock *(*create_sched_lock)(void);
+   void (*destroy_sched_lock)(struct sched_lock *p);
+   int (*get_sched_lock_owner)(struct sched_lock *p);
+   void (*acquire_sched_lock)(struct sched_lock *p);
+   void (*release_sched_lock)(struct sched_lock *p);
+};
+
+extern const struct sched_lock_ops ML_(generic_sched_lock_ops);
+extern const struct sched_lock_ops ML_(linux_ticket_lock_ops);
+
+#endif   // __PRIV_SCHED_LOCK_IMPL_H
+
+/*--------------------------------------------------------------------*/
+/*--- end                                                          ---*/
+/*--------------------------------------------------------------------*/
diff --git a/coregrind/m_scheduler/priv_sched-lock.h b/coregrind/m_scheduler/priv_sched-lock.h
new file mode 100644
index 0000000..c411735
--- /dev/null
+++ b/coregrind/m_scheduler/priv_sched-lock.h
@@ -0,0 +1,51 @@
+
+/*--------------------------------------------------------------------*/
+/*--- Private scheduler lock header.             priv_sched-lock.h ---*/
+/*---                                                              ---*/
+/*--- Scheduler lock API.                                          ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2011 Bart Van Assche <bvanassche@acm.org>.
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#ifndef __PRIV_SCHED_LOCK_H
+#define __PRIV_SCHED_LOCK_H
+
+struct sched_lock;
+
+enum SchedLockType { sched_lock_generic, sched_lock_ticket };
+
+Bool ML_(set_sched_lock_impl)(const enum SchedLockType t);
+const Char *ML_(get_sched_lock_name)(void);
+struct sched_lock *ML_(create_sched_lock)(void);
+void ML_(destroy_sched_lock)(struct sched_lock *p);
+int ML_(get_sched_lock_owner)(struct sched_lock *p);
+void ML_(acquire_sched_lock)(struct sched_lock *p);
+void ML_(release_sched_lock)(struct sched_lock *p);
+
+#endif   // __PRIV_SCHED_LOCK_H
+
+/*--------------------------------------------------------------------*/
+/*--- end                                                          ---*/
+/*--------------------------------------------------------------------*/
diff --git a/coregrind/m_scheduler/sched-lock-generic.c b/coregrind/m_scheduler/sched-lock-generic.c
new file mode 100644
index 0000000..be01fff
--- /dev/null
+++ b/coregrind/m_scheduler/sched-lock-generic.c
@@ -0,0 +1,87 @@
+
+/*--------------------------------------------------------------------*/
+/*--- Generic scheduler lock implementation   sched-lock-generic.c ---*/
+/*---                                                              ---*/
+/*--- This implementation does not guarantee fair scheduling on    ---*/
+/*--- multicore systems but is sufficient to make the Valgrind     ---*/
+/*--- scheduler work reasonably.                                   ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2011 Bart Van Assche <bvanassche@acm.org>.
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "pub_core_basics.h"
+#include "pub_tool_mallocfree.h"
+#include "priv_sema.h"
+#include "priv_sched-lock.h"
+#include "priv_sched-lock-impl.h"
+
+struct sched_lock {
+   vg_sema_t sema;
+};
+
+static const Char *get_sched_lock_name(void)
+{
+   return "generic";
+}
+
+static struct sched_lock *create_sched_lock(void)
+{
+   struct sched_lock *p;
+
+   p = VG_(malloc)("sched_lock", sizeof(*p));
+   if (p)
+      ML_(sema_init)(&p->sema);
+   return p;
+}
+
+static void destroy_sched_lock(struct sched_lock *p)
+{
+   ML_(sema_deinit)(&p->sema);
+   VG_(free)(p);
+}
+
+static int get_sched_lock_owner(struct sched_lock *p)
+{
+   return p->sema.owner_lwpid;
+}
+
+static void acquire_sched_lock(struct sched_lock *p)
+{
+   ML_(sema_down)(&p->sema, False);
+}
+
+static void release_sched_lock(struct sched_lock *p)
+{
+   ML_(sema_up)(&p->sema, False);
+}
+
+const struct sched_lock_ops ML_(generic_sched_lock_ops) = {
+   .get_sched_lock_name  = get_sched_lock_name,
+   .create_sched_lock    = create_sched_lock,
+   .destroy_sched_lock   = destroy_sched_lock,
+   .get_sched_lock_owner = get_sched_lock_owner,
+   .acquire_sched_lock   = acquire_sched_lock,
+   .release_sched_lock   = release_sched_lock,
+};
diff --git a/coregrind/m_scheduler/sched-lock.c b/coregrind/m_scheduler/sched-lock.c
new file mode 100644
index 0000000..f6fdb4d
--- /dev/null
+++ b/coregrind/m_scheduler/sched-lock.c
@@ -0,0 +1,96 @@
+
+/*--------------------------------------------------------------------*/
+/*--- Scheduler lock support functions                sched-lock.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2011 Bart Van Assche <bvanassche@acm.org>.
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "config.h"
+#include "pub_core_basics.h"
+#include "pub_tool_libcbase.h"
+#include "pub_tool_mallocfree.h"
+#include "priv_sema.h"
+#include "priv_sched-lock.h"
+#include "priv_sched-lock-impl.h"
+
+static struct sched_lock_ops const *sched_lock_ops =
+   &ML_(generic_sched_lock_ops);
+
+static struct sched_lock_ops const *const sched_lock_impl[] = {
+   [sched_lock_generic] = &ML_(generic_sched_lock_ops),
+#ifdef ENABLE_LINUX_TICKET_LOCK
+   [sched_lock_ticket]  = &ML_(linux_ticket_lock_ops),
+#endif
+};
+
+/**
+ * Define which scheduler lock implementation to use.
+ *
+ * @param[in] t Scheduler lock type.
+ *
+ * @return True if and only if this function succeeded.
+ *
+ * @note Must be called before any other sched_lock*() function is invoked.
+ */
+Bool ML_(set_sched_lock_impl)(const enum SchedLockType t)
+{
+   struct sched_lock_ops const *p = NULL;
+
+   if ((unsigned)t < sizeof(sched_lock_impl)/sizeof(sched_lock_impl[0]))
+      p = sched_lock_impl[t];
+   if (p)
+      sched_lock_ops = p;
+   return !!p;
+}
+
+const Char *ML_(get_sched_lock_name)(void)
+{
+   return (sched_lock_ops->get_sched_lock_name)();
+}
+
+struct sched_lock *ML_(create_sched_lock)(void)
+{
+   return (sched_lock_ops->create_sched_lock)();
+}
+
+void ML_(destroy_sched_lock)(struct sched_lock *p)
+{
+   return (sched_lock_ops->destroy_sched_lock)(p);
+}
+
+int ML_(get_sched_lock_owner)(struct sched_lock *p)
+{
+   return (sched_lock_ops->get_sched_lock_owner)(p);
+}
+
+void ML_(acquire_sched_lock)(struct sched_lock *p)
+{
+   return (sched_lock_ops->acquire_sched_lock)(p);
+}
+
+void ML_(release_sched_lock)(struct sched_lock *p)
+{
+   return (sched_lock_ops->release_sched_lock)(p);
+}
diff --git a/coregrind/m_scheduler/scheduler.c b/coregrind/m_scheduler/scheduler.c
index 995f44c..4548db0 100644
--- a/coregrind/m_scheduler/scheduler.c
+++ b/coregrind/m_scheduler/scheduler.c
@@ -89,7 +89,7 @@
 #include "pub_core_translate.h"     // For VG_(translate)()
 #include "pub_core_transtab.h"
 #include "pub_core_debuginfo.h"     // VG_(di_notify_pdb_debuginfo)
-#include "priv_sema.h"
+#include "priv_sched-lock.h"
 #include "pub_core_scheduler.h"     // self
 #include "pub_core_redir.h"
 
@@ -146,8 +146,10 @@
                 sanity_fast_count, sanity_slow_count );
 }
 
-/* CPU semaphore, so that threads can run exclusively */
-static vg_sema_t the_BigLock;
+/*
+ * Mutual exclusion object used to serialize threads.
+ */
+static struct sched_lock *the_BigLock;
 
 
 /* ---------------------------------------------------------------------
@@ -241,7 +243,7 @@
    /* First, acquire the_BigLock.  We can't do anything else safely
       prior to this point.  Even doing debug printing prior to this
       point is, technically, wrong. */
-   ML_(sema_down)(&the_BigLock, False/*not LL*/);
+   VG_(acquire_BigLock_LL)(NULL);
 
    tst = VG_(get_ThreadState)(tid);
 
@@ -297,19 +299,31 @@
 
    /* Release the_BigLock; this will reschedule any runnable
       thread. */
-   ML_(sema_up)(&the_BigLock, False/*not LL*/);
+   VG_(release_BigLock_LL)(NULL);
+}
+
+static void init_BigLock(void)
+{
+   vg_assert(!the_BigLock);
+   the_BigLock = ML_(create_sched_lock)();
+}
+
+static void deinit_BigLock(void)
+{
+   ML_(destroy_sched_lock)(the_BigLock);
+   the_BigLock = NULL;
 }
 
 /* See pub_core_scheduler.h for description */
 void VG_(acquire_BigLock_LL) ( HChar* who )
 {
-  ML_(sema_down)(&the_BigLock, True/*LL*/);
+   ML_(acquire_sched_lock)(the_BigLock);
 }
 
 /* See pub_core_scheduler.h for description */
 void VG_(release_BigLock_LL) ( HChar* who )
 {
-   ML_(sema_up)(&the_BigLock, True/*LL*/);
+   ML_(release_sched_lock)(the_BigLock);
 }
 
 
@@ -331,7 +345,7 @@
    if (VG_(clo_trace_sched))
       print_sched_event(tid, "release lock in VG_(exit_thread)");
 
-   ML_(sema_up)(&the_BigLock, False/*not LL*/);
+   VG_(release_BigLock_LL)(NULL);
 }
 
 /* If 'tid' is blocked in a syscall, send it SIGVGKILL so as to get it
@@ -518,9 +532,9 @@
    }
 
    /* re-init and take the sema */
-   ML_(sema_deinit)(&the_BigLock);
-   ML_(sema_init)(&the_BigLock);
-   ML_(sema_down)(&the_BigLock, False/*not LL*/);
+   deinit_BigLock();
+   init_BigLock();
+   VG_(acquire_BigLock_LL)(NULL);
 }
 
 
@@ -535,7 +549,21 @@
 
    VG_(debugLog)(1,"sched","sched_init_phase1\n");
 
-   ML_(sema_init)(&the_BigLock);
+   if (VG_(clo_fair_sched) != disable_fair_sched
+       && !ML_(set_sched_lock_impl)(sched_lock_ticket)
+       && VG_(clo_fair_sched) == enable_fair_sched)
+   {
+      VG_(printf)("Error: fair scheduling is not supported on this system.\n");
+      VG_(exit)(1);
+   }
+
+   if (VG_(clo_verbosity) > 1) {
+      VG_(message)(Vg_DebugMsg,
+                   "Scheduler: using %s scheduler lock implementation.\n",
+                   ML_(get_sched_lock_name)());
+   }
+
+   init_BigLock();
 
    for (i = 0 /* NB; not 1 */; i < VG_N_THREADS; i++) {
       /* Paranoia .. completely zero it out. */
@@ -1758,15 +1786,12 @@
       bad = True;
    }
 
-#if !defined(VGO_darwin)
-   // GrP fixme
-   if (lwpid != the_BigLock.owner_lwpid) {
+   if (lwpid != ML_(get_sched_lock_owner)(the_BigLock)) {
       VG_(message)(Vg_DebugMsg,
                    "Thread (LWPID) %d doesn't own the_BigLock\n",
                    tid);
       bad = True;
    }
-#endif
 
    /* Periodically show the state of all threads, for debugging
       purposes. */
diff --git a/coregrind/m_scheduler/ticket-lock-linux.c b/coregrind/m_scheduler/ticket-lock-linux.c
new file mode 100644
index 0000000..64b5d4d
--- /dev/null
+++ b/coregrind/m_scheduler/ticket-lock-linux.c
@@ -0,0 +1,185 @@
+
+/*--------------------------------------------------------------------*/
+/*--- Linux ticket lock implementation         ticket-lock-linux.c ---*/
+/*---                                                              ---*/
+/*--- Guarantees fair scheduling even if multiple threads are      ---*/
+/*--- runnable at the same time on a multicore system. Has been    ---*/
+/*--- observed to cause a slow-down compared to the generic        ---*/
+/*--- scheduler lock with CPU frequency scaling enabled. Makes     ---*/
+/*--- Valgrind slightly faster if CPU frequency scaling has been   ---*/
+/*--- disabled. See also http://bugs.kde.org/show_bug.cgi?id=270006---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2011 Bart Van Assche <bvanassche@acm.org>.
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "pub_core_basics.h"
+#include "pub_core_libcassert.h"
+#include "pub_core_libcbase.h"     // VG_(memset)()
+#include "pub_core_libcprint.h"
+#include "pub_core_syscall.h"
+#include "pub_core_vki.h"
+#include "pub_core_vkiscnums.h"    // __NR_futex
+#include "pub_tool_libcproc.h"
+#include "pub_tool_mallocfree.h"
+#include "pub_tool_threadstate.h"
+#include "priv_sched-lock.h"
+#include "priv_sched-lock-impl.h"
+
+#define TL_FUTEX_COUNT_LOG2 4
+#define TL_FUTEX_COUNT (1U << TL_FUTEX_COUNT_LOG2)
+#define TL_FUTEX_MASK (TL_FUTEX_COUNT - 1)
+
+struct sched_lock {
+   volatile unsigned head;
+   volatile unsigned tail;
+   volatile unsigned futex[TL_FUTEX_COUNT];
+   int owner;
+};
+
+#if 1
+static Bool s_debug;
+#else
+static Bool s_debug = True;
+#endif
+
+static const Char *get_sched_lock_name(void)
+{
+   return "ticket lock";
+}
+
+static struct sched_lock *create_sched_lock(void)
+{
+   struct sched_lock *p;
+
+   p = VG_(malloc)("sched_lock", sizeof(*p));
+   if (p) {
+      // The futex syscall requires that a futex takes four bytes.
+      vg_assert(sizeof(p->futex[0]) == 4);
+
+      p->head = 0;
+      p->tail = 0;
+      VG_(memset)((void*)p->futex, 0, sizeof(p->futex));
+      p->owner = 0;
+   }
+   return p;
+}
+
+static void destroy_sched_lock(struct sched_lock *p)
+{
+   VG_(free)(p);
+}
+
+static int get_sched_lock_owner(struct sched_lock *p)
+{
+   return p->owner;
+}
+
+/*
+ * Acquire ticket lock. Increment the tail of the queue and use the original
+ * value as the ticket value. Wait until the head of the queue equals the
+ * ticket value. The futex used to wait depends on the ticket value in order
+ * to avoid that all threads get woken up every time a ticket lock is
+ * released. That last effect is sometimes called the "thundering herd"
+ * effect.
+ *
+ * See also Nick Piggin, x86: FIFO ticket spinlocks, Linux kernel mailing list
+ * (http://lkml.org/lkml/2007/11/1/125) for more info.
+ */
+static void acquire_sched_lock(struct sched_lock *p)
+{
+   unsigned ticket, futex_value;
+   volatile unsigned *futex;
+   SysRes sres;
+
+   ticket = __sync_fetch_and_add(&p->tail, 1);
+   futex = &p->futex[ticket & TL_FUTEX_MASK];
+   if (s_debug)
+      VG_(printf)("[%d/%d] acquire: ticket %d\n", VG_(getpid)(),
+                  VG_(gettid)(), ticket);
+   for (;;) {
+      futex_value = *futex;
+      if (ticket == p->head)
+         break;
+      if (s_debug)
+         VG_(printf)("[%d/%d] acquire: ticket %d - waiting until"
+                     " futex[%ld] != %d\n", VG_(getpid)(),
+                     VG_(gettid)(), ticket, (long)(futex - p->futex),
+                     futex_value);
+      sres = VG_(do_syscall3)(__NR_futex, (UWord)futex,
+                              VKI_FUTEX_WAIT | VKI_FUTEX_PRIVATE_FLAG,
+                              futex_value);
+      if (sr_isError(sres) && sres._val != VKI_EAGAIN) {
+         VG_(printf)("futex_wait() returned error code %ld\n", sres._val);
+         vg_assert(False);
+      }
+   }
+   vg_assert(p->owner == 0);
+   p->owner = VG_(gettid)();
+}
+
+/*
+ * Release a ticket lock by incrementing the head of the queue. Only generate
+ * a thread wakeup signal if at least one thread is waiting. If the queue tail
+ * matches the wakeup_ticket value, no threads have to be woken up.
+ *
+ * Note: tail will only be read after head has been incremented since both are
+ * declared as volatile and since the __sync...() functions imply a memory
+ * barrier.
+ */
+static void release_sched_lock(struct sched_lock *p)
+{
+   unsigned wakeup_ticket, futex_value;
+   volatile unsigned *futex;
+   SysRes sres;
+
+   vg_assert(p->owner != 0);
+   p->owner = 0;
+   wakeup_ticket = __sync_fetch_and_add(&p->head, 1) + 1;
+   if (p->tail != wakeup_ticket) {
+      futex = &p->futex[wakeup_ticket & TL_FUTEX_MASK];
+      futex_value = __sync_fetch_and_add(futex, 1);
+      if (s_debug)
+         VG_(printf)("[%d/%d] release: waking up ticket %d (futex[%ld] = %d)"
+                     "\n", VG_(getpid)(), VG_(gettid)(), wakeup_ticket,
+                     (long)(futex - p->futex), futex_value);
+      sres = VG_(do_syscall3)(__NR_futex, (UWord)futex,
+                              VKI_FUTEX_WAKE | VKI_FUTEX_PRIVATE_FLAG,
+                              0x7fffffff);
+      vg_assert(!sr_isError(sres));
+   } else {
+      if (s_debug)
+         VG_(printf)("[%d/%d] release: no thread is waiting for ticket %d\n",
+                     VG_(getpid)(), VG_(gettid)(), wakeup_ticket);
+   }
+}
+
+const struct sched_lock_ops ML_(linux_ticket_lock_ops) = {
+   .get_sched_lock_name  = get_sched_lock_name,
+   .create_sched_lock    = create_sched_lock,
+   .destroy_sched_lock   = destroy_sched_lock,
+   .get_sched_lock_owner = get_sched_lock_owner,
+   .acquire_sched_lock   = acquire_sched_lock,
+   .release_sched_lock   = release_sched_lock,
+};
diff --git a/coregrind/pub_core_options.h b/coregrind/pub_core_options.h
index 970424b..5559984 100644
--- a/coregrind/pub_core_options.h
+++ b/coregrind/pub_core_options.h
@@ -146,6 +146,9 @@
 extern Bool  VG_(clo_debug_dump_frames);
 /* DEBUG: print redirection details?  default: NO */
 extern Bool  VG_(clo_trace_redir);
+/* Enable fair scheduling on multicore systems? default: NO */
+enum FairSchedType { disable_fair_sched, enable_fair_sched, try_fair_sched };
+extern enum FairSchedType VG_(clo_fair_sched);
 /* DEBUG: print thread scheduling events?  default: NO */
 extern Bool  VG_(clo_trace_sched);
 /* DEBUG: do heap profiling?  default: NO */
diff --git a/drd/tests/annotate_hbefore.vgtest b/drd/tests/annotate_hbefore.vgtest
index d898915..e01bcde 100644
--- a/drd/tests/annotate_hbefore.vgtest
+++ b/drd/tests/annotate_hbefore.vgtest
@@ -1,4 +1,4 @@
 prereq: test -e ../../helgrind/tests/annotate_hbefore && ./supported_libpthread
-vgopts: --read-var-info=yes --check-stack-var=yes --show-confl-seg=no
+vgopts: --fair-sched=try --read-var-info=yes --check-stack-var=yes --show-confl-seg=no
 prog: ../../helgrind/tests/annotate_hbefore
 stderr_filter: filter_stderr
diff --git a/helgrind/tests/annotate_hbefore.vgtest b/helgrind/tests/annotate_hbefore.vgtest
index 1e37939..2142f0a 100644
--- a/helgrind/tests/annotate_hbefore.vgtest
+++ b/helgrind/tests/annotate_hbefore.vgtest
@@ -1,2 +1,2 @@
-vgopts: -q
+vgopts: -q --fair-sched=try
 prog: annotate_hbefore
diff --git a/none/tests/cmdline1.stdout.exp b/none/tests/cmdline1.stdout.exp
index 35ca443..369e999 100644
--- a/none/tests/cmdline1.stdout.exp
+++ b/none/tests/cmdline1.stdout.exp
@@ -73,6 +73,7 @@
     --run-libc-freeres=no|yes free up glibc memory at exit on Linux? [yes]
     --sim-hints=hint1,hint2,...  known hints:
                                  lax-ioctls, enable-outer, fuse-compatible [none]
+    --fair-sched=no|yes|try   schedule threads fairly on multicore systems [no]
     --kernel-variant=variant1,variant2,...  known variants: bproc [none]
                               handle non-standard kernel variants
     --show-emwarns=no|yes     show warnings about emulation limits? [no]
diff --git a/none/tests/cmdline2.stdout.exp b/none/tests/cmdline2.stdout.exp
index 01a3801..5cf446c 100644
--- a/none/tests/cmdline2.stdout.exp
+++ b/none/tests/cmdline2.stdout.exp
@@ -73,6 +73,7 @@
     --run-libc-freeres=no|yes free up glibc memory at exit on Linux? [yes]
     --sim-hints=hint1,hint2,...  known hints:
                                  lax-ioctls, enable-outer, fuse-compatible [none]
+    --fair-sched=no|yes|try   schedule threads fairly on multicore systems [no]
     --kernel-variant=variant1,variant2,...  known variants: bproc [none]
                               handle non-standard kernel variants
     --show-emwarns=no|yes     show warnings about emulation limits? [no]