perf: Optimize the perf_output() path by removing IRQ-disables

Since we can now assume there is only a single writer
to each buffer, we can remove per-cpu lock thingy and
use a simply nest-count to the same effect.

This removes the need to disable IRQs.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0b521fc..f1f853a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -597,12 +597,12 @@
 	atomic_t			events;		/* event_id limit       */
 
 	atomic_long_t			head;		/* write position    */
-	atomic_long_t			done_head;	/* completed head    */
 
-	atomic_t			lock;		/* concurrent writes */
 	atomic_t			wakeup;		/* needs a wakeup    */
 	atomic_t			lost;		/* nr records lost   */
 
+	atomic_t			nest;		/* nested writers    */
+
 	long				watermark;	/* wakeup watermark  */
 
 	struct perf_event_mmap_page	*user_page;
@@ -807,7 +807,6 @@
 	unsigned long			offset;
 	int				nmi;
 	int				sample;
-	int				locked;
 };
 
 #ifdef CONFIG_PERF_EVENTS