merge from open-source master

Change-Id: Iff982a4bec118c01718056fff079460d365968c3
diff --git a/Android.mk b/Android.mk
index 582ddc9..5053e7d 100644
--- a/Android.mk
+++ b/Android.mk
@@ -1,3 +1 @@
-ifeq ($(TARGET_ARCH),arm)
 include $(call all-subdir-makefiles)
-endif
diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h
index 35080ad..b1e3b4c 100644
--- a/libop/op_hw_specific.h
+++ b/libop/op_hw_specific.h
@@ -20,7 +20,8 @@
 		char v[12];
 	} v;
 	unsigned eax;
-	asm("cpuid" : "=a" (eax), "=b" (v.b), "=c" (v.c), "=d" (v.d) : "0" (0));
+	asm volatile(	"pushl %%ebx; cpuid; movl %%ebx, %1; popl %%ebx"
+			: "=a" (eax), "=S" (v.b), "=c" (v.c), "=d" (v.d) : "0" (0));
 	return !strncmp(v.v, vnd, 12);
 }
 
@@ -46,7 +47,8 @@
 
 	if (!cpuid_vendor("GenuineIntel"))
 		return;
-	asm("cpuid" : "=a" (v.eax) : "0" (1) : "ecx","ebx","edx");
+	asm volatile(	"pushl %%ebx; cpuid; movl %%ebx, %1; popl %%ebx"
+			: "=a" (v.eax) : "0" (1) : "ecx","edx");
 	model = (v.ext_model << 4) + v.model;
 	if (v.family != 6 || model != 26 || v.stepping > 4)
 		return;
@@ -57,7 +59,8 @@
 {
 	if (cpu_type == CPU_ARCH_PERFMON) { 
 		unsigned ebx, eax;
-		asm("cpuid" : "=a" (eax), "=b" (ebx) : "0" (0xa) : "ecx","edx");
+		asm volatile(	"pushl %%ebx; cpuid; movl %%ebx, %1; popl %%ebx"
+				: "=a" (eax), "=S" (ebx) : "0" (0xa) : "ecx","edx");
 		workaround_nehalem_aaj79(&ebx);
 		return ebx & num_to_mask(eax >> 24);
 	}
@@ -68,7 +71,8 @@
 {
 	if (cpu_type == CPU_ARCH_PERFMON) {
 		unsigned v;
-		asm("cpuid" : "=a" (v) : "0" (0xa) : "ebx","ecx","edx");
+		asm volatile(	"pushl %%ebx; cpuid; movl %%eax, %1; popl %%ebx"
+				: "=a" (v) : "0" (0xa) : "ecx","edx");
 		return (v >> 8) & 0xff;
 	} 
 	return -1;
@@ -77,7 +81,8 @@
 static inline unsigned arch_get_counter_mask(void)
 {
 	unsigned v;
-	asm("cpuid" : "=a" (v) : "0" (0xa) : "ebx","ecx","edx");
+	asm volatile(	"pushl %%ebx; cpuid; movl %%ebx, %1; popl %%ebx"
+			: "=a" (v) : "0" (0xa) : "ecx","edx");
 	return num_to_mask((v >> 8) & 0xff);	
 }
 
diff --git a/opcontrol/opcontrol.cpp b/opcontrol/opcontrol.cpp
index 5dffe24..0552c8c 100644
--- a/opcontrol/opcontrol.cpp
+++ b/opcontrol/opcontrol.cpp
@@ -23,6 +23,7 @@
 #include <getopt.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <signal.h>
@@ -30,17 +31,16 @@
 
 #include "op_config.h"
 
-#if 0
-#define verbose(fmt...) printf(fmt)
-#else
-#define verbose(fmt...)
-#endif
+#define verbose(fmt...) if (verbose_print) printf(fmt)
 
 /* Experiments found that using a small interval may hang the device, and the
  * more events tracked simultaneously, the longer the interval has to be.
  */
 
-#if !defined(WITH_ARM_V7_A)
+#if defined(__i386__) || defined(__x86_64__)
+#define MAX_EVENTS 2
+int min_count[MAX_EVENTS] = {60000, 100000};
+#elif !defined(WITH_ARM_V7_A)
 #define MAX_EVENTS 3
 int min_count[MAX_EVENTS] = {150000, 200000, 250000};
 #else
@@ -48,6 +48,7 @@
 int min_count[MAX_EVENTS] = {150000, 20000, 25000, 30000, 35000};
 #endif
 
+int verbose_print;
 int list_events; 
 int show_usage;
 int setup;
@@ -60,6 +61,7 @@
 int selected_events[MAX_EVENTS];
 int selected_counts[MAX_EVENTS];
 
+char callgraph[8];
 char kernel_range[512];
 char vmlinux[512];
 
@@ -69,166 +71,215 @@
     {"reset", 0, &reset, 1},
     {"setup", 0, &setup, 1},
     {"quick", 0, &quick, 1},
+    {"callgraph", 1, 0, 'c'},
     {"event", 1, 0, 'e'},
     {"vmlinux", 1, 0, 'v'},
     {"kernel-range", 1, 0, 'r'},
     {"start", 0, &start, 1},
     {"stop", 0, &stop, 1},
+    {"dump", 0, 0, 'd'},
     {"shutdown", 0, 0, 'h'},
     {"status", 0, 0, 't'},
+    {"verbose", 0, 0, 'V'},
     {0, 0, 0, 0},
 };
 
 struct event_info {
     int id;
+    int um;
     const char *name;
     const char *explanation;
 } event_info[] = {
-#if !defined(WITH_ARM_V7_A)
+#if defined(__i386__) || defined(__x86_64__)
+    /* INTEL_ARCH_PERFMON events */
+
+    /* 0x3c counters:cpuid um:zero minimum:6000 filter:0 name:CPU_CLK_UNHALTED :
+     * Clock cycles when not halted
+     */
+    {0x3c, 0, "CPU_CLK_UNHALTED",
+     "Clock cycles when not halted" },
+
+    /* event:0x3c counters:cpuid um:one minimum:6000 filter:2 name:UNHALTED_REFERENCE_CYCLES :
+     * Unhalted reference cycles
+     */
+    {0x3c, 1, "UNHALTED_REFERENCE_CYCLES",
+      "Unhalted reference cycles" },
+
+    /* event:0xc0 counters:cpuid um:zero minimum:6000 filter:1 name:INST_RETIRED :
+     * number of instructions retired
+     */
+     {0xc0, 0, "INST_RETIRED",
+       "number of instructions retired"},
+
+    /* event:0x2e counters:cpuid um:x41 minimum:6000 filter:5 name:LLC_MISSES :
+     * Last level cache demand requests from this core that missed the LLC
+     */
+     {0x2e, 0x41, "LLC_MISSES",
+       "Last level cache demand requests from this core that missed the LLC"},
+
+    /* event:0x2e counters:cpuid um:x4f minimum:6000 filter:4 name:LLC_REFS :
+     * Last level cache demand requests from this core
+     */
+     {0x2e, 0x4f, "LLC_REFS",
+      "Last level cache demand requests from this core"},
+
+    /* event:0xc4 counters:cpuid um:zero minimum:500 filter:6 name:BR_INST_RETIRED :
+     * number of branch instructions retired
+     */
+     {0xc4, 0, "BR_INST_RETIRED",
+       "number of branch instructions retired"},
+
+    /* event:0xc5 counters:cpuid um:zero minimum:500 filter:7 name:BR_MISS_PRED_RETIRED :
+     * number of mispredicted branches retired (precise)
+     */
+     {0xc5, 0, "BR_MISS_PRED_RETIRED",
+       "number of mispredicted branches retired (precise)"},
+
+#elif !defined(WITH_ARM_V7_A)
     /* ARM V6 events */
-    {0x00, "IFU_IFETCH_MISS", 
+    {0x00, 0, "IFU_IFETCH_MISS", 
      "number of instruction fetch misses"},
-    {0x01, "CYCLES_IFU_MEM_STALL", 
+    {0x01, 0, "CYCLES_IFU_MEM_STALL", 
      "cycles instruction fetch pipe is stalled"},
-    {0x02, "CYCLES_DATA_STALL", 
+    {0x02, 0, "CYCLES_DATA_STALL", 
      "cycles stall occurs for due to data dependency"},
-    {0x03, "ITLB_MISS", 
+    {0x03, 0, "ITLB_MISS", 
      "number of Instruction MicroTLB misses"},
-    {0x04, "DTLB_MISS", 
+    {0x04, 0, "DTLB_MISS", 
      "number of Data MicroTLB misses"},
-    {0x05, "BR_INST_EXECUTED", 
+    {0x05, 0, "BR_INST_EXECUTED", 
      "branch instruction executed w/ or w/o program flow change"},
-    {0x06, "BR_INST_MISS_PRED", 
+    {0x06, 0, "BR_INST_MISS_PRED", 
      "branch mispredicted"},
-    {0x07, "INSN_EXECUTED", 
+    {0x07, 0, "INSN_EXECUTED", 
      "instructions executed"},
-    {0x09, "DCACHE_ACCESS", 
+    {0x09, 0, "DCACHE_ACCESS", 
      "data cache access, cacheable locations"},
-    {0x0a, "DCACHE_ACCESS_ALL", 
+    {0x0a, 0, "DCACHE_ACCESS_ALL", 
      "data cache access, all locations"},
-    {0x0b, "DCACHE_MISS", 
+    {0x0b, 0, "DCACHE_MISS", 
      "data cache miss"},
-    {0x0c, "DCACHE_WB", 
+    {0x0c, 0, "DCACHE_WB", 
      "data cache writeback, 1 event for every half cacheline"},
-    {0x0d, "PC_CHANGE", 
+    {0x0d, 0, "PC_CHANGE", 
      "number of times the program counter was changed without a mode switch"},
-    {0x0f, "TLB_MISS", 
+    {0x0f, 0, "TLB_MISS", 
      "Main TLB miss"},
-    {0x10, "EXP_EXTERNAL", 
+    {0x10, 0, "EXP_EXTERNAL", 
      "Explicit external data access"},
-    {0x11, "LSU_STALL", 
+    {0x11, 0, "LSU_STALL", 
      "cycles stalled because Load Store request queue is full"},
-    {0x12, "WRITE_DRAIN", 
+    {0x12, 0, "WRITE_DRAIN", 
      "Times write buffer was drained"},
-    {0xff, "CPU_CYCLES", 
+    {0xff, 0, "CPU_CYCLES", 
      "clock cycles counter"}, 
 #else
     /* ARM V7 events */
-    {0x00, "PMNC_SW_INCR",
+    {0x00, 0, "PMNC_SW_INCR",
      "Software increment of PMNC registers"},
-    {0x01, "IFETCH_MISS",
+    {0x01, 0, "IFETCH_MISS",
      "Instruction fetch misses from cache or normal cacheable memory"},
-    {0x02, "ITLB_MISS",
+    {0x02, 0, "ITLB_MISS",
      "Instruction fetch misses from TLB"},
-    {0x03, "DCACHE_REFILL",
+    {0x03, 0, "DCACHE_REFILL",
      "Data R/W operation that causes a refill from cache or normal cacheable"
      "memory"},
-    {0x04, "DCACHE_ACCESS",
+    {0x04, 0, "DCACHE_ACCESS",
      "Data R/W from cache"},
-    {0x05, "DTLB_REFILL",
+    {0x05, 0, "DTLB_REFILL",
      "Data R/W that causes a TLB refill"},
-    {0x06, "DREAD",
+    {0x06, 0, "DREAD",
      "Data read architecturally executed (note: architecturally executed = for"
      "instructions that are unconditional or that pass the condition code)"},
-    {0x07, "DWRITE",
+    {0x07, 0, "DWRITE",
      "Data write architecturally executed"},
-    {0x08, "INSTR_EXECUTED",
+    {0x08, 0, "INSTR_EXECUTED",
      "All executed instructions"},
-    {0x09, "EXC_TAKEN",
+    {0x09, 0, "EXC_TAKEN",
      "Exception taken"},
-    {0x0A, "EXC_EXECUTED",
+    {0x0A, 0, "EXC_EXECUTED",
      "Exception return architecturally executed"},
-    {0x0B, "CID_WRITE",
+    {0x0B, 0, "CID_WRITE",
      "Instruction that writes to the Context ID Register architecturally"
      "executed"},
-    {0x0C, "PC_WRITE",
+    {0x0C, 0, "PC_WRITE",
      "SW change of PC, architecturally executed (not by exceptions)"},
-    {0x0D, "PC_IMM_BRANCH",
+    {0x0D, 0, "PC_IMM_BRANCH",
      "Immediate branch instruction executed (taken or not)"},
-    {0x0E, "PC_PROC_RETURN",
+    {0x0E, 0, "PC_PROC_RETURN",
      "Procedure return architecturally executed (not by exceptions)"},
-    {0x0F, "UNALIGNED_ACCESS",
+    {0x0F, 0, "UNALIGNED_ACCESS",
      "Unaligned access architecturally executed"},
-    {0x10, "PC_BRANCH_MIS_PRED",
+    {0x10, 0, "PC_BRANCH_MIS_PRED",
      "Branch mispredicted or not predicted. Counts pipeline flushes because of"
      "misprediction"},
-    {0x12, "PC_BRANCH_MIS_USED",
+    {0x12, 0, "PC_BRANCH_MIS_USED",
     "Branch or change in program flow that could have been predicted"},
-    {0x40, "WRITE_BUFFER_FULL",
+    {0x40, 0, "WRITE_BUFFER_FULL",
      "Any write buffer full cycle"},
-    {0x41, "L2_STORE_MERGED",
+    {0x41, 0, "L2_STORE_MERGED",
      "Any store that is merged in L2 cache"},
-    {0x42, "L2_STORE_BUFF",
+    {0x42, 0, "L2_STORE_BUFF",
      "Any bufferable store from load/store to L2 cache"},
-    {0x43, "L2_ACCESS",
+    {0x43, 0, "L2_ACCESS",
      "Any access to L2 cache"},
-    {0x44, "L2_CACH_MISS",
+    {0x44, 0, "L2_CACH_MISS",
      "Any cacheable miss in L2 cache"},
-    {0x45, "AXI_READ_CYCLES",
+    {0x45, 0, "AXI_READ_CYCLES",
      "Number of cycles for an active AXI read"},
-    {0x46, "AXI_WRITE_CYCLES",
+    {0x46, 0, "AXI_WRITE_CYCLES",
      "Number of cycles for an active AXI write"},
-    {0x47, "MEMORY_REPLAY",
+    {0x47, 0, "MEMORY_REPLAY",
      "Any replay event in the memory subsystem"},
-    {0x48, "UNALIGNED_ACCESS_REPLAY",
+    {0x48, 0, "UNALIGNED_ACCESS_REPLAY",
      "Unaligned access that causes a replay"},
-    {0x49, "L1_DATA_MISS",
+    {0x49, 0, "L1_DATA_MISS",
      "L1 data cache miss as a result of the hashing algorithm"},
-    {0x4A, "L1_INST_MISS",
+    {0x4A, 0, "L1_INST_MISS",
      "L1 instruction cache miss as a result of the hashing algorithm"},
-    {0x4B, "L1_DATA_COLORING",
+    {0x4B, 0, "L1_DATA_COLORING",
      "L1 data access in which a page coloring alias occurs"},
-    {0x4C, "L1_NEON_DATA",
+    {0x4C, 0, "L1_NEON_DATA",
      "NEON data access that hits L1 cache"},
-    {0x4D, "L1_NEON_CACH_DATA",
+    {0x4D, 0, "L1_NEON_CACH_DATA",
      "NEON cacheable data access that hits L1 cache"},
-    {0x4E, "L2_NEON",
+    {0x4E, 0, "L2_NEON",
      "L2 access as a result of NEON memory access"},
-    {0x4F, "L2_NEON_HIT",
+    {0x4F, 0, "L2_NEON_HIT",
      "Any NEON hit in L2 cache"},
-    {0x50, "L1_INST",
+    {0x50, 0, "L1_INST",
      "Any L1 instruction cache access, excluding CP15 cache accesses"},
-    {0x51, "PC_RETURN_MIS_PRED",
+    {0x51, 0, "PC_RETURN_MIS_PRED",
      "Return stack misprediction at return stack pop"
      "(incorrect target address)"},
-    {0x52, "PC_BRANCH_FAILED",
+    {0x52, 0, "PC_BRANCH_FAILED",
      "Branch prediction misprediction"},
-    {0x53, "PC_BRANCH_TAKEN",
+    {0x53, 0, "PC_BRANCH_TAKEN",
      "Any predicted branch that is taken"},
-    {0x54, "PC_BRANCH_EXECUTED",
+    {0x54, 0, "PC_BRANCH_EXECUTED",
      "Any taken branch that is executed"},
-    {0x55, "OP_EXECUTED",
+    {0x55, 0, "OP_EXECUTED",
      "Number of operations executed"
      "(in instruction or mutli-cycle instruction)"},
-    {0x56, "CYCLES_INST_STALL",
+    {0x56, 0, "CYCLES_INST_STALL",
      "Cycles where no instruction available"},
-    {0x57, "CYCLES_INST",
+    {0x57, 0, "CYCLES_INST",
      "Number of instructions issued in a cycle"},
-    {0x58, "CYCLES_NEON_DATA_STALL",
+    {0x58, 0, "CYCLES_NEON_DATA_STALL",
      "Number of cycles the processor waits on MRC data from NEON"},
-    {0x59, "CYCLES_NEON_INST_STALL",
+    {0x59, 0, "CYCLES_NEON_INST_STALL",
      "Number of cycles the processor waits on NEON instruction queue or"
      "NEON load queue"},
-    {0x5A, "NEON_CYCLES",
+    {0x5A, 0, "NEON_CYCLES",
      "Number of cycles NEON and integer processors are not idle"},
-    {0x70, "PMU0_EVENTS",
+    {0x70, 0, "PMU0_EVENTS",
      "Number of events from external input source PMUEXTIN[0]"},
-    {0x71, "PMU1_EVENTS",
+    {0x71, 0, "PMU1_EVENTS",
      "Number of events from external input source PMUEXTIN[1]"},
-    {0x72, "PMU_EVENTS",
+    {0x72, 0, "PMU_EVENTS",
      "Number of events from both external input sources PMUEXTIN[0]"
      "and PMUEXTIN[1]"},
-    {0xFF, "CPU_CYCLES",
+    {0xFF, 0, "CPU_CYCLES",
      "Number of CPU cycles"},
 #endif
 };
@@ -238,13 +289,19 @@
     printf("\nopcontrol: usage:\n"
            "   --list-events    list event types\n"
            "   --help           this message\n"
+           "   --verbose        show extra status\n"
            "   --setup          setup directories\n"
+#if defined(__i386__) || defined(__x86_64__)
+           "   --quick          setup and select CPU_CLK_UNHALTED:60000\n"
+#else
            "   --quick          setup and select CPU_CYCLES:150000\n"
+#endif
            "   --status         show configuration\n"
            "   --start          start data collection\n"
            "   --stop           stop data collection\n"
            "   --reset          clears out data from current session\n"
            "   --shutdown       kill the oprofile daeman\n"
+           "   --callgraph=depth callgraph depth\n"
            "   --event=eventspec\n"
            "      Choose an event. May be specified multiple times.\n"
            "      eventspec is in the form of name[:count], where :\n"
@@ -459,8 +516,9 @@
             printf("  %9u samples received\n", num);
             num = read_num(OP_DRIVER_BASE"/stats/cpu0/sample_lost_overflow");
             printf("  %9u samples lost overflow\n", num);
-#if 0
-            /* FIXME - backtrace seems broken */
+
+#if defined(__i386__) || defined(__x86_64__)
+            /* FIXME on ARM - backtrace seems broken there */
             num = read_num(OP_DRIVER_BASE"/stats/cpu0/backtrace_aborted");
             printf("  %9u backtrace aborted\n", num);
             num = read_num(OP_DRIVER_BASE"/backtrace_depth");
@@ -495,13 +553,17 @@
     strcpy(kernel_range, "");
 
     while (1) {
-        int c = getopt_long(argc, argv, "", long_options, &option_index);
+        int c = getopt_long(argc, argv, "c:e:v:r:dhVt", long_options, &option_index);
         if (c == -1) {
             break;
         }
         switch (c) {
             case 0:
                 break;
+            /* --callgraph */
+            case 'c':
+		strncpy(callgraph, optarg, sizeof(callgraph));
+                break;
             /* --event */
             case 'e':   
                 if (num_events == MAX_EVENTS) {
@@ -521,15 +583,33 @@
             case 'r':
                 sprintf(kernel_range, "-r %s", optarg);
                 break;
+            case 'd':
+            /* --dump */ {
+                int pid = read_num(OP_DATA_DIR"/lock");
+                echo_dev("1", 0, "dump", -1);
+                if (pid >= 0) {
+                    sleep(1);
+                    kill(pid, SIGHUP);
+                }
+                break;
+            }
             /* --shutdown */
             case 'h': {
                 int pid = read_num(OP_DATA_DIR"/lock");
                 if (pid >= 0) {
+                    kill(pid, SIGHUP); /* Politely ask the daemon to close files */
+                    sleep(1);
+                    kill(pid, SIGTERM);/* Politely ask the daemon to die */
+                    sleep(1);
                     kill(pid, SIGKILL);
                 }   
                 setup_session_dir();
                 break;
             }
+            /* --verbose */
+            case 'V':
+                verbose_print++;
+                break;
             /* --status */
             case 't':
                 do_status();
@@ -547,7 +627,11 @@
     }
 
     if (quick) {
+#if defined(__i386__) || defined(__x86_64__)
+        process_event("CPU_CLK_UNHALTED");
+#else
         process_event("CPU_CYCLES");
+#endif
         setup = 1;
     }
 
@@ -566,12 +650,18 @@
         }
     }
 
+    if (strlen(callgraph)) {
+        echo_dev(callgraph, 0, "backtrace_depth", -1);
+    }
+
     if (num_events != 0) {
         int i;
 
         strcpy(command, "oprofiled --session-dir="OP_DATA_DIR);
 
-#if !defined(WITH_ARM_V7_A)
+#if defined(__i386__) || defined(__x86_64__)
+        /* Nothing */
+#elif !defined(WITH_ARM_V7_A)
         /* Since counter #3 can only handle CPU_CYCLES, check and shuffle the 
          * order a bit so that the maximal number of events can be profiled
          * simultaneously
@@ -624,15 +714,16 @@
              * --events=CYCLES_DATA_STALL:2:0:200000:0:1:1,....
              */
             snprintf(command+strlen(command), 1024 - strlen(command), 
-                     "%s:%d:%d:%d:0:1:1",
+                     "%s:%d:%d:%d:%d:1:1",
                      event_info[event_idx].name,
                      event_info[event_idx].id,
                      i,
-                     selected_counts[i]);
+                     selected_counts[i],
+                     event_info[event_idx].um);
 
             setup_result |= echo_dev("1", 0, "user", i);
             setup_result |= echo_dev("1", 0, "kernel", i);
-            setup_result |= echo_dev("0", 0, "unit_mask", i);
+            setup_result |= echo_dev(NULL, event_info[event_idx].um, "unit_mask", i);
             setup_result |= echo_dev("1", 0, "enabled", i);
             setup_result |= echo_dev(NULL, selected_counts[i], "count", i);
             setup_result |= echo_dev(NULL, event_info[event_idx].id,