introduce {attach|detach}_raw_tracepoint API

The motivation comes from pull request #1689.
It attached a kprobe bpf program to kernel function
ttwu_do_wakeup for more accurate tracing.
Unfortunately, it broke runqlat.py in my
4.17 environment since ttwu_do_wakeup function
is inlined in my kernel with gcc 7.3.1.

4.17 introduced raw_tracepoint and this patch
added the relevant API to bcc. With this,
we can use tracepoints
sched:{sched_wakeup, sched_wakeup_new, sched_switch}
to measure runq latency more reliably.

Signed-off-by: Yonghong Song <yhs@fb.com>
diff --git a/docs/reference_guide.md b/docs/reference_guide.md
index e24ac8a..63fb73c 100644
--- a/docs/reference_guide.md
+++ b/docs/reference_guide.md
@@ -14,6 +14,7 @@
         - [4. uprobes](#4-uprobes)
         - [5. uretprobes](#5-uretprobes)
         - [6. USDT probes](#6-usdt-probes)
+        - [7. Raw Tracepoints](#7-raw-tracepoints)
     - [Data](#data)
         - [1. bpf_probe_read()](#1-bpf_probe_read)
         - [2. bpf_probe_read_str()](#2-bpf_probe_read_str)
@@ -61,6 +62,7 @@
         - [4. attach_uprobe()](#4-attach_uprobe)
         - [5. attach_uretprobe()](#5-attach_uretprobe)
         - [6. USDT.enable_probe()](#6-usdtenable_probe)
+        - [7. attach_raw_tracepoint()](#7-attach_raw_tracepoint)
     - [Debug Output](#debug-output)
         - [1. trace_print()](#1-trace_print)
         - [2. trace_fields()](#2-trace_fields)
@@ -237,6 +239,35 @@
 [search /examples](https://github.com/iovisor/bcc/search?q=bpf_usdt_readarg+path%3Aexamples&type=Code),
 [search /tools](https://github.com/iovisor/bcc/search?q=bpf_usdt_readarg+path%3Atools&type=Code)
 
+### 7. Raw Tracepoints
+
+Syntax: RAW_TRACEPOINT_PROBE(*event*)
+
+This is a macro that instruments the raw tracepoint defined by *event*.
+
+The argument is a pointer to struct ```bpf_raw_tracepoint_args```, which is defined in [bpf.h](https://github.com/iovisor/bcc/blob/master/src/cc/compat/linux/bpf.h).  The struct field ```args``` contains all parameters of the raw tracepoint where you can found at linux tree [include/trace/events](https://github.com/torvalds/linux/tree/master/include/trace/events)
+directory.
+
+For example:
+```C
+RAW_TRACEPOINT_PROBE(sched_switch)
+{
+    // TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next)
+    struct task_struct *prev = (struct task_struct *)ctx->args[1];
+    struct task_struct *next= (struct task_struct *)ctx->args[2];
+    s32 prev_tgid, next_tgid;
+
+    bpf_probe_read(&prev_tgid, sizeof(prev->tgid), &prev->tgid);
+    bpf_probe_read(&next_tgid, sizeof(next->tgid), &next->tgid);
+    bpf_trace_printk("%d -> %d\\n", prev_tgid, next_tgid);
+}
+```
+
+This instruments the sched:sched_switch tracepoint, and prints the prev and next tgid.
+
+Examples in situ:
+[search /tools](https://github.com/iovisor/bcc/search?q=RAW_TRACEPOINT_PROBE+path%3Atools&type=Code)
+
 ## Data
 
 ### 1. bpf_probe_read()
@@ -993,6 +1024,23 @@
 [search /examples](https://github.com/iovisor/bcc/search?q=enable_probe+path%3Aexamples+language%3Apython&type=Code),
 [search /tools](https://github.com/iovisor/bcc/search?q=enable_probe+path%3Atools+language%3Apython&type=Code)
 
+### 7. attach_raw_tracepoint()
+
+Syntax: ```BPF.attach_raw_tracepoint(tp="tracepoint", fn_name="name")```
+
+Instruments the kernel raw tracepoint described by ```tracepoint``` (```event``` only, no ```category```), and when hit, runs the BPF function ```name()```.
+
+This is an explicit way to instrument tracepoints. The ```RAW_TRACEPOINT_PROBE``` syntax, covered in the earlier raw tracepoints section, is an alternate method.
+
+For example:
+
+```Python
+b.attach_raw_tracepoint("sched_swtich", "do_trace")
+```
+
+Examples in situ:
+[search /tools](https://github.com/iovisor/bcc/search?q=attach_raw_tracepoint+path%3Atools+language%3Apython&type=Code)
+
 ## Debug Output
 
 ### 1. trace_print()
diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h
index 2ed5ae1..11044cf 100644
--- a/src/cc/export/helpers.h
+++ b/src/cc/export/helpers.h
@@ -683,6 +683,9 @@
 #define TRACEPOINT_PROBE(category, event) \
 int tracepoint__##category##__##event(struct tracepoint__##category##__##event *args)
 
+#define RAW_TRACEPOINT_PROBE(event) \
+int raw_tracepoint__##event(struct bpf_raw_tracepoint_args *ctx)
+
 #define TP_DATA_LOC_READ_CONST(dst, field, length)                        \
         do {                                                              \
             unsigned short __offset = args->data_loc_##field & 0xFFFF;    \
diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c
index dc50270..722350e 100644
--- a/src/cc/libbpf.c
+++ b/src/cc/libbpf.c
@@ -1061,6 +1061,21 @@
   return 0;
 }
 
+int bpf_attach_raw_tracepoint(int progfd, char *tp_name)
+{
+  union bpf_attr attr;
+  int ret;
+
+  bzero(&attr, sizeof(attr));
+  attr.raw_tracepoint.name = ptr_to_u64(tp_name);
+  attr.raw_tracepoint.prog_fd = progfd;
+
+  ret = syscall(__NR_bpf, BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr));
+  if (ret < 0)
+    fprintf(stderr, "bpf_attach_raw_tracepoint (%s): %s\n", tp_name, strerror(errno));
+  return ret;
+}
+
 void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb,
                             perf_reader_lost_cb lost_cb, void *cb_cookie,
                             int pid, int cpu, int page_cnt) {
diff --git a/src/cc/libbpf.h b/src/cc/libbpf.h
index e59d48a..589006f 100644
--- a/src/cc/libbpf.h
+++ b/src/cc/libbpf.h
@@ -81,6 +81,8 @@
                           const char *tp_name);
 int bpf_detach_tracepoint(const char *tp_category, const char *tp_name);
 
+int bpf_attach_raw_tracepoint(int progfd, char *tp_name);
+
 void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb,
                             perf_reader_lost_cb lost_cb, void *cb_cookie,
                             int pid, int cpu, int page_cnt);
diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py
index a1552e7..5dccd65 100644
--- a/src/python/bcc/__init__.py
+++ b/src/python/bcc/__init__.py
@@ -135,6 +135,17 @@
     TRACEPOINT = 5
     XDP = 6
     PERF_EVENT = 7
+    CGROUP_SKB = 8
+    CGROUP_SOCK = 9
+    LWT_IN = 10
+    LWT_OUT = 11
+    LWT_XMIT = 12
+    SOCK_OPS = 13
+    SK_SKB = 14
+    CGROUP_DEVICE = 15
+    SK_MSG = 16
+    RAW_TRACEPOINT = 17
+    CGROUP_SOCK_ADDR = 18
 
     # from xdp_action uapi/linux/bpf.h
     XDP_ABORTED = 0
@@ -267,6 +278,7 @@
         self.kprobe_fds = {}
         self.uprobe_fds = {}
         self.tracepoint_fds = {}
+        self.raw_tracepoint_fds = {}
         self.perf_buffers = {}
         self.open_perf_events = {}
         self.tracefile = None
@@ -310,7 +322,8 @@
         for usdt_context in usdt_contexts:
             usdt_context.attach_uprobes(self)
 
-        # If any "kprobe__" or "tracepoint__" prefixed functions were defined,
+        # If any "kprobe__" or "tracepoint__" or "raw_tracepoint__"
+        # prefixed functions were defined,
         # they will be loaded and attached here.
         self._trace_autoload()
 
@@ -725,6 +738,52 @@
         self.tracepoint_fds[tp] = fd
         return self
 
+    def attach_raw_tracepoint(self, tp=b"", fn_name=b""):
+        """attach_raw_tracepoint(self, tp=b"", fn_name=b"")
+
+        Run the bpf function denoted by fn_name every time the kernel tracepoint
+        specified by 'tp' is hit. The bpf function should be loaded as a
+        RAW_TRACEPOINT type. The fn_name is the kernel tracepoint name,
+        e.g., sched_switch, sys_enter_bind, etc.
+
+        Examples:
+            BPF(text).attach_raw_tracepoint(tp="sched_switch", fn_name="on_switch")
+        """
+
+        tp = _assert_is_bytes(tp)
+        if tp in self.raw_tracepoint_fds:
+            raise Exception("Raw tracepoint %s has been attached" % tp)
+
+        fn_name = _assert_is_bytes(fn_name)
+        fn = self.load_func(fn_name, BPF.RAW_TRACEPOINT)
+        fd = lib.bpf_attach_raw_tracepoint(fn.fd, tp)
+        if fd < 0:
+            raise Exception("Failed to attach BPF to raw tracepoint")
+        self.raw_tracepoint_fds[tp] = fd;
+        return self
+
+    def detach_raw_tracepoint(self, tp=b""):
+        """detach_raw_tracepoint(tp="")
+
+        Stop running the bpf function that is attached to the kernel tracepoint
+        specified by 'tp'.
+
+        Example: bpf.detach_raw_tracepoint("sched_switch")
+        """
+
+        tp = _assert_is_bytes(tp)
+        if tp not in self.raw_tracepoint_fds:
+            raise Exception("Raw tracepoint %s is not attached" % tp)
+        os.close(self.raw_tracepoint_fds[tp])
+        del self.raw_tracepoint_fds[tp]
+
+    @staticmethod
+    def support_raw_tracepoint():
+        # kernel symbol "bpf_find_raw_tracepoint" indicates raw_tracepint support
+        if BPF.ksymname("bpf_find_raw_tracepoint") != -1:
+            return True
+        return False
+
     def detach_tracepoint(self, tp=b""):
         """detach_tracepoint(tp="")
 
@@ -954,6 +1013,10 @@
                 fn = self.load_func(func_name, BPF.TRACEPOINT)
                 tp = fn.name[len(b"tracepoint__"):].replace(b"__", b":")
                 self.attach_tracepoint(tp=tp, fn_name=fn.name)
+            elif func_name.startswith(b"raw_tracepoint__"):
+                fn = self.load_func(func_name, BPF.RAW_TRACEPOINT)
+                tp = fn.name[len(b"raw_tracepoint__"):]
+                self.attach_raw_tracepoint(tp=tp, fn_name=fn.name)
 
     def trace_open(self, nonblocking=False):
         """trace_open(nonblocking=False)
@@ -1154,6 +1217,8 @@
             self.detach_uprobe_event(k)
         for k, v in list(self.tracepoint_fds.items()):
             self.detach_tracepoint(k)
+        for k, v in list(self.raw_tracepoint_fds.items()):
+            self.detach_raw_tracepoint(k)
 
         # Clean up opened perf ring buffer and perf events
         table_keys = list(self.tables.keys())
diff --git a/src/python/bcc/libbcc.py b/src/python/bcc/libbcc.py
index fd9f72b..e61227e 100644
--- a/src/python/bcc/libbcc.py
+++ b/src/python/bcc/libbcc.py
@@ -100,6 +100,8 @@
 lib.bpf_attach_tracepoint.argtypes = [ct.c_int, ct.c_char_p, ct.c_char_p]
 lib.bpf_detach_tracepoint.restype = ct.c_int
 lib.bpf_detach_tracepoint.argtypes = [ct.c_char_p, ct.c_char_p]
+lib.bpf_attach_raw_tracepoint.restype = ct.c_int
+lib.bpf_attach_raw_tracepoint.argtypes = [ct.c_int, ct.c_char_p]
 lib.bpf_open_perf_buffer.restype = ct.c_void_p
 lib.bpf_open_perf_buffer.argtypes = [_RAW_CB_TYPE, _LOST_CB_TYPE, ct.py_object, ct.c_int, ct.c_int, ct.c_int]
 lib.bpf_open_perf_event.restype = ct.c_int
diff --git a/tools/runqlat.py b/tools/runqlat.py
index 95657cd..ebda11d 100755
--- a/tools/runqlat.py
+++ b/tools/runqlat.py
@@ -95,7 +95,9 @@
     start.update(&pid, &ts);
     return 0;
 }
+"""
 
+bpf_text_kprobe = """
 int trace_wake_up_new_task(struct pt_regs *ctx, struct task_struct *p)
 {
     return trace_enqueue(p->tgid, p->pid);
@@ -144,6 +146,76 @@
 }
 """
 
+bpf_text_raw_tp = """
+RAW_TRACEPOINT_PROBE(sched_wakeup)
+{
+    // TP_PROTO(struct task_struct *p)
+    struct task_struct *p = (struct task_struct *)ctx->args[0];
+    u32 tgid, pid;
+
+    bpf_probe_read(&tgid, sizeof(tgid), &p->tgid);
+    bpf_probe_read(&pid, sizeof(pid), &p->pid);
+    return trace_enqueue(tgid, pid);
+}
+
+RAW_TRACEPOINT_PROBE(sched_wakeup_new)
+{
+    // TP_PROTO(struct task_struct *p)
+    struct task_struct *p = (struct task_struct *)ctx->args[0];
+    u32 tgid, pid;
+
+    bpf_probe_read(&tgid, sizeof(tgid), &p->tgid);
+    bpf_probe_read(&pid, sizeof(pid), &p->pid);
+    return trace_enqueue(tgid, pid);
+}
+
+RAW_TRACEPOINT_PROBE(sched_switch)
+{
+    // TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next)
+    struct task_struct *prev = (struct task_struct *)ctx->args[1];
+    struct task_struct *next= (struct task_struct *)ctx->args[2];
+    u32 pid, tgid;
+    long state;
+
+    // ivcsw: treat like an enqueue event and store timestamp
+    bpf_probe_read(&state, sizeof(long), &prev->state);
+    if (state == TASK_RUNNING) {
+        bpf_probe_read(&tgid, sizeof(prev->tgid), &prev->tgid);
+        bpf_probe_read(&pid, sizeof(prev->pid), &prev->pid);
+        if (!(FILTER)) {
+            u64 ts = bpf_ktime_get_ns();
+            start.update(&pid, &ts);
+        }
+    }
+
+    bpf_probe_read(&tgid, sizeof(next->tgid), &next->tgid);
+    bpf_probe_read(&pid, sizeof(next->pid), &next->pid);
+    if (FILTER)
+        return 0;
+    u64 *tsp, delta;
+
+    // fetch timestamp and calculate delta
+    tsp = start.lookup(&pid);
+    if (tsp == 0) {
+        return 0;   // missed enqueue
+    }
+    delta = bpf_ktime_get_ns() - *tsp;
+    FACTOR
+
+    // store as histogram
+    STORE
+
+    start.delete(&pid);
+    return 0;
+}
+"""
+
+is_support_raw_tp = BPF.support_raw_tracepoint()
+if is_support_raw_tp:
+    bpf_text += bpf_text_raw_tp
+else:
+    bpf_text += bpf_text_kprobe
+
 # code substitutions
 if args.pid:
     # pid from userspace point of view is thread group from kernel pov
@@ -186,9 +258,10 @@
 
 # load BPF program
 b = BPF(text=bpf_text)
-b.attach_kprobe(event="ttwu_do_wakeup", fn_name="trace_ttwu_do_wakeup")
-b.attach_kprobe(event="wake_up_new_task", fn_name="trace_wake_up_new_task")
-b.attach_kprobe(event="finish_task_switch", fn_name="trace_run")
+if not is_support_raw_tp:
+    b.attach_kprobe(event="ttwu_do_wakeup", fn_name="trace_ttwu_do_wakeup")
+    b.attach_kprobe(event="wake_up_new_task", fn_name="trace_wake_up_new_task")
+    b.attach_kprobe(event="finish_task_switch", fn_name="trace_run")
 
 print("Tracing run queue latency... Hit Ctrl-C to end.")