fix iteration over CPUs

Since kernel version 4.9.0 BPF stopped working in a KVM guest.
The problem are calls to perf_event_open with CPU identifiers which do
not exist (ENODEV). The root cause for this is that the current code
assumes ascending numbered CPUs. However, this is not always the case
(e.g. CPU hotplugging).

This patch introduces the get_online_cpus() and get_possible_cpus()
helper functions and uses the appropriate function for iterations over
CPUs. The BPF_MAP_TYPE_PERF_EVENT_ARRAY map contains now an entry for
each possible CPU instead of for each online CPU.

Fixes: #893
Signed-off-by: Andreas Gerstmayr <andreas.gerstmayr@catalysts.cc>
diff --git a/tests/python/CMakeLists.txt b/tests/python/CMakeLists.txt
index fb0eedb..1da2a67 100644
--- a/tests/python/CMakeLists.txt
+++ b/tests/python/CMakeLists.txt
@@ -56,6 +56,8 @@
   COMMAND ${TEST_WRAPPER} py_test_tracepoint sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_tracepoint.py)
 add_test(NAME py_test_perf_event WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
   COMMAND ${TEST_WRAPPER} py_test_perf_event sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_perf_event.py)
+add_test(NAME py_test_utils WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_test_utils sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_utils.py)
 
 add_test(NAME py_test_dump_func WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
   COMMAND ${TEST_WRAPPER} py_dump_func simple ${CMAKE_CURRENT_SOURCE_DIR}/test_dump_func.py)
diff --git a/tests/python/test_array.py b/tests/python/test_array.py
index 3fe1dce..a7c82a4 100755
--- a/tests/python/test_array.py
+++ b/tests/python/test_array.py
@@ -6,6 +6,8 @@
 import ctypes as ct
 import random
 import time
+import subprocess
+from bcc.utils import get_online_cpus
 from unittest import main, TestCase
 
 class TestArray(TestCase):
@@ -62,6 +64,37 @@
         time.sleep(0.1)
         b.kprobe_poll()
         self.assertGreater(self.counter, 0)
+        b.cleanup()
+
+    def test_perf_buffer_for_each_cpu(self):
+        self.events = []
+
+        class Data(ct.Structure):
+            _fields_ = [("cpu", ct.c_ulonglong)]
+
+        def cb(cpu, data, size):
+            self.assertGreater(size, ct.sizeof(Data))
+            event = ct.cast(data, ct.POINTER(Data)).contents
+            self.events.append(event)
+
+        text = """
+BPF_PERF_OUTPUT(events);
+int kprobe__sys_nanosleep(void *ctx) {
+    struct {
+        u64 cpu;
+    } data = {bpf_get_smp_processor_id()};
+    events.perf_submit(ctx, &data, sizeof(data));
+    return 0;
+}
+"""
+        b = BPF(text=text)
+        b["events"].open_perf_buffer(cb)
+        online_cpus = get_online_cpus()
+        for cpu in online_cpus:
+            subprocess.call(['taskset', '-c', str(cpu), 'sleep', '0.1'])
+        b.kprobe_poll()
+        b.cleanup()
+        self.assertGreaterEqual(len(self.events), len(online_cpus), 'Received only {}/{} events'.format(len(self.events), len(online_cpus)))
 
 if __name__ == "__main__":
     main()
diff --git a/tests/python/test_utils.py b/tests/python/test_utils.py
new file mode 100755
index 0000000..08862e7
--- /dev/null
+++ b/tests/python/test_utils.py
@@ -0,0 +1,18 @@
+#!/usr/bin/python
+# Copyright (c) Catalysts GmbH
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from bcc.utils import get_online_cpus
+import multiprocessing
+import unittest
+
+class TestUtils(unittest.TestCase):
+    def test_get_online_cpus(self):
+        online_cpus = get_online_cpus()
+        num_cores = multiprocessing.cpu_count()
+
+        self.assertEqual(len(online_cpus), num_cores)
+
+
+if __name__ == "__main__":
+    unittest.main()