Support killing sockets using SOCK_DESTROY.

This gives netd the ability to close sockets on a particular
source IP address using SOCK_DESTROY. It does not yet enable
this behaviour.

The microbenchmark is able to close 500 IPv6 sockets in ~30ms on
my angler. Specifically:

- Scanning 500 socketpairs: ~5ms
- Scanning 500 socketpairs and killing one half of each: ~30ms
- Scanning 500 socketpairs and killing both halves of each: ~40ms

This is about ~2.5x-3.5x slower than SIOCKILLADDR:
 - For 500 sockets, it's 9.5ms vs. 22.9ms.
 - For 4000 sockets, it's ~40ms vs ~135ms.

A large part of that is due to sending RST packets, which
SIOCKILLADDR does not do. If the kernel is modified so that
SOCK_DESTROY does not send RSTs, the time taken to kill 4000
sockets goes down to ~70ms

Batching the destroy operations does not help much. It saves
5-10%, but it complicates error handling.

Bug: 26976388
Change-Id: I2e1ac30af5dbcdb98dbb7c6e4d4d67c55b9fd00f
diff --git a/server/Android.mk b/server/Android.mk
index d0cd49e..b713b4f 100644
--- a/server/Android.mk
+++ b/server/Android.mk
@@ -68,6 +68,7 @@
         PppController.cpp \
         ResolverController.cpp \
         RouteController.cpp \
+        SockDiag.cpp \
         SoftapController.cpp \
         StrictController.cpp \
         TetherController.cpp \
diff --git a/server/SockDiag.cpp b/server/SockDiag.cpp
new file mode 100644
index 0000000..2f1437c
--- /dev/null
+++ b/server/SockDiag.cpp
@@ -0,0 +1,230 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <errno.h>
+#include <netdb.h>
+#include <string.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sys/socket.h>
+#include <sys/uio.h>
+
+#include <linux/netlink.h>
+#include <linux/sock_diag.h>
+#include <linux/inet_diag.h>
+
+#define LOG_TAG "Netd"
+
+#include <cutils/log.h>
+
+#include "NetdConstants.h"
+#include "SockDiag.h"
+
+#ifndef SOCK_DESTROY
+#define SOCK_DESTROY 21
+#endif
+
+namespace {
+
+struct AddrinfoDeleter {
+  void operator()(addrinfo *a) { if (a) freeaddrinfo(a); }
+};
+
+typedef std::unique_ptr<addrinfo, AddrinfoDeleter> ScopedAddrinfo;
+
+int checkError(int fd) {
+    struct {
+        nlmsghdr h;
+        nlmsgerr err;
+    } __attribute__((__packed__)) ack;
+    ssize_t bytesread = recv(fd, &ack, sizeof(ack), MSG_DONTWAIT | MSG_PEEK);
+    if (bytesread == -1) {
+       // Read failed (error), or nothing to read (good).
+       return (errno == EAGAIN) ? 0 : -errno;
+    } else if (bytesread == (ssize_t) sizeof(ack) && ack.h.nlmsg_type == NLMSG_ERROR) {
+        // We got an error. Consume it.
+        recv(fd, &ack, sizeof(ack), 0);
+        return ack.err.error;
+    } else {
+        // The kernel replied with something. Leave it to the caller.
+        return 0;
+    }
+}
+
+}  // namespace
+
+bool SockDiag::open() {
+    if (hasSocks()) {
+        return false;
+    }
+
+    mSock = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_INET_DIAG);
+    mWriteSock = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_INET_DIAG);
+    if (!hasSocks()) {
+        closeSocks();
+        return false;
+    }
+
+    sockaddr_nl nl = { .nl_family = AF_NETLINK };
+    if ((connect(mSock, reinterpret_cast<sockaddr *>(&nl), sizeof(nl)) == -1) ||
+        (connect(mWriteSock, reinterpret_cast<sockaddr *>(&nl), sizeof(nl)) == -1)) {
+        closeSocks();
+        return false;
+    }
+
+    return true;
+}
+
+int SockDiag::sendDumpRequest(uint8_t proto, uint8_t family, const char *addrstr) {
+    addrinfo hints = { .ai_flags = AI_NUMERICHOST };
+    addrinfo *res;
+    in6_addr mapped = { .s6_addr32 = { 0, 0, htonl(0xffff), 0 } };
+    int ret;
+
+    // TODO: refactor the netlink parsing code out of system/core, bring it into netd, and stop
+    // doing string conversions when they're not necessary.
+    if ((ret = getaddrinfo(addrstr, nullptr, &hints, &res)) != 0) {
+        return -EINVAL;
+    }
+
+    // So we don't have to call freeaddrinfo on every failure path.
+    ScopedAddrinfo resP(res);
+
+    void *addr;
+    uint8_t addrlen;
+    if (res->ai_family == AF_INET && family == AF_INET) {
+        in_addr& ina = reinterpret_cast<sockaddr_in*>(res->ai_addr)->sin_addr;
+        addr = &ina;
+        addrlen = sizeof(ina);
+    } else if (res->ai_family == AF_INET && family == AF_INET6) {
+        in_addr& ina = reinterpret_cast<sockaddr_in*>(res->ai_addr)->sin_addr;
+        mapped.s6_addr32[3] = ina.s_addr;
+        addr = &mapped;
+        addrlen = sizeof(mapped);
+    } else if (res->ai_family == AF_INET6 && family == AF_INET6) {
+        in6_addr& in6a = reinterpret_cast<sockaddr_in6*>(res->ai_addr)->sin6_addr;
+        addr = &in6a;
+        addrlen = sizeof(in6a);
+    } else {
+        return -EAFNOSUPPORT;
+    }
+
+    uint8_t prefixlen = addrlen * 8;
+    uint8_t yesjump = sizeof(inet_diag_bc_op) + sizeof(inet_diag_hostcond) + addrlen;
+    uint8_t nojump = yesjump + 4;
+    uint32_t states = ~(1 << TCP_TIME_WAIT);
+
+    struct {
+        nlmsghdr nlh;
+        inet_diag_req_v2 req;
+        nlattr nla;
+        inet_diag_bc_op op;
+        inet_diag_hostcond cond;
+    } __attribute__((__packed__)) request = {
+        .nlh = {
+            .nlmsg_type = SOCK_DIAG_BY_FAMILY,
+            .nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP,
+        },
+        .req = {
+            .sdiag_family = family,
+            .sdiag_protocol = proto,
+            .idiag_states = states,
+        },
+        .nla = {
+            .nla_type = INET_DIAG_REQ_BYTECODE,
+        },
+        .op = {
+            INET_DIAG_BC_S_COND,
+            yesjump,
+            nojump,
+        },
+        .cond = {
+            family,
+            prefixlen,
+            -1,
+            {}
+        },
+    };
+
+    request.nlh.nlmsg_len = sizeof(request) + addrlen;
+    request.nla.nla_len = sizeof(request.nla) + sizeof(request.op) + sizeof(request.cond) + addrlen;
+
+    struct iovec iov[] = {
+        { &request, sizeof(request) },
+        { addr, addrlen },
+    };
+
+    if (writev(mSock, iov, ARRAY_SIZE(iov)) != (int) request.nlh.nlmsg_len) {
+        return -errno;
+    }
+
+    return checkError(mSock);
+}
+
+int SockDiag::readDiagMsg(uint8_t proto, SockDiag::DumpCallback callback) {
+    char buf[kBufferSize];
+
+    ssize_t bytesread;
+    do {
+        bytesread = read(mSock, buf, sizeof(buf));
+
+        if (bytesread < 0) {
+            return -errno;
+        }
+
+        uint32_t len = bytesread;
+        for (nlmsghdr *nlh = reinterpret_cast<nlmsghdr *>(buf);
+             NLMSG_OK(nlh, len);
+             nlh = NLMSG_NEXT(nlh, len)) {
+            switch (nlh->nlmsg_type) {
+              case NLMSG_DONE:
+                callback(proto, NULL);
+                return 0;
+              case NLMSG_ERROR: {
+                nlmsgerr *err = reinterpret_cast<nlmsgerr *>(NLMSG_DATA(nlh));
+                return err->error;
+              }
+              default:
+                inet_diag_msg *msg = reinterpret_cast<inet_diag_msg *>(NLMSG_DATA(nlh));
+                callback(proto, msg);
+            }
+        }
+    } while (bytesread > 0);
+
+    return 0;
+}
+
+int SockDiag::sockDestroy(uint8_t proto, const inet_diag_msg *msg) {
+    DestroyRequest request = {
+        .nlh = {
+            .nlmsg_type = SOCK_DESTROY,
+            .nlmsg_flags = NLM_F_REQUEST,
+        },
+        .req = {
+            .sdiag_family = msg->idiag_family,
+            .sdiag_protocol = proto,
+            .idiag_states = (uint32_t) (1 << msg->idiag_state),
+            .id = msg->id,
+        },
+    };
+    request.nlh.nlmsg_len = sizeof(request);
+
+    if (write(mWriteSock, &request, sizeof(request)) < (ssize_t) sizeof(request)) {
+        return -errno;
+    }
+
+    return checkError(mWriteSock);
+}
diff --git a/server/SockDiag.h b/server/SockDiag.h
new file mode 100644
index 0000000..3b6ca8b
--- /dev/null
+++ b/server/SockDiag.h
@@ -0,0 +1,33 @@
+#include <functional>
+
+#include <linux/netlink.h>
+#include <linux/sock_diag.h>
+#include <linux/inet_diag.h>
+
+struct inet_diag_msg;
+
+class SockDiag {
+
+  public:
+    static const int kBufferSize = 4096;
+    typedef std::function<int(uint8_t proto, const inet_diag_msg *)> DumpCallback;
+
+    struct DestroyRequest {
+        nlmsghdr nlh;
+        inet_diag_req_v2 req;
+    } __attribute__((__packed__));
+
+    SockDiag() : mSock(-1), mWriteSock(-1) {}
+    bool open();
+    virtual ~SockDiag() { closeSocks(); }
+
+    int sendDumpRequest(uint8_t proto, uint8_t family, const char *addrstr);
+    int readDiagMsg(uint8_t proto, DumpCallback callback);
+    int sockDestroy(uint8_t proto, const inet_diag_msg *);
+
+  private:
+    int mSock;
+    int mWriteSock;
+    bool hasSocks() { return mSock != -1 && mWriteSock != -1; }
+    void closeSocks() { close(mSock); close(mWriteSock); mSock = mWriteSock = -1; }
+};
diff --git a/tests/Android.mk b/tests/Android.mk
new file mode 100644
index 0000000..d9659f7
--- /dev/null
+++ b/tests/Android.mk
@@ -0,0 +1,13 @@
+LOCAL_PATH := $(call my-dir)
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := sock_diag_test
+LOCAL_CFLAGS := -Wall -Werror -Wunused-parameter
+LOCAL_C_INCLUDES := system/netd/server
+LOCAL_SRC_FILES := sock_diag_test.cpp ../server/SockDiag.cpp
+LOCAL_MODULE_TAGS := tests
+LOCAL_SHARED_LIBRARIES := liblog
+
+include $(BUILD_NATIVE_TEST)
+
diff --git a/tests/sock_diag_test.cpp b/tests/sock_diag_test.cpp
new file mode 100644
index 0000000..67af978
--- /dev/null
+++ b/tests/sock_diag_test.cpp
@@ -0,0 +1,266 @@
+/*
+ * Copyright 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * sock_diag_test.cpp - unit tests for SockDiag.cpp
+ */
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <linux/inet_diag.h>
+
+#include <gtest/gtest.h>
+
+#include "NetdConstants.h"
+#include "SockDiag.h"
+
+
+#define NUM_SOCKETS 500
+
+
+class SockDiagTest : public ::testing::Test {
+};
+
+uint16_t bindAndListen(int s) {
+    for (int i = 0; i < 10; i++) {
+        uint16_t port = 1024 + arc4random_uniform(0xffff - 1024);
+        sockaddr_in6 sin6 = { .sin6_family = AF_INET6, .sin6_port = htons(port) };
+        if (bind(s, (sockaddr *) &sin6, sizeof(sin6)) == 0) {
+            listen(s, 1);
+            return port;
+        }
+    }
+    close(s);
+    return 0;
+}
+
+const char *tcpStateName(uint8_t state) {
+    static const char *states[] = {
+        "???",
+        "TCP_ESTABLISHED",
+        "TCP_SYN_SENT",
+        "TCP_SYN_RECV",
+        "TCP_FIN_WAIT1",
+        "TCP_FIN_WAIT2",
+        "TCP_TIME_WAIT",
+        "TCP_CLOSE",
+        "TCP_CLOSE_WAIT",
+        "TCP_LAST_ACK",
+        "TCP_LISTEN",
+        "TCP_CLOSING",
+        "TCP_NEW_SYN_RECV",
+    };
+    return states[(state < ARRAY_SIZE(states)) ? state : 0];
+}
+
+TEST_F(SockDiagTest, TestDump) {
+    int v4socket = socket(AF_INET, SOCK_STREAM, 0);
+    int v6socket = socket(AF_INET6, SOCK_STREAM, 0);
+    int listensocket = socket(AF_INET6, SOCK_STREAM, 0);
+    ASSERT_NE(-1, v4socket) << "Failed to open IPv4 socket";
+    ASSERT_NE(-1, v6socket) << "Failed to open IPv6 socket";
+    ASSERT_NE(-1, listensocket) << "Failed to open listen socket";
+
+    uint16_t port = bindAndListen(listensocket);
+    ASSERT_NE(0, port) << "Can't bind to server port";
+
+    // Connect to loopback.
+    sockaddr_in server4 = { .sin_family = AF_INET, .sin_port = htons(port) };
+    sockaddr_in6 server6 = { .sin6_family = AF_INET6, .sin6_port = htons(port) };
+    ASSERT_EQ(0, connect(v4socket, (sockaddr *) &server4, sizeof(server4)))
+        << "IPv4 connect failed: " << strerror(errno);
+    ASSERT_EQ(0, connect(v6socket, (sockaddr *) &server6, sizeof(server6)))
+        << "IPv6 connect failed: " << strerror(errno);
+
+    sockaddr_in6 client46, client6;
+    socklen_t clientlen = std::max(sizeof(client46), sizeof(client6));
+    int accepted4 = accept(listensocket, (sockaddr *) &client46, &clientlen);
+    int accepted6 = accept(listensocket, (sockaddr *) &client6, &clientlen);
+    ASSERT_NE(-1, accepted4);
+    ASSERT_NE(-1, accepted6);
+
+    int v4SocketsSeen = 0;
+    bool seenclient46 = false;
+    bool seenNull = false;
+    char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN];
+
+    fprintf(stderr, "Ports:\n  server=%d. client46=%d, client6=%d\n",
+            port, ntohs(client46.sin6_port), ntohs(client6.sin6_port));
+
+    auto checkIPv4Dump = [&] (uint8_t /* proto */, const inet_diag_msg *msg) {
+        if (msg == nullptr) {
+            EXPECT_FALSE(seenNull);
+            seenNull = true;
+            return 0;
+        }
+        EXPECT_EQ(htonl(INADDR_LOOPBACK), msg->id.idiag_src[0]);
+        v4SocketsSeen++;
+        seenclient46 |= (msg->id.idiag_sport == client46.sin6_port);
+        inet_ntop(AF_INET, msg->id.idiag_src, src, sizeof(src));
+        inet_ntop(AF_INET, msg->id.idiag_src, dst, sizeof(dst));
+        fprintf(stderr, "  v4 %s:%d -> %s:%d %s\n",
+                src, htons(msg->id.idiag_sport),
+                dst, htons(msg->id.idiag_dport),
+                tcpStateName(msg->idiag_state));
+        return 0;
+    };
+
+    int v6SocketsSeen = 0;
+    bool seenClient6 = false, seenServer46 = false, seenServer6 = false;
+
+    auto checkIPv6Dump = [&] (uint8_t /* proto */, const inet_diag_msg *msg) {
+        if (msg == nullptr) {
+            EXPECT_FALSE(seenNull);
+            seenNull = true;
+            return 0;
+        }
+        struct in6_addr *saddr = (struct in6_addr *) msg->id.idiag_src;
+        EXPECT_TRUE(
+            IN6_IS_ADDR_LOOPBACK(saddr) ||
+            (IN6_IS_ADDR_V4MAPPED(saddr) && saddr->s6_addr32[3] == htonl(INADDR_LOOPBACK)));
+        v6SocketsSeen++;
+        seenClient6 |= (msg->id.idiag_sport == client6.sin6_port);
+        seenServer46 |= (msg->id.idiag_sport == htons(port));
+        seenServer6 |= (msg->id.idiag_sport == htons(port));
+        inet_ntop(AF_INET6, msg->id.idiag_src, src, sizeof(src));
+        inet_ntop(AF_INET6, msg->id.idiag_src, dst, sizeof(dst));
+        fprintf(stderr, "  v6 [%s]:%d -> [%s]:%d %s\n",
+                src, htons(msg->id.idiag_sport),
+                dst, htons(msg->id.idiag_dport),
+                tcpStateName(msg->idiag_state));
+        return 0;
+    };
+
+    SockDiag sd;
+    ASSERT_TRUE(sd.open()) << "Failed to open SOCK_DIAG socket";
+
+    seenNull = false;
+    int ret = sd.sendDumpRequest(IPPROTO_TCP, AF_INET, "127.0.0.1");
+    ASSERT_EQ(0, ret) << "Failed to send IPv4 dump request: " << strerror(-ret);
+    fprintf(stderr, "Sent IPv4 dump\n");
+    sd.readDiagMsg(IPPROTO_TCP, checkIPv4Dump);
+    EXPECT_GE(v4SocketsSeen, 1);
+    EXPECT_TRUE(seenclient46);
+    EXPECT_FALSE(seenServer46);
+
+    seenNull = false;
+    ret = sd.sendDumpRequest(IPPROTO_TCP, AF_INET6, "127.0.0.1");
+    ASSERT_EQ(0, ret) << "Failed to send mapped dump request: " << strerror(-ret);
+    fprintf(stderr, "Sent mapped dump\n");
+    sd.readDiagMsg(IPPROTO_TCP, checkIPv6Dump);
+    EXPECT_TRUE(seenServer46);
+
+    seenNull = false;
+    ret = sd.sendDumpRequest(IPPROTO_TCP, AF_INET6, "::1");
+    ASSERT_EQ(0, ret) << "Failed to send IPv6 dump request: " << strerror(-ret);
+    fprintf(stderr, "Sent IPv6 dump\n");
+
+    sd.readDiagMsg(IPPROTO_TCP, checkIPv6Dump);
+    EXPECT_GE(v6SocketsSeen, 1);
+    EXPECT_TRUE(seenClient6);
+    EXPECT_TRUE(seenServer6);
+
+    close(v4socket);
+    close(v6socket);
+    close(listensocket);
+    close(accepted4);
+    close(accepted6);
+}
+
+TEST_F(SockDiagTest, TestMicroBenchmark) {
+    fprintf(stderr, "Benchmarking closing %d sockets\n", NUM_SOCKETS);
+
+    int listensocket = socket(AF_INET6, SOCK_STREAM, 0);
+    ASSERT_NE(-1, listensocket) << "Failed to open listen socket";
+
+    uint16_t port = bindAndListen(listensocket);
+    ASSERT_NE(0, port) << "Can't bind to server port";
+    sockaddr_in6 server = { .sin6_family = AF_INET6, .sin6_port = htons(port) };
+
+    using ms = std::chrono::duration<float, std::ratio<1, 1000>>;
+
+    int clientsockets[NUM_SOCKETS], serversockets[NUM_SOCKETS];
+    uint16_t clientports[NUM_SOCKETS];
+    sockaddr_in6 client;
+    socklen_t clientlen;
+
+    auto start = std::chrono::steady_clock::now();
+    for (int i = 0; i < NUM_SOCKETS; i++) {
+        int s = socket(AF_INET6, SOCK_STREAM, 0);
+        clientlen = sizeof(client);
+        ASSERT_EQ(0, connect(s, (sockaddr *) &server, sizeof(server)))
+            << "Connecting socket " << i << " failed " << strerror(errno);
+        serversockets[i] = accept(listensocket, (sockaddr *) &client, &clientlen);
+        ASSERT_NE(-1, serversockets[i])
+            << "Accepting socket " << i << " failed " << strerror(errno);
+        clientports[i] = client.sin6_port;
+        clientsockets[i] = s;
+    }
+    fprintf(stderr, "  Connecting: %6.1f ms\n",
+            std::chrono::duration_cast<ms>(std::chrono::steady_clock::now() - start).count());
+
+    SockDiag sd;
+    ASSERT_TRUE(sd.open()) << "Failed to open SOCK_DIAG socket";
+
+    int ret = sd.sendDumpRequest(IPPROTO_TCP, AF_INET6, "::1");
+    ASSERT_EQ(0, ret) << "Failed to send IPv6 dump request: " << strerror(-ret);
+
+    auto closeMySockets = [&] (uint8_t proto, const inet_diag_msg *msg) {
+        if (msg && msg->id.idiag_dport == htons(port)) {
+            return sd.sockDestroy(proto, msg);
+        }
+        return 0;
+    };
+
+    start = std::chrono::steady_clock::now();
+    sd.readDiagMsg(IPPROTO_TCP, closeMySockets);
+    fprintf(stderr, "  Destroying: %6.1f ms\n",
+            std::chrono::duration_cast<ms>(std::chrono::steady_clock::now() - start).count());
+
+    int err;
+    start = std::chrono::steady_clock::now();
+    for (int i = 0; i < NUM_SOCKETS; i++) {
+        ret = send(clientsockets[i], "foo", sizeof("foo"), 0);
+        err = errno;
+        EXPECT_EQ(-1, ret) << "Client socket " << i << " not closed";
+        if (ret == -1) {
+            EXPECT_EQ(ECONNABORTED, errno)
+                << "Client socket: unexpected error: " << strerror(errno);
+        }
+
+        // Check that the server sockets have been closed too (because closing the client sockets
+        // sends RSTs).
+        ret = send(serversockets[i], "foo", sizeof("foo"), 0);
+        err = errno;
+        EXPECT_EQ(-1, ret) << "Server socket " << i << " not closed";
+        if (ret == -1) {
+            EXPECT_EQ(ECONNRESET, errno)
+                << "Server socket: unexpected error: " << strerror(errno);
+        }
+    }
+    fprintf(stderr, "   Verifying: %6.1f ms\n",
+            std::chrono::duration_cast<ms>(std::chrono::steady_clock::now() - start).count());
+
+
+
+    start = std::chrono::steady_clock::now();
+    for (int i = 0; i < NUM_SOCKETS; i++) {
+        close(clientsockets[i]);
+        close(serversockets[i]);
+    }
+    fprintf(stderr, "     Closing: %6.1f ms\n",
+            std::chrono::duration_cast<ms>(std::chrono::steady_clock::now() - start).count());
+
+    close(listensocket);
+}