Support killing sockets using SOCK_DESTROY.

This gives netd the ability to close sockets on a particular
source IP address using SOCK_DESTROY. It does not yet enable
this behaviour.

The microbenchmark is able to close 500 IPv6 sockets in ~30ms on
my angler. Specifically:

- Scanning 500 socketpairs: ~5ms
- Scanning 500 socketpairs and killing one half of each: ~30ms
- Scanning 500 socketpairs and killing both halves of each: ~40ms

This is about ~2.5x-3.5x slower than SIOCKILLADDR:
 - For 500 sockets, it's 9.5ms vs. 22.9ms.
 - For 4000 sockets, it's ~40ms vs ~135ms.

A large part of that is due to sending RST packets, which
SIOCKILLADDR does not do. If the kernel is modified so that
SOCK_DESTROY does not send RSTs, the time taken to kill 4000
sockets goes down to ~70ms

Batching the destroy operations does not help much. It saves
5-10%, but it complicates error handling.

Bug: 26976388
Change-Id: I2e1ac30af5dbcdb98dbb7c6e4d4d67c55b9fd00f
diff --git a/server/SockDiag.cpp b/server/SockDiag.cpp
new file mode 100644
index 0000000..2f1437c
--- /dev/null
+++ b/server/SockDiag.cpp
@@ -0,0 +1,230 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <errno.h>
+#include <netdb.h>
+#include <string.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sys/socket.h>
+#include <sys/uio.h>
+
+#include <linux/netlink.h>
+#include <linux/sock_diag.h>
+#include <linux/inet_diag.h>
+
+#define LOG_TAG "Netd"
+
+#include <cutils/log.h>
+
+#include "NetdConstants.h"
+#include "SockDiag.h"
+
+#ifndef SOCK_DESTROY
+#define SOCK_DESTROY 21
+#endif
+
+namespace {
+
+struct AddrinfoDeleter {
+  void operator()(addrinfo *a) { if (a) freeaddrinfo(a); }
+};
+
+typedef std::unique_ptr<addrinfo, AddrinfoDeleter> ScopedAddrinfo;
+
+int checkError(int fd) {
+    struct {
+        nlmsghdr h;
+        nlmsgerr err;
+    } __attribute__((__packed__)) ack;
+    ssize_t bytesread = recv(fd, &ack, sizeof(ack), MSG_DONTWAIT | MSG_PEEK);
+    if (bytesread == -1) {
+       // Read failed (error), or nothing to read (good).
+       return (errno == EAGAIN) ? 0 : -errno;
+    } else if (bytesread == (ssize_t) sizeof(ack) && ack.h.nlmsg_type == NLMSG_ERROR) {
+        // We got an error. Consume it.
+        recv(fd, &ack, sizeof(ack), 0);
+        return ack.err.error;
+    } else {
+        // The kernel replied with something. Leave it to the caller.
+        return 0;
+    }
+}
+
+}  // namespace
+
+bool SockDiag::open() {
+    if (hasSocks()) {
+        return false;
+    }
+
+    mSock = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_INET_DIAG);
+    mWriteSock = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_INET_DIAG);
+    if (!hasSocks()) {
+        closeSocks();
+        return false;
+    }
+
+    sockaddr_nl nl = { .nl_family = AF_NETLINK };
+    if ((connect(mSock, reinterpret_cast<sockaddr *>(&nl), sizeof(nl)) == -1) ||
+        (connect(mWriteSock, reinterpret_cast<sockaddr *>(&nl), sizeof(nl)) == -1)) {
+        closeSocks();
+        return false;
+    }
+
+    return true;
+}
+
+int SockDiag::sendDumpRequest(uint8_t proto, uint8_t family, const char *addrstr) {
+    addrinfo hints = { .ai_flags = AI_NUMERICHOST };
+    addrinfo *res;
+    in6_addr mapped = { .s6_addr32 = { 0, 0, htonl(0xffff), 0 } };
+    int ret;
+
+    // TODO: refactor the netlink parsing code out of system/core, bring it into netd, and stop
+    // doing string conversions when they're not necessary.
+    if ((ret = getaddrinfo(addrstr, nullptr, &hints, &res)) != 0) {
+        return -EINVAL;
+    }
+
+    // So we don't have to call freeaddrinfo on every failure path.
+    ScopedAddrinfo resP(res);
+
+    void *addr;
+    uint8_t addrlen;
+    if (res->ai_family == AF_INET && family == AF_INET) {
+        in_addr& ina = reinterpret_cast<sockaddr_in*>(res->ai_addr)->sin_addr;
+        addr = &ina;
+        addrlen = sizeof(ina);
+    } else if (res->ai_family == AF_INET && family == AF_INET6) {
+        in_addr& ina = reinterpret_cast<sockaddr_in*>(res->ai_addr)->sin_addr;
+        mapped.s6_addr32[3] = ina.s_addr;
+        addr = &mapped;
+        addrlen = sizeof(mapped);
+    } else if (res->ai_family == AF_INET6 && family == AF_INET6) {
+        in6_addr& in6a = reinterpret_cast<sockaddr_in6*>(res->ai_addr)->sin6_addr;
+        addr = &in6a;
+        addrlen = sizeof(in6a);
+    } else {
+        return -EAFNOSUPPORT;
+    }
+
+    uint8_t prefixlen = addrlen * 8;
+    uint8_t yesjump = sizeof(inet_diag_bc_op) + sizeof(inet_diag_hostcond) + addrlen;
+    uint8_t nojump = yesjump + 4;
+    uint32_t states = ~(1 << TCP_TIME_WAIT);
+
+    struct {
+        nlmsghdr nlh;
+        inet_diag_req_v2 req;
+        nlattr nla;
+        inet_diag_bc_op op;
+        inet_diag_hostcond cond;
+    } __attribute__((__packed__)) request = {
+        .nlh = {
+            .nlmsg_type = SOCK_DIAG_BY_FAMILY,
+            .nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP,
+        },
+        .req = {
+            .sdiag_family = family,
+            .sdiag_protocol = proto,
+            .idiag_states = states,
+        },
+        .nla = {
+            .nla_type = INET_DIAG_REQ_BYTECODE,
+        },
+        .op = {
+            INET_DIAG_BC_S_COND,
+            yesjump,
+            nojump,
+        },
+        .cond = {
+            family,
+            prefixlen,
+            -1,
+            {}
+        },
+    };
+
+    request.nlh.nlmsg_len = sizeof(request) + addrlen;
+    request.nla.nla_len = sizeof(request.nla) + sizeof(request.op) + sizeof(request.cond) + addrlen;
+
+    struct iovec iov[] = {
+        { &request, sizeof(request) },
+        { addr, addrlen },
+    };
+
+    if (writev(mSock, iov, ARRAY_SIZE(iov)) != (int) request.nlh.nlmsg_len) {
+        return -errno;
+    }
+
+    return checkError(mSock);
+}
+
+int SockDiag::readDiagMsg(uint8_t proto, SockDiag::DumpCallback callback) {
+    char buf[kBufferSize];
+
+    ssize_t bytesread;
+    do {
+        bytesread = read(mSock, buf, sizeof(buf));
+
+        if (bytesread < 0) {
+            return -errno;
+        }
+
+        uint32_t len = bytesread;
+        for (nlmsghdr *nlh = reinterpret_cast<nlmsghdr *>(buf);
+             NLMSG_OK(nlh, len);
+             nlh = NLMSG_NEXT(nlh, len)) {
+            switch (nlh->nlmsg_type) {
+              case NLMSG_DONE:
+                callback(proto, NULL);
+                return 0;
+              case NLMSG_ERROR: {
+                nlmsgerr *err = reinterpret_cast<nlmsgerr *>(NLMSG_DATA(nlh));
+                return err->error;
+              }
+              default:
+                inet_diag_msg *msg = reinterpret_cast<inet_diag_msg *>(NLMSG_DATA(nlh));
+                callback(proto, msg);
+            }
+        }
+    } while (bytesread > 0);
+
+    return 0;
+}
+
+int SockDiag::sockDestroy(uint8_t proto, const inet_diag_msg *msg) {
+    DestroyRequest request = {
+        .nlh = {
+            .nlmsg_type = SOCK_DESTROY,
+            .nlmsg_flags = NLM_F_REQUEST,
+        },
+        .req = {
+            .sdiag_family = msg->idiag_family,
+            .sdiag_protocol = proto,
+            .idiag_states = (uint32_t) (1 << msg->idiag_state),
+            .id = msg->id,
+        },
+    };
+    request.nlh.nlmsg_len = sizeof(request);
+
+    if (write(mWriteSock, &request, sizeof(request)) < (ssize_t) sizeof(request)) {
+        return -errno;
+    }
+
+    return checkError(mWriteSock);
+}