client/server: track and handle command timeouts

Signed-off-by: Jens Axboe <axboe@kernel.dk>
diff --git a/client.c b/client.c
index 358903b..130aeaf 100644
--- a/client.c
+++ b/client.c
@@ -45,6 +45,8 @@
 	struct flist_head eta_list;
 	struct client_eta *eta_in_flight;
 
+	struct flist_head cmd_list;
+
 	uint16_t argc;
 	char **argv;
 };
@@ -188,6 +190,7 @@
 	INIT_FLIST_HEAD(&client->hash_list);
 	INIT_FLIST_HEAD(&client->arg_list);
 	INIT_FLIST_HEAD(&client->eta_list);
+	INIT_FLIST_HEAD(&client->cmd_list);
 
 	if (fio_server_parse_string(hostname, &client->hostname,
 					&client->is_sock, &client->port,
@@ -266,6 +269,8 @@
 	else
 		fd = fio_client_connect_ip(client);
 
+	dprint(FD_NET, "client: %s connected %d\n", client->hostname, fd);
+
 	if (fd < 0)
 		return 1;
 
@@ -285,7 +290,7 @@
 	flist_for_each(entry, &client_list) {
 		client = flist_entry(entry, struct fio_client, list);
 
-		fio_net_send_simple_cmd(client->fd, FIO_NET_CMD_QUIT, 0);
+		fio_net_send_simple_cmd(client->fd, FIO_NET_CMD_QUIT, 0, NULL);
 	}
 }
 
@@ -314,8 +319,7 @@
 {
 	dprint(FD_NET, "client: send probe\n");
 
-	fio_net_send_simple_cmd(client->fd, FIO_NET_CMD_PROBE, 0);
-	handle_client(client);
+	fio_net_send_simple_cmd(client->fd, FIO_NET_CMD_PROBE, 0, &client->cmd_list);
 }
 
 static int send_client_cmd_line(struct fio_client *client)
@@ -631,6 +635,30 @@
 	}
 }
 
+static void remove_reply_cmd(struct fio_client *client, struct fio_net_cmd *cmd)
+{
+	struct fio_net_int_cmd *icmd = NULL;
+	struct flist_head *entry;
+
+	flist_for_each(entry, &client->cmd_list) {
+		icmd = flist_entry(entry, struct fio_net_int_cmd, list);
+
+		if (cmd->tag == (uint64_t) icmd)
+			break;
+
+		icmd = NULL;
+	}
+
+	if (!icmd) {
+		log_err("fio: client: unable to find matching tag\n");
+		return;
+	}
+
+	flist_del(&icmd->list);
+	cmd->tag = icmd->saved_tag;
+	free(icmd);
+}
+
 static void handle_eta(struct fio_client *client, struct fio_net_cmd *cmd)
 {
 	struct jobs_eta *je = (struct jobs_eta *) cmd->payload;
@@ -679,8 +707,8 @@
 	if (!cmd)
 		return 0;
 
-	dprint(FD_NET, "client: got cmd op %d from %s\n",
-					cmd->opcode, client->hostname);
+	dprint(FD_NET, "client: got cmd op %s from %s\n",
+				fio_server_op(cmd->opcode), client->hostname);
 
 	switch (cmd->opcode) {
 	case FIO_NET_CMD_QUIT:
@@ -711,10 +739,12 @@
 		free(cmd);
 		break;
 	case FIO_NET_CMD_ETA:
+		remove_reply_cmd(client, cmd);
 		handle_eta(client, cmd);
 		free(cmd);
 		break;
 	case FIO_NET_CMD_PROBE:
+		remove_reply_cmd(client, cmd);
 		handle_probe(client, cmd);
 		free(cmd);
 		break;
@@ -727,7 +757,7 @@
 		free(cmd);
 		break;
 	default:
-		log_err("fio: unknown client op: %d\n", cmd->opcode);
+		log_err("fio: unknown client op: %s\n", fio_server_op(cmd->opcode));
 		free(cmd);
 		break;
 	}
@@ -760,7 +790,7 @@
 		flist_add_tail(&client->eta_list, &eta_list);
 		client->eta_in_flight = eta;
 		fio_net_send_simple_cmd(client->fd, FIO_NET_CMD_SEND_ETA,
-						(uint64_t) eta);
+					(uint64_t) eta, &client->cmd_list);
 	}
 
 	while (skipped--)
@@ -769,6 +799,55 @@
 	dprint(FD_NET, "client: requested eta tag %p\n", eta);
 }
 
+static int client_check_cmd_timeout(struct fio_client *client,
+				    struct timeval *now)
+{
+	struct fio_net_int_cmd *cmd;
+	struct flist_head *entry, *tmp;
+	int ret = 0;
+
+	flist_for_each_safe(entry, tmp, &client->cmd_list) {
+		cmd = flist_entry(entry, struct fio_net_int_cmd, list);
+
+		if (mtime_since(&cmd->tv, now) < FIO_NET_CLIENT_TIMEOUT)
+			continue;
+
+		log_err("fio: client %s, timeout on cmd %s\n", client->hostname,
+						fio_server_op(cmd->cmd.opcode));
+		flist_del(&cmd->list);
+		free(cmd);
+		ret = 1;
+	}
+
+	return flist_empty(&client->cmd_list) && ret;
+}
+
+static int fio_client_timed_out(void)
+{
+	struct fio_client *client;
+	struct flist_head *entry, *tmp;
+	struct timeval tv;
+	int ret = 0;
+
+	gettimeofday(&tv, NULL);
+
+	flist_for_each_safe(entry, tmp, &client_list) {
+		client = flist_entry(entry, struct fio_client, list);
+
+		if (flist_empty(&client->cmd_list))
+			continue;
+
+		if (!client_check_cmd_timeout(client, &tv))
+			continue;
+
+		log_err("fio: client %s timed out\n", client->hostname);
+		remove_client(client);
+		ret = 1;
+	}
+
+	return ret;
+}
+
 int fio_handle_clients(void)
 {
 	struct fio_client *client;
@@ -799,6 +878,9 @@
 			if (mtime_since(&eta_tv, &tv) >= 900) {
 				request_client_etas();
 				memcpy(&eta_tv, &tv, sizeof(tv));
+
+				if (fio_client_timed_out())
+					break;
 			}
 
 			ret = poll(pfds, nr_clients, 100);