Merge pull request #8666 from dgquintas/rr_fixall
Fixed wrong connectivity status updates for RR
diff --git a/include/grpc/impl/codegen/connectivity_state.h b/include/grpc/impl/codegen/connectivity_state.h
index 71865d8..c6d1a33 100644
--- a/include/grpc/impl/codegen/connectivity_state.h
+++ b/include/grpc/impl/codegen/connectivity_state.h
@@ -40,6 +40,8 @@
/** Connectivity state of a channel. */
typedef enum {
+ /** channel has just been initialized */
+ GRPC_CHANNEL_INIT = -1,
/** channel is idle */
GRPC_CHANNEL_IDLE,
/** channel is connecting */
diff --git a/src/core/ext/lb_policy/pick_first/pick_first.c b/src/core/ext/lb_policy/pick_first/pick_first.c
index ac3c6a3..39f3b3d 100644
--- a/src/core/ext/lb_policy/pick_first/pick_first.c
+++ b/src/core/ext/lb_policy/pick_first/pick_first.c
@@ -292,6 +292,8 @@
} else {
loop:
switch (p->checking_connectivity) {
+ case GRPC_CHANNEL_INIT:
+ GPR_UNREACHABLE_CODE();
case GRPC_CHANNEL_READY:
grpc_connectivity_state_set(exec_ctx, &p->state_tracker,
GRPC_CHANNEL_READY, GRPC_ERROR_NONE,
diff --git a/src/core/ext/lb_policy/round_robin/round_robin.c b/src/core/ext/lb_policy/round_robin/round_robin.c
index 0fd3abe..acb2f77 100644
--- a/src/core/ext/lb_policy/round_robin/round_robin.c
+++ b/src/core/ext/lb_policy/round_robin/round_robin.c
@@ -116,8 +116,13 @@
grpc_closure connectivity_changed_closure;
/** this subchannels current position in subchannel->ready_list */
ready_list *ready_list_node;
- /** last observed connectivity */
- grpc_connectivity_state connectivity_state;
+ /** last observed connectivity. Not updated by
+ * \a grpc_subchannel_notify_on_state_change. Used to determine the previous
+ * state while processing the new state in \a rr_connectivity_changed */
+ grpc_connectivity_state prev_connectivity_state;
+ /** current connectivity state. Updated by \a
+ * grpc_subchannel_notify_on_state_change */
+ grpc_connectivity_state curr_connectivity_state;
/** the subchannel's target user data */
void *user_data;
/** vtable to operate over \a user_data */
@@ -127,6 +132,7 @@
struct round_robin_lb_policy {
/** base policy: must be first */
grpc_lb_policy base;
+ gpr_mu mu;
/** total number of addresses received at creation time */
size_t num_addresses;
@@ -135,8 +141,11 @@
size_t num_subchannels;
subchannel_data **subchannels;
- /** mutex protecting remaining members */
- gpr_mu mu;
+ /** how many subchannels are in TRANSIENT_FAILURE */
+ size_t num_transient_failures;
+ /** how many subchannels are IDLE */
+ size_t num_idle;
+
/** have we started picking? */
int started_picking;
/** are we shutting down? */
@@ -258,6 +267,10 @@
gpr_free(node);
}
+static bool is_ready_list_empty(round_robin_lb_policy *p) {
+ return p->ready_list.prev == NULL;
+}
+
static void rr_destroy(grpc_exec_ctx *exec_ctx, grpc_lb_policy *pol) {
round_robin_lb_policy *p = (round_robin_lb_policy *)pol;
ready_list *elem;
@@ -268,7 +281,7 @@
for (size_t i = 0; i < p->num_subchannels; i++) {
subchannel_data *sd = p->subchannels[i];
- GRPC_SUBCHANNEL_UNREF(exec_ctx, sd->subchannel, "round_robin_destroy");
+ GRPC_SUBCHANNEL_UNREF(exec_ctx, sd->subchannel, "rr_destroy");
if (sd->user_data != NULL) {
GPR_ASSERT(sd->user_data_vtable != NULL);
sd->user_data_vtable->destroy(sd->user_data);
@@ -381,18 +394,18 @@
size_t i;
p->started_picking = 1;
- if (grpc_lb_round_robin_trace) {
- gpr_log(GPR_DEBUG, "LB_POLICY: p=%p num_subchannels=%" PRIuPTR, (void *)p,
- p->num_subchannels);
- }
-
for (i = 0; i < p->num_subchannels; i++) {
subchannel_data *sd = p->subchannels[i];
- sd->connectivity_state = GRPC_CHANNEL_IDLE;
+ /* use some sentinel value outside of the range of grpc_connectivity_state
+ * to signal an undefined previous state. We won't be referring to this
+ * value again and it'll be overwritten after the first call to
+ * rr_connectivity_changed */
+ sd->prev_connectivity_state = GRPC_CHANNEL_INIT;
+ sd->curr_connectivity_state = GRPC_CHANNEL_IDLE;
+ GRPC_LB_POLICY_WEAK_REF(&p->base, "rr_connectivity");
grpc_subchannel_notify_on_state_change(
exec_ctx, sd->subchannel, p->base.interested_parties,
- &sd->connectivity_state, &sd->connectivity_changed_closure);
- GRPC_LB_POLICY_WEAK_REF(&p->base, "round_robin_connectivity");
+ &sd->curr_connectivity_state, &sd->connectivity_changed_closure);
}
}
@@ -422,7 +435,7 @@
/* readily available, report right away */
*target = GRPC_CONNECTED_SUBCHANNEL_REF(
grpc_subchannel_get_connected_subchannel(selected->subchannel),
- "picked");
+ "rr_picked");
if (user_data != NULL) {
*user_data = selected->user_data;
@@ -453,125 +466,184 @@
}
}
+static void update_state_counters(subchannel_data *sd) {
+ round_robin_lb_policy *p = sd->policy;
+
+ /* update p->num_transient_failures (resp. p->num_idle): if the previous
+ * state was TRANSIENT_FAILURE (resp. IDLE), decrement
+ * p->num_transient_failures (resp. p->num_idle). */
+ if (sd->prev_connectivity_state == GRPC_CHANNEL_TRANSIENT_FAILURE) {
+ GPR_ASSERT(p->num_transient_failures > 0);
+ --p->num_transient_failures;
+ } else if (sd->prev_connectivity_state == GRPC_CHANNEL_IDLE) {
+ GPR_ASSERT(p->num_idle > 0);
+ --p->num_idle;
+ }
+}
+
+/* sd is the subchannel_data associted with the updated subchannel.
+ * shutdown_error will only be used upon policy transition to TRANSIENT_FAILURE
+ * or SHUTDOWN */
+static grpc_connectivity_state update_lb_connectivity_status(
+ grpc_exec_ctx *exec_ctx, subchannel_data *sd, grpc_error *error) {
+ /* In priority order. The first rule to match terminates the search (ie, if we
+ * are on rule n, all previous rules were unfulfilled).
+ *
+ * 1) RULE: ANY subchannel is READY => policy is READY.
+ * CHECK: At least one subchannel is ready iff p->ready_list is NOT empty.
+ *
+ * 2) RULE: ANY subchannel is CONNECTING => policy is CONNECTING.
+ * CHECK: sd->curr_connectivity_state == CONNECTING.
+ *
+ * 3) RULE: ALL subchannels are SHUTDOWN => policy is SHUTDOWN.
+ * CHECK: p->num_subchannels = 0.
+ *
+ * 4) RULE: ALL subchannels are TRANSIENT_FAILURE => policy is
+ * TRANSIENT_FAILURE.
+ * CHECK: p->num_transient_failures == p->num_subchannels.
+ *
+ * 5) RULE: ALL subchannels are IDLE => policy is IDLE.
+ * CHECK: p->num_idle == p->num_subchannels.
+ */
+ round_robin_lb_policy *p = sd->policy;
+ if (!is_ready_list_empty(p)) { /* 1) READY */
+ grpc_connectivity_state_set(exec_ctx, &p->state_tracker, GRPC_CHANNEL_READY,
+ GRPC_ERROR_NONE, "rr_ready");
+ return GRPC_CHANNEL_READY;
+ } else if (sd->curr_connectivity_state ==
+ GRPC_CHANNEL_CONNECTING) { /* 2) CONNECTING */
+ grpc_connectivity_state_set(exec_ctx, &p->state_tracker,
+ GRPC_CHANNEL_CONNECTING, GRPC_ERROR_NONE,
+ "rr_connecting");
+ return GRPC_CHANNEL_CONNECTING;
+ } else if (p->num_subchannels == 0) { /* 3) SHUTDOWN */
+ grpc_connectivity_state_set(exec_ctx, &p->state_tracker,
+ GRPC_CHANNEL_SHUTDOWN, GRPC_ERROR_REF(error),
+ "rr_shutdown");
+ return GRPC_CHANNEL_SHUTDOWN;
+ } else if (p->num_transient_failures ==
+ p->num_subchannels) { /* 4) TRANSIENT_FAILURE */
+ grpc_connectivity_state_set(exec_ctx, &p->state_tracker,
+ GRPC_CHANNEL_TRANSIENT_FAILURE,
+ GRPC_ERROR_REF(error), "rr_transient_failure");
+ return GRPC_CHANNEL_TRANSIENT_FAILURE;
+ } else if (p->num_idle == p->num_subchannels) { /* 5) IDLE */
+ grpc_connectivity_state_set(exec_ctx, &p->state_tracker, GRPC_CHANNEL_IDLE,
+ GRPC_ERROR_NONE, "rr_idle");
+ return GRPC_CHANNEL_IDLE;
+ }
+ /* no change */
+ return sd->curr_connectivity_state;
+}
+
static void rr_connectivity_changed(grpc_exec_ctx *exec_ctx, void *arg,
grpc_error *error) {
subchannel_data *sd = arg;
round_robin_lb_policy *p = sd->policy;
pending_pick *pp;
- int unref = 0;
-
GRPC_ERROR_REF(error);
gpr_mu_lock(&p->mu);
if (p->shutdown) {
- unref = 1;
- } else {
- switch (sd->connectivity_state) {
- case GRPC_CHANNEL_READY:
- grpc_connectivity_state_set(exec_ctx, &p->state_tracker,
- GRPC_CHANNEL_READY, GRPC_ERROR_REF(error),
- "connecting_ready");
- /* add the newly connected subchannel to the list of connected ones.
- * Note that it goes to the "end of the line". */
- sd->ready_list_node = add_connected_sc_locked(p, sd);
- /* at this point we know there's at least one suitable subchannel. Go
- * ahead and pick one and notify the pending suitors in
- * p->pending_picks. This preemtively replicates rr_pick()'s actions. */
- ready_list *selected = peek_next_connected_locked(p);
- GPR_ASSERT(selected != NULL);
- if (p->pending_picks != NULL) {
- /* if the selected subchannel is going to be used for the pending
- * picks, update the last picked pointer */
- advance_last_picked_locked(p);
+ gpr_mu_unlock(&p->mu);
+ GRPC_LB_POLICY_WEAK_UNREF(exec_ctx, &p->base, "rr_connectivity");
+ GRPC_ERROR_UNREF(error);
+ return;
+ }
+ switch (sd->curr_connectivity_state) {
+ case GRPC_CHANNEL_INIT:
+ GPR_UNREACHABLE_CODE();
+ case GRPC_CHANNEL_READY:
+ /* add the newly connected subchannel to the list of connected ones.
+ * Note that it goes to the "end of the line". */
+ sd->ready_list_node = add_connected_sc_locked(p, sd);
+ /* at this point we know there's at least one suitable subchannel. Go
+ * ahead and pick one and notify the pending suitors in
+ * p->pending_picks. This preemtively replicates rr_pick()'s actions. */
+ ready_list *selected = peek_next_connected_locked(p);
+ GPR_ASSERT(selected != NULL);
+ if (p->pending_picks != NULL) {
+ /* if the selected subchannel is going to be used for the pending
+ * picks, update the last picked pointer */
+ advance_last_picked_locked(p);
+ }
+ while ((pp = p->pending_picks)) {
+ p->pending_picks = pp->next;
+ *pp->target = GRPC_CONNECTED_SUBCHANNEL_REF(
+ grpc_subchannel_get_connected_subchannel(selected->subchannel),
+ "rr_picked");
+ if (pp->user_data != NULL) {
+ *pp->user_data = selected->user_data;
}
-
+ if (grpc_lb_round_robin_trace) {
+ gpr_log(GPR_DEBUG,
+ "[RR CONN CHANGED] TARGET <-- SUBCHANNEL %p (NODE %p)",
+ (void *)selected->subchannel, (void *)selected);
+ }
+ grpc_exec_ctx_sched(exec_ctx, pp->on_complete, GRPC_ERROR_NONE, NULL);
+ gpr_free(pp);
+ }
+ update_lb_connectivity_status(exec_ctx, sd, error);
+ sd->prev_connectivity_state = sd->curr_connectivity_state;
+ /* renew notification: reuses the "rr_connectivity" weak ref */
+ grpc_subchannel_notify_on_state_change(
+ exec_ctx, sd->subchannel, p->base.interested_parties,
+ &sd->curr_connectivity_state, &sd->connectivity_changed_closure);
+ break;
+ case GRPC_CHANNEL_IDLE:
+ ++p->num_idle;
+ /* fallthrough */
+ case GRPC_CHANNEL_CONNECTING:
+ update_state_counters(sd);
+ update_lb_connectivity_status(exec_ctx, sd, error);
+ sd->prev_connectivity_state = sd->curr_connectivity_state;
+ /* renew notification: reuses the "rr_connectivity" weak ref */
+ grpc_subchannel_notify_on_state_change(
+ exec_ctx, sd->subchannel, p->base.interested_parties,
+ &sd->curr_connectivity_state, &sd->connectivity_changed_closure);
+ break;
+ case GRPC_CHANNEL_TRANSIENT_FAILURE:
+ ++p->num_transient_failures;
+ /* remove from ready list if still present */
+ if (sd->ready_list_node != NULL) {
+ remove_disconnected_sc_locked(p, sd->ready_list_node);
+ sd->ready_list_node = NULL;
+ }
+ update_lb_connectivity_status(exec_ctx, sd, error);
+ sd->prev_connectivity_state = sd->curr_connectivity_state;
+ /* renew notification: reuses the "rr_connectivity" weak ref */
+ grpc_subchannel_notify_on_state_change(
+ exec_ctx, sd->subchannel, p->base.interested_parties,
+ &sd->curr_connectivity_state, &sd->connectivity_changed_closure);
+ break;
+ case GRPC_CHANNEL_SHUTDOWN:
+ update_state_counters(sd);
+ if (sd->ready_list_node != NULL) {
+ remove_disconnected_sc_locked(p, sd->ready_list_node);
+ sd->ready_list_node = NULL;
+ }
+ --p->num_subchannels;
+ GPR_SWAP(subchannel_data *, p->subchannels[sd->index],
+ p->subchannels[p->num_subchannels]);
+ GRPC_SUBCHANNEL_UNREF(exec_ctx, sd->subchannel, "rr_subchannel_shutdown");
+ p->subchannels[sd->index]->index = sd->index;
+ if (update_lb_connectivity_status(exec_ctx, sd, error) ==
+ GRPC_CHANNEL_SHUTDOWN) {
+ /* the policy is shutting down. Flush all the pending picks... */
while ((pp = p->pending_picks)) {
p->pending_picks = pp->next;
-
- *pp->target = GRPC_CONNECTED_SUBCHANNEL_REF(
- grpc_subchannel_get_connected_subchannel(selected->subchannel),
- "picked");
- if (pp->user_data != NULL) {
- *pp->user_data = selected->user_data;
- }
- if (grpc_lb_round_robin_trace) {
- gpr_log(GPR_DEBUG,
- "[RR CONN CHANGED] TARGET <-- SUBCHANNEL %p (NODE %p)",
- (void *)selected->subchannel, (void *)selected);
- }
+ *pp->target = NULL;
grpc_exec_ctx_sched(exec_ctx, pp->on_complete, GRPC_ERROR_NONE, NULL);
gpr_free(pp);
}
- grpc_subchannel_notify_on_state_change(
- exec_ctx, sd->subchannel, p->base.interested_parties,
- &sd->connectivity_state, &sd->connectivity_changed_closure);
- break;
- case GRPC_CHANNEL_CONNECTING:
- case GRPC_CHANNEL_IDLE:
- grpc_connectivity_state_set(
- exec_ctx, &p->state_tracker, sd->connectivity_state,
- GRPC_ERROR_REF(error), "connecting_changed");
- grpc_subchannel_notify_on_state_change(
- exec_ctx, sd->subchannel, p->base.interested_parties,
- &sd->connectivity_state, &sd->connectivity_changed_closure);
- break;
- case GRPC_CHANNEL_TRANSIENT_FAILURE:
- /* renew state notification */
- grpc_subchannel_notify_on_state_change(
- exec_ctx, sd->subchannel, p->base.interested_parties,
- &sd->connectivity_state, &sd->connectivity_changed_closure);
-
- /* remove from ready list if still present */
- if (sd->ready_list_node != NULL) {
- remove_disconnected_sc_locked(p, sd->ready_list_node);
- sd->ready_list_node = NULL;
- }
- grpc_connectivity_state_set(
- exec_ctx, &p->state_tracker, GRPC_CHANNEL_TRANSIENT_FAILURE,
- GRPC_ERROR_REF(error), "connecting_transient_failure");
- break;
- case GRPC_CHANNEL_SHUTDOWN:
- if (sd->ready_list_node != NULL) {
- remove_disconnected_sc_locked(p, sd->ready_list_node);
- sd->ready_list_node = NULL;
- }
-
- p->num_subchannels--;
- GPR_SWAP(subchannel_data *, p->subchannels[sd->index],
- p->subchannels[p->num_subchannels]);
- GRPC_SUBCHANNEL_UNREF(exec_ctx, sd->subchannel, "round_robin");
- p->subchannels[sd->index]->index = sd->index;
- gpr_free(sd);
-
- unref = 1;
- if (p->num_subchannels == 0) {
- grpc_connectivity_state_set(
- exec_ctx, &p->state_tracker, GRPC_CHANNEL_SHUTDOWN,
- GRPC_ERROR_CREATE_REFERENCING("Round Robin Channels Exhausted",
- &error, 1),
- "no_more_channels");
- while ((pp = p->pending_picks)) {
- p->pending_picks = pp->next;
- *pp->target = NULL;
- grpc_exec_ctx_sched(exec_ctx, pp->on_complete, GRPC_ERROR_NONE,
- NULL);
- gpr_free(pp);
- }
- } else {
- grpc_connectivity_state_set(
- exec_ctx, &p->state_tracker, GRPC_CHANNEL_TRANSIENT_FAILURE,
- GRPC_ERROR_REF(error), "subchannel_failed");
- }
- } /* switch */
- } /* !unref */
-
- gpr_mu_unlock(&p->mu);
-
- if (unref) {
- GRPC_LB_POLICY_WEAK_UNREF(exec_ctx, &p->base, "round_robin_connectivity");
+ }
+ gpr_free(sd);
+ /* unref the "rr_connectivity" weak ref from start_picking */
+ GRPC_LB_POLICY_WEAK_UNREF(exec_ctx, &p->base, "rr_connectivity");
+ break;
}
-
+ gpr_mu_unlock(&p->mu);
GRPC_ERROR_UNREF(error);
}
@@ -607,9 +679,9 @@
gpr_mu_unlock(&p->mu);
target = GRPC_CONNECTED_SUBCHANNEL_REF(
grpc_subchannel_get_connected_subchannel(selected->subchannel),
- "picked");
+ "rr_picked");
grpc_connected_subchannel_ping(exec_ctx, target, closure);
- GRPC_CONNECTED_SUBCHANNEL_UNREF(exec_ctx, target, "picked");
+ GRPC_CONNECTED_SUBCHANNEL_UNREF(exec_ctx, target, "rr_picked");
} else {
gpr_mu_unlock(&p->mu);
grpc_exec_ctx_sched(exec_ctx, closure,
@@ -705,6 +777,11 @@
grpc_lb_policy_init(&p->base, &round_robin_lb_policy_vtable);
grpc_connectivity_state_init(&p->state_tracker, GRPC_CHANNEL_IDLE,
"round_robin");
+
+ if (grpc_lb_round_robin_trace) {
+ gpr_log(GPR_DEBUG, "Created RR policy at %p with %lu subchannels",
+ (void *)p, (unsigned long)p->num_subchannels);
+ }
gpr_mu_init(&p->mu);
return &p->base;
}
diff --git a/src/core/lib/transport/connectivity_state.c b/src/core/lib/transport/connectivity_state.c
index fdb5307..8907287 100644
--- a/src/core/lib/transport/connectivity_state.c
+++ b/src/core/lib/transport/connectivity_state.c
@@ -43,6 +43,8 @@
const char *grpc_connectivity_state_name(grpc_connectivity_state state) {
switch (state) {
+ case GRPC_CHANNEL_INIT:
+ return "INIT";
case GRPC_CHANNEL_IDLE:
return "IDLE";
case GRPC_CHANNEL_CONNECTING:
@@ -159,6 +161,7 @@
grpc_error_free_string(error_string);
}
switch (state) {
+ case GRPC_CHANNEL_INIT:
case GRPC_CHANNEL_CONNECTING:
case GRPC_CHANNEL_IDLE:
case GRPC_CHANNEL_READY:
diff --git a/test/core/client_channel/lb_policies_test.c b/test/core/client_channel/lb_policies_test.c
index 7b1c47e..c373505 100644
--- a/test/core/client_channel/lb_policies_test.c
+++ b/test/core/client_channel/lb_policies_test.c
@@ -62,8 +62,14 @@
grpc_metadata_array *request_metadata_recv;
} servers_fixture;
+typedef struct request_sequences {
+ size_t n;
+ int *connections;
+ int *connectivity_states;
+} request_sequences;
+
typedef void (*verifier_fn)(const servers_fixture *, grpc_channel *,
- const int *, const size_t);
+ const request_sequences *, const size_t);
typedef struct test_spec {
size_t num_iters;
@@ -227,9 +233,24 @@
gpr_free(f);
}
+static request_sequences request_sequences_create(size_t n) {
+ request_sequences res;
+ res.n = n;
+ res.connections = gpr_malloc(sizeof(*res.connections) * n);
+ res.connectivity_states = gpr_malloc(sizeof(*res.connectivity_states) * n);
+ return res;
+}
+
+static void request_sequences_destroy(const request_sequences *rseqs) {
+ gpr_free(rseqs->connections);
+ gpr_free(rseqs->connectivity_states);
+}
+
/** Returns connection sequence (server indices), which must be freed */
-static int *perform_request(servers_fixture *f, grpc_channel *client,
- request_data *rdata, const test_spec *spec) {
+static request_sequences perform_request(servers_fixture *f,
+ grpc_channel *client,
+ request_data *rdata,
+ const test_spec *spec) {
grpc_call *c;
int s_idx;
int *s_valid;
@@ -239,11 +260,10 @@
size_t i, iter_num;
grpc_event ev;
int read_tag;
- int *connection_sequence;
int completed_client;
+ const request_sequences sequences = request_sequences_create(spec->num_iters);
s_valid = gpr_malloc(sizeof(int) * f->num_servers);
- connection_sequence = gpr_malloc(sizeof(int) * spec->num_iters);
for (iter_num = 0; iter_num < spec->num_iters; iter_num++) {
cq_verifier *cqv = cq_verifier_create(f->cq);
@@ -260,7 +280,7 @@
}
}
- connection_sequence[iter_num] = -1;
+ sequences.connections[iter_num] = -1;
grpc_metadata_array_init(&rdata->initial_metadata_recv);
grpc_metadata_array_init(&rdata->trailing_metadata_recv);
@@ -305,12 +325,14 @@
grpc_call_start_batch(c, ops, (size_t)(op - ops), tag(1), NULL));
s_idx = -1;
- while (
- (ev = grpc_completion_queue_next(
- f->cq, GRPC_TIMEOUT_MILLIS_TO_DEADLINE(10 * RETRY_TIMEOUT), NULL))
- .type != GRPC_QUEUE_TIMEOUT) {
+ while ((ev = grpc_completion_queue_next(
+ f->cq, GRPC_TIMEOUT_MILLIS_TO_DEADLINE(RETRY_TIMEOUT), NULL))
+ .type != GRPC_QUEUE_TIMEOUT) {
GPR_ASSERT(ev.type == GRPC_OP_COMPLETE);
read_tag = ((int)(intptr_t)ev.tag);
+ const grpc_connectivity_state conn_state =
+ grpc_channel_check_connectivity_state(client, 0);
+ sequences.connectivity_states[iter_num] = conn_state;
gpr_log(GPR_DEBUG, "EVENT: success:%d, type:%d, tag:%d iter:%" PRIuPTR,
ev.success, ev.type, read_tag, iter_num);
if (ev.success && read_tag >= 1000) {
@@ -318,7 +340,7 @@
/* only server notifications for non-shutdown events */
s_idx = read_tag - 1000;
s_valid[s_idx] = 1;
- connection_sequence[iter_num] = s_idx;
+ sequences.connections[iter_num] = s_idx;
break;
} else if (read_tag == 1) {
gpr_log(GPR_DEBUG, "client timed out");
@@ -381,10 +403,9 @@
}
}
- GPR_ASSERT(
- grpc_completion_queue_next(
- f->cq, GRPC_TIMEOUT_MILLIS_TO_DEADLINE(2 * RETRY_TIMEOUT), NULL)
- .type == GRPC_QUEUE_TIMEOUT);
+ GPR_ASSERT(grpc_completion_queue_next(
+ f->cq, GRPC_TIMEOUT_MILLIS_TO_DEADLINE(RETRY_TIMEOUT), NULL)
+ .type == GRPC_QUEUE_TIMEOUT);
grpc_metadata_array_destroy(&rdata->initial_metadata_recv);
grpc_metadata_array_destroy(&rdata->trailing_metadata_recv);
@@ -401,7 +422,7 @@
gpr_free(s_valid);
- return connection_sequence;
+ return sequences;
}
static grpc_call **perform_multirequest(servers_fixture *f,
@@ -441,62 +462,10 @@
return calls;
}
-static void assert_channel_connectivity(grpc_channel *ch,
- size_t num_accepted_conn_states,
- int accepted_conn_state, ...) {
- size_t i;
- grpc_channel_stack *client_stack;
- grpc_channel_element *client_channel_filter;
- grpc_connectivity_state actual_conn_state;
- grpc_exec_ctx exec_ctx = GRPC_EXEC_CTX_INIT;
- va_list ap;
-
- client_stack = grpc_channel_get_channel_stack(ch);
- client_channel_filter = grpc_channel_stack_last_element(client_stack);
-
- actual_conn_state = grpc_client_channel_check_connectivity_state(
- &exec_ctx, client_channel_filter, 0 /* don't try to connect */);
- grpc_exec_ctx_finish(&exec_ctx);
- va_start(ap, accepted_conn_state);
- for (i = 0; i < num_accepted_conn_states; i++) {
- if ((int)actual_conn_state == accepted_conn_state) {
- break;
- }
- accepted_conn_state = va_arg(ap, grpc_connectivity_state);
- }
- va_end(ap);
- if (i == num_accepted_conn_states) {
- char **accepted_strs =
- gpr_malloc(sizeof(char *) * num_accepted_conn_states);
- char *accepted_str_joined;
- va_start(ap, accepted_conn_state);
- for (i = 0; i < num_accepted_conn_states; i++) {
- GPR_ASSERT(gpr_asprintf(&accepted_strs[i], "%d", accepted_conn_state) >
- 0);
- accepted_conn_state = va_arg(ap, grpc_connectivity_state);
- }
- va_end(ap);
- accepted_str_joined = gpr_strjoin_sep((const char **)accepted_strs,
- num_accepted_conn_states, ", ", NULL);
- gpr_log(
- GPR_ERROR,
- "Channel connectivity assertion failed: expected <one of [%s]>, got %d",
- accepted_str_joined, actual_conn_state);
-
- for (i = 0; i < num_accepted_conn_states; i++) {
- gpr_free(accepted_strs[i]);
- }
- gpr_free(accepted_strs);
- gpr_free(accepted_str_joined);
- abort();
- }
-}
-
void run_spec(const test_spec *spec) {
grpc_channel *client;
char *client_hostport;
char *servers_hostports_str;
- int *actual_connection_sequence;
request_data rdata;
servers_fixture *f;
grpc_channel_args args;
@@ -524,14 +493,14 @@
gpr_log(GPR_INFO, "Testing '%s' with servers=%s client=%s", spec->description,
servers_hostports_str, client_hostport);
- actual_connection_sequence = perform_request(f, client, &rdata, spec);
+ const request_sequences sequences = perform_request(f, client, &rdata, spec);
- spec->verifier(f, client, actual_connection_sequence, spec->num_iters);
+ spec->verifier(f, client, &sequences, spec->num_iters);
gpr_free(client_hostport);
gpr_free(servers_hostports_str);
- gpr_free(actual_connection_sequence);
gpr_free(rdata.call_details);
+ request_sequences_destroy(&sequences);
grpc_channel_destroy(client); /* calls the LB's shutdown func */
teardown_servers(f);
@@ -684,29 +653,43 @@
static void verify_vanilla_round_robin(const servers_fixture *f,
grpc_channel *client,
- const int *actual_connection_sequence,
+ const request_sequences *sequences,
const size_t num_iters) {
- int *expected_connection_sequence;
- size_t i;
const size_t expected_seq_length = f->num_servers;
/* verify conn. seq. expectation */
/* get the first sequence of "num_servers" elements */
- expected_connection_sequence = gpr_malloc(sizeof(int) * expected_seq_length);
- memcpy(expected_connection_sequence, actual_connection_sequence,
+ int *expected_connection_sequence =
+ gpr_malloc(sizeof(int) * expected_seq_length);
+ memcpy(expected_connection_sequence, sequences->connections,
sizeof(int) * expected_seq_length);
- for (i = 0; i < num_iters; i++) {
- const int actual = actual_connection_sequence[i];
+ for (size_t i = 0; i < num_iters; i++) {
+ const int actual = sequences->connections[i];
const int expected = expected_connection_sequence[i % expected_seq_length];
if (actual != expected) {
- print_failed_expectations(expected_connection_sequence,
- actual_connection_sequence, expected_seq_length,
- num_iters);
+ gpr_log(
+ GPR_ERROR,
+ "CONNECTION SEQUENCE FAILURE: expected %d, got %d at iteration #%d",
+ expected, actual, (int)i);
abort();
}
}
- assert_channel_connectivity(client, 1, GRPC_CHANNEL_READY);
+
+ /* All servers are available, therefore all client subchannels are READY, even
+ * when we only need one for the client channel state to be READY */
+ for (size_t i = 0; i < sequences->n; i++) {
+ const grpc_connectivity_state actual = sequences->connectivity_states[i];
+ const grpc_connectivity_state expected = GRPC_CHANNEL_READY;
+ if (actual != expected) {
+ gpr_log(GPR_ERROR,
+ "CONNECTIVITY STATUS SEQUENCE FAILURE: expected '%s', got '%s' "
+ "at iteration #%d",
+ grpc_connectivity_state_name(expected),
+ grpc_connectivity_state_name(actual), (int)i);
+ abort();
+ }
+ }
gpr_free(expected_connection_sequence);
}
@@ -715,7 +698,7 @@
* given in "f") are killed */
static void verify_vanishing_floor_round_robin(
const servers_fixture *f, grpc_channel *client,
- const int *actual_connection_sequence, const size_t num_iters) {
+ const request_sequences *sequences, const size_t num_iters) {
int *expected_connection_sequence;
const size_t expected_seq_length = 2;
size_t i;
@@ -723,57 +706,83 @@
/* verify conn. seq. expectation */
/* copy the first full sequence (without -1s) */
expected_connection_sequence = gpr_malloc(sizeof(int) * expected_seq_length);
- memcpy(expected_connection_sequence, actual_connection_sequence + 2,
+ memcpy(expected_connection_sequence, sequences->connections + 2,
expected_seq_length * sizeof(int));
/* first two elements of the sequence should be [0 (1st server), -1 (failure)]
*/
- GPR_ASSERT(actual_connection_sequence[0] == 0);
- GPR_ASSERT(actual_connection_sequence[1] == -1);
+ GPR_ASSERT(sequences->connections[0] == 0);
+ GPR_ASSERT(sequences->connections[1] == -1);
/* the next two element must be [3, 0], repeating from that point: the 3 is
* brought forth by servers 1 and 2 disappearing after the intial pick of 0 */
- GPR_ASSERT(actual_connection_sequence[2] == 3);
- GPR_ASSERT(actual_connection_sequence[3] == 0);
+ GPR_ASSERT(sequences->connections[2] == 3);
+ GPR_ASSERT(sequences->connections[3] == 0);
/* make sure that the expectation obliges */
for (i = 2; i < num_iters; i++) {
- const int actual = actual_connection_sequence[i];
+ const int actual = sequences->connections[i];
const int expected = expected_connection_sequence[i % expected_seq_length];
if (actual != expected) {
print_failed_expectations(expected_connection_sequence,
- actual_connection_sequence, expected_seq_length,
+ sequences->connections, expected_seq_length,
num_iters);
abort();
}
}
- gpr_free(expected_connection_sequence);
-}
-static void verify_total_carnage_round_robin(
- const servers_fixture *f, grpc_channel *client,
- const int *actual_connection_sequence, const size_t num_iters) {
- size_t i;
-
- for (i = 0; i < num_iters; i++) {
- const int actual = actual_connection_sequence[i];
- const int expected = -1;
+ /* There's always at least one subchannel READY (connected), therefore the
+ * overall state of the client channel is READY at all times. */
+ for (i = 0; i < sequences->n; i++) {
+ const grpc_connectivity_state actual = sequences->connectivity_states[i];
+ const grpc_connectivity_state expected = GRPC_CHANNEL_READY;
if (actual != expected) {
- gpr_log(GPR_ERROR, "FAILURE: expected %d, actual %d at iter %" PRIuPTR,
- expected, actual, i);
+ gpr_log(GPR_ERROR,
+ "CONNECTIVITY STATUS SEQUENCE FAILURE: expected '%s', got '%s' "
+ "at iteration #%d",
+ grpc_connectivity_state_name(expected),
+ grpc_connectivity_state_name(actual), (int)i);
abort();
}
}
- /* even though we know all the servers are dead, the client is still trying
- * retrying, believing it's in a transient failure situation */
- assert_channel_connectivity(client, 2, GRPC_CHANNEL_TRANSIENT_FAILURE,
- GRPC_CHANNEL_CONNECTING);
+ gpr_free(expected_connection_sequence);
+}
+
+static void verify_total_carnage_round_robin(const servers_fixture *f,
+ grpc_channel *client,
+ const request_sequences *sequences,
+ const size_t num_iters) {
+ for (size_t i = 0; i < num_iters; i++) {
+ const int actual = sequences->connections[i];
+ const int expected = -1;
+ if (actual != expected) {
+ gpr_log(
+ GPR_ERROR,
+ "CONNECTION SEQUENCE FAILURE: expected %d, got %d at iteration #%d",
+ expected, actual, (int)i);
+ abort();
+ }
+ }
+
+ /* no server is ever available. The persistent state is TRANSIENT_FAILURE */
+ for (size_t i = 0; i < sequences->n; i++) {
+ const grpc_connectivity_state actual = sequences->connectivity_states[i];
+ const grpc_connectivity_state expected = GRPC_CHANNEL_TRANSIENT_FAILURE;
+ if (actual != expected) {
+ gpr_log(GPR_ERROR,
+ "CONNECTIVITY STATUS SEQUENCE FAILURE: expected '%s', got '%s' "
+ "at iteration #%d",
+ grpc_connectivity_state_name(expected),
+ grpc_connectivity_state_name(actual), (int)i);
+ abort();
+ }
+ }
}
static void verify_partial_carnage_round_robin(
const servers_fixture *f, grpc_channel *client,
- const int *actual_connection_sequence, const size_t num_iters) {
+ const request_sequences *sequences, const size_t num_iters) {
int *expected_connection_sequence;
size_t i;
const size_t expected_seq_length = f->num_servers;
@@ -781,15 +790,15 @@
/* verify conn. seq. expectation */
/* get the first sequence of "num_servers" elements */
expected_connection_sequence = gpr_malloc(sizeof(int) * expected_seq_length);
- memcpy(expected_connection_sequence, actual_connection_sequence,
+ memcpy(expected_connection_sequence, sequences->connections,
sizeof(int) * expected_seq_length);
for (i = 0; i < num_iters / 2; i++) {
- const int actual = actual_connection_sequence[i];
+ const int actual = sequences->connections[i];
const int expected = expected_connection_sequence[i % expected_seq_length];
if (actual != expected) {
print_failed_expectations(expected_connection_sequence,
- actual_connection_sequence, expected_seq_length,
+ sequences->connections, expected_seq_length,
num_iters);
abort();
}
@@ -797,13 +806,34 @@
/* second half of the iterations go without response */
for (; i < num_iters; i++) {
- GPR_ASSERT(actual_connection_sequence[i] == -1);
+ GPR_ASSERT(sequences->connections[i] == -1);
}
- /* even though we know all the servers are dead, the client is still trying
- * retrying, believing it's in a transient failure situation */
- assert_channel_connectivity(client, 2, GRPC_CHANNEL_TRANSIENT_FAILURE,
- GRPC_CHANNEL_CONNECTING);
+ /* We can assert that the first client channel state should be READY, when all
+ * servers were available; and that the last one should be TRANSIENT_FAILURE,
+ * after all servers are gone. */
+ grpc_connectivity_state actual = sequences->connectivity_states[0];
+ grpc_connectivity_state expected = GRPC_CHANNEL_READY;
+ if (actual != expected) {
+ gpr_log(GPR_ERROR,
+ "CONNECTIVITY STATUS SEQUENCE FAILURE: expected '%s', got '%s' "
+ "at iteration #%d",
+ grpc_connectivity_state_name(expected),
+ grpc_connectivity_state_name(actual), 0);
+ abort();
+ }
+
+ actual = sequences->connectivity_states[num_iters - 1];
+ expected = GRPC_CHANNEL_TRANSIENT_FAILURE;
+ if (actual != expected) {
+ gpr_log(GPR_ERROR,
+ "CONNECTIVITY STATUS SEQUENCE FAILURE: expected '%s', got '%s' "
+ "at iteration #%d",
+ grpc_connectivity_state_name(expected),
+ grpc_connectivity_state_name(actual), (int)num_iters - 1);
+ abort();
+ }
+
gpr_free(expected_connection_sequence);
}
@@ -826,15 +856,14 @@
static void verify_rebirth_round_robin(const servers_fixture *f,
grpc_channel *client,
- const int *actual_connection_sequence,
+ const request_sequences *sequences,
const size_t num_iters) {
int *expected_connection_sequence;
size_t i, j, unique_seq_last_idx, unique_seq_first_idx;
const size_t expected_seq_length = f->num_servers;
int *seen_elements;
- dump_array("actual_connection_sequence", actual_connection_sequence,
- num_iters);
+ dump_array("actual_connection_sequence", sequences->connections, num_iters);
/* verify conn. seq. expectation */
/* get the first unique run of length "num_servers". */
@@ -845,13 +874,13 @@
memset(seen_elements, 0, sizeof(int) * expected_seq_length);
for (i = 0; i < num_iters; i++) {
- if (actual_connection_sequence[i] < 0 ||
- seen_elements[actual_connection_sequence[i]] != 0) {
+ if (sequences->connections[i] < 0 ||
+ seen_elements[sequences->connections[i]] != 0) {
/* if anything breaks the uniqueness of the run, back to square zero */
memset(seen_elements, 0, sizeof(int) * expected_seq_length);
continue;
}
- seen_elements[actual_connection_sequence[i]] = 1;
+ seen_elements[sequences->connections[i]] = 1;
for (j = 0; j < expected_seq_length; j++) {
if (seen_elements[j] == 0) break;
}
@@ -870,30 +899,72 @@
unique_seq_first_idx = (unique_seq_last_idx - expected_seq_length + 1);
memcpy(expected_connection_sequence,
- actual_connection_sequence + unique_seq_first_idx,
+ sequences->connections + unique_seq_first_idx,
sizeof(int) * expected_seq_length);
/* first iteration succeeds */
- GPR_ASSERT(actual_connection_sequence[0] != -1);
+ GPR_ASSERT(sequences->connections[0] != -1);
/* then we fail for a while... */
- GPR_ASSERT(actual_connection_sequence[1] == -1);
+ GPR_ASSERT(sequences->connections[1] == -1);
/* ... but should be up at "unique_seq_first_idx" */
- GPR_ASSERT(actual_connection_sequence[unique_seq_first_idx] != -1);
+ GPR_ASSERT(sequences->connections[unique_seq_first_idx] != -1);
for (j = 0, i = unique_seq_first_idx; i < num_iters; i++) {
- const int actual = actual_connection_sequence[i];
+ const int actual = sequences->connections[i];
const int expected =
expected_connection_sequence[j++ % expected_seq_length];
if (actual != expected) {
print_failed_expectations(expected_connection_sequence,
- actual_connection_sequence, expected_seq_length,
+ sequences->connections, expected_seq_length,
num_iters);
abort();
}
}
- /* things are fine once the servers are brought back up */
- assert_channel_connectivity(client, 1, GRPC_CHANNEL_READY);
+ /* We can assert that the first client channel state should be READY, when all
+ * servers were available; same thing for the last one. In the middle
+ * somewhere there must exist at least one TRANSIENT_FAILURE */
+ grpc_connectivity_state actual = sequences->connectivity_states[0];
+ grpc_connectivity_state expected = GRPC_CHANNEL_READY;
+ if (actual != expected) {
+ gpr_log(GPR_ERROR,
+ "CONNECTIVITY STATUS SEQUENCE FAILURE: expected '%s', got '%s' "
+ "at iteration #%d",
+ grpc_connectivity_state_name(expected),
+ grpc_connectivity_state_name(actual), 0);
+ abort();
+ }
+
+ actual = sequences->connectivity_states[num_iters - 1];
+ expected = GRPC_CHANNEL_READY;
+ if (actual != expected) {
+ gpr_log(GPR_ERROR,
+ "CONNECTIVITY STATUS SEQUENCE FAILURE: expected '%s', got '%s' "
+ "at iteration #%d",
+ grpc_connectivity_state_name(expected),
+ grpc_connectivity_state_name(actual), (int)num_iters - 1);
+ abort();
+ }
+
+ bool found_failure_status = false;
+ for (i = 1; i < sequences->n - 1; i++) {
+ if (sequences->connectivity_states[i] == GRPC_CHANNEL_TRANSIENT_FAILURE) {
+ found_failure_status = true;
+ break;
+ }
+ }
+ if (!found_failure_status) {
+ gpr_log(
+ GPR_ERROR,
+ "CONNECTIVITY STATUS SEQUENCE FAILURE: "
+ "GRPC_CHANNEL_TRANSIENT_FAILURE status not found. Got the following "
+ "instead:");
+ for (i = 0; i < num_iters; i++) {
+ gpr_log(GPR_ERROR, "[%d]: %s", (int)i,
+ grpc_connectivity_state_name(sequences->connectivity_states[i]));
+ }
+ }
+
gpr_free(expected_connection_sequence);
gpr_free(seen_elements);
}
@@ -934,7 +1005,7 @@
* This should knock down the server bound to be selected next */
test_spec_reset(spec);
spec->verifier = verify_vanishing_floor_round_robin;
- spec->description = "test_kill_all_server_at_2nd_iteration";
+ spec->description = "test_kill_middle_servers_at_2nd_iteration";
for (i = 1; i < NUM_SERVERS - 1; i++) {
spec->kill_at[1][i] = 1;
}