ceph: reset osd after relevant messages timed out
This simplifies the process of timing out messages. We
keep lru of current messages that are in flight. If a
timeout has passed, we reset the osd connection, so that
messages will be retransmitted. This is a failsafe in case
we hit some sort of problem sending out message to the OSD.
Normally, we'll get notification via an updated osdmap if
there are problems.
If a request is older than the keepalive timeout, send a
keepalive to ensure we detect any breaks in the TCP connection.
Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
index f256eba..1b1a3ca 100644
--- a/fs/ceph/osd_client.h
+++ b/fs/ceph/osd_client.h
@@ -36,12 +36,15 @@
void *o_authorizer_buf, *o_authorizer_reply_buf;
size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
unsigned long lru_ttl;
+ int o_marked_for_keepalive;
+ struct list_head o_keepalive_item;
};
/* an in-flight request */
struct ceph_osd_request {
u64 r_tid; /* unique for this client */
struct rb_node r_node;
+ struct list_head r_req_lru_item;
struct list_head r_osd_item;
struct ceph_osd *r_osd;
struct ceph_pg r_pgid;
@@ -67,7 +70,7 @@
char r_oid[40]; /* object name */
int r_oid_len;
- unsigned long r_timeout_stamp;
+ unsigned long r_sent_stamp;
bool r_resend; /* msg send failed, needs retry */
struct ceph_file_layout r_file_layout;
@@ -92,6 +95,7 @@
u64 timeout_tid; /* tid of timeout triggering rq */
u64 last_tid; /* tid of last request */
struct rb_root requests; /* pending requests */
+ struct list_head req_lru; /* pending requests lru */
int num_requests;
struct delayed_work timeout_work;
struct delayed_work osds_timeout_work;