libceph: force resend of osd requests if we skip an osdmap
If we skip over one or more map epochs, we need to resend all osd requests
because it is possible they remapped to other servers and then back.
Signed-off-by: Sage Weil <sage@newdream.net>
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 01ceb59..733e4600 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -943,7 +943,7 @@
* Caller should hold map_sem for read and request_mutex.
*/
static int __map_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+ struct ceph_osd_request *req, int force_resend)
{
struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
struct ceph_pg pgid;
@@ -967,7 +967,8 @@
num = err;
}
- if ((req->r_osd && req->r_osd->o_osd == o &&
+ if ((!force_resend &&
+ req->r_osd && req->r_osd->o_osd == o &&
req->r_sent >= req->r_osd->o_incarnation &&
req->r_num_pg_osds == num &&
memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
@@ -1289,18 +1290,18 @@
*
* Caller should hold map_sem for read and request_mutex.
*/
-static void kick_requests(struct ceph_osd_client *osdc)
+static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
{
struct ceph_osd_request *req, *nreq;
struct rb_node *p;
int needmap = 0;
int err;
- dout("kick_requests\n");
+ dout("kick_requests %s\n", force_resend ? " (force resend)" : "");
mutex_lock(&osdc->request_mutex);
for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
req = rb_entry(p, struct ceph_osd_request, r_node);
- err = __map_request(osdc, req);
+ err = __map_request(osdc, req, force_resend);
if (err < 0)
continue; /* error */
if (req->r_osd == NULL) {
@@ -1318,7 +1319,7 @@
r_linger_item) {
dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
- err = __map_request(osdc, req);
+ err = __map_request(osdc, req, force_resend);
if (err == 0)
continue; /* no change and no osd was specified */
if (err < 0)
@@ -1395,7 +1396,7 @@
ceph_osdmap_destroy(osdc->osdmap);
osdc->osdmap = newmap;
}
- kick_requests(osdc);
+ kick_requests(osdc, 0);
reset_changed_osds(osdc);
} else {
dout("ignoring incremental map %u len %d\n",
@@ -1423,6 +1424,8 @@
"older than our %u\n", epoch, maplen,
osdc->osdmap->epoch);
} else {
+ int skipped_map = 0;
+
dout("taking full map %u len %d\n", epoch, maplen);
newmap = osdmap_decode(&p, p+maplen);
if (IS_ERR(newmap)) {
@@ -1432,9 +1435,12 @@
BUG_ON(!newmap);
oldmap = osdc->osdmap;
osdc->osdmap = newmap;
- if (oldmap)
+ if (oldmap) {
+ if (oldmap->epoch + 1 < newmap->epoch)
+ skipped_map = 1;
ceph_osdmap_destroy(oldmap);
- kick_requests(osdc);
+ }
+ kick_requests(osdc, skipped_map);
}
p += maplen;
nr_maps--;
@@ -1707,7 +1713,7 @@
* the request still han't been touched yet.
*/
if (req->r_sent == 0) {
- rc = __map_request(osdc, req);
+ rc = __map_request(osdc, req, 0);
if (rc < 0) {
if (nofail) {
dout("osdc_start_request failed map, "