Orangefs: Clean up pvfs2_devreq_read.

* Kick invalid arguments out early, so handling them does not clutter
  the code.
* Avoid possibility of race by not releasing lock until completely done.
* Do not leak ops (memory) in certain error condition.
* Check for more error conditions.
* Put module name in all error and debug logs.
* Document behavior.

Signed-off-by: Martin Brandenburg <martin@omnibond.com>
Signed-off-by: Mike Marshall <hubcap@omnibond.com>
diff --git a/fs/orangefs/devpvfs2-req.c b/fs/orangefs/devpvfs2-req.c
index 34e2240..e37b647 100644
--- a/fs/orangefs/devpvfs2-req.c
+++ b/fs/orangefs/devpvfs2-req.c
@@ -104,110 +104,137 @@
 				 char __user *buf,
 				 size_t count, loff_t *offset)
 {
-	int ret = 0;
-	ssize_t len = 0;
-	struct pvfs2_kernel_op_s *cur_op = NULL;
-	static __s32 magic = PVFS2_DEVREQ_MAGIC;
+	struct pvfs2_kernel_op_s *op, *temp;
 	__s32 proto_ver = PVFS_KERNEL_PROTO_VERSION;
+	static __s32 magic = PVFS2_DEVREQ_MAGIC;
+	struct pvfs2_kernel_op_s *cur_op = NULL;
+	unsigned long ret;
 
+	/* We do not support blocking IO. */
 	if (!(file->f_flags & O_NONBLOCK)) {
-		/* We do not support blocking reads/opens any more */
-		gossip_err("pvfs2: blocking reads are not supported! (pvfs2-client-core bug)\n");
+		gossip_err("orangefs: blocking reads are not supported! (pvfs2-client-core bug)\n");
 		return -EINVAL;
-	} else {
-		struct pvfs2_kernel_op_s *op = NULL, *temp = NULL;
-		/* get next op (if any) from top of list */
-		spin_lock(&pvfs2_request_list_lock);
-		list_for_each_entry_safe(op, temp, &pvfs2_request_list, list) {
-			__s32 fsid = fsid_of_op(op);
-			/*
-			 * Check if this op's fsid is known and needs
-			 * remounting
-			 */
-			if (fsid != PVFS_FS_ID_NULL &&
-			    fs_mount_pending(fsid) == 1) {
+	}
+
+	/*
+	 * The client will do an ioctl to find MAX_ALIGNED_DEV_REQ_UPSIZE, then
+	 * always read with that size buffer.
+	 */
+	if (count != MAX_ALIGNED_DEV_REQ_UPSIZE) {
+		gossip_err("orangefs: client-core tried to read wrong size\n");
+		return -EINVAL;
+	}
+
+	/* Get next op (if any) from top of list. */
+	spin_lock(&pvfs2_request_list_lock);
+	list_for_each_entry_safe(op, temp, &pvfs2_request_list, list) {
+		__s32 fsid;
+		/* This lock is held past the end of the loop when we break. */
+		spin_lock(&op->lock);
+
+		fsid = fsid_of_op(op);
+		if (fsid != PVFS_FS_ID_NULL) {
+			int ret;
+			/* Skip ops whose filesystem needs to be mounted. */
+			ret = fs_mount_pending(fsid);
+			if (ret == 1) {
 				gossip_debug(GOSSIP_DEV_DEBUG,
-					     "Skipping op tag %llu %s\n",
-					     llu(op->tag),
-					     get_opname_string(op));
+				    "orangefs: skipping op tag %llu %s\n",
+				    llu(op->tag), get_opname_string(op));
+				spin_unlock(&op->lock);
 				continue;
-			} else {
-				/*
-				 * op does not belong to any particular fsid
-				 * or already mounted.. let it through
-				 */
-				cur_op = op;
-				spin_lock(&cur_op->lock);
-				list_del(&cur_op->list);
-				spin_unlock(&cur_op->lock);
-				break;
+			/* Skip ops whose filesystem we don't know about unless
+			 * it is being mounted. */
+			/* XXX: is there a better way to detect this? */
+			} else if (ret == -1 &&
+				   !(op->upcall.type == PVFS2_VFS_OP_FS_MOUNT ||
+				     op->upcall.type == PVFS2_VFS_OP_GETATTR)) {
+				gossip_debug(GOSSIP_DEV_DEBUG,
+				    "orangefs: skipping op tag %llu %s\n",
+				    llu(op->tag), get_opname_string(op));
+				gossip_err(
+				    "orangefs: ERROR: fs_mount_pending %d\n",
+				    fsid);
+				spin_unlock(&op->lock);
+				continue;
 			}
 		}
-		spin_unlock(&pvfs2_request_list_lock);
-	}
-
-	if (cur_op) {
-		spin_lock(&cur_op->lock);
-
-		gossip_debug(GOSSIP_DEV_DEBUG,
-			     "client-core: reading op tag %llu %s\n",
-			     llu(cur_op->tag), get_opname_string(cur_op));
-		if (op_state_in_progress(cur_op) || op_state_serviced(cur_op)) {
-			gossip_err("WARNING: Current op already queued...skipping\n");
-		} else  {
-			/*
-			 * atomically move the operation to the
-			 * htable_ops_in_progress
-			 */
-			set_op_state_inprogress(cur_op);
-			pvfs2_devreq_add_op(cur_op);
-		}
-
-		spin_unlock(&cur_op->lock);
-
-		/* Push the upcall out */
-		len = MAX_ALIGNED_DEV_REQ_UPSIZE;
-		if ((size_t) len <= count) {
-		    ret = copy_to_user(buf,
-				       &proto_ver,
-			       sizeof(__s32));
-		    if (ret == 0) {
-			ret = copy_to_user(buf + sizeof(__s32),
-					   &magic,
-					   sizeof(__s32));
-			if (ret == 0) {
-			    ret = copy_to_user(buf+2 * sizeof(__s32),
-					       &cur_op->tag,
-					       sizeof(__u64));
-			    if (ret == 0) {
-				ret = copy_to_user(
-					buf +
-					  2 *
-					  sizeof(__s32) +
-					  sizeof(__u64),
-					&cur_op->upcall,
-					sizeof(struct pvfs2_upcall_s));
-			    }
-			}
-		    }
-
-		    if (ret) {
-			gossip_err("Failed to copy data to user space\n");
-			len = -EFAULT;
-		    }
-		} else {
-			gossip_err
-			    ("Failed to copy data to user space\n");
-			len = -EIO;
-		}
-	} else if (file->f_flags & O_NONBLOCK) {
 		/*
-		 * if in non-blocking mode, return EAGAIN since no requests are
-		 * ready yet
+		 * Either this op does not pertain to a filesystem, is mounting
+		 * a filesystem, or pertains to a mounted filesystem. Let it
+		 * through.
 		 */
-		len = -EAGAIN;
+		cur_op = op;
+		break;
 	}
-	return len;
+
+	/*
+	 * At this point we either have a valid op and can continue or have not
+	 * found an op and must ask the client to try again later.
+	 */
+	if (!cur_op) {
+		spin_unlock(&pvfs2_request_list_lock);
+		return -EAGAIN;
+	}
+
+	gossip_debug(GOSSIP_DEV_DEBUG, "orangefs: reading op tag %llu %s\n",
+		     llu(cur_op->tag), get_opname_string(cur_op));
+
+	/*
+	 * Such an op should never be on the list in the first place. If so, we
+	 * will abort.
+	 */
+	if (op_state_in_progress(cur_op) || op_state_serviced(cur_op)) {
+		gossip_err("orangefs: ERROR: Current op already queued.\n");
+		list_del(&cur_op->list);
+		spin_unlock(&cur_op->lock);
+		spin_unlock(&pvfs2_request_list_lock);
+		return -EAGAIN;
+	}
+
+	/*
+	 * Set the operation to be in progress and move it between lists since
+	 * it has been sent to the client.
+	 */
+	set_op_state_inprogress(cur_op);
+
+	list_del(&cur_op->list);
+	spin_unlock(&pvfs2_request_list_lock);
+	pvfs2_devreq_add_op(cur_op);
+	spin_unlock(&cur_op->lock);
+
+	/* Push the upcall out. */
+	ret = copy_to_user(buf, &proto_ver, sizeof(__s32));
+	if (ret != 0)
+		goto error;
+	ret = copy_to_user(buf+sizeof(__s32), &magic, sizeof(__s32));
+	if (ret != 0)
+		goto error;
+	ret = copy_to_user(buf+2 * sizeof(__s32), &cur_op->tag, sizeof(__u64));
+	if (ret != 0)
+		goto error;
+	ret = copy_to_user(buf+2*sizeof(__s32)+sizeof(__u64), &cur_op->upcall,
+			   sizeof(struct pvfs2_upcall_s));
+	if (ret != 0)
+		goto error;
+
+	/* The client only asks to read one size buffer. */
+	return MAX_ALIGNED_DEV_REQ_UPSIZE;
+error:
+	/*
+	 * We were unable to copy the op data to the client. Put the op back in
+	 * list. If client has crashed, the op will be purged later when the
+	 * device is released.
+	 */
+	gossip_err("orangefs: Failed to copy data to user space\n");
+	spin_lock(&pvfs2_request_list_lock);
+	spin_lock(&cur_op->lock);
+	set_op_state_waiting(cur_op);
+	pvfs2_devreq_remove_op(cur_op->tag);
+	list_add(&cur_op->list, &pvfs2_request_list);
+	spin_unlock(&cur_op->lock);
+	spin_unlock(&pvfs2_request_list_lock);
+	return -EFAULT;
 }
 
 /* Function for writev() callers into the device */