Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma

Pull more rdma updates from Doug Ledford:
 "This is the second group of code for the 4.7 merge window.  It looks
  large, but only in one sense.  I'll get to that in a minute.  The list
  of changes here breaks down as follows:

   - Dynamic counter infrastructure in the IB drivers

     This is a sysfs based code to allow free form access to the
     hardware counters RDMA devices might support so drivers don't need
     to code this up repeatedly themselves

   - SendOnlyFullMember multicast support

   - IB router support

   - A couple misc fixes

   - The big item on the list: hfi1 driver updates, plus moving the hfi1
     driver out of staging

  There was a group of 15 patches in the hfi1 list that I thought I had
  in the first pull request but they weren't.  So that added to the
  length of the hfi1 section here.

  As far as these go, everything but the hfi1 is pretty straight
  forward.

  The hfi1 is, if you recall, the driver that Al had complaints about
  how it used the write/writev interfaces in an overloaded fashion.  The
  write portion of their interface behaved like the write handler in the
  IB stack proper and did bi-directional communications.  The writev
  interface, on the other hand, only accepts SDMA request structures.
  The completions for those structures are sent back via an entirely
  different event mechanism.

  With the security patch, we put security checks on the write
  interface, however, we also knew they would be going away soon.  Now,
  we've converted the write handler in the hfi1 driver to use ioctls
  from the IB reserved magic area for its bidirectional communications.
  With that change, Intel has addressed all of the items originally on
  their TODO when they went into staging (as well as many items added to
  the list later).

  As such, I moved them out, and since they were the last item in the
  staging/rdma directory, and I don't have immediate plans to use the
  staging area again, I removed the staging/rdma area.

  Because of the move out of staging, as well as a series of 5 patches
  in the hfi1 driver that removed code people thought should be done in
  a different way and was optional to begin with (a snoop debug
  interface, an eeprom driver for an eeprom connected directory to their
  hfi1 chip and not via an i2c bus, and a few other things like that),
  the line count, especially the removal count, is high"

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma: (56 commits)
  staging/rdma: Remove the entire rdma subdirectory of staging
  IB/core: Make device counter infrastructure dynamic
  IB/hfi1: Fix pio map initialization
  IB/hfi1: Correct 8051 link parameter settings
  IB/hfi1: Update pkey table properly after link down or FM start
  IB/rdamvt: Fix rdmavt s_ack_queue sizing
  IB/rdmavt: Max atomic value should be a u8
  IB/hfi1: Fix hard lockup due to not using save/restore spin lock
  IB/hfi1: Add tracing support for send with invalidate opcode
  IB/hfi1, qib: Add ieth to the packet header definitions
  IB/hfi1: Move driver out of staging
  IB/hfi1: Do not free hfi1 cdev parent structure early
  IB/hfi1: Add trace message in user IOCTL handling
  IB/hfi1: Remove write(), use ioctl() for user cmds
  IB/hfi1: Add ioctl() interface for user commands
  IB/hfi1: Remove unused user command
  IB/hfi1: Remove snoop/diag interface
  IB/hfi1: Remove EPROM functionality from data device
  IB/hfi1: Remove UI char device
  IB/hfi1: Remove multiple device cdev
  ...
diff --git a/Documentation/infiniband/sysfs.txt b/Documentation/infiniband/sysfs.txt
index 3ecf0c3..45bcafe 100644
--- a/Documentation/infiniband/sysfs.txt
+++ b/Documentation/infiniband/sysfs.txt
@@ -56,6 +56,18 @@
   ports/1/pkeys/10 contains the value at index 10 in port 1's P_Key
   table.
 
+  There is an optional "hw_counters" subdirectory that may be under either
+  the parent device or the port subdirectories or both.  If present,
+  there are a list of counters provided by the hardware.  They may match
+  some of the counters in the counters directory, but they often include
+  many other counters.  In addition to the various counters, there will
+  be a file named "lifespan" that configures how frequently the core
+  should update the counters when they are being accessed (counters are
+  not updated if they are not being accessed).  The lifespan is in milli-
+  seconds and defaults to 10 unless set to something else by the driver.
+  Users may echo a value between 0 - 10000 to the lifespan file to set
+  the length of time between updates in milliseconds.
+
 MTHCA
 
   The Mellanox HCA driver also creates the files:
diff --git a/MAINTAINERS b/MAINTAINERS
index f466673..216165a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5308,6 +5308,13 @@
 F:	include/linux/cciss_ioctl.h
 F:	include/uapi/linux/cciss_ioctl.h
 
+HFI1 DRIVER
+M:	Mike Marciniszyn <mike.marciniszyn@intel.com>
+M:	Dennis Dalessandro <dennis.dalessandro@intel.com>
+L:	linux-rdma@vger.kernel.org
+S:	Supported
+F:	drivers/infiniband/hw/hfi1
+
 HFS FILESYSTEM
 L:	linux-fsdevel@vger.kernel.org
 S:	Orphan
@@ -5837,7 +5844,6 @@
 S:	Supported
 F:	Documentation/infiniband/
 F:	drivers/infiniband/
-F:	drivers/staging/rdma/
 F:	include/uapi/linux/if_infiniband.h
 F:	include/uapi/rdma/
 F:	include/rdma/
@@ -10920,12 +10926,6 @@
 S:	Odd Fixes
 F:	drivers/staging/xgifb/
 
-HFI1 DRIVER
-M:	Mike Marciniszyn <infinipath@intel.com>
-L:	linux-rdma@vger.kernel.org
-S:	Supported
-F:	drivers/staging/rdma/hfi1
-
 STARFIRE/DURALAN NETWORK DRIVER
 M:	Ion Badulescu <ionut@badula.org>
 S:	Odd Fixes
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index 6425c0e..2137adf 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -85,4 +85,6 @@
 
 source "drivers/infiniband/sw/rdmavt/Kconfig"
 
+source "drivers/infiniband/hw/hfi1/Kconfig"
+
 endif # INFINIBAND
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 26987d9..edaae9f 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -1,8 +1,7 @@
 infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS)	:= rdma_cm.o
 user_access-$(CONFIG_INFINIBAND_ADDR_TRANS)	:= rdma_ucm.o
 
-obj-$(CONFIG_INFINIBAND) +=		ib_core.o ib_mad.o ib_sa.o \
-					ib_cm.o iw_cm.o ib_addr.o \
+obj-$(CONFIG_INFINIBAND) +=		ib_core.o ib_cm.o iw_cm.o \
 					$(infiniband-y)
 obj-$(CONFIG_INFINIBAND_USER_MAD) +=	ib_umad.o
 obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \
@@ -10,14 +9,11 @@
 
 ib_core-y :=			packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
 				device.o fmr_pool.o cache.o netlink.o \
-				roce_gid_mgmt.o mr_pool.o
+				roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
+				multicast.o mad.o smi.o agent.o mad_rmpp.o
 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
 ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
 
-ib_mad-y :=			mad.o smi.o agent.o mad_rmpp.o
-
-ib_sa-y :=			sa_query.o multicast.o
-
 ib_cm-y :=			cm.o
 
 iw_cm-y :=			iwcm.o iwpm_util.o iwpm_msg.o
@@ -28,8 +24,6 @@
 
 rdma_ucm-y :=			ucma.o
 
-ib_addr-y :=			addr.o
-
 ib_umad-y :=			user_mad.o
 
 ib_ucm-y :=			ucm.o
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index 337353d..1374541 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -46,10 +46,10 @@
 #include <net/ip6_route.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib.h>
+#include <rdma/rdma_netlink.h>
+#include <net/netlink.h>
 
-MODULE_AUTHOR("Sean Hefty");
-MODULE_DESCRIPTION("IB Address Translation");
-MODULE_LICENSE("Dual BSD/GPL");
+#include "core_priv.h"
 
 struct addr_req {
 	struct list_head list;
@@ -62,8 +62,11 @@
 			 struct rdma_dev_addr *addr, void *context);
 	unsigned long timeout;
 	int status;
+	u32 seq;
 };
 
+static atomic_t ib_nl_addr_request_seq = ATOMIC_INIT(0);
+
 static void process_req(struct work_struct *work);
 
 static DEFINE_MUTEX(lock);
@@ -71,6 +74,126 @@
 static DECLARE_DELAYED_WORK(work, process_req);
 static struct workqueue_struct *addr_wq;
 
+static const struct nla_policy ib_nl_addr_policy[LS_NLA_TYPE_MAX] = {
+	[LS_NLA_TYPE_DGID] = {.type = NLA_BINARY,
+		.len = sizeof(struct rdma_nla_ls_gid)},
+};
+
+static inline bool ib_nl_is_good_ip_resp(const struct nlmsghdr *nlh)
+{
+	struct nlattr *tb[LS_NLA_TYPE_MAX] = {};
+	int ret;
+
+	if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR)
+		return false;
+
+	ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+			nlmsg_len(nlh), ib_nl_addr_policy);
+	if (ret)
+		return false;
+
+	return true;
+}
+
+static void ib_nl_process_good_ip_rsep(const struct nlmsghdr *nlh)
+{
+	const struct nlattr *head, *curr;
+	union ib_gid gid;
+	struct addr_req *req;
+	int len, rem;
+	int found = 0;
+
+	head = (const struct nlattr *)nlmsg_data(nlh);
+	len = nlmsg_len(nlh);
+
+	nla_for_each_attr(curr, head, len, rem) {
+		if (curr->nla_type == LS_NLA_TYPE_DGID)
+			memcpy(&gid, nla_data(curr), nla_len(curr));
+	}
+
+	mutex_lock(&lock);
+	list_for_each_entry(req, &req_list, list) {
+		if (nlh->nlmsg_seq != req->seq)
+			continue;
+		/* We set the DGID part, the rest was set earlier */
+		rdma_addr_set_dgid(req->addr, &gid);
+		req->status = 0;
+		found = 1;
+		break;
+	}
+	mutex_unlock(&lock);
+
+	if (!found)
+		pr_info("Couldn't find request waiting for DGID: %pI6\n",
+			&gid);
+}
+
+int ib_nl_handle_ip_res_resp(struct sk_buff *skb,
+			     struct netlink_callback *cb)
+{
+	const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
+
+	if ((nlh->nlmsg_flags & NLM_F_REQUEST) ||
+	    !(NETLINK_CB(skb).sk) ||
+	    !netlink_capable(skb, CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (ib_nl_is_good_ip_resp(nlh))
+		ib_nl_process_good_ip_rsep(nlh);
+
+	return skb->len;
+}
+
+static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr,
+			     const void *daddr,
+			     u32 seq, u16 family)
+{
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	struct rdma_ls_ip_resolve_header *header;
+	void *data;
+	size_t size;
+	int attrtype;
+	int len;
+
+	if (family == AF_INET) {
+		size = sizeof(struct in_addr);
+		attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV4;
+	} else {
+		size = sizeof(struct in6_addr);
+		attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV6;
+	}
+
+	len = nla_total_size(sizeof(size));
+	len += NLMSG_ALIGN(sizeof(*header));
+
+	skb = nlmsg_new(len, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	data = ibnl_put_msg(skb, &nlh, seq, 0, RDMA_NL_LS,
+			    RDMA_NL_LS_OP_IP_RESOLVE, NLM_F_REQUEST);
+	if (!data) {
+		nlmsg_free(skb);
+		return -ENODATA;
+	}
+
+	/* Construct the family header first */
+	header = (struct rdma_ls_ip_resolve_header *)
+		skb_put(skb, NLMSG_ALIGN(sizeof(*header)));
+	header->ifindex = dev_addr->bound_dev_if;
+	nla_put(skb, attrtype, size, daddr);
+
+	/* Repair the nlmsg header length */
+	nlmsg_end(skb, nlh);
+	ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, GFP_KERNEL);
+
+	/* Make the request retry, so when we get the response from userspace
+	 * we will have something.
+	 */
+	return -ENODATA;
+}
+
 int rdma_addr_size(struct sockaddr *addr)
 {
 	switch (addr->sa_family) {
@@ -199,6 +322,17 @@
 	mutex_unlock(&lock);
 }
 
+static int ib_nl_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
+			  const void *daddr, u32 seq, u16 family)
+{
+	if (ibnl_chk_listeners(RDMA_NL_GROUP_LS))
+		return -EADDRNOTAVAIL;
+
+	/* We fill in what we can, the response will fill the rest */
+	rdma_copy_addr(dev_addr, dst->dev, NULL);
+	return ib_nl_ip_send_msg(dev_addr, daddr, seq, family);
+}
+
 static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
 			const void *daddr)
 {
@@ -223,6 +357,39 @@
 	return ret;
 }
 
+static bool has_gateway(struct dst_entry *dst, sa_family_t family)
+{
+	struct rtable *rt;
+	struct rt6_info *rt6;
+
+	if (family == AF_INET) {
+		rt = container_of(dst, struct rtable, dst);
+		return rt->rt_uses_gateway;
+	}
+
+	rt6 = container_of(dst, struct rt6_info, dst);
+	return rt6->rt6i_flags & RTF_GATEWAY;
+}
+
+static int fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
+		    const struct sockaddr *dst_in, u32 seq)
+{
+	const struct sockaddr_in *dst_in4 =
+		(const struct sockaddr_in *)dst_in;
+	const struct sockaddr_in6 *dst_in6 =
+		(const struct sockaddr_in6 *)dst_in;
+	const void *daddr = (dst_in->sa_family == AF_INET) ?
+		(const void *)&dst_in4->sin_addr.s_addr :
+		(const void *)&dst_in6->sin6_addr;
+	sa_family_t family = dst_in->sa_family;
+
+	/* Gateway + ARPHRD_INFINIBAND -> IB router */
+	if (has_gateway(dst, family) && dst->dev->type == ARPHRD_INFINIBAND)
+		return ib_nl_fetch_ha(dst, dev_addr, daddr, seq, family);
+	else
+		return dst_fetch_ha(dst, dev_addr, daddr);
+}
+
 static int addr4_resolve(struct sockaddr_in *src_in,
 			 const struct sockaddr_in *dst_in,
 			 struct rdma_dev_addr *addr,
@@ -246,10 +413,11 @@
 	src_in->sin_family = AF_INET;
 	src_in->sin_addr.s_addr = fl4.saddr;
 
-	/* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't
-	 * routable) and we could set the network type accordingly.
+	/* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're
+	 * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network
+	 * type accordingly.
 	 */
-	if (rt->rt_uses_gateway)
+	if (rt->rt_uses_gateway && rt->dst.dev->type != ARPHRD_INFINIBAND)
 		addr->network = RDMA_NETWORK_IPV4;
 
 	addr->hoplimit = ip4_dst_hoplimit(&rt->dst);
@@ -291,10 +459,12 @@
 		src_in->sin6_addr = fl6.saddr;
 	}
 
-	/* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't
-	 * routable) and we could set the network type accordingly.
+	/* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're
+	 * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network
+	 * type accordingly.
 	 */
-	if (rt->rt6i_flags & RTF_GATEWAY)
+	if (rt->rt6i_flags & RTF_GATEWAY &&
+	    ip6_dst_idev(dst)->dev->type != ARPHRD_INFINIBAND)
 		addr->network = RDMA_NETWORK_IPV6;
 
 	addr->hoplimit = ip6_dst_hoplimit(dst);
@@ -317,7 +487,8 @@
 
 static int addr_resolve_neigh(struct dst_entry *dst,
 			      const struct sockaddr *dst_in,
-			      struct rdma_dev_addr *addr)
+			      struct rdma_dev_addr *addr,
+			      u32 seq)
 {
 	if (dst->dev->flags & IFF_LOOPBACK) {
 		int ret;
@@ -331,17 +502,8 @@
 	}
 
 	/* If the device doesn't do ARP internally */
-	if (!(dst->dev->flags & IFF_NOARP)) {
-		const struct sockaddr_in *dst_in4 =
-			(const struct sockaddr_in *)dst_in;
-		const struct sockaddr_in6 *dst_in6 =
-			(const struct sockaddr_in6 *)dst_in;
-
-		return dst_fetch_ha(dst, addr,
-				    dst_in->sa_family == AF_INET ?
-				    (const void *)&dst_in4->sin_addr.s_addr :
-				    (const void *)&dst_in6->sin6_addr);
-	}
+	if (!(dst->dev->flags & IFF_NOARP))
+		return fetch_ha(dst, addr, dst_in, seq);
 
 	return rdma_copy_addr(addr, dst->dev, NULL);
 }
@@ -349,7 +511,8 @@
 static int addr_resolve(struct sockaddr *src_in,
 			const struct sockaddr *dst_in,
 			struct rdma_dev_addr *addr,
-			bool resolve_neigh)
+			bool resolve_neigh,
+			u32 seq)
 {
 	struct net_device *ndev;
 	struct dst_entry *dst;
@@ -366,7 +529,7 @@
 			return ret;
 
 		if (resolve_neigh)
-			ret = addr_resolve_neigh(&rt->dst, dst_in, addr);
+			ret = addr_resolve_neigh(&rt->dst, dst_in, addr, seq);
 
 		ndev = rt->dst.dev;
 		dev_hold(ndev);
@@ -383,7 +546,7 @@
 			return ret;
 
 		if (resolve_neigh)
-			ret = addr_resolve_neigh(dst, dst_in, addr);
+			ret = addr_resolve_neigh(dst, dst_in, addr, seq);
 
 		ndev = dst->dev;
 		dev_hold(ndev);
@@ -412,7 +575,7 @@
 			src_in = (struct sockaddr *) &req->src_addr;
 			dst_in = (struct sockaddr *) &req->dst_addr;
 			req->status = addr_resolve(src_in, dst_in, req->addr,
-						   true);
+						   true, req->seq);
 			if (req->status && time_after_eq(jiffies, req->timeout))
 				req->status = -ETIMEDOUT;
 			else if (req->status == -ENODATA)
@@ -471,8 +634,9 @@
 	req->context = context;
 	req->client = client;
 	atomic_inc(&client->refcount);
+	req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq);
 
-	req->status = addr_resolve(src_in, dst_in, addr, true);
+	req->status = addr_resolve(src_in, dst_in, addr, true, req->seq);
 	switch (req->status) {
 	case 0:
 		req->timeout = jiffies;
@@ -510,7 +674,7 @@
 		src_in->sa_family = dst_addr->sa_family;
 	}
 
-	return addr_resolve(src_in, dst_addr, addr, false);
+	return addr_resolve(src_in, dst_addr, addr, false, 0);
 }
 EXPORT_SYMBOL(rdma_resolve_ip_route);
 
@@ -634,7 +798,7 @@
 	.notifier_call = netevent_callback
 };
 
-static int __init addr_init(void)
+int addr_init(void)
 {
 	addr_wq = create_singlethread_workqueue("ib_addr");
 	if (!addr_wq)
@@ -642,15 +806,13 @@
 
 	register_netevent_notifier(&nb);
 	rdma_addr_register_client(&self);
+
 	return 0;
 }
 
-static void __exit addr_cleanup(void)
+void addr_cleanup(void)
 {
 	rdma_addr_unregister_client(&self);
 	unregister_netevent_notifier(&nb);
 	destroy_workqueue(addr_wq);
 }
-
-module_init(addr_init);
-module_exit(addr_cleanup);
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index eab3221..19d499d 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -137,4 +137,20 @@
 	return _upper == upper;
 }
 
+int addr_init(void);
+void addr_cleanup(void);
+
+int ib_mad_init(void);
+void ib_mad_cleanup(void);
+
+int ib_sa_init(void);
+void ib_sa_cleanup(void);
+
+int ib_nl_handle_resolve_resp(struct sk_buff *skb,
+			      struct netlink_callback *cb);
+int ib_nl_handle_set_timeout(struct sk_buff *skb,
+			     struct netlink_callback *cb);
+int ib_nl_handle_ip_res_resp(struct sk_buff *skb,
+			     struct netlink_callback *cb);
+
 #endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 1097984..5516fb0 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -955,6 +955,29 @@
 }
 EXPORT_SYMBOL(ib_get_net_dev_by_params);
 
+static struct ibnl_client_cbs ibnl_ls_cb_table[] = {
+	[RDMA_NL_LS_OP_RESOLVE] = {
+		.dump = ib_nl_handle_resolve_resp,
+		.module = THIS_MODULE },
+	[RDMA_NL_LS_OP_SET_TIMEOUT] = {
+		.dump = ib_nl_handle_set_timeout,
+		.module = THIS_MODULE },
+	[RDMA_NL_LS_OP_IP_RESOLVE] = {
+		.dump = ib_nl_handle_ip_res_resp,
+		.module = THIS_MODULE },
+};
+
+static int ib_add_ibnl_clients(void)
+{
+	return ibnl_add_client(RDMA_NL_LS, ARRAY_SIZE(ibnl_ls_cb_table),
+			       ibnl_ls_cb_table);
+}
+
+static void ib_remove_ibnl_clients(void)
+{
+	ibnl_remove_client(RDMA_NL_LS);
+}
+
 static int __init ib_core_init(void)
 {
 	int ret;
@@ -983,10 +1006,41 @@
 		goto err_sysfs;
 	}
 
+	ret = addr_init();
+	if (ret) {
+		pr_warn("Could't init IB address resolution\n");
+		goto err_ibnl;
+	}
+
+	ret = ib_mad_init();
+	if (ret) {
+		pr_warn("Couldn't init IB MAD\n");
+		goto err_addr;
+	}
+
+	ret = ib_sa_init();
+	if (ret) {
+		pr_warn("Couldn't init SA\n");
+		goto err_mad;
+	}
+
+	if (ib_add_ibnl_clients()) {
+		pr_warn("Couldn't register ibnl clients\n");
+		goto err_sa;
+	}
+
 	ib_cache_setup();
 
 	return 0;
 
+err_sa:
+	ib_sa_cleanup();
+err_mad:
+	ib_mad_cleanup();
+err_addr:
+	addr_cleanup();
+err_ibnl:
+	ibnl_cleanup();
 err_sysfs:
 	class_unregister(&ib_class);
 err_comp:
@@ -999,6 +1053,10 @@
 static void __exit ib_core_cleanup(void)
 {
 	ib_cache_cleanup();
+	ib_remove_ibnl_clients();
+	ib_sa_cleanup();
+	ib_mad_cleanup();
+	addr_cleanup();
 	ibnl_cleanup();
 	class_unregister(&ib_class);
 	destroy_workqueue(ib_comp_wq);
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 9fa5bf3..82fb511 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -47,11 +47,7 @@
 #include "smi.h"
 #include "opa_smi.h"
 #include "agent.h"
-
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_DESCRIPTION("kernel IB MAD API");
-MODULE_AUTHOR("Hal Rosenstock");
-MODULE_AUTHOR("Sean Hefty");
+#include "core_priv.h"
 
 static int mad_sendq_size = IB_MAD_QP_SEND_SIZE;
 static int mad_recvq_size = IB_MAD_QP_RECV_SIZE;
@@ -3316,7 +3312,7 @@
 	.remove = ib_mad_remove_device
 };
 
-static int __init ib_mad_init_module(void)
+int ib_mad_init(void)
 {
 	mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE);
 	mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE);
@@ -3334,10 +3330,7 @@
 	return 0;
 }
 
-static void __exit ib_mad_cleanup_module(void)
+void ib_mad_cleanup(void)
 {
 	ib_unregister_client(&mad_client);
 }
-
-module_init(ib_mad_init_module);
-module_exit(ib_mad_cleanup_module);
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
index 250937c..a83ec28 100644
--- a/drivers/infiniband/core/multicast.c
+++ b/drivers/infiniband/core/multicast.c
@@ -93,6 +93,18 @@
 
 struct mcast_member;
 
+/*
+* There are 4 types of join states:
+* FullMember, NonMember, SendOnlyNonMember, SendOnlyFullMember.
+*/
+enum {
+	FULLMEMBER_JOIN,
+	NONMEMBER_JOIN,
+	SENDONLY_NONMEBER_JOIN,
+	SENDONLY_FULLMEMBER_JOIN,
+	NUM_JOIN_MEMBERSHIP_TYPES,
+};
+
 struct mcast_group {
 	struct ib_sa_mcmember_rec rec;
 	struct rb_node		node;
@@ -102,7 +114,7 @@
 	struct list_head	pending_list;
 	struct list_head	active_list;
 	struct mcast_member	*last_join;
-	int			members[3];
+	int			members[NUM_JOIN_MEMBERSHIP_TYPES];
 	atomic_t		refcount;
 	enum mcast_group_state	state;
 	struct ib_sa_query	*query;
@@ -220,8 +232,9 @@
 }
 
 /*
- * A multicast group has three types of members: full member, non member, and
- * send only member.  We need to keep track of the number of members of each
+ * A multicast group has four types of members: full member, non member,
+ * sendonly non member and sendonly full member.
+ * We need to keep track of the number of members of each
  * type based on their join state.  Adjust the number of members the belong to
  * the specified join states.
  */
@@ -229,7 +242,7 @@
 {
 	int i;
 
-	for (i = 0; i < 3; i++, join_state >>= 1)
+	for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++, join_state >>= 1)
 		if (join_state & 0x1)
 			group->members[i] += inc;
 }
@@ -245,7 +258,7 @@
 	u8 leave_state = 0;
 	int i;
 
-	for (i = 0; i < 3; i++)
+	for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++)
 		if (!group->members[i])
 			leave_state |= (0x1 << i);
 
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 3ebd108..e955386 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -53,10 +53,6 @@
 #include "sa.h"
 #include "core_priv.h"
 
-MODULE_AUTHOR("Roland Dreier");
-MODULE_DESCRIPTION("InfiniBand subnet administration query support");
-MODULE_LICENSE("Dual BSD/GPL");
-
 #define IB_SA_LOCAL_SVC_TIMEOUT_MIN		100
 #define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT		2000
 #define IB_SA_LOCAL_SVC_TIMEOUT_MAX		200000
@@ -119,6 +115,12 @@
 	struct ib_sa_query sa_query;
 };
 
+struct ib_sa_classport_info_query {
+	void (*callback)(int, struct ib_class_port_info *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
 struct ib_sa_mcmember_query {
 	void (*callback)(int, struct ib_sa_mcmember_rec *, void *);
 	void *context;
@@ -392,6 +394,82 @@
 	  .size_bits    = 2*64 },
 };
 
+#define CLASSPORTINFO_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_class_port_info, field),	\
+	.struct_size_bytes   = sizeof((struct ib_class_port_info *)0)->field,	\
+	.field_name          = "ib_class_port_info:" #field
+
+static const struct ib_field classport_info_rec_table[] = {
+	{ CLASSPORTINFO_REC_FIELD(base_version),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ CLASSPORTINFO_REC_FIELD(class_version),
+	  .offset_words = 0,
+	  .offset_bits  = 8,
+	  .size_bits    = 8 },
+	{ CLASSPORTINFO_REC_FIELD(capability_mask),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ CLASSPORTINFO_REC_FIELD(cap_mask2_resp_time),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ CLASSPORTINFO_REC_FIELD(redirect_gid),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ CLASSPORTINFO_REC_FIELD(redirect_tcslfl),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ CLASSPORTINFO_REC_FIELD(redirect_lid),
+	  .offset_words = 7,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ CLASSPORTINFO_REC_FIELD(redirect_pkey),
+	  .offset_words = 7,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+
+	{ CLASSPORTINFO_REC_FIELD(redirect_qp),
+	  .offset_words = 8,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ CLASSPORTINFO_REC_FIELD(redirect_qkey),
+	  .offset_words = 9,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+
+	{ CLASSPORTINFO_REC_FIELD(trap_gid),
+	  .offset_words = 10,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ CLASSPORTINFO_REC_FIELD(trap_tcslfl),
+	  .offset_words = 14,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+
+	{ CLASSPORTINFO_REC_FIELD(trap_lid),
+	  .offset_words = 15,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ CLASSPORTINFO_REC_FIELD(trap_pkey),
+	  .offset_words = 15,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+
+	{ CLASSPORTINFO_REC_FIELD(trap_hlqp),
+	  .offset_words = 16,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ CLASSPORTINFO_REC_FIELD(trap_qkey),
+	  .offset_words = 17,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+};
+
 #define GUIDINFO_REC_FIELD(field) \
 	.struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field),	\
 	.struct_size_bytes   = sizeof((struct ib_sa_guidinfo_rec *) 0)->field,	\
@@ -705,8 +783,8 @@
 	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
 }
 
-static int ib_nl_handle_set_timeout(struct sk_buff *skb,
-				    struct netlink_callback *cb)
+int ib_nl_handle_set_timeout(struct sk_buff *skb,
+			     struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
 	int timeout, delta, abs_delta;
@@ -782,8 +860,8 @@
 	return 1;
 }
 
-static int ib_nl_handle_resolve_resp(struct sk_buff *skb,
-				     struct netlink_callback *cb)
+int ib_nl_handle_resolve_resp(struct sk_buff *skb,
+			      struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
 	unsigned long flags;
@@ -838,15 +916,6 @@
 	return skb->len;
 }
 
-static struct ibnl_client_cbs ib_sa_cb_table[] = {
-	[RDMA_NL_LS_OP_RESOLVE] = {
-		.dump = ib_nl_handle_resolve_resp,
-		.module = THIS_MODULE },
-	[RDMA_NL_LS_OP_SET_TIMEOUT] = {
-		.dump = ib_nl_handle_set_timeout,
-		.module = THIS_MODULE },
-};
-
 static void free_sm_ah(struct kref *kref)
 {
 	struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
@@ -1645,6 +1714,97 @@
 }
 EXPORT_SYMBOL(ib_sa_guid_info_rec_query);
 
+/* Support get SA ClassPortInfo */
+static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query,
+					      int status,
+					      struct ib_sa_mad *mad)
+{
+	struct ib_sa_classport_info_query *query =
+		container_of(sa_query, struct ib_sa_classport_info_query, sa_query);
+
+	if (mad) {
+		struct ib_class_port_info rec;
+
+		ib_unpack(classport_info_rec_table,
+			  ARRAY_SIZE(classport_info_rec_table),
+			  mad->data, &rec);
+		query->callback(status, &rec, query->context);
+	} else {
+		query->callback(status, NULL, query->context);
+	}
+}
+
+static void ib_sa_portclass_info_rec_release(struct ib_sa_query *sa_query)
+{
+	kfree(container_of(sa_query, struct ib_sa_classport_info_query,
+			   sa_query));
+}
+
+int ib_sa_classport_info_rec_query(struct ib_sa_client *client,
+				   struct ib_device *device, u8 port_num,
+				   int timeout_ms, gfp_t gfp_mask,
+				   void (*callback)(int status,
+						    struct ib_class_port_info *resp,
+						    void *context),
+				   void *context,
+				   struct ib_sa_query **sa_query)
+{
+	struct ib_sa_classport_info_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port *port;
+	struct ib_mad_agent *agent;
+	struct ib_sa_mad *mad;
+	int ret;
+
+	if (!sa_dev)
+		return -ENODEV;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	agent = port->agent;
+
+	query = kzalloc(sizeof(*query), gfp_mask);
+	if (!query)
+		return -ENOMEM;
+
+	query->sa_query.port = port;
+	ret = alloc_mad(&query->sa_query, gfp_mask);
+	if (ret)
+		goto err1;
+
+	ib_sa_client_get(client);
+	query->sa_query.client = client;
+	query->callback        = callback;
+	query->context         = context;
+
+	mad = query->sa_query.mad_buf->mad;
+	init_mad(mad, agent);
+
+	query->sa_query.callback = callback ? ib_sa_classport_info_rec_callback : NULL;
+
+	query->sa_query.release  = ib_sa_portclass_info_rec_release;
+	/* support GET only */
+	mad->mad_hdr.method	 = IB_MGMT_METHOD_GET;
+	mad->mad_hdr.attr_id	 = cpu_to_be16(IB_SA_ATTR_CLASS_PORTINFO);
+	mad->sa_hdr.comp_mask	 = 0;
+	*sa_query = &query->sa_query;
+
+	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+	if (ret < 0)
+		goto err2;
+
+	return ret;
+
+err2:
+	*sa_query = NULL;
+	ib_sa_client_put(query->sa_query.client);
+	free_mad(&query->sa_query);
+
+err1:
+	kfree(query);
+	return ret;
+}
+EXPORT_SYMBOL(ib_sa_classport_info_rec_query);
+
 static void send_handler(struct ib_mad_agent *agent,
 			 struct ib_mad_send_wc *mad_send_wc)
 {
@@ -1794,7 +1954,7 @@
 	kfree(sa_dev);
 }
 
-static int __init ib_sa_init(void)
+int ib_sa_init(void)
 {
 	int ret;
 
@@ -1820,17 +1980,10 @@
 		goto err3;
 	}
 
-	if (ibnl_add_client(RDMA_NL_LS, ARRAY_SIZE(ib_sa_cb_table),
-			    ib_sa_cb_table)) {
-		pr_err("Failed to add netlink callback\n");
-		ret = -EINVAL;
-		goto err4;
-	}
 	INIT_DELAYED_WORK(&ib_nl_timed_work, ib_nl_request_timeout);
 
 	return 0;
-err4:
-	destroy_workqueue(ib_nl_wq);
+
 err3:
 	mcast_cleanup();
 err2:
@@ -1839,9 +1992,8 @@
 	return ret;
 }
 
-static void __exit ib_sa_cleanup(void)
+void ib_sa_cleanup(void)
 {
-	ibnl_remove_client(RDMA_NL_LS);
 	cancel_delayed_work(&ib_nl_timed_work);
 	flush_workqueue(ib_nl_wq);
 	destroy_workqueue(ib_nl_wq);
@@ -1849,6 +2001,3 @@
 	ib_unregister_client(&sa_client);
 	idr_destroy(&query_idr);
 }
-
-module_init(ib_sa_init);
-module_exit(ib_sa_cleanup);
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 14606af..5e573bb 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -56,8 +56,10 @@
 	struct gid_attr_group *gid_attr_group;
 	struct attribute_group gid_group;
 	struct attribute_group pkey_group;
-	u8                     port_num;
 	struct attribute_group *pma_table;
+	struct attribute_group *hw_stats_ag;
+	struct rdma_hw_stats   *hw_stats;
+	u8                     port_num;
 };
 
 struct port_attribute {
@@ -80,6 +82,18 @@
 	__be16			attr_id;
 };
 
+struct hw_stats_attribute {
+	struct attribute	attr;
+	ssize_t			(*show)(struct kobject *kobj,
+					struct attribute *attr, char *buf);
+	ssize_t			(*store)(struct kobject *kobj,
+					 struct attribute *attr,
+					 const char *buf,
+					 size_t count);
+	int			index;
+	u8			port_num;
+};
+
 static ssize_t port_attr_show(struct kobject *kobj,
 			      struct attribute *attr, char *buf)
 {
@@ -733,6 +747,212 @@
 	return &pma_group;
 }
 
+static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats,
+			   u8 port_num, int index)
+{
+	int ret;
+
+	if (time_is_after_eq_jiffies(stats->timestamp + stats->lifespan))
+		return 0;
+	ret = dev->get_hw_stats(dev, stats, port_num, index);
+	if (ret < 0)
+		return ret;
+	if (ret == stats->num_counters)
+		stats->timestamp = jiffies;
+
+	return 0;
+}
+
+static ssize_t print_hw_stat(struct rdma_hw_stats *stats, int index, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->value[index]);
+}
+
+static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr,
+			     char *buf)
+{
+	struct ib_device *dev;
+	struct ib_port *port;
+	struct hw_stats_attribute *hsa;
+	struct rdma_hw_stats *stats;
+	int ret;
+
+	hsa = container_of(attr, struct hw_stats_attribute, attr);
+	if (!hsa->port_num) {
+		dev = container_of((struct device *)kobj,
+				   struct ib_device, dev);
+		stats = dev->hw_stats;
+	} else {
+		port = container_of(kobj, struct ib_port, kobj);
+		dev = port->ibdev;
+		stats = port->hw_stats;
+	}
+	ret = update_hw_stats(dev, stats, hsa->port_num, hsa->index);
+	if (ret)
+		return ret;
+	return print_hw_stat(stats, hsa->index, buf);
+}
+
+static ssize_t show_stats_lifespan(struct kobject *kobj,
+				   struct attribute *attr,
+				   char *buf)
+{
+	struct hw_stats_attribute *hsa;
+	int msecs;
+
+	hsa = container_of(attr, struct hw_stats_attribute, attr);
+	if (!hsa->port_num) {
+		struct ib_device *dev = container_of((struct device *)kobj,
+						     struct ib_device, dev);
+		msecs = jiffies_to_msecs(dev->hw_stats->lifespan);
+	} else {
+		struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+		msecs = jiffies_to_msecs(p->hw_stats->lifespan);
+	}
+	return sprintf(buf, "%d\n", msecs);
+}
+
+static ssize_t set_stats_lifespan(struct kobject *kobj,
+				  struct attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct hw_stats_attribute *hsa;
+	int msecs;
+	int jiffies;
+	int ret;
+
+	ret = kstrtoint(buf, 10, &msecs);
+	if (ret)
+		return ret;
+	if (msecs < 0 || msecs > 10000)
+		return -EINVAL;
+	jiffies = msecs_to_jiffies(msecs);
+	hsa = container_of(attr, struct hw_stats_attribute, attr);
+	if (!hsa->port_num) {
+		struct ib_device *dev = container_of((struct device *)kobj,
+						     struct ib_device, dev);
+		dev->hw_stats->lifespan = jiffies;
+	} else {
+		struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+		p->hw_stats->lifespan = jiffies;
+	}
+	return count;
+}
+
+static void free_hsag(struct kobject *kobj, struct attribute_group *attr_group)
+{
+	struct attribute **attr;
+
+	sysfs_remove_group(kobj, attr_group);
+
+	for (attr = attr_group->attrs; *attr; attr++)
+		kfree(*attr);
+	kfree(attr_group);
+}
+
+static struct attribute *alloc_hsa(int index, u8 port_num, const char *name)
+{
+	struct hw_stats_attribute *hsa;
+
+	hsa = kmalloc(sizeof(*hsa), GFP_KERNEL);
+	if (!hsa)
+		return NULL;
+
+	hsa->attr.name = (char *)name;
+	hsa->attr.mode = S_IRUGO;
+	hsa->show = show_hw_stats;
+	hsa->store = NULL;
+	hsa->index = index;
+	hsa->port_num = port_num;
+
+	return &hsa->attr;
+}
+
+static struct attribute *alloc_hsa_lifespan(char *name, u8 port_num)
+{
+	struct hw_stats_attribute *hsa;
+
+	hsa = kmalloc(sizeof(*hsa), GFP_KERNEL);
+	if (!hsa)
+		return NULL;
+
+	hsa->attr.name = name;
+	hsa->attr.mode = S_IWUSR | S_IRUGO;
+	hsa->show = show_stats_lifespan;
+	hsa->store = set_stats_lifespan;
+	hsa->index = 0;
+	hsa->port_num = port_num;
+
+	return &hsa->attr;
+}
+
+static void setup_hw_stats(struct ib_device *device, struct ib_port *port,
+			   u8 port_num)
+{
+	struct attribute_group *hsag = NULL;
+	struct rdma_hw_stats *stats;
+	int i = 0, ret;
+
+	stats = device->alloc_hw_stats(device, port_num);
+
+	if (!stats)
+		return;
+
+	if (!stats->names || stats->num_counters <= 0)
+		goto err;
+
+	hsag = kzalloc(sizeof(*hsag) +
+		       // 1 extra for the lifespan config entry
+		       sizeof(void *) * (stats->num_counters + 1),
+		       GFP_KERNEL);
+	if (!hsag)
+		return;
+
+	ret = device->get_hw_stats(device, stats, port_num,
+				   stats->num_counters);
+	if (ret != stats->num_counters)
+		goto err;
+
+	stats->timestamp = jiffies;
+
+	hsag->name = "hw_counters";
+	hsag->attrs = (void *)hsag + sizeof(*hsag);
+
+	for (i = 0; i < stats->num_counters; i++) {
+		hsag->attrs[i] = alloc_hsa(i, port_num, stats->names[i]);
+		if (!hsag->attrs[i])
+			goto err;
+	}
+
+	/* treat an error here as non-fatal */
+	hsag->attrs[i] = alloc_hsa_lifespan("lifespan", port_num);
+
+	if (port) {
+		struct kobject *kobj = &port->kobj;
+		ret = sysfs_create_group(kobj, hsag);
+		if (ret)
+			goto err;
+		port->hw_stats_ag = hsag;
+		port->hw_stats = stats;
+	} else {
+		struct kobject *kobj = &device->dev.kobj;
+		ret = sysfs_create_group(kobj, hsag);
+		if (ret)
+			goto err;
+		device->hw_stats_ag = hsag;
+		device->hw_stats = stats;
+	}
+
+	return;
+
+err:
+	kfree(stats);
+	for (; i >= 0; i--)
+		kfree(hsag->attrs[i]);
+	kfree(hsag);
+	return;
+}
+
 static int add_port(struct ib_device *device, int port_num,
 		    int (*port_callback)(struct ib_device *,
 					 u8, struct kobject *))
@@ -835,6 +1055,14 @@
 			goto err_remove_pkey;
 	}
 
+	/*
+	 * If port == 0, it means we have only one port and the parent
+	 * device, not this port device, should be the holder of the
+	 * hw_counters
+	 */
+	if (device->alloc_hw_stats && port_num)
+		setup_hw_stats(device, p, port_num);
+
 	list_add_tail(&p->kobj.entry, &device->port_list);
 
 	kobject_uevent(&p->kobj, KOBJ_ADD);
@@ -972,120 +1200,6 @@
 	&dev_attr_node_desc
 };
 
-/* Show a given an attribute in the statistics group */
-static ssize_t show_protocol_stat(const struct device *device,
-			    struct device_attribute *attr, char *buf,
-			    unsigned offset)
-{
-	struct ib_device *dev = container_of(device, struct ib_device, dev);
-	union rdma_protocol_stats stats;
-	ssize_t ret;
-
-	ret = dev->get_protocol_stats(dev, &stats);
-	if (ret)
-		return ret;
-
-	return sprintf(buf, "%llu\n",
-		       (unsigned long long) ((u64 *) &stats)[offset]);
-}
-
-/* generate a read-only iwarp statistics attribute */
-#define IW_STATS_ENTRY(name)						\
-static ssize_t show_##name(struct device *device,			\
-			   struct device_attribute *attr, char *buf)	\
-{									\
-	return show_protocol_stat(device, attr, buf,			\
-				  offsetof(struct iw_protocol_stats, name) / \
-				  sizeof (u64));			\
-}									\
-static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
-
-IW_STATS_ENTRY(ipInReceives);
-IW_STATS_ENTRY(ipInHdrErrors);
-IW_STATS_ENTRY(ipInTooBigErrors);
-IW_STATS_ENTRY(ipInNoRoutes);
-IW_STATS_ENTRY(ipInAddrErrors);
-IW_STATS_ENTRY(ipInUnknownProtos);
-IW_STATS_ENTRY(ipInTruncatedPkts);
-IW_STATS_ENTRY(ipInDiscards);
-IW_STATS_ENTRY(ipInDelivers);
-IW_STATS_ENTRY(ipOutForwDatagrams);
-IW_STATS_ENTRY(ipOutRequests);
-IW_STATS_ENTRY(ipOutDiscards);
-IW_STATS_ENTRY(ipOutNoRoutes);
-IW_STATS_ENTRY(ipReasmTimeout);
-IW_STATS_ENTRY(ipReasmReqds);
-IW_STATS_ENTRY(ipReasmOKs);
-IW_STATS_ENTRY(ipReasmFails);
-IW_STATS_ENTRY(ipFragOKs);
-IW_STATS_ENTRY(ipFragFails);
-IW_STATS_ENTRY(ipFragCreates);
-IW_STATS_ENTRY(ipInMcastPkts);
-IW_STATS_ENTRY(ipOutMcastPkts);
-IW_STATS_ENTRY(ipInBcastPkts);
-IW_STATS_ENTRY(ipOutBcastPkts);
-IW_STATS_ENTRY(tcpRtoAlgorithm);
-IW_STATS_ENTRY(tcpRtoMin);
-IW_STATS_ENTRY(tcpRtoMax);
-IW_STATS_ENTRY(tcpMaxConn);
-IW_STATS_ENTRY(tcpActiveOpens);
-IW_STATS_ENTRY(tcpPassiveOpens);
-IW_STATS_ENTRY(tcpAttemptFails);
-IW_STATS_ENTRY(tcpEstabResets);
-IW_STATS_ENTRY(tcpCurrEstab);
-IW_STATS_ENTRY(tcpInSegs);
-IW_STATS_ENTRY(tcpOutSegs);
-IW_STATS_ENTRY(tcpRetransSegs);
-IW_STATS_ENTRY(tcpInErrs);
-IW_STATS_ENTRY(tcpOutRsts);
-
-static struct attribute *iw_proto_stats_attrs[] = {
-	&dev_attr_ipInReceives.attr,
-	&dev_attr_ipInHdrErrors.attr,
-	&dev_attr_ipInTooBigErrors.attr,
-	&dev_attr_ipInNoRoutes.attr,
-	&dev_attr_ipInAddrErrors.attr,
-	&dev_attr_ipInUnknownProtos.attr,
-	&dev_attr_ipInTruncatedPkts.attr,
-	&dev_attr_ipInDiscards.attr,
-	&dev_attr_ipInDelivers.attr,
-	&dev_attr_ipOutForwDatagrams.attr,
-	&dev_attr_ipOutRequests.attr,
-	&dev_attr_ipOutDiscards.attr,
-	&dev_attr_ipOutNoRoutes.attr,
-	&dev_attr_ipReasmTimeout.attr,
-	&dev_attr_ipReasmReqds.attr,
-	&dev_attr_ipReasmOKs.attr,
-	&dev_attr_ipReasmFails.attr,
-	&dev_attr_ipFragOKs.attr,
-	&dev_attr_ipFragFails.attr,
-	&dev_attr_ipFragCreates.attr,
-	&dev_attr_ipInMcastPkts.attr,
-	&dev_attr_ipOutMcastPkts.attr,
-	&dev_attr_ipInBcastPkts.attr,
-	&dev_attr_ipOutBcastPkts.attr,
-	&dev_attr_tcpRtoAlgorithm.attr,
-	&dev_attr_tcpRtoMin.attr,
-	&dev_attr_tcpRtoMax.attr,
-	&dev_attr_tcpMaxConn.attr,
-	&dev_attr_tcpActiveOpens.attr,
-	&dev_attr_tcpPassiveOpens.attr,
-	&dev_attr_tcpAttemptFails.attr,
-	&dev_attr_tcpEstabResets.attr,
-	&dev_attr_tcpCurrEstab.attr,
-	&dev_attr_tcpInSegs.attr,
-	&dev_attr_tcpOutSegs.attr,
-	&dev_attr_tcpRetransSegs.attr,
-	&dev_attr_tcpInErrs.attr,
-	&dev_attr_tcpOutRsts.attr,
-	NULL
-};
-
-static struct attribute_group iw_stats_group = {
-	.name	= "proto_stats",
-	.attrs	= iw_proto_stats_attrs,
-};
-
 static void free_port_list_attributes(struct ib_device *device)
 {
 	struct kobject *p, *t;
@@ -1093,6 +1207,10 @@
 	list_for_each_entry_safe(p, t, &device->port_list, entry) {
 		struct ib_port *port = container_of(p, struct ib_port, kobj);
 		list_del(&p->entry);
+		if (port->hw_stats) {
+			kfree(port->hw_stats);
+			free_hsag(&port->kobj, port->hw_stats_ag);
+		}
 		sysfs_remove_group(p, port->pma_table);
 		sysfs_remove_group(p, &port->pkey_group);
 		sysfs_remove_group(p, &port->gid_group);
@@ -1149,11 +1267,8 @@
 		}
 	}
 
-	if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) {
-		ret = sysfs_create_group(&class_dev->kobj, &iw_stats_group);
-		if (ret)
-			goto err_put;
-	}
+	if (device->alloc_hw_stats)
+		setup_hw_stats(device, NULL, 0);
 
 	return 0;
 
@@ -1169,15 +1284,18 @@
 
 void ib_device_unregister_sysfs(struct ib_device *device)
 {
-	/* Hold kobject until ib_dealloc_device() */
-	struct kobject *kobj_dev = kobject_get(&device->dev.kobj);
 	int i;
 
-	if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats)
-		sysfs_remove_group(kobj_dev, &iw_stats_group);
+	/* Hold kobject until ib_dealloc_device() */
+	kobject_get(&device->dev.kobj);
 
 	free_port_list_attributes(device);
 
+	if (device->hw_stats) {
+		kfree(device->hw_stats);
+		free_hsag(&device->dev.kobj, device->hw_stats_ag);
+	}
+
 	for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i)
 		device_remove_file(&device->dev, ib_class_attributes[i]);
 
diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile
index c7ad0a4..c0c7cf8 100644
--- a/drivers/infiniband/hw/Makefile
+++ b/drivers/infiniband/hw/Makefile
@@ -8,3 +8,4 @@
 obj-$(CONFIG_INFINIBAND_NES)		+= nes/
 obj-$(CONFIG_INFINIBAND_OCRDMA)		+= ocrdma/
 obj-$(CONFIG_INFINIBAND_USNIC)		+= usnic/
+obj-$(CONFIG_INFINIBAND_HFI1)		+= hfi1/
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c
index de1c61b4..ada2e50 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_hal.c
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c
@@ -327,7 +327,7 @@
 	kfree(cq->sw_queue);
 	dma_free_coherent(&(rdev_p->rnic_info.pdev->dev),
 			  (1UL << (cq->size_log2))
-			  * sizeof(struct t3_cqe), cq->queue,
+			  * sizeof(struct t3_cqe) + 1, cq->queue,
 			  dma_unmap_addr(cq, mapping));
 	cxio_hal_put_cqid(rdev_p->rscp, cq->cqid);
 	return err;
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 47cb927..bb1a839 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -1218,59 +1218,119 @@
 		       iwch_dev->rdev.rnic_info.pdev->device);
 }
 
-static int iwch_get_mib(struct ib_device *ibdev,
-			union rdma_protocol_stats *stats)
+enum counters {
+	IPINRECEIVES,
+	IPINHDRERRORS,
+	IPINADDRERRORS,
+	IPINUNKNOWNPROTOS,
+	IPINDISCARDS,
+	IPINDELIVERS,
+	IPOUTREQUESTS,
+	IPOUTDISCARDS,
+	IPOUTNOROUTES,
+	IPREASMTIMEOUT,
+	IPREASMREQDS,
+	IPREASMOKS,
+	IPREASMFAILS,
+	TCPACTIVEOPENS,
+	TCPPASSIVEOPENS,
+	TCPATTEMPTFAILS,
+	TCPESTABRESETS,
+	TCPCURRESTAB,
+	TCPINSEGS,
+	TCPOUTSEGS,
+	TCPRETRANSSEGS,
+	TCPINERRS,
+	TCPOUTRSTS,
+	TCPRTOMIN,
+	TCPRTOMAX,
+	NR_COUNTERS
+};
+
+static const char * const names[] = {
+	[IPINRECEIVES] = "ipInReceives",
+	[IPINHDRERRORS] = "ipInHdrErrors",
+	[IPINADDRERRORS] = "ipInAddrErrors",
+	[IPINUNKNOWNPROTOS] = "ipInUnknownProtos",
+	[IPINDISCARDS] = "ipInDiscards",
+	[IPINDELIVERS] = "ipInDelivers",
+	[IPOUTREQUESTS] = "ipOutRequests",
+	[IPOUTDISCARDS] = "ipOutDiscards",
+	[IPOUTNOROUTES] = "ipOutNoRoutes",
+	[IPREASMTIMEOUT] = "ipReasmTimeout",
+	[IPREASMREQDS] = "ipReasmReqds",
+	[IPREASMOKS] = "ipReasmOKs",
+	[IPREASMFAILS] = "ipReasmFails",
+	[TCPACTIVEOPENS] = "tcpActiveOpens",
+	[TCPPASSIVEOPENS] = "tcpPassiveOpens",
+	[TCPATTEMPTFAILS] = "tcpAttemptFails",
+	[TCPESTABRESETS] = "tcpEstabResets",
+	[TCPCURRESTAB] = "tcpCurrEstab",
+	[TCPINSEGS] = "tcpInSegs",
+	[TCPOUTSEGS] = "tcpOutSegs",
+	[TCPRETRANSSEGS] = "tcpRetransSegs",
+	[TCPINERRS] = "tcpInErrs",
+	[TCPOUTRSTS] = "tcpOutRsts",
+	[TCPRTOMIN] = "tcpRtoMin",
+	[TCPRTOMAX] = "tcpRtoMax",
+};
+
+static struct rdma_hw_stats *iwch_alloc_stats(struct ib_device *ibdev,
+					      u8 port_num)
+{
+	BUILD_BUG_ON(ARRAY_SIZE(names) != NR_COUNTERS);
+
+	/* Our driver only supports device level stats */
+	if (port_num != 0)
+		return NULL;
+
+	return rdma_alloc_hw_stats_struct(names, NR_COUNTERS,
+					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static int iwch_get_mib(struct ib_device *ibdev, struct rdma_hw_stats *stats,
+			u8 port, int index)
 {
 	struct iwch_dev *dev;
 	struct tp_mib_stats m;
 	int ret;
 
+	if (port != 0 || !stats)
+		return -ENOSYS;
+
 	PDBG("%s ibdev %p\n", __func__, ibdev);
 	dev = to_iwch_dev(ibdev);
 	ret = dev->rdev.t3cdev_p->ctl(dev->rdev.t3cdev_p, RDMA_GET_MIB, &m);
 	if (ret)
 		return -ENOSYS;
 
-	memset(stats, 0, sizeof *stats);
-	stats->iw.ipInReceives = ((u64) m.ipInReceive_hi << 32) +
-				m.ipInReceive_lo;
-	stats->iw.ipInHdrErrors = ((u64) m.ipInHdrErrors_hi << 32) +
-				  m.ipInHdrErrors_lo;
-	stats->iw.ipInAddrErrors = ((u64) m.ipInAddrErrors_hi << 32) +
-				   m.ipInAddrErrors_lo;
-	stats->iw.ipInUnknownProtos = ((u64) m.ipInUnknownProtos_hi << 32) +
-				      m.ipInUnknownProtos_lo;
-	stats->iw.ipInDiscards = ((u64) m.ipInDiscards_hi << 32) +
-				 m.ipInDiscards_lo;
-	stats->iw.ipInDelivers = ((u64) m.ipInDelivers_hi << 32) +
-				 m.ipInDelivers_lo;
-	stats->iw.ipOutRequests = ((u64) m.ipOutRequests_hi << 32) +
-				  m.ipOutRequests_lo;
-	stats->iw.ipOutDiscards = ((u64) m.ipOutDiscards_hi << 32) +
-				  m.ipOutDiscards_lo;
-	stats->iw.ipOutNoRoutes = ((u64) m.ipOutNoRoutes_hi << 32) +
-				  m.ipOutNoRoutes_lo;
-	stats->iw.ipReasmTimeout = (u64) m.ipReasmTimeout;
-	stats->iw.ipReasmReqds = (u64) m.ipReasmReqds;
-	stats->iw.ipReasmOKs = (u64) m.ipReasmOKs;
-	stats->iw.ipReasmFails = (u64) m.ipReasmFails;
-	stats->iw.tcpActiveOpens = (u64) m.tcpActiveOpens;
-	stats->iw.tcpPassiveOpens = (u64) m.tcpPassiveOpens;
-	stats->iw.tcpAttemptFails = (u64) m.tcpAttemptFails;
-	stats->iw.tcpEstabResets = (u64) m.tcpEstabResets;
-	stats->iw.tcpOutRsts = (u64) m.tcpOutRsts;
-	stats->iw.tcpCurrEstab = (u64) m.tcpCurrEstab;
-	stats->iw.tcpInSegs = ((u64) m.tcpInSegs_hi << 32) +
-			      m.tcpInSegs_lo;
-	stats->iw.tcpOutSegs = ((u64) m.tcpOutSegs_hi << 32) +
-			       m.tcpOutSegs_lo;
-	stats->iw.tcpRetransSegs = ((u64) m.tcpRetransSeg_hi << 32) +
-				  m.tcpRetransSeg_lo;
-	stats->iw.tcpInErrs = ((u64) m.tcpInErrs_hi << 32) +
-			      m.tcpInErrs_lo;
-	stats->iw.tcpRtoMin = (u64) m.tcpRtoMin;
-	stats->iw.tcpRtoMax = (u64) m.tcpRtoMax;
-	return 0;
+	stats->value[IPINRECEIVES] = ((u64)m.ipInReceive_hi << 32) +	m.ipInReceive_lo;
+	stats->value[IPINHDRERRORS] = ((u64)m.ipInHdrErrors_hi << 32) + m.ipInHdrErrors_lo;
+	stats->value[IPINADDRERRORS] = ((u64)m.ipInAddrErrors_hi << 32) + m.ipInAddrErrors_lo;
+	stats->value[IPINUNKNOWNPROTOS] = ((u64)m.ipInUnknownProtos_hi << 32) + m.ipInUnknownProtos_lo;
+	stats->value[IPINDISCARDS] = ((u64)m.ipInDiscards_hi << 32) + m.ipInDiscards_lo;
+	stats->value[IPINDELIVERS] = ((u64)m.ipInDelivers_hi << 32) + m.ipInDelivers_lo;
+	stats->value[IPOUTREQUESTS] = ((u64)m.ipOutRequests_hi << 32) + m.ipOutRequests_lo;
+	stats->value[IPOUTDISCARDS] = ((u64)m.ipOutDiscards_hi << 32) + m.ipOutDiscards_lo;
+	stats->value[IPOUTNOROUTES] = ((u64)m.ipOutNoRoutes_hi << 32) + m.ipOutNoRoutes_lo;
+	stats->value[IPREASMTIMEOUT] = 	m.ipReasmTimeout;
+	stats->value[IPREASMREQDS] = m.ipReasmReqds;
+	stats->value[IPREASMOKS] = m.ipReasmOKs;
+	stats->value[IPREASMFAILS] = m.ipReasmFails;
+	stats->value[TCPACTIVEOPENS] =	m.tcpActiveOpens;
+	stats->value[TCPPASSIVEOPENS] =	m.tcpPassiveOpens;
+	stats->value[TCPATTEMPTFAILS] = m.tcpAttemptFails;
+	stats->value[TCPESTABRESETS] = m.tcpEstabResets;
+	stats->value[TCPCURRESTAB] = m.tcpOutRsts;
+	stats->value[TCPINSEGS] = m.tcpCurrEstab;
+	stats->value[TCPOUTSEGS] = ((u64)m.tcpInSegs_hi << 32) + m.tcpInSegs_lo;
+	stats->value[TCPRETRANSSEGS] = ((u64)m.tcpOutSegs_hi << 32) + m.tcpOutSegs_lo;
+	stats->value[TCPINERRS] = ((u64)m.tcpRetransSeg_hi << 32) + m.tcpRetransSeg_lo,
+	stats->value[TCPOUTRSTS] = ((u64)m.tcpInErrs_hi << 32) + m.tcpInErrs_lo;
+	stats->value[TCPRTOMIN] = m.tcpRtoMin;
+	stats->value[TCPRTOMAX] = m.tcpRtoMax;
+
+	return stats->num_counters;
 }
 
 static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
@@ -1373,7 +1433,8 @@
 	dev->ibdev.req_notify_cq = iwch_arm_cq;
 	dev->ibdev.post_send = iwch_post_send;
 	dev->ibdev.post_recv = iwch_post_receive;
-	dev->ibdev.get_protocol_stats = iwch_get_mib;
+	dev->ibdev.alloc_hw_stats = iwch_alloc_stats;
+	dev->ibdev.get_hw_stats = iwch_get_mib;
 	dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION;
 	dev->ibdev.get_port_immutable = iwch_port_immutable;
 
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index 7574f394..dd8a86b 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -446,20 +446,59 @@
 		       c4iw_dev->rdev.lldi.pdev->device);
 }
 
+enum counters {
+	IP4INSEGS,
+	IP4OUTSEGS,
+	IP4RETRANSSEGS,
+	IP4OUTRSTS,
+	IP6INSEGS,
+	IP6OUTSEGS,
+	IP6RETRANSSEGS,
+	IP6OUTRSTS,
+	NR_COUNTERS
+};
+
+static const char * const names[] = {
+	[IP4INSEGS] = "ip4InSegs",
+	[IP4OUTSEGS] = "ip4OutSegs",
+	[IP4RETRANSSEGS] = "ip4RetransSegs",
+	[IP4OUTRSTS] = "ip4OutRsts",
+	[IP6INSEGS] = "ip6InSegs",
+	[IP6OUTSEGS] = "ip6OutSegs",
+	[IP6RETRANSSEGS] = "ip6RetransSegs",
+	[IP6OUTRSTS] = "ip6OutRsts"
+};
+
+static struct rdma_hw_stats *c4iw_alloc_stats(struct ib_device *ibdev,
+					      u8 port_num)
+{
+	BUILD_BUG_ON(ARRAY_SIZE(names) != NR_COUNTERS);
+
+	if (port_num != 0)
+		return NULL;
+
+	return rdma_alloc_hw_stats_struct(names, NR_COUNTERS,
+					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
 static int c4iw_get_mib(struct ib_device *ibdev,
-			union rdma_protocol_stats *stats)
+			struct rdma_hw_stats *stats,
+			u8 port, int index)
 {
 	struct tp_tcp_stats v4, v6;
 	struct c4iw_dev *c4iw_dev = to_c4iw_dev(ibdev);
 
 	cxgb4_get_tcp_stats(c4iw_dev->rdev.lldi.pdev, &v4, &v6);
-	memset(stats, 0, sizeof *stats);
-	stats->iw.tcpInSegs = v4.tcp_in_segs + v6.tcp_in_segs;
-	stats->iw.tcpOutSegs = v4.tcp_out_segs + v6.tcp_out_segs;
-	stats->iw.tcpRetransSegs = v4.tcp_retrans_segs + v6.tcp_retrans_segs;
-	stats->iw.tcpOutRsts = v4.tcp_out_rsts + v6.tcp_out_rsts;
+	stats->value[IP4INSEGS] = v4.tcp_in_segs;
+	stats->value[IP4OUTSEGS] = v4.tcp_out_segs;
+	stats->value[IP4RETRANSSEGS] = v4.tcp_retrans_segs;
+	stats->value[IP4OUTRSTS] = v4.tcp_out_rsts;
+	stats->value[IP6INSEGS] = v6.tcp_in_segs;
+	stats->value[IP6OUTSEGS] = v6.tcp_out_segs;
+	stats->value[IP6RETRANSSEGS] = v6.tcp_retrans_segs;
+	stats->value[IP6OUTRSTS] = v6.tcp_out_rsts;
 
-	return 0;
+	return stats->num_counters;
 }
 
 static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
@@ -562,7 +601,8 @@
 	dev->ibdev.req_notify_cq = c4iw_arm_cq;
 	dev->ibdev.post_send = c4iw_post_send;
 	dev->ibdev.post_recv = c4iw_post_receive;
-	dev->ibdev.get_protocol_stats = c4iw_get_mib;
+	dev->ibdev.alloc_hw_stats = c4iw_alloc_stats;
+	dev->ibdev.get_hw_stats = c4iw_get_mib;
 	dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION;
 	dev->ibdev.get_port_immutable = c4iw_port_immutable;
 	dev->ibdev.drain_sq = c4iw_drain_sq;
diff --git a/drivers/staging/rdma/hfi1/Kconfig b/drivers/infiniband/hw/hfi1/Kconfig
similarity index 100%
rename from drivers/staging/rdma/hfi1/Kconfig
rename to drivers/infiniband/hw/hfi1/Kconfig
diff --git a/drivers/staging/rdma/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile
similarity index 88%
rename from drivers/staging/rdma/hfi1/Makefile
rename to drivers/infiniband/hw/hfi1/Makefile
index 8dc5938..9b5382c 100644
--- a/drivers/staging/rdma/hfi1/Makefile
+++ b/drivers/infiniband/hw/hfi1/Makefile
@@ -7,7 +7,7 @@
 #
 obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o
 
-hfi1-y := affinity.o chip.o device.o diag.o driver.o efivar.o \
+hfi1-y := affinity.o chip.o device.o driver.o efivar.o \
 	eprom.o file_ops.o firmware.o \
 	init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \
 	qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o twsi.o \
diff --git a/drivers/staging/rdma/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
similarity index 100%
rename from drivers/staging/rdma/hfi1/affinity.c
rename to drivers/infiniband/hw/hfi1/affinity.c
diff --git a/drivers/staging/rdma/hfi1/affinity.h b/drivers/infiniband/hw/hfi1/affinity.h
similarity index 100%
rename from drivers/staging/rdma/hfi1/affinity.h
rename to drivers/infiniband/hw/hfi1/affinity.h
diff --git a/drivers/staging/rdma/hfi1/aspm.h b/drivers/infiniband/hw/hfi1/aspm.h
similarity index 100%
rename from drivers/staging/rdma/hfi1/aspm.h
rename to drivers/infiniband/hw/hfi1/aspm.h
diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
similarity index 99%
rename from drivers/staging/rdma/hfi1/chip.c
rename to drivers/infiniband/hw/hfi1/chip.c
index dcae8e7..3b876da 100644
--- a/drivers/staging/rdma/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -1037,6 +1037,7 @@
 static void dc_start(struct hfi1_devdata *);
 static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
 			   unsigned int *np);
+static void remove_full_mgmt_pkey(struct hfi1_pportdata *ppd);
 
 /*
  * Error interrupt table entry.  This is used as input to the interrupt
@@ -6105,7 +6106,7 @@
 	}
 
 	/* this access is valid only when the link is up */
-	if ((ppd->host_link_state & HLS_UP) == 0) {
+	if (ppd->host_link_state & HLS_DOWN) {
 		dd_dev_info(dd, "%s: link state %s not up\n",
 			    __func__, link_state_name(ppd->host_link_state));
 		ret = -EBUSY;
@@ -6961,6 +6962,8 @@
 	}
 
 	reset_neighbor_info(ppd);
+	if (ppd->mgmt_allowed)
+		remove_full_mgmt_pkey(ppd);
 
 	/* disable the port */
 	clear_rcvctrl(ppd->dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
@@ -7069,6 +7072,12 @@
 	(void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
 }
 
+static void remove_full_mgmt_pkey(struct hfi1_pportdata *ppd)
+{
+	ppd->pkeys[2] = 0;
+	(void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
+}
+
 /*
  * Convert the given link width to the OPA link width bitmask.
  */
@@ -7429,7 +7438,7 @@
 retry:
 	mutex_lock(&ppd->hls_lock);
 	/* only apply if the link is up */
-	if (!(ppd->host_link_state & HLS_UP)) {
+	if (ppd->host_link_state & HLS_DOWN) {
 		/* still going up..wait and retry */
 		if (ppd->host_link_state & HLS_GOING_UP) {
 			if (++tries < 1000) {
@@ -9212,9 +9221,6 @@
 
 	/* Reset the QSFP */
 	mask = (u64)QSFP_HFI0_RESET_N;
-	qsfp_mask = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE);
-	qsfp_mask |= mask;
-	write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE, qsfp_mask);
 
 	qsfp_mask = read_csr(dd,
 			     dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT);
@@ -9252,6 +9258,12 @@
 		dd_dev_info(dd, "%s: QSFP cable temperature too low\n",
 			    __func__);
 
+	/*
+	 * The remaining alarms/warnings don't matter if the link is down.
+	 */
+	if (ppd->host_link_state & HLS_DOWN)
+		return 0;
+
 	if ((qsfp_interrupt_status[1] & QSFP_HIGH_VCC_ALARM) ||
 	    (qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING))
 		dd_dev_info(dd, "%s: QSFP supply voltage too high\n",
@@ -9346,9 +9358,8 @@
 		return;
 
 	/*
-	 * Turn DC back on after cables has been
-	 * re-inserted. Up until now, the DC has been in
-	 * reset to save power.
+	 * Turn DC back on after cable has been re-inserted. Up until
+	 * now, the DC has been in reset to save power.
 	 */
 	dc_start(dd);
 
@@ -9480,7 +9491,15 @@
 			return ret;
 	}
 
-	/* tune the SERDES to a ballpark setting for
+	get_port_type(ppd);
+	if (ppd->port_type == PORT_TYPE_QSFP) {
+		set_qsfp_int_n(ppd, 0);
+		wait_for_qsfp_init(ppd);
+		set_qsfp_int_n(ppd, 1);
+	}
+
+	/*
+	 * Tune the SerDes to a ballpark setting for
 	 * optimal signal and bit error rate
 	 * Needs to be done before starting the link
 	 */
@@ -10074,7 +10093,7 @@
  */
 u32 driver_logical_state(struct hfi1_pportdata *ppd)
 {
-	if (ppd->host_link_state && !(ppd->host_link_state & HLS_UP))
+	if (ppd->host_link_state && (ppd->host_link_state & HLS_DOWN))
 		return IB_PORT_DOWN;
 
 	switch (ppd->host_link_state & HLS_UP) {
@@ -14578,7 +14597,7 @@
 		   (reason), (ret))
 
 /*
- * Initialize the Avago Thermal sensor.
+ * Initialize the thermal sensor.
  *
  * After initialization, enable polling of thermal sensor through
  * SBus interface. In order for this to work, the SBus Master
diff --git a/drivers/staging/rdma/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h
similarity index 99%
rename from drivers/staging/rdma/hfi1/chip.h
rename to drivers/infiniband/hw/hfi1/chip.h
index 1948706..66a3279 100644
--- a/drivers/staging/rdma/hfi1/chip.h
+++ b/drivers/infiniband/hw/hfi1/chip.h
@@ -398,6 +398,12 @@
 /* Lane ID for general configuration registers */
 #define GENERAL_CONFIG 4
 
+/* LINK_TUNING_PARAMETERS fields */
+#define TUNING_METHOD_SHIFT 24
+
+/* LINK_OPTIMIZATION_SETTINGS fields */
+#define ENABLE_EXT_DEV_CONFIG_SHIFT 24
+
 /* LOAD_DATA 8051 command shifts and fields */
 #define LOAD_DATA_FIELD_ID_SHIFT 40
 #define LOAD_DATA_FIELD_ID_MASK 0xfull
diff --git a/drivers/staging/rdma/hfi1/chip_registers.h b/drivers/infiniband/hw/hfi1/chip_registers.h
similarity index 100%
rename from drivers/staging/rdma/hfi1/chip_registers.h
rename to drivers/infiniband/hw/hfi1/chip_registers.h
diff --git a/drivers/staging/rdma/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h
similarity index 98%
rename from drivers/staging/rdma/hfi1/common.h
rename to drivers/infiniband/hw/hfi1/common.h
index e9b6bb3..fcc9c21 100644
--- a/drivers/staging/rdma/hfi1/common.h
+++ b/drivers/infiniband/hw/hfi1/common.h
@@ -178,7 +178,8 @@
 		     HFI1_CAP_PKEY_CHECK |		\
 		     HFI1_CAP_NO_INTEGRITY)
 
-#define HFI1_USER_SWVERSION ((HFI1_USER_SWMAJOR << 16) | HFI1_USER_SWMINOR)
+#define HFI1_USER_SWVERSION ((HFI1_USER_SWMAJOR << HFI1_SWMAJOR_SHIFT) | \
+			     HFI1_USER_SWMINOR)
 
 #ifndef HFI1_KERN_TYPE
 #define HFI1_KERN_TYPE 0
@@ -349,6 +350,8 @@
 #define HFI1_BECN_MASK 1
 #define HFI1_BECN_SMASK BIT(HFI1_BECN_SHIFT)
 
+#define HFI1_PSM_IOC_BASE_SEQ 0x0
+
 static inline __u64 rhf_to_cpu(const __le32 *rbuf)
 {
 	return __le64_to_cpu(*((__le64 *)rbuf));
diff --git a/drivers/staging/rdma/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c
similarity index 100%
rename from drivers/staging/rdma/hfi1/debugfs.c
rename to drivers/infiniband/hw/hfi1/debugfs.c
diff --git a/drivers/staging/rdma/hfi1/debugfs.h b/drivers/infiniband/hw/hfi1/debugfs.h
similarity index 100%
rename from drivers/staging/rdma/hfi1/debugfs.h
rename to drivers/infiniband/hw/hfi1/debugfs.h
diff --git a/drivers/staging/rdma/hfi1/device.c b/drivers/infiniband/hw/hfi1/device.c
similarity index 94%
rename from drivers/staging/rdma/hfi1/device.c
rename to drivers/infiniband/hw/hfi1/device.c
index c05c39d..bf64b5a 100644
--- a/drivers/staging/rdma/hfi1/device.c
+++ b/drivers/infiniband/hw/hfi1/device.c
@@ -60,7 +60,8 @@
 int hfi1_cdev_init(int minor, const char *name,
 		   const struct file_operations *fops,
 		   struct cdev *cdev, struct device **devp,
-		   bool user_accessible)
+		   bool user_accessible,
+		   struct kobject *parent)
 {
 	const dev_t dev = MKDEV(MAJOR(hfi1_dev), minor);
 	struct device *device = NULL;
@@ -68,6 +69,7 @@
 
 	cdev_init(cdev, fops);
 	cdev->owner = THIS_MODULE;
+	cdev->kobj.parent = parent;
 	kobject_set_name(&cdev->kobj, name);
 
 	ret = cdev_add(cdev, dev, 1);
@@ -82,13 +84,13 @@
 	else
 		device = device_create(class, NULL, dev, NULL, "%s", name);
 
-	if (!IS_ERR(device))
-		goto done;
-	ret = PTR_ERR(device);
-	device = NULL;
-	pr_err("Could not create device for minor %d, %s (err %d)\n",
-	       minor, name, -ret);
-	cdev_del(cdev);
+	if (IS_ERR(device)) {
+		ret = PTR_ERR(device);
+		device = NULL;
+		pr_err("Could not create device for minor %d, %s (err %d)\n",
+			minor, name, -ret);
+		cdev_del(cdev);
+	}
 done:
 	*devp = device;
 	return ret;
diff --git a/drivers/staging/rdma/hfi1/device.h b/drivers/infiniband/hw/hfi1/device.h
similarity index 97%
rename from drivers/staging/rdma/hfi1/device.h
rename to drivers/infiniband/hw/hfi1/device.h
index 5bb3e83..c3ec19c 100644
--- a/drivers/staging/rdma/hfi1/device.h
+++ b/drivers/infiniband/hw/hfi1/device.h
@@ -50,7 +50,8 @@
 int hfi1_cdev_init(int minor, const char *name,
 		   const struct file_operations *fops,
 		   struct cdev *cdev, struct device **devp,
-		   bool user_accessible);
+		   bool user_accessible,
+		   struct kobject *parent);
 void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp);
 const char *class_name(void);
 int __init dev_init(void);
diff --git a/drivers/staging/rdma/hfi1/dma.c b/drivers/infiniband/hw/hfi1/dma.c
similarity index 100%
rename from drivers/staging/rdma/hfi1/dma.c
rename to drivers/infiniband/hw/hfi1/dma.c
diff --git a/drivers/staging/rdma/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c
similarity index 99%
rename from drivers/staging/rdma/hfi1/driver.c
rename to drivers/infiniband/hw/hfi1/driver.c
index 700c6fa..c75b0ae 100644
--- a/drivers/staging/rdma/hfi1/driver.c
+++ b/drivers/infiniband/hw/hfi1/driver.c
@@ -1161,7 +1161,7 @@
 	ppd->lmc = lmc;
 	hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LIDLMC, 0);
 
-	dd_dev_info(dd, "IB%u:%u got a lid: 0x%x\n", dd->unit, ppd->port, lid);
+	dd_dev_info(dd, "port %u: got a lid: 0x%x\n", ppd->port, lid);
 
 	return 0;
 }
diff --git a/drivers/staging/rdma/hfi1/efivar.c b/drivers/infiniband/hw/hfi1/efivar.c
similarity index 100%
rename from drivers/staging/rdma/hfi1/efivar.c
rename to drivers/infiniband/hw/hfi1/efivar.c
diff --git a/drivers/staging/rdma/hfi1/efivar.h b/drivers/infiniband/hw/hfi1/efivar.h
similarity index 100%
rename from drivers/staging/rdma/hfi1/efivar.h
rename to drivers/infiniband/hw/hfi1/efivar.h
diff --git a/drivers/staging/rdma/hfi1/device.h b/drivers/infiniband/hw/hfi1/eprom.c
similarity index 60%
copy from drivers/staging/rdma/hfi1/device.h
copy to drivers/infiniband/hw/hfi1/eprom.c
index 5bb3e83..36b7794 100644
--- a/drivers/staging/rdma/hfi1/device.h
+++ b/drivers/infiniband/hw/hfi1/eprom.c
@@ -1,5 +1,3 @@
-#ifndef _HFI1_DEVICE_H
-#define _HFI1_DEVICE_H
 /*
  * Copyright(c) 2015, 2016 Intel Corporation.
  *
@@ -46,14 +44,59 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  */
+#include <linux/delay.h>
+#include "hfi.h"
+#include "common.h"
+#include "eprom.h"
 
-int hfi1_cdev_init(int minor, const char *name,
-		   const struct file_operations *fops,
-		   struct cdev *cdev, struct device **devp,
-		   bool user_accessible);
-void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp);
-const char *class_name(void);
-int __init dev_init(void);
-void dev_cleanup(void);
+#define CMD_SHIFT 24
+#define CMD_RELEASE_POWERDOWN_NOID  ((0xab << CMD_SHIFT))
 
-#endif                          /* _HFI1_DEVICE_H */
+/* controller interface speeds */
+#define EP_SPEED_FULL 0x2	/* full speed */
+
+/*
+ * How long to wait for the EPROM to become available, in ms.
+ * The spec 32 Mb EPROM takes around 40s to erase then write.
+ * Double it for safety.
+ */
+#define EPROM_TIMEOUT 80000 /* ms */
+/*
+ * Initialize the EPROM handler.
+ */
+int eprom_init(struct hfi1_devdata *dd)
+{
+	int ret = 0;
+
+	/* only the discrete chip has an EPROM */
+	if (dd->pcidev->device != PCI_DEVICE_ID_INTEL0)
+		return 0;
+
+	/*
+	 * It is OK if both HFIs reset the EPROM as long as they don't
+	 * do it at the same time.
+	 */
+	ret = acquire_chip_resource(dd, CR_EPROM, EPROM_TIMEOUT);
+	if (ret) {
+		dd_dev_err(dd,
+			   "%s: unable to acquire EPROM resource, no EPROM support\n",
+			   __func__);
+		goto done_asic;
+	}
+
+	/* reset EPROM to be sure it is in a good state */
+
+	/* set reset */
+	write_csr(dd, ASIC_EEP_CTL_STAT, ASIC_EEP_CTL_STAT_EP_RESET_SMASK);
+	/* clear reset, set speed */
+	write_csr(dd, ASIC_EEP_CTL_STAT,
+		  EP_SPEED_FULL << ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT);
+
+	/* wake the device with command "release powerdown NoID" */
+	write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_RELEASE_POWERDOWN_NOID);
+
+	dd->eprom_available = true;
+	release_chip_resource(dd, CR_EPROM);
+done_asic:
+	return ret;
+}
diff --git a/drivers/staging/rdma/hfi1/eprom.h b/drivers/infiniband/hw/hfi1/eprom.h
similarity index 100%
rename from drivers/staging/rdma/hfi1/eprom.h
rename to drivers/infiniband/hw/hfi1/eprom.h
diff --git a/drivers/staging/rdma/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
similarity index 78%
rename from drivers/staging/rdma/hfi1/file_ops.c
rename to drivers/infiniband/hw/hfi1/file_ops.c
index c1c5bf8..7a5b0e6 100644
--- a/drivers/staging/rdma/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -72,8 +72,6 @@
  */
 static int hfi1_file_open(struct inode *, struct file *);
 static int hfi1_file_close(struct inode *, struct file *);
-static ssize_t hfi1_file_write(struct file *, const char __user *,
-			       size_t, loff_t *);
 static ssize_t hfi1_write_iter(struct kiocb *, struct iov_iter *);
 static unsigned int hfi1_poll(struct file *, struct poll_table_struct *);
 static int hfi1_file_mmap(struct file *, struct vm_area_struct *);
@@ -86,8 +84,7 @@
 static int get_base_info(struct file *, void __user *, __u32);
 static int setup_ctxt(struct file *);
 static int setup_subctxt(struct hfi1_ctxtdata *);
-static int get_user_context(struct file *, struct hfi1_user_info *,
-			    int, unsigned);
+static int get_user_context(struct file *, struct hfi1_user_info *, int);
 static int find_shared_ctxt(struct file *, const struct hfi1_user_info *);
 static int allocate_ctxt(struct file *, struct hfi1_devdata *,
 			 struct hfi1_user_info *);
@@ -97,13 +94,15 @@
 static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16);
 static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int);
 static int vma_fault(struct vm_area_struct *, struct vm_fault *);
+static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
+			    unsigned long arg);
 
 static const struct file_operations hfi1_file_ops = {
 	.owner = THIS_MODULE,
-	.write = hfi1_file_write,
 	.write_iter = hfi1_write_iter,
 	.open = hfi1_file_open,
 	.release = hfi1_file_close,
+	.unlocked_ioctl = hfi1_file_ioctl,
 	.poll = hfi1_poll,
 	.mmap = hfi1_file_mmap,
 	.llseek = noop_llseek,
@@ -169,6 +168,13 @@
 
 static int hfi1_file_open(struct inode *inode, struct file *fp)
 {
+	struct hfi1_devdata *dd = container_of(inode->i_cdev,
+					       struct hfi1_devdata,
+					       user_cdev);
+
+	/* Just take a ref now. Not all opens result in a context assign */
+	kobject_get(&dd->kobj);
+
 	/* The real work is performed later in assign_ctxt() */
 	fp->private_data = kzalloc(sizeof(struct hfi1_filedata), GFP_KERNEL);
 	if (fp->private_data) /* no cpu affinity by default */
@@ -176,127 +182,59 @@
 	return fp->private_data ? 0 : -ENOMEM;
 }
 
-static ssize_t hfi1_file_write(struct file *fp, const char __user *data,
-			       size_t count, loff_t *offset)
+static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
+			    unsigned long arg)
 {
-	const struct hfi1_cmd __user *ucmd;
 	struct hfi1_filedata *fd = fp->private_data;
 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
-	struct hfi1_cmd cmd;
 	struct hfi1_user_info uinfo;
 	struct hfi1_tid_info tinfo;
+	int ret = 0;
 	unsigned long addr;
-	ssize_t consumed = 0, copy = 0, ret = 0;
-	void *dest = NULL;
-	__u64 user_val = 0;
-	int uctxt_required = 1;
-	int must_be_root = 0;
+	int uval = 0;
+	unsigned long ul_uval = 0;
+	u16 uval16 = 0;
 
-	/* FIXME: This interface cannot continue out of staging */
-	if (WARN_ON_ONCE(!ib_safe_file_access(fp)))
-		return -EACCES;
+	hfi1_cdbg(IOCTL, "IOCTL recv: 0x%x", cmd);
+	if (cmd != HFI1_IOCTL_ASSIGN_CTXT &&
+	    cmd != HFI1_IOCTL_GET_VERS &&
+	    !uctxt)
+		return -EINVAL;
 
-	if (count < sizeof(cmd)) {
-		ret = -EINVAL;
-		goto bail;
-	}
+	switch (cmd) {
+	case HFI1_IOCTL_ASSIGN_CTXT:
+		if (copy_from_user(&uinfo,
+				   (struct hfi1_user_info __user *)arg,
+				   sizeof(uinfo)))
+			return -EFAULT;
 
-	ucmd = (const struct hfi1_cmd __user *)data;
-	if (copy_from_user(&cmd, ucmd, sizeof(cmd))) {
-		ret = -EFAULT;
-		goto bail;
-	}
-
-	consumed = sizeof(cmd);
-
-	switch (cmd.type) {
-	case HFI1_CMD_ASSIGN_CTXT:
-		uctxt_required = 0;	/* assigned user context not required */
-		copy = sizeof(uinfo);
-		dest = &uinfo;
-		break;
-	case HFI1_CMD_SDMA_STATUS_UPD:
-	case HFI1_CMD_CREDIT_UPD:
-		copy = 0;
-		break;
-	case HFI1_CMD_TID_UPDATE:
-	case HFI1_CMD_TID_FREE:
-	case HFI1_CMD_TID_INVAL_READ:
-		copy = sizeof(tinfo);
-		dest = &tinfo;
-		break;
-	case HFI1_CMD_USER_INFO:
-	case HFI1_CMD_RECV_CTRL:
-	case HFI1_CMD_POLL_TYPE:
-	case HFI1_CMD_ACK_EVENT:
-	case HFI1_CMD_CTXT_INFO:
-	case HFI1_CMD_SET_PKEY:
-	case HFI1_CMD_CTXT_RESET:
-		copy = 0;
-		user_val = cmd.addr;
-		break;
-	case HFI1_CMD_EP_INFO:
-	case HFI1_CMD_EP_ERASE_CHIP:
-	case HFI1_CMD_EP_ERASE_RANGE:
-	case HFI1_CMD_EP_READ_RANGE:
-	case HFI1_CMD_EP_WRITE_RANGE:
-		uctxt_required = 0;	/* assigned user context not required */
-		must_be_root = 1;	/* validate user */
-		copy = 0;
-		break;
-	default:
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	/* If the command comes with user data, copy it. */
-	if (copy) {
-		if (copy_from_user(dest, (void __user *)cmd.addr, copy)) {
-			ret = -EFAULT;
-			goto bail;
-		}
-		consumed += copy;
-	}
-
-	/*
-	 * Make sure there is a uctxt when needed.
-	 */
-	if (uctxt_required && !uctxt) {
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	/* only root can do these operations */
-	if (must_be_root && !capable(CAP_SYS_ADMIN)) {
-		ret = -EPERM;
-		goto bail;
-	}
-
-	switch (cmd.type) {
-	case HFI1_CMD_ASSIGN_CTXT:
 		ret = assign_ctxt(fp, &uinfo);
 		if (ret < 0)
-			goto bail;
-		ret = setup_ctxt(fp);
+			return ret;
+		setup_ctxt(fp);
 		if (ret)
-			goto bail;
+			return ret;
 		ret = user_init(fp);
 		break;
-	case HFI1_CMD_CTXT_INFO:
-		ret = get_ctxt_info(fp, (void __user *)(unsigned long)
-				    user_val, cmd.len);
+	case HFI1_IOCTL_CTXT_INFO:
+		ret = get_ctxt_info(fp, (void __user *)(unsigned long)arg,
+				    sizeof(struct hfi1_ctxt_info));
 		break;
-	case HFI1_CMD_USER_INFO:
-		ret = get_base_info(fp, (void __user *)(unsigned long)
-				    user_val, cmd.len);
+	case HFI1_IOCTL_USER_INFO:
+		ret = get_base_info(fp, (void __user *)(unsigned long)arg,
+				    sizeof(struct hfi1_base_info));
 		break;
-	case HFI1_CMD_SDMA_STATUS_UPD:
-		break;
-	case HFI1_CMD_CREDIT_UPD:
+	case HFI1_IOCTL_CREDIT_UPD:
 		if (uctxt && uctxt->sc)
 			sc_return_credits(uctxt->sc);
 		break;
-	case HFI1_CMD_TID_UPDATE:
+
+	case HFI1_IOCTL_TID_UPDATE:
+		if (copy_from_user(&tinfo,
+				   (struct hfi11_tid_info __user *)arg,
+				   sizeof(tinfo)))
+			return -EFAULT;
+
 		ret = hfi1_user_exp_rcv_setup(fp, &tinfo);
 		if (!ret) {
 			/*
@@ -305,57 +243,82 @@
 			 * These fields are adjacent in the structure so
 			 * we can copy them at the same time.
 			 */
-			addr = (unsigned long)cmd.addr +
-				offsetof(struct hfi1_tid_info, tidcnt);
+			addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
 			if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
 					 sizeof(tinfo.tidcnt) +
 					 sizeof(tinfo.length)))
 				ret = -EFAULT;
 		}
 		break;
-	case HFI1_CMD_TID_INVAL_READ:
-		ret = hfi1_user_exp_rcv_invalid(fp, &tinfo);
-		if (ret)
-			break;
-		addr = (unsigned long)cmd.addr +
-			offsetof(struct hfi1_tid_info, tidcnt);
-		if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
-				 sizeof(tinfo.tidcnt)))
-			ret = -EFAULT;
-		break;
-	case HFI1_CMD_TID_FREE:
+
+	case HFI1_IOCTL_TID_FREE:
+		if (copy_from_user(&tinfo,
+				   (struct hfi11_tid_info __user *)arg,
+				   sizeof(tinfo)))
+			return -EFAULT;
+
 		ret = hfi1_user_exp_rcv_clear(fp, &tinfo);
 		if (ret)
 			break;
-		addr = (unsigned long)cmd.addr +
-			offsetof(struct hfi1_tid_info, tidcnt);
+		addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
 		if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
 				 sizeof(tinfo.tidcnt)))
 			ret = -EFAULT;
 		break;
-	case HFI1_CMD_RECV_CTRL:
-		ret = manage_rcvq(uctxt, fd->subctxt, (int)user_val);
+
+	case HFI1_IOCTL_TID_INVAL_READ:
+		if (copy_from_user(&tinfo,
+				   (struct hfi11_tid_info __user *)arg,
+				   sizeof(tinfo)))
+			return -EFAULT;
+
+		ret = hfi1_user_exp_rcv_invalid(fp, &tinfo);
+		if (ret)
+			break;
+		addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
+		if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
+				 sizeof(tinfo.tidcnt)))
+			ret = -EFAULT;
 		break;
-	case HFI1_CMD_POLL_TYPE:
-		uctxt->poll_type = (typeof(uctxt->poll_type))user_val;
+
+	case HFI1_IOCTL_RECV_CTRL:
+		ret = get_user(uval, (int __user *)arg);
+		if (ret != 0)
+			return -EFAULT;
+		ret = manage_rcvq(uctxt, fd->subctxt, uval);
 		break;
-	case HFI1_CMD_ACK_EVENT:
-		ret = user_event_ack(uctxt, fd->subctxt, user_val);
+
+	case HFI1_IOCTL_POLL_TYPE:
+		ret = get_user(uval, (int __user *)arg);
+		if (ret != 0)
+			return -EFAULT;
+		uctxt->poll_type = (typeof(uctxt->poll_type))uval;
 		break;
-	case HFI1_CMD_SET_PKEY:
+
+	case HFI1_IOCTL_ACK_EVENT:
+		ret = get_user(ul_uval, (unsigned long __user *)arg);
+		if (ret != 0)
+			return -EFAULT;
+		ret = user_event_ack(uctxt, fd->subctxt, ul_uval);
+		break;
+
+	case HFI1_IOCTL_SET_PKEY:
+		ret = get_user(uval16, (u16 __user *)arg);
+		if (ret != 0)
+			return -EFAULT;
 		if (HFI1_CAP_IS_USET(PKEY_CHECK))
-			ret = set_ctxt_pkey(uctxt, fd->subctxt, user_val);
+			ret = set_ctxt_pkey(uctxt, fd->subctxt, uval16);
 		else
-			ret = -EPERM;
+			return -EPERM;
 		break;
-	case HFI1_CMD_CTXT_RESET: {
+
+	case HFI1_IOCTL_CTXT_RESET: {
 		struct send_context *sc;
 		struct hfi1_devdata *dd;
 
-		if (!uctxt || !uctxt->dd || !uctxt->sc) {
-			ret = -EINVAL;
-			break;
-		}
+		if (!uctxt || !uctxt->dd || !uctxt->sc)
+			return -EINVAL;
+
 		/*
 		 * There is no protection here. User level has to
 		 * guarantee that no one will be writing to the send
@@ -373,10 +336,9 @@
 		wait_event_interruptible_timeout(
 			sc->halt_wait, (sc->flags & SCF_HALTED),
 			msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
-		if (!(sc->flags & SCF_HALTED)) {
-			ret = -ENOLCK;
-			break;
-		}
+		if (!(sc->flags & SCF_HALTED))
+			return -ENOLCK;
+
 		/*
 		 * If the send context was halted due to a Freeze,
 		 * wait until the device has been "unfrozen" before
@@ -387,18 +349,16 @@
 				dd->event_queue,
 				!(ACCESS_ONCE(dd->flags) & HFI1_FROZEN),
 				msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
-			if (dd->flags & HFI1_FROZEN) {
-				ret = -ENOLCK;
-				break;
-			}
-			if (dd->flags & HFI1_FORCED_FREEZE) {
+			if (dd->flags & HFI1_FROZEN)
+				return -ENOLCK;
+
+			if (dd->flags & HFI1_FORCED_FREEZE)
 				/*
 				 * Don't allow context reset if we are into
 				 * forced freeze
 				 */
-				ret = -ENODEV;
-				break;
-			}
+				return -ENODEV;
+
 			sc_disable(sc);
 			ret = sc_enable(sc);
 			hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB,
@@ -410,18 +370,17 @@
 			sc_return_credits(sc);
 		break;
 	}
-	case HFI1_CMD_EP_INFO:
-	case HFI1_CMD_EP_ERASE_CHIP:
-	case HFI1_CMD_EP_ERASE_RANGE:
-	case HFI1_CMD_EP_READ_RANGE:
-	case HFI1_CMD_EP_WRITE_RANGE:
-		ret = handle_eprom_command(fp, &cmd);
+
+	case HFI1_IOCTL_GET_VERS:
+		uval = HFI1_USER_SWVERSION;
+		if (put_user(uval, (int __user *)arg))
+			return -EFAULT;
 		break;
+
+	default:
+		return -EINVAL;
 	}
 
-	if (ret >= 0)
-		ret = consumed;
-bail:
 	return ret;
 }
 
@@ -738,7 +697,9 @@
 {
 	struct hfi1_filedata *fdata = fp->private_data;
 	struct hfi1_ctxtdata *uctxt = fdata->uctxt;
-	struct hfi1_devdata *dd;
+	struct hfi1_devdata *dd = container_of(inode->i_cdev,
+					       struct hfi1_devdata,
+					       user_cdev);
 	unsigned long flags, *ev;
 
 	fp->private_data = NULL;
@@ -747,7 +708,6 @@
 		goto done;
 
 	hfi1_cdbg(PROC, "freeing ctxt %u:%u", uctxt->ctxt, fdata->subctxt);
-	dd = uctxt->dd;
 	mutex_lock(&hfi1_mutex);
 
 	flush_wc();
@@ -813,6 +773,7 @@
 	mutex_unlock(&hfi1_mutex);
 	hfi1_free_ctxtdata(dd, uctxt);
 done:
+	kobject_put(&dd->kobj);
 	kfree(fdata);
 	return 0;
 }
@@ -836,7 +797,7 @@
 static int assign_ctxt(struct file *fp, struct hfi1_user_info *uinfo)
 {
 	int i_minor, ret = 0;
-	unsigned swmajor, swminor, alg = HFI1_ALG_ACROSS;
+	unsigned int swmajor, swminor;
 
 	swmajor = uinfo->userversion >> 16;
 	if (swmajor != HFI1_USER_SWMAJOR) {
@@ -846,9 +807,6 @@
 
 	swminor = uinfo->userversion & 0xffff;
 
-	if (uinfo->hfi1_alg < HFI1_ALG_COUNT)
-		alg = uinfo->hfi1_alg;
-
 	mutex_lock(&hfi1_mutex);
 	/* First, lets check if we need to setup a shared context? */
 	if (uinfo->subctxt_cnt) {
@@ -868,7 +826,7 @@
 	 */
 	if (!ret) {
 		i_minor = iminor(file_inode(fp)) - HFI1_USER_MINOR_BASE;
-		ret = get_user_context(fp, uinfo, i_minor - 1, alg);
+		ret = get_user_context(fp, uinfo, i_minor);
 	}
 done_unlock:
 	mutex_unlock(&hfi1_mutex);
@@ -876,71 +834,26 @@
 	return ret;
 }
 
-/* return true if the device available for general use */
-static int usable_device(struct hfi1_devdata *dd)
-{
-	struct hfi1_pportdata *ppd = dd->pport;
-
-	return driver_lstate(ppd) == IB_PORT_ACTIVE;
-}
-
 static int get_user_context(struct file *fp, struct hfi1_user_info *uinfo,
-			    int devno, unsigned alg)
+			    int devno)
 {
 	struct hfi1_devdata *dd = NULL;
-	int ret = 0, devmax, npresent, nup, dev;
+	int devmax, npresent, nup;
 
 	devmax = hfi1_count_units(&npresent, &nup);
-	if (!npresent) {
-		ret = -ENXIO;
-		goto done;
-	}
-	if (!nup) {
-		ret = -ENETDOWN;
-		goto done;
-	}
-	if (devno >= 0) {
-		dd = hfi1_lookup(devno);
-		if (!dd)
-			ret = -ENODEV;
-		else if (!dd->freectxts)
-			ret = -EBUSY;
-	} else {
-		struct hfi1_devdata *pdd;
+	if (!npresent)
+		return -ENXIO;
 
-		if (alg == HFI1_ALG_ACROSS) {
-			unsigned free = 0U;
+	if (!nup)
+		return -ENETDOWN;
 
-			for (dev = 0; dev < devmax; dev++) {
-				pdd = hfi1_lookup(dev);
-				if (!pdd)
-					continue;
-				if (!usable_device(pdd))
-					continue;
-				if (pdd->freectxts &&
-				    pdd->freectxts > free) {
-					dd = pdd;
-					free = pdd->freectxts;
-				}
-			}
-		} else {
-			for (dev = 0; dev < devmax; dev++) {
-				pdd = hfi1_lookup(dev);
-				if (!pdd)
-					continue;
-				if (!usable_device(pdd))
-					continue;
-				if (pdd->freectxts) {
-					dd = pdd;
-					break;
-				}
-			}
-		}
-		if (!dd)
-			ret = -EBUSY;
-	}
-done:
-	return ret ? ret : allocate_ctxt(fp, dd, uinfo);
+	dd = hfi1_lookup(devno);
+	if (!dd)
+		return -ENODEV;
+	else if (!dd->freectxts)
+		return -EBUSY;
+
+	return allocate_ctxt(fp, dd, uinfo);
 }
 
 static int find_shared_ctxt(struct file *fp,
@@ -1546,170 +1459,10 @@
 	return ret;
 }
 
-static int ui_open(struct inode *inode, struct file *filp)
-{
-	struct hfi1_devdata *dd;
-
-	dd = container_of(inode->i_cdev, struct hfi1_devdata, ui_cdev);
-	filp->private_data = dd; /* for other methods */
-	return 0;
-}
-
-static int ui_release(struct inode *inode, struct file *filp)
-{
-	/* nothing to do */
-	return 0;
-}
-
-static loff_t ui_lseek(struct file *filp, loff_t offset, int whence)
-{
-	struct hfi1_devdata *dd = filp->private_data;
-
-	return fixed_size_llseek(filp, offset, whence,
-		(dd->kregend - dd->kregbase) + DC8051_DATA_MEM_SIZE);
-}
-
-/* NOTE: assumes unsigned long is 8 bytes */
-static ssize_t ui_read(struct file *filp, char __user *buf, size_t count,
-		       loff_t *f_pos)
-{
-	struct hfi1_devdata *dd = filp->private_data;
-	void __iomem *base = dd->kregbase;
-	unsigned long total, csr_off,
-		barlen = (dd->kregend - dd->kregbase);
-	u64 data;
-
-	/* only read 8 byte quantities */
-	if ((count % 8) != 0)
-		return -EINVAL;
-	/* offset must be 8-byte aligned */
-	if ((*f_pos % 8) != 0)
-		return -EINVAL;
-	/* destination buffer must be 8-byte aligned */
-	if ((unsigned long)buf % 8 != 0)
-		return -EINVAL;
-	/* must be in range */
-	if (*f_pos + count > (barlen + DC8051_DATA_MEM_SIZE))
-		return -EINVAL;
-	/* only set the base if we are not starting past the BAR */
-	if (*f_pos < barlen)
-		base += *f_pos;
-	csr_off = *f_pos;
-	for (total = 0; total < count; total += 8, csr_off += 8) {
-		/* accessing LCB CSRs requires more checks */
-		if (is_lcb_offset(csr_off)) {
-			if (read_lcb_csr(dd, csr_off, (u64 *)&data))
-				break; /* failed */
-		}
-		/*
-		 * Cannot read ASIC GPIO/QSFP* clear and force CSRs without a
-		 * false parity error.  Avoid the whole issue by not reading
-		 * them.  These registers are defined as having a read value
-		 * of 0.
-		 */
-		else if (csr_off == ASIC_GPIO_CLEAR ||
-			 csr_off == ASIC_GPIO_FORCE ||
-			 csr_off == ASIC_QSFP1_CLEAR ||
-			 csr_off == ASIC_QSFP1_FORCE ||
-			 csr_off == ASIC_QSFP2_CLEAR ||
-			 csr_off == ASIC_QSFP2_FORCE)
-			data = 0;
-		else if (csr_off >= barlen) {
-			/*
-			 * read_8051_data can read more than just 8 bytes at
-			 * a time. However, folding this into the loop and
-			 * handling the reads in 8 byte increments allows us
-			 * to smoothly transition from chip memory to 8051
-			 * memory.
-			 */
-			if (read_8051_data(dd,
-					   (u32)(csr_off - barlen),
-					   sizeof(data), &data))
-				break; /* failed */
-		} else
-			data = readq(base + total);
-		if (put_user(data, (unsigned long __user *)(buf + total)))
-			break;
-	}
-	*f_pos += total;
-	return total;
-}
-
-/* NOTE: assumes unsigned long is 8 bytes */
-static ssize_t ui_write(struct file *filp, const char __user *buf,
-			size_t count, loff_t *f_pos)
-{
-	struct hfi1_devdata *dd = filp->private_data;
-	void __iomem *base;
-	unsigned long total, data, csr_off;
-	int in_lcb;
-
-	/* only write 8 byte quantities */
-	if ((count % 8) != 0)
-		return -EINVAL;
-	/* offset must be 8-byte aligned */
-	if ((*f_pos % 8) != 0)
-		return -EINVAL;
-	/* source buffer must be 8-byte aligned */
-	if ((unsigned long)buf % 8 != 0)
-		return -EINVAL;
-	/* must be in range */
-	if (*f_pos + count > dd->kregend - dd->kregbase)
-		return -EINVAL;
-
-	base = (void __iomem *)dd->kregbase + *f_pos;
-	csr_off = *f_pos;
-	in_lcb = 0;
-	for (total = 0; total < count; total += 8, csr_off += 8) {
-		if (get_user(data, (unsigned long __user *)(buf + total)))
-			break;
-		/* accessing LCB CSRs requires a special procedure */
-		if (is_lcb_offset(csr_off)) {
-			if (!in_lcb) {
-				int ret = acquire_lcb_access(dd, 1);
-
-				if (ret)
-					break;
-				in_lcb = 1;
-			}
-		} else {
-			if (in_lcb) {
-				release_lcb_access(dd, 1);
-				in_lcb = 0;
-			}
-		}
-		writeq(data, base + total);
-	}
-	if (in_lcb)
-		release_lcb_access(dd, 1);
-	*f_pos += total;
-	return total;
-}
-
-static const struct file_operations ui_file_ops = {
-	.owner = THIS_MODULE,
-	.llseek = ui_lseek,
-	.read = ui_read,
-	.write = ui_write,
-	.open = ui_open,
-	.release = ui_release,
-};
-
-#define UI_OFFSET 192	/* device minor offset for UI devices */
-static int create_ui = 1;
-
-static struct cdev wildcard_cdev;
-static struct device *wildcard_device;
-
-static atomic_t user_count = ATOMIC_INIT(0);
-
 static void user_remove(struct hfi1_devdata *dd)
 {
-	if (atomic_dec_return(&user_count) == 0)
-		hfi1_cdev_cleanup(&wildcard_cdev, &wildcard_device);
 
 	hfi1_cdev_cleanup(&dd->user_cdev, &dd->user_device);
-	hfi1_cdev_cleanup(&dd->ui_cdev, &dd->ui_device);
 }
 
 static int user_add(struct hfi1_devdata *dd)
@@ -1717,34 +1470,13 @@
 	char name[10];
 	int ret;
 
-	if (atomic_inc_return(&user_count) == 1) {
-		ret = hfi1_cdev_init(0, class_name(), &hfi1_file_ops,
-				     &wildcard_cdev, &wildcard_device,
-				     true);
-		if (ret)
-			goto done;
-	}
-
 	snprintf(name, sizeof(name), "%s_%d", class_name(), dd->unit);
-	ret = hfi1_cdev_init(dd->unit + 1, name, &hfi1_file_ops,
+	ret = hfi1_cdev_init(dd->unit, name, &hfi1_file_ops,
 			     &dd->user_cdev, &dd->user_device,
-			     true);
+			     true, &dd->kobj);
 	if (ret)
-		goto done;
+		user_remove(dd);
 
-	if (create_ui) {
-		snprintf(name, sizeof(name),
-			 "%s_ui%d", class_name(), dd->unit);
-		ret = hfi1_cdev_init(dd->unit + UI_OFFSET, name, &ui_file_ops,
-				     &dd->ui_cdev, &dd->ui_device,
-				     false);
-		if (ret)
-			goto done;
-	}
-
-	return 0;
-done:
-	user_remove(dd);
 	return ret;
 }
 
@@ -1753,13 +1485,7 @@
  */
 int hfi1_device_create(struct hfi1_devdata *dd)
 {
-	int r, ret;
-
-	r = user_add(dd);
-	ret = hfi1_diag_add(dd);
-	if (r && !ret)
-		ret = r;
-	return ret;
+	return user_add(dd);
 }
 
 /*
@@ -1769,5 +1495,4 @@
 void hfi1_device_remove(struct hfi1_devdata *dd)
 {
 	user_remove(dd);
-	hfi1_diag_remove(dd);
 }
diff --git a/drivers/staging/rdma/hfi1/firmware.c b/drivers/infiniband/hw/hfi1/firmware.c
similarity index 100%
rename from drivers/staging/rdma/hfi1/firmware.c
rename to drivers/infiniband/hw/hfi1/firmware.c
diff --git a/drivers/staging/rdma/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
similarity index 99%
rename from drivers/staging/rdma/hfi1/hfi.h
rename to drivers/infiniband/hw/hfi1/hfi.h
index 7b78d56..4417a0f 100644
--- a/drivers/staging/rdma/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -453,6 +453,7 @@
 #define HLS_LINK_COOLDOWN BIT(__HLS_LINK_COOLDOWN_BP)
 
 #define HLS_UP (HLS_UP_INIT | HLS_UP_ARMED | HLS_UP_ACTIVE)
+#define HLS_DOWN ~(HLS_UP)
 
 /* use this MTU size if none other is given */
 #define HFI1_DEFAULT_ACTIVE_MTU 10240
@@ -1168,6 +1169,7 @@
 	atomic_t aspm_disabled_cnt;
 
 	struct hfi1_affinity *affinity;
+	struct kobject kobj;
 };
 
 /* 8051 firmware version helper */
@@ -1882,9 +1884,8 @@
 		get_unit_name((dd)->unit), ##__VA_ARGS__)
 
 #define hfi1_dev_porterr(dd, port, fmt, ...) \
-	dev_err(&(dd)->pcidev->dev, "%s: IB%u:%u " fmt, \
-			get_unit_name((dd)->unit), (dd)->unit, (port), \
-			##__VA_ARGS__)
+	dev_err(&(dd)->pcidev->dev, "%s: port %u: " fmt, \
+			get_unit_name((dd)->unit), (port), ##__VA_ARGS__)
 
 /*
  * this is used for formatting hw error messages...
diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
similarity index 98%
rename from drivers/staging/rdma/hfi1/init.c
rename to drivers/infiniband/hw/hfi1/init.c
index 502b7cf..5cc492e 100644
--- a/drivers/staging/rdma/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -732,12 +732,12 @@
 		lastfail = hfi1_create_rcvhdrq(dd, rcd);
 		if (!lastfail)
 			lastfail = hfi1_setup_eagerbufs(rcd);
-		if (lastfail)
+		if (lastfail) {
 			dd_dev_err(dd,
 				   "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
+			ret = lastfail;
+		}
 	}
-	if (lastfail)
-		ret = lastfail;
 
 	/* Allocate enough memory for user event notification. */
 	len = PAGE_ALIGN(dd->chip_rcv_contexts * HFI1_MAX_SHARED_CTXTS *
@@ -989,8 +989,10 @@
 	dd->asic_data = NULL;
 }
 
-void hfi1_free_devdata(struct hfi1_devdata *dd)
+static void __hfi1_free_devdata(struct kobject *kobj)
 {
+	struct hfi1_devdata *dd =
+		container_of(kobj, struct hfi1_devdata, kobj);
 	unsigned long flags;
 
 	spin_lock_irqsave(&hfi1_devs_lock, flags);
@@ -1007,6 +1009,15 @@
 	rvt_dealloc_device(&dd->verbs_dev.rdi);
 }
 
+static struct kobj_type hfi1_devdata_type = {
+	.release = __hfi1_free_devdata,
+};
+
+void hfi1_free_devdata(struct hfi1_devdata *dd)
+{
+	kobject_put(&dd->kobj);
+}
+
 /*
  * Allocate our primary per-unit data structure.  Must be done via verbs
  * allocator, because the verbs cleanup process both does cleanup and
@@ -1102,6 +1113,7 @@
 			&pdev->dev,
 			"Could not alloc cpulist info, cpu affinity might be wrong\n");
 	}
+	kobject_init(&dd->kobj, &hfi1_devdata_type);
 	return dd;
 
 bail:
@@ -1300,7 +1312,7 @@
 
 		spin_lock(&ppd->cc_state_lock);
 		cc_state = get_cc_state(ppd);
-		rcu_assign_pointer(ppd->cc_state, NULL);
+		RCU_INIT_POINTER(ppd->cc_state, NULL);
 		spin_unlock(&ppd->cc_state_lock);
 
 		if (cc_state)
diff --git a/drivers/staging/rdma/hfi1/intr.c b/drivers/infiniband/hw/hfi1/intr.c
similarity index 100%
rename from drivers/staging/rdma/hfi1/intr.c
rename to drivers/infiniband/hw/hfi1/intr.c
diff --git a/drivers/staging/rdma/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h
similarity index 100%
rename from drivers/staging/rdma/hfi1/iowait.h
rename to drivers/infiniband/hw/hfi1/iowait.h
diff --git a/drivers/staging/rdma/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c
similarity index 98%
rename from drivers/staging/rdma/hfi1/mad.c
rename to drivers/infiniband/hw/hfi1/mad.c
index ed58cf2..2190295 100644
--- a/drivers/staging/rdma/hfi1/mad.c
+++ b/drivers/infiniband/hw/hfi1/mad.c
@@ -1403,6 +1403,12 @@
 		if (key == okey)
 			continue;
 		/*
+		 * Don't update pkeys[2], if an HFI port without MgmtAllowed
+		 * by neighbor is a switch.
+		 */
+		if (i == 2 && !ppd->mgmt_allowed && ppd->neighbor_type == 1)
+			continue;
+		/*
 		 * The SM gives us the complete PKey table. We have
 		 * to ensure that we put the PKeys in the matching
 		 * slots.
@@ -3363,6 +3369,50 @@
 	return reply((struct ib_mad_hdr *)smp);
 }
 
+/*
+ * Apply congestion control information stored in the ppd to the
+ * active structure.
+ */
+static void apply_cc_state(struct hfi1_pportdata *ppd)
+{
+	struct cc_state *old_cc_state, *new_cc_state;
+
+	new_cc_state = kzalloc(sizeof(*new_cc_state), GFP_KERNEL);
+	if (!new_cc_state)
+		return;
+
+	/*
+	 * Hold the lock for updating *and* to prevent ppd information
+	 * from changing during the update.
+	 */
+	spin_lock(&ppd->cc_state_lock);
+
+	old_cc_state = get_cc_state(ppd);
+	if (!old_cc_state) {
+		/* never active, or shutting down */
+		spin_unlock(&ppd->cc_state_lock);
+		kfree(new_cc_state);
+		return;
+	}
+
+	*new_cc_state = *old_cc_state;
+
+	new_cc_state->cct.ccti_limit = ppd->total_cct_entry - 1;
+	memcpy(new_cc_state->cct.entries, ppd->ccti_entries,
+	       ppd->total_cct_entry * sizeof(struct ib_cc_table_entry));
+
+	new_cc_state->cong_setting.port_control = IB_CC_CCS_PC_SL_BASED;
+	new_cc_state->cong_setting.control_map = ppd->cc_sl_control_map;
+	memcpy(new_cc_state->cong_setting.entries, ppd->congestion_entries,
+	       OPA_MAX_SLS * sizeof(struct opa_congestion_setting_entry));
+
+	rcu_assign_pointer(ppd->cc_state, new_cc_state);
+
+	spin_unlock(&ppd->cc_state_lock);
+
+	call_rcu(&old_cc_state->rcu, cc_state_reclaim);
+}
+
 static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data,
 				       struct ib_device *ibdev, u8 port,
 				       u32 *resp_len)
@@ -3374,6 +3424,11 @@
 	struct opa_congestion_setting_entry_shadow *entries;
 	int i;
 
+	/*
+	 * Save details from packet into the ppd.  Hold the cc_state_lock so
+	 * our information is consistent with anyone trying to apply the state.
+	 */
+	spin_lock(&ppd->cc_state_lock);
 	ppd->cc_sl_control_map = be32_to_cpu(p->control_map);
 
 	entries = ppd->congestion_entries;
@@ -3384,6 +3439,10 @@
 			p->entries[i].trigger_threshold;
 		entries[i].ccti_min = p->entries[i].ccti_min;
 	}
+	spin_unlock(&ppd->cc_state_lock);
+
+	/* now apply the information */
+	apply_cc_state(ppd);
 
 	return __subn_get_opa_cong_setting(smp, am, data, ibdev, port,
 					   resp_len);
@@ -3526,7 +3585,6 @@
 	int i, j;
 	u32 sentry, eentry;
 	u16 ccti_limit;
-	struct cc_state *old_cc_state, *new_cc_state;
 
 	/* sanity check n_blocks, start_block */
 	if (n_blocks == 0 ||
@@ -3546,45 +3604,20 @@
 		return reply((struct ib_mad_hdr *)smp);
 	}
 
-	new_cc_state = kzalloc(sizeof(*new_cc_state), GFP_KERNEL);
-	if (!new_cc_state)
-		goto getit;
-
+	/*
+	 * Save details from packet into the ppd.  Hold the cc_state_lock so
+	 * our information is consistent with anyone trying to apply the state.
+	 */
 	spin_lock(&ppd->cc_state_lock);
-
-	old_cc_state = get_cc_state(ppd);
-
-	if (!old_cc_state) {
-		spin_unlock(&ppd->cc_state_lock);
-		kfree(new_cc_state);
-		return reply((struct ib_mad_hdr *)smp);
-	}
-
-	*new_cc_state = *old_cc_state;
-
-	new_cc_state->cct.ccti_limit = ccti_limit;
-
-	entries = ppd->ccti_entries;
 	ppd->total_cct_entry = ccti_limit + 1;
-
+	entries = ppd->ccti_entries;
 	for (j = 0, i = sentry; i < eentry; j++, i++)
 		entries[i].entry = be16_to_cpu(p->ccti_entries[j].entry);
-
-	memcpy(new_cc_state->cct.entries, entries,
-	       eentry * sizeof(struct ib_cc_table_entry));
-
-	new_cc_state->cong_setting.port_control = IB_CC_CCS_PC_SL_BASED;
-	new_cc_state->cong_setting.control_map = ppd->cc_sl_control_map;
-	memcpy(new_cc_state->cong_setting.entries, ppd->congestion_entries,
-	       OPA_MAX_SLS * sizeof(struct opa_congestion_setting_entry));
-
-	rcu_assign_pointer(ppd->cc_state, new_cc_state);
-
 	spin_unlock(&ppd->cc_state_lock);
 
-	call_rcu(&old_cc_state->rcu, cc_state_reclaim);
+	/* now apply the information */
+	apply_cc_state(ppd);
 
-getit:
 	return __subn_get_opa_cc_table(smp, am, data, ibdev, port, resp_len);
 }
 
diff --git a/drivers/staging/rdma/hfi1/mad.h b/drivers/infiniband/hw/hfi1/mad.h
similarity index 100%
rename from drivers/staging/rdma/hfi1/mad.h
rename to drivers/infiniband/hw/hfi1/mad.h
diff --git a/drivers/staging/rdma/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
similarity index 95%
rename from drivers/staging/rdma/hfi1/mmu_rb.c
rename to drivers/infiniband/hw/hfi1/mmu_rb.c
index 2b0e91d..b7a80aa 100644
--- a/drivers/staging/rdma/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -45,6 +45,7 @@
  *
  */
 #include <linux/list.h>
+#include <linux/rculist.h>
 #include <linux/mmu_notifier.h>
 #include <linux/interval_tree_generic.h>
 
@@ -97,7 +98,6 @@
 int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops)
 {
 	struct mmu_rb_handler *handlr;
-	unsigned long flags;
 
 	if (!ops->invalidate)
 		return -EINVAL;
@@ -111,9 +111,9 @@
 	INIT_HLIST_NODE(&handlr->mn.hlist);
 	spin_lock_init(&handlr->lock);
 	handlr->mn.ops = &mn_opts;
-	spin_lock_irqsave(&mmu_rb_lock, flags);
-	list_add_tail(&handlr->list, &mmu_rb_handlers);
-	spin_unlock_irqrestore(&mmu_rb_lock, flags);
+	spin_lock(&mmu_rb_lock);
+	list_add_tail_rcu(&handlr->list, &mmu_rb_handlers);
+	spin_unlock(&mmu_rb_lock);
 
 	return mmu_notifier_register(&handlr->mn, current->mm);
 }
@@ -130,9 +130,10 @@
 	if (current->mm)
 		mmu_notifier_unregister(&handler->mn, current->mm);
 
-	spin_lock_irqsave(&mmu_rb_lock, flags);
-	list_del(&handler->list);
-	spin_unlock_irqrestore(&mmu_rb_lock, flags);
+	spin_lock(&mmu_rb_lock);
+	list_del_rcu(&handler->list);
+	spin_unlock(&mmu_rb_lock);
+	synchronize_rcu();
 
 	spin_lock_irqsave(&handler->lock, flags);
 	if (!RB_EMPTY_ROOT(root)) {
@@ -271,16 +272,15 @@
 static struct mmu_rb_handler *find_mmu_handler(struct rb_root *root)
 {
 	struct mmu_rb_handler *handler;
-	unsigned long flags;
 
-	spin_lock_irqsave(&mmu_rb_lock, flags);
-	list_for_each_entry(handler, &mmu_rb_handlers, list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(handler, &mmu_rb_handlers, list) {
 		if (handler->root == root)
 			goto unlock;
 	}
 	handler = NULL;
 unlock:
-	spin_unlock_irqrestore(&mmu_rb_lock, flags);
+	rcu_read_unlock();
 	return handler;
 }
 
diff --git a/drivers/staging/rdma/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h
similarity index 100%
rename from drivers/staging/rdma/hfi1/mmu_rb.h
rename to drivers/infiniband/hw/hfi1/mmu_rb.h
diff --git a/drivers/staging/rdma/hfi1/opa_compat.h b/drivers/infiniband/hw/hfi1/opa_compat.h
similarity index 100%
rename from drivers/staging/rdma/hfi1/opa_compat.h
rename to drivers/infiniband/hw/hfi1/opa_compat.h
diff --git a/drivers/staging/rdma/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c
similarity index 100%
rename from drivers/staging/rdma/hfi1/pcie.c
rename to drivers/infiniband/hw/hfi1/pcie.c
diff --git a/drivers/staging/rdma/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c
similarity index 99%
rename from drivers/staging/rdma/hfi1/pio.c
rename to drivers/infiniband/hw/hfi1/pio.c
index c67b9ad..d5edb1a 100644
--- a/drivers/staging/rdma/hfi1/pio.c
+++ b/drivers/infiniband/hw/hfi1/pio.c
@@ -1835,8 +1835,7 @@
 	struct pio_vl_map *oldmap, *newmap;
 
 	if (!vl_scontexts) {
-		/* send context 0 reserved for VL15 */
-		for (i = 1; i < dd->num_send_contexts; i++)
+		for (i = 0; i < dd->num_send_contexts; i++)
 			if (dd->send_contexts[i].type == SC_KERNEL)
 				num_kernel_send_contexts++;
 		/* truncate divide */
diff --git a/drivers/staging/rdma/hfi1/pio.h b/drivers/infiniband/hw/hfi1/pio.h
similarity index 98%
rename from drivers/staging/rdma/hfi1/pio.h
rename to drivers/infiniband/hw/hfi1/pio.h
index 53a08ed..464cbd2 100644
--- a/drivers/staging/rdma/hfi1/pio.h
+++ b/drivers/infiniband/hw/hfi1/pio.h
@@ -49,10 +49,10 @@
 
 /* send context types */
 #define SC_KERNEL 0
-#define SC_ACK    1
-#define SC_USER   2
-#define SC_VL15   3
-#define SC_MAX    4
+#define SC_VL15   1
+#define SC_ACK    2
+#define SC_USER   3	/* must be the last one: it may take all left */
+#define SC_MAX    4	/* count of send context types */
 
 /* invalid send context index */
 #define INVALID_SCI 0xff
diff --git a/drivers/staging/rdma/hfi1/pio_copy.c b/drivers/infiniband/hw/hfi1/pio_copy.c
similarity index 100%
rename from drivers/staging/rdma/hfi1/pio_copy.c
rename to drivers/infiniband/hw/hfi1/pio_copy.c
diff --git a/drivers/staging/rdma/hfi1/platform.c b/drivers/infiniband/hw/hfi1/platform.c
similarity index 98%
rename from drivers/staging/rdma/hfi1/platform.c
rename to drivers/infiniband/hw/hfi1/platform.c
index 8fe8a20..03df932 100644
--- a/drivers/staging/rdma/hfi1/platform.c
+++ b/drivers/infiniband/hw/hfi1/platform.c
@@ -87,6 +87,17 @@
 	 */
 }
 
+void get_port_type(struct hfi1_pportdata *ppd)
+{
+	int ret;
+
+	ret = get_platform_config_field(ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+					PORT_TABLE_PORT_TYPE, &ppd->port_type,
+					4);
+	if (ret)
+		ppd->port_type = PORT_TYPE_UNKNOWN;
+}
+
 int set_qsfp_tx(struct hfi1_pportdata *ppd, int on)
 {
 	u8 tx_ctrl_byte = on ? 0x0 : 0xF;
@@ -529,7 +540,8 @@
 	/* Enable external device config if channel is limiting active */
 	read_8051_config(ppd->dd, LINK_OPTIMIZATION_SETTINGS,
 			 GENERAL_CONFIG, &config_data);
-	config_data |= limiting_active;
+	config_data &= ~(0xff << ENABLE_EXT_DEV_CONFIG_SHIFT);
+	config_data |= ((u32)limiting_active << ENABLE_EXT_DEV_CONFIG_SHIFT);
 	ret = load_8051_config(ppd->dd, LINK_OPTIMIZATION_SETTINGS,
 			       GENERAL_CONFIG, config_data);
 	if (ret != HCMD_SUCCESS)
@@ -542,7 +554,8 @@
 	/* Pass tuning method to 8051 */
 	read_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG,
 			 &config_data);
-	config_data |= tuning_method;
+	config_data &= ~(0xff << TUNING_METHOD_SHIFT);
+	config_data |= ((u32)tuning_method << TUNING_METHOD_SHIFT);
 	ret = load_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG,
 			       config_data);
 	if (ret != HCMD_SUCCESS)
@@ -564,8 +577,8 @@
 		ret = read_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS,
 				       GENERAL_CONFIG, &config_data);
 		/* Clear, then set the external device config field */
-		config_data &= ~(0xFF << 24);
-		config_data |= (external_device_config << 24);
+		config_data &= ~(u32)0xFF;
+		config_data |= external_device_config;
 		ret = load_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS,
 				       GENERAL_CONFIG, config_data);
 		if (ret != HCMD_SUCCESS)
@@ -784,12 +797,6 @@
 		return;
 	}
 
-	ret = get_platform_config_field(ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
-					PORT_TABLE_PORT_TYPE, &ppd->port_type,
-					4);
-	if (ret)
-		ppd->port_type = PORT_TYPE_UNKNOWN;
-
 	switch (ppd->port_type) {
 	case PORT_TYPE_DISCONNECTED:
 		ppd->offline_disabled_reason =
diff --git a/drivers/staging/rdma/hfi1/platform.h b/drivers/infiniband/hw/hfi1/platform.h
similarity index 99%
rename from drivers/staging/rdma/hfi1/platform.h
rename to drivers/infiniband/hw/hfi1/platform.h
index 19620cf..e2c2161 100644
--- a/drivers/staging/rdma/hfi1/platform.h
+++ b/drivers/infiniband/hw/hfi1/platform.h
@@ -298,6 +298,7 @@
 /* platform.c */
 void get_platform_config(struct hfi1_devdata *dd);
 void free_platform_config(struct hfi1_devdata *dd);
+void get_port_type(struct hfi1_pportdata *ppd);
 int set_qsfp_tx(struct hfi1_pportdata *ppd, int on);
 void tune_serdes(struct hfi1_pportdata *ppd);
 
diff --git a/drivers/staging/rdma/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
similarity index 98%
rename from drivers/staging/rdma/hfi1/qp.c
rename to drivers/infiniband/hw/hfi1/qp.c
index 91eb423..1a942ff 100644
--- a/drivers/staging/rdma/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -49,7 +49,6 @@
 #include <linux/vmalloc.h>
 #include <linux/hash.h>
 #include <linux/module.h>
-#include <linux/random.h>
 #include <linux/seq_file.h>
 #include <rdma/rdma_vt.h>
 #include <rdma/rdmavt_qp.h>
@@ -161,9 +160,6 @@
  * This function is what we would push to the core layer if we wanted to be a
  * "first class citizen".  Instead we hide this here and rely on Verbs ULPs
  * to blindly pass the MTU enum value from the PathRecord to us.
- *
- * The actual flag used to determine "8k MTU" will change and is currently
- * unknown.
  */
 static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu)
 {
@@ -516,6 +512,7 @@
 static void iowait_sdma_drained(struct iowait *wait)
 {
 	struct rvt_qp *qp = iowait_to_qp(wait);
+	unsigned long flags;
 
 	/*
 	 * This happens when the send engine notes
@@ -523,12 +520,12 @@
 	 * do the flush work until that QP's
 	 * sdma work has finished.
 	 */
-	spin_lock(&qp->s_lock);
+	spin_lock_irqsave(&qp->s_lock, flags);
 	if (qp->s_flags & RVT_S_WAIT_DMA) {
 		qp->s_flags &= ~RVT_S_WAIT_DMA;
 		hfi1_schedule_send(qp);
 	}
-	spin_unlock(&qp->s_lock);
+	spin_unlock_irqrestore(&qp->s_lock, flags);
 }
 
 /**
diff --git a/drivers/staging/rdma/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h
similarity index 100%
rename from drivers/staging/rdma/hfi1/qp.h
rename to drivers/infiniband/hw/hfi1/qp.h
diff --git a/drivers/staging/rdma/hfi1/qsfp.c b/drivers/infiniband/hw/hfi1/qsfp.c
similarity index 100%
rename from drivers/staging/rdma/hfi1/qsfp.c
rename to drivers/infiniband/hw/hfi1/qsfp.c
diff --git a/drivers/staging/rdma/hfi1/qsfp.h b/drivers/infiniband/hw/hfi1/qsfp.h
similarity index 100%
rename from drivers/staging/rdma/hfi1/qsfp.h
rename to drivers/infiniband/hw/hfi1/qsfp.h
diff --git a/drivers/staging/rdma/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
similarity index 100%
rename from drivers/staging/rdma/hfi1/rc.c
rename to drivers/infiniband/hw/hfi1/rc.c
diff --git a/drivers/staging/rdma/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c
similarity index 100%
rename from drivers/staging/rdma/hfi1/ruc.c
rename to drivers/infiniband/hw/hfi1/ruc.c
diff --git a/drivers/staging/rdma/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
similarity index 99%
rename from drivers/staging/rdma/hfi1/sdma.c
rename to drivers/infiniband/hw/hfi1/sdma.c
index abb8ebc..f9befc0 100644
--- a/drivers/staging/rdma/hfi1/sdma.c
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -134,6 +134,7 @@
 	[sdma_state_s99_running]                = "s99_Running",
 };
 
+#ifdef CONFIG_SDMA_VERBOSITY
 static const char * const sdma_event_names[] = {
 	[sdma_event_e00_go_hw_down]   = "e00_GoHwDown",
 	[sdma_event_e10_go_hw_start]  = "e10_GoHwStart",
@@ -150,6 +151,7 @@
 	[sdma_event_e85_link_down]    = "e85_LinkDown",
 	[sdma_event_e90_sw_halted]    = "e90_SwHalted",
 };
+#endif
 
 static const struct sdma_set_state_action sdma_action_table[] = {
 	[sdma_state_s00_hw_down] = {
@@ -376,7 +378,7 @@
 	sdma_txclean(sde->dd, tx);
 	if (complete)
 		(*complete)(tx, res);
-	if (iowait_sdma_dec(wait) && wait)
+	if (wait && iowait_sdma_dec(wait))
 		iowait_drain_wakeup(wait);
 }
 
diff --git a/drivers/staging/rdma/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h
similarity index 100%
rename from drivers/staging/rdma/hfi1/sdma.h
rename to drivers/infiniband/hw/hfi1/sdma.h
diff --git a/drivers/staging/rdma/hfi1/sdma_txreq.h b/drivers/infiniband/hw/hfi1/sdma_txreq.h
similarity index 100%
rename from drivers/staging/rdma/hfi1/sdma_txreq.h
rename to drivers/infiniband/hw/hfi1/sdma_txreq.h
diff --git a/drivers/staging/rdma/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c
similarity index 99%
rename from drivers/staging/rdma/hfi1/sysfs.c
rename to drivers/infiniband/hw/hfi1/sysfs.c
index 8cd6df8..91fc2ae 100644
--- a/drivers/staging/rdma/hfi1/sysfs.c
+++ b/drivers/infiniband/hw/hfi1/sysfs.c
@@ -721,8 +721,8 @@
 	}
 
 	dd_dev_info(dd,
-		    "IB%u: Congestion Control Agent enabled for port %d\n",
-		    dd->unit, port_num);
+		    "Congestion Control Agent enabled for port %d\n",
+		    port_num);
 
 	return 0;
 
diff --git a/drivers/staging/rdma/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c
similarity index 96%
rename from drivers/staging/rdma/hfi1/trace.c
rename to drivers/infiniband/hw/hfi1/trace.c
index 8b62fef..79b2952 100644
--- a/drivers/staging/rdma/hfi1/trace.c
+++ b/drivers/infiniband/hw/hfi1/trace.c
@@ -66,6 +66,7 @@
 #define RETH_PRN "reth vaddr 0x%.16llx rkey 0x%.8x dlen 0x%.8x"
 #define AETH_PRN "aeth syn 0x%.2x %s msn 0x%.8x"
 #define DETH_PRN "deth qkey 0x%.8x sqpn 0x%.6x"
+#define IETH_PRN "ieth rkey 0x%.8x"
 #define ATOMICACKETH_PRN "origdata %lld"
 #define ATOMICETH_PRN "vaddr 0x%llx rkey 0x%.8x sdata %lld cdata %lld"
 
@@ -166,6 +167,12 @@
 				 be32_to_cpu(eh->ud.deth[0]),
 				 be32_to_cpu(eh->ud.deth[1]) & RVT_QPN_MASK);
 		break;
+	/* ieth */
+	case OP(RC, SEND_LAST_WITH_INVALIDATE):
+	case OP(RC, SEND_ONLY_WITH_INVALIDATE):
+		trace_seq_printf(p, IETH_PRN,
+				 be32_to_cpu(eh->ieth));
+		break;
 	}
 	trace_seq_putc(p, 0);
 	return ret;
@@ -233,3 +240,4 @@
 __hfi1_trace_fn(RCVCTRL);
 __hfi1_trace_fn(TID);
 __hfi1_trace_fn(MMU);
+__hfi1_trace_fn(IOCTL);
diff --git a/drivers/staging/rdma/hfi1/trace.h b/drivers/infiniband/hw/hfi1/trace.h
similarity index 99%
rename from drivers/staging/rdma/hfi1/trace.h
rename to drivers/infiniband/hw/hfi1/trace.h
index 963dc94..28c1d08 100644
--- a/drivers/staging/rdma/hfi1/trace.h
+++ b/drivers/infiniband/hw/hfi1/trace.h
@@ -74,8 +74,8 @@
 
 TRACE_EVENT(hfi1_rcvhdr,
 	    TP_PROTO(struct hfi1_devdata *dd,
-		     u64 eflags,
 		     u32 ctxt,
+		     u64 eflags,
 		     u32 etype,
 		     u32 hlen,
 		     u32 tlen,
@@ -392,6 +392,8 @@
 	ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE),             \
 	ib_opcode_name(RC_COMPARE_SWAP),                   \
 	ib_opcode_name(RC_FETCH_ADD),                      \
+	ib_opcode_name(RC_SEND_LAST_WITH_INVALIDATE),      \
+	ib_opcode_name(RC_SEND_ONLY_WITH_INVALIDATE),      \
 	ib_opcode_name(UC_SEND_FIRST),                     \
 	ib_opcode_name(UC_SEND_MIDDLE),                    \
 	ib_opcode_name(UC_SEND_LAST),                      \
@@ -1341,6 +1343,7 @@
 __hfi1_trace_def(RCVCTRL);
 __hfi1_trace_def(TID);
 __hfi1_trace_def(MMU);
+__hfi1_trace_def(IOCTL);
 
 #define hfi1_cdbg(which, fmt, ...) \
 	__hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__)
diff --git a/drivers/staging/rdma/hfi1/twsi.c b/drivers/infiniband/hw/hfi1/twsi.c
similarity index 100%
rename from drivers/staging/rdma/hfi1/twsi.c
rename to drivers/infiniband/hw/hfi1/twsi.c
diff --git a/drivers/staging/rdma/hfi1/twsi.h b/drivers/infiniband/hw/hfi1/twsi.h
similarity index 100%
rename from drivers/staging/rdma/hfi1/twsi.h
rename to drivers/infiniband/hw/hfi1/twsi.h
diff --git a/drivers/staging/rdma/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c
similarity index 100%
rename from drivers/staging/rdma/hfi1/uc.c
rename to drivers/infiniband/hw/hfi1/uc.c
diff --git a/drivers/staging/rdma/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c
similarity index 100%
rename from drivers/staging/rdma/hfi1/ud.c
rename to drivers/infiniband/hw/hfi1/ud.c
diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
similarity index 100%
rename from drivers/staging/rdma/hfi1/user_exp_rcv.c
rename to drivers/infiniband/hw/hfi1/user_exp_rcv.c
diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
similarity index 100%
rename from drivers/staging/rdma/hfi1/user_exp_rcv.h
rename to drivers/infiniband/hw/hfi1/user_exp_rcv.h
diff --git a/drivers/staging/rdma/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c
similarity index 100%
rename from drivers/staging/rdma/hfi1/user_pages.c
rename to drivers/infiniband/hw/hfi1/user_pages.c
diff --git a/drivers/staging/rdma/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c
similarity index 99%
rename from drivers/staging/rdma/hfi1/user_sdma.c
rename to drivers/infiniband/hw/hfi1/user_sdma.c
index 0014c9c..29f4795 100644
--- a/drivers/staging/rdma/hfi1/user_sdma.c
+++ b/drivers/infiniband/hw/hfi1/user_sdma.c
@@ -166,6 +166,8 @@
 
 #define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */
 
+struct sdma_mmu_node;
+
 struct user_sdma_iovec {
 	struct list_head list;
 	struct iovec iov;
@@ -178,6 +180,7 @@
 	 * which we last left off.
 	 */
 	u64 offset;
+	struct sdma_mmu_node *node;
 };
 
 #define SDMA_CACHE_NODE_EVICT BIT(0)
@@ -507,6 +510,7 @@
 	struct sdma_req_info info;
 	struct user_sdma_request *req;
 	u8 opcode, sc, vl;
+	int req_queued = 0;
 
 	if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
 		hfi1_cdbg(
@@ -703,6 +707,7 @@
 
 	set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
 	atomic_inc(&pq->n_reqs);
+	req_queued = 1;
 	/* Send the first N packets in the request to buy us some time */
 	ret = user_sdma_send_pkts(req, pcount);
 	if (unlikely(ret < 0 && ret != -EBUSY)) {
@@ -747,7 +752,8 @@
 	return 0;
 free_req:
 	user_sdma_free_request(req, true);
-	pq_update(pq);
+	if (req_queued)
+		pq_update(pq);
 	set_comp_state(pq, cq, info.comp_idx, ERROR, req->status);
 	return ret;
 }
@@ -1153,6 +1159,7 @@
 	}
 	iovec->pages = node->pages;
 	iovec->npages = npages;
+	iovec->node = node;
 
 	ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb);
 	if (ret) {
@@ -1519,18 +1526,13 @@
 	}
 	if (req->data_iovs) {
 		struct sdma_mmu_node *node;
-		struct mmu_rb_node *mnode;
 		int i;
 
 		for (i = 0; i < req->data_iovs; i++) {
-			mnode = hfi1_mmu_rb_search(
-				&req->pq->sdma_rb_root,
-				(unsigned long)req->iovs[i].iov.iov_base,
-				req->iovs[i].iov.iov_len);
-			if (!mnode || IS_ERR(mnode))
+			node = req->iovs[i].node;
+			if (!node)
 				continue;
 
-			node = container_of(mnode, struct sdma_mmu_node, rb);
 			if (unpin)
 				hfi1_mmu_rb_remove(&req->pq->sdma_rb_root,
 						   &node->rb);
diff --git a/drivers/staging/rdma/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h
similarity index 100%
rename from drivers/staging/rdma/hfi1/user_sdma.h
rename to drivers/infiniband/hw/hfi1/user_sdma.h
diff --git a/drivers/staging/rdma/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
similarity index 99%
rename from drivers/staging/rdma/hfi1/verbs.c
rename to drivers/infiniband/hw/hfi1/verbs.c
index 9cdc85f..849c4b9 100644
--- a/drivers/staging/rdma/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -52,7 +52,6 @@
 #include <linux/utsname.h>
 #include <linux/rculist.h>
 #include <linux/mm.h>
-#include <linux/random.h>
 #include <linux/vmalloc.h>
 
 #include "hfi.h"
@@ -336,6 +335,8 @@
 	[IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = 12 + 8 + 4,
 	[IB_OPCODE_RC_COMPARE_SWAP]                   = 12 + 8 + 28,
 	[IB_OPCODE_RC_FETCH_ADD]                      = 12 + 8 + 28,
+	[IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE]      = 12 + 8 + 4,
+	[IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = 12 + 8 + 4,
 	/* UC */
 	[IB_OPCODE_UC_SEND_FIRST]                     = 12 + 8,
 	[IB_OPCODE_UC_SEND_MIDDLE]                    = 12 + 8,
@@ -946,7 +947,6 @@
 
 			dev->n_piowait += !!(flag & RVT_S_WAIT_PIO);
 			dev->n_piodrain += !!(flag & RVT_S_WAIT_PIO_DRAIN);
-			dev->n_piowait++;
 			qp->s_flags |= flag;
 			was_empty = list_empty(&sc->piowait);
 			list_add_tail(&priv->s_iowait.list, &sc->piowait);
diff --git a/drivers/staging/rdma/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
similarity index 99%
rename from drivers/staging/rdma/hfi1/verbs.h
rename to drivers/infiniband/hw/hfi1/verbs.h
index 3ee2239..4883567 100644
--- a/drivers/staging/rdma/hfi1/verbs.h
+++ b/drivers/infiniband/hw/hfi1/verbs.h
@@ -152,6 +152,7 @@
 	} at;
 	__be32 imm_data;
 	__be32 aeth;
+	__be32 ieth;
 	struct ib_atomic_eth atomic_eth;
 }  __packed;
 
diff --git a/drivers/staging/rdma/hfi1/verbs_txreq.c b/drivers/infiniband/hw/hfi1/verbs_txreq.c
similarity index 100%
rename from drivers/staging/rdma/hfi1/verbs_txreq.c
rename to drivers/infiniband/hw/hfi1/verbs_txreq.c
diff --git a/drivers/staging/rdma/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h
similarity index 100%
rename from drivers/staging/rdma/hfi1/verbs_txreq.h
rename to drivers/infiniband/hw/hfi1/verbs_txreq.h
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index 4a740f7..02a735b 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -2361,58 +2361,130 @@
 	return 0;
 }
 
+static const char * const i40iw_hw_stat_names[] = {
+	// 32bit names
+	[I40IW_HW_STAT_INDEX_IP4RXDISCARD] = "ip4InDiscards",
+	[I40IW_HW_STAT_INDEX_IP4RXTRUNC] = "ip4InTruncatedPkts",
+	[I40IW_HW_STAT_INDEX_IP4TXNOROUTE] = "ip4OutNoRoutes",
+	[I40IW_HW_STAT_INDEX_IP6RXDISCARD] = "ip6InDiscards",
+	[I40IW_HW_STAT_INDEX_IP6RXTRUNC] = "ip6InTruncatedPkts",
+	[I40IW_HW_STAT_INDEX_IP6TXNOROUTE] = "ip6OutNoRoutes",
+	[I40IW_HW_STAT_INDEX_TCPRTXSEG] = "tcpRetransSegs",
+	[I40IW_HW_STAT_INDEX_TCPRXOPTERR] = "tcpInOptErrors",
+	[I40IW_HW_STAT_INDEX_TCPRXPROTOERR] = "tcpInProtoErrors",
+	// 64bit names
+	[I40IW_HW_STAT_INDEX_IP4RXOCTS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip4InOctets",
+	[I40IW_HW_STAT_INDEX_IP4RXPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip4InPkts",
+	[I40IW_HW_STAT_INDEX_IP4RXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip4InReasmRqd",
+	[I40IW_HW_STAT_INDEX_IP4RXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip4InMcastPkts",
+	[I40IW_HW_STAT_INDEX_IP4TXOCTS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip4OutOctets",
+	[I40IW_HW_STAT_INDEX_IP4TXPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip4OutPkts",
+	[I40IW_HW_STAT_INDEX_IP4TXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip4OutSegRqd",
+	[I40IW_HW_STAT_INDEX_IP4TXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip4OutMcastPkts",
+	[I40IW_HW_STAT_INDEX_IP6RXOCTS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip6InOctets",
+	[I40IW_HW_STAT_INDEX_IP6RXPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip6InPkts",
+	[I40IW_HW_STAT_INDEX_IP6RXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip6InReasmRqd",
+	[I40IW_HW_STAT_INDEX_IP6RXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip6InMcastPkts",
+	[I40IW_HW_STAT_INDEX_IP6TXOCTS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip6OutOctets",
+	[I40IW_HW_STAT_INDEX_IP6TXPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip6OutPkts",
+	[I40IW_HW_STAT_INDEX_IP6TXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip6OutSegRqd",
+	[I40IW_HW_STAT_INDEX_IP6TXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip6OutMcastPkts",
+	[I40IW_HW_STAT_INDEX_TCPRXSEGS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"tcpInSegs",
+	[I40IW_HW_STAT_INDEX_TCPTXSEG + I40IW_HW_STAT_INDEX_MAX_32] =
+		"tcpOutSegs",
+	[I40IW_HW_STAT_INDEX_RDMARXRDS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"iwInRdmaReads",
+	[I40IW_HW_STAT_INDEX_RDMARXSNDS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"iwInRdmaSends",
+	[I40IW_HW_STAT_INDEX_RDMARXWRS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"iwInRdmaWrites",
+	[I40IW_HW_STAT_INDEX_RDMATXRDS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"iwOutRdmaReads",
+	[I40IW_HW_STAT_INDEX_RDMATXSNDS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"iwOutRdmaSends",
+	[I40IW_HW_STAT_INDEX_RDMATXWRS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"iwOutRdmaWrites",
+	[I40IW_HW_STAT_INDEX_RDMAVBND + I40IW_HW_STAT_INDEX_MAX_32] =
+		"iwRdmaBnd",
+	[I40IW_HW_STAT_INDEX_RDMAVINV + I40IW_HW_STAT_INDEX_MAX_32] =
+		"iwRdmaInv"
+};
+
 /**
- * i40iw_get_protocol_stats - Populates the rdma_stats structure
- * @ibdev: ib dev struct
- * @stats: iw protocol stats struct
+ * i40iw_alloc_hw_stats - Allocate a hw stats structure
+ * @ibdev: device pointer from stack
+ * @port_num: port number
  */
-static int i40iw_get_protocol_stats(struct ib_device *ibdev,
-				    union rdma_protocol_stats *stats)
+static struct rdma_hw_stats *i40iw_alloc_hw_stats(struct ib_device *ibdev,
+						  u8 port_num)
+{
+	struct i40iw_device *iwdev = to_iwdev(ibdev);
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+	int num_counters = I40IW_HW_STAT_INDEX_MAX_32 +
+		I40IW_HW_STAT_INDEX_MAX_64;
+	unsigned long lifespan = RDMA_HW_STATS_DEFAULT_LIFESPAN;
+
+	BUILD_BUG_ON(ARRAY_SIZE(i40iw_hw_stat_names) !=
+		     (I40IW_HW_STAT_INDEX_MAX_32 +
+		      I40IW_HW_STAT_INDEX_MAX_64));
+
+	/*
+	 * PFs get the default update lifespan, but VFs only update once
+	 * per second
+	 */
+	if (!dev->is_pf)
+		lifespan = 1000;
+	return rdma_alloc_hw_stats_struct(i40iw_hw_stat_names, num_counters,
+					  lifespan);
+}
+
+/**
+ * i40iw_get_hw_stats - Populates the rdma_hw_stats structure
+ * @ibdev: device pointer from stack
+ * @stats: stats pointer from stack
+ * @port_num: port number
+ * @index: which hw counter the stack is requesting we update
+ */
+static int i40iw_get_hw_stats(struct ib_device *ibdev,
+			      struct rdma_hw_stats *stats,
+			      u8 port_num, int index)
 {
 	struct i40iw_device *iwdev = to_iwdev(ibdev);
 	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
 	struct i40iw_dev_pestat *devstat = &dev->dev_pestat;
 	struct i40iw_dev_hw_stats *hw_stats = &devstat->hw_stats;
-	struct timespec curr_time;
-	static struct timespec last_rd_time = {0, 0};
 	unsigned long flags;
 
-	curr_time = current_kernel_time();
-	memset(stats, 0, sizeof(*stats));
-
 	if (dev->is_pf) {
 		spin_lock_irqsave(&devstat->stats_lock, flags);
 		devstat->ops.iw_hw_stat_read_all(devstat,
 			&devstat->hw_stats);
 		spin_unlock_irqrestore(&devstat->stats_lock, flags);
 	} else {
-		if (((u64)curr_time.tv_sec - (u64)last_rd_time.tv_sec) > 1)
-			if (i40iw_vchnl_vf_get_pe_stats(dev, &devstat->hw_stats))
-				return -ENOSYS;
+		if (i40iw_vchnl_vf_get_pe_stats(dev, &devstat->hw_stats))
+			return -ENOSYS;
 	}
 
-	stats->iw.ipInReceives = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4RXPKTS] +
-				 hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6RXPKTS];
-	stats->iw.ipInTruncatedPkts = hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP4RXTRUNC] +
-				      hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP6RXTRUNC];
-	stats->iw.ipInDiscards = hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP4RXDISCARD] +
-				 hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP6RXDISCARD];
-	stats->iw.ipOutNoRoutes = hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP4TXNOROUTE] +
-				  hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP6TXNOROUTE];
-	stats->iw.ipReasmReqds = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4RXFRAGS] +
-				 hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6RXFRAGS];
-	stats->iw.ipFragCreates = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4TXFRAGS] +
-				  hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6TXFRAGS];
-	stats->iw.ipInMcastPkts = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4RXMCPKTS] +
-				  hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6RXMCPKTS];
-	stats->iw.ipOutMcastPkts = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4TXMCPKTS] +
-				   hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6TXMCPKTS];
-	stats->iw.tcpOutSegs = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_TCPTXSEG];
-	stats->iw.tcpInSegs = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_TCPRXSEGS];
-	stats->iw.tcpRetransSegs = hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_TCPRTXSEG];
+	memcpy(&stats->value[0], &hw_stats, sizeof(*hw_stats));
 
-	last_rd_time = curr_time;
-	return 0;
+	return stats->num_counters;
 }
 
 /**
@@ -2551,7 +2623,8 @@
 	iwibdev->ibdev.get_dma_mr = i40iw_get_dma_mr;
 	iwibdev->ibdev.reg_user_mr = i40iw_reg_user_mr;
 	iwibdev->ibdev.dereg_mr = i40iw_dereg_mr;
-	iwibdev->ibdev.get_protocol_stats = i40iw_get_protocol_stats;
+	iwibdev->ibdev.alloc_hw_stats = i40iw_alloc_hw_stats;
+	iwibdev->ibdev.get_hw_stats = i40iw_get_hw_stats;
 	iwibdev->ibdev.query_device = i40iw_query_device;
 	iwibdev->ibdev.create_ah = i40iw_create_ah;
 	iwibdev->ibdev.destroy_ah = i40iw_destroy_ah;
diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c
index 82d7c4b..ce40340 100644
--- a/drivers/infiniband/hw/qib/qib_iba7322.c
+++ b/drivers/infiniband/hw/qib/qib_iba7322.c
@@ -1308,21 +1308,6 @@
 	SYM_LSB(IntMask, fldname##17IntMask)), \
 	.msg = #fldname "_C", .sz = sizeof(#fldname "_C") }
 
-static const struct  qib_hwerror_msgs qib_7322_intr_msgs[] = {
-	INTR_AUTO_P(SDmaInt),
-	INTR_AUTO_P(SDmaProgressInt),
-	INTR_AUTO_P(SDmaIdleInt),
-	INTR_AUTO_P(SDmaCleanupDone),
-	INTR_AUTO_C(RcvUrg),
-	INTR_AUTO_P(ErrInt),
-	INTR_AUTO(ErrInt),      /* non-port-specific errs */
-	INTR_AUTO(AssertGPIOInt),
-	INTR_AUTO_P(SendDoneInt),
-	INTR_AUTO(SendBufAvailInt),
-	INTR_AUTO_C(RcvAvail),
-	{ .mask = 0, .sz = 0 }
-};
-
 #define TXSYMPTOM_AUTO_P(fldname) \
 	{ .mask = SYM_MASK(SendHdrErrSymptom_0, fldname), \
 	.msg = #fldname, .sz = sizeof(#fldname) }
diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c
index 0bd1837..d2ac298 100644
--- a/drivers/infiniband/hw/qib/qib_mad.c
+++ b/drivers/infiniband/hw/qib/qib_mad.c
@@ -1172,11 +1172,13 @@
 	 * Set the most significant bit of CM2 to indicate support for
 	 * congestion statistics
 	 */
-	p->reserved[0] = dd->psxmitwait_supported << 7;
+	ib_set_cpi_capmask2(p,
+			    dd->psxmitwait_supported <<
+			    (31 - IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE));
 	/*
 	 * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec.
 	 */
-	p->resp_time_value = 18;
+	ib_set_cpi_resp_time(p, 18);
 
 	return reply((struct ib_smp *) pmp);
 }
diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h
index 6888f03..4f87815 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.h
+++ b/drivers/infiniband/hw/qib/qib_verbs.h
@@ -159,6 +159,7 @@
 		} at;
 		__be32 imm_data;
 		__be32 aeth;
+		__be32 ieth;
 		struct ib_atomic_eth atomic_eth;
 	} u;
 } __packed;
diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c
index b1ffc8b..6ca6fa8 100644
--- a/drivers/infiniband/sw/rdmavt/cq.c
+++ b/drivers/infiniband/sw/rdmavt/cq.c
@@ -525,6 +525,7 @@
 		return PTR_ERR(task);
 	}
 
+	set_user_nice(task, MIN_NICE);
 	cpu = cpumask_first(cpumask_of_node(rdi->dparms.node));
 	kthread_bind(task, cpu);
 	wake_up_process(task);
diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c
index 0ff765b..0f4d450 100644
--- a/drivers/infiniband/sw/rdmavt/mr.c
+++ b/drivers/infiniband/sw/rdmavt/mr.c
@@ -124,11 +124,13 @@
 			    int count)
 {
 	int m, i = 0;
+	struct rvt_dev_info *dev = ib_to_rvt(pd->device);
 
 	mr->mapsz = 0;
 	m = (count + RVT_SEGSZ - 1) / RVT_SEGSZ;
 	for (; i < m; i++) {
-		mr->map[i] = kzalloc(sizeof(*mr->map[0]), GFP_KERNEL);
+		mr->map[i] = kzalloc_node(sizeof(*mr->map[0]), GFP_KERNEL,
+					  dev->dparms.node);
 		if (!mr->map[i]) {
 			rvt_deinit_mregion(mr);
 			return -ENOMEM;
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 0f12c21..5fa4d4d 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -397,6 +397,7 @@
 static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends)
 {
 	unsigned n;
+	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
 
 	if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags))
 		rvt_put_ss(&qp->s_rdma_read_sge);
@@ -431,7 +432,7 @@
 	if (qp->ibqp.qp_type != IB_QPT_RC)
 		return;
 
-	for (n = 0; n < ARRAY_SIZE(qp->s_ack_queue); n++) {
+	for (n = 0; n < rvt_max_atomic(rdi); n++) {
 		struct rvt_ack_entry *e = &qp->s_ack_queue[n];
 
 		if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST &&
@@ -569,7 +570,12 @@
 	qp->s_ssn = 1;
 	qp->s_lsn = 0;
 	qp->s_mig_state = IB_MIG_MIGRATED;
-	memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue));
+	if (qp->s_ack_queue)
+		memset(
+			qp->s_ack_queue,
+			0,
+			rvt_max_atomic(rdi) *
+				sizeof(*qp->s_ack_queue));
 	qp->r_head_ack_queue = 0;
 	qp->s_tail_ack_queue = 0;
 	qp->s_num_rd_atomic = 0;
@@ -653,9 +659,9 @@
 		if (gfp == GFP_NOIO)
 			swq = __vmalloc(
 				(init_attr->cap.max_send_wr + 1) * sz,
-				gfp, PAGE_KERNEL);
+				gfp | __GFP_ZERO, PAGE_KERNEL);
 		else
-			swq = vmalloc_node(
+			swq = vzalloc_node(
 				(init_attr->cap.max_send_wr + 1) * sz,
 				rdi->dparms.node);
 		if (!swq)
@@ -677,6 +683,16 @@
 			goto bail_swq;
 
 		RCU_INIT_POINTER(qp->next, NULL);
+		if (init_attr->qp_type == IB_QPT_RC) {
+			qp->s_ack_queue =
+				kzalloc_node(
+					sizeof(*qp->s_ack_queue) *
+					 rvt_max_atomic(rdi),
+					gfp,
+					rdi->dparms.node);
+			if (!qp->s_ack_queue)
+				goto bail_qp;
+		}
 
 		/*
 		 * Driver needs to set up it's private QP structure and do any
@@ -704,9 +720,9 @@
 				qp->r_rq.wq = __vmalloc(
 						sizeof(struct rvt_rwq) +
 						qp->r_rq.size * sz,
-						gfp, PAGE_KERNEL);
+						gfp | __GFP_ZERO, PAGE_KERNEL);
 			else
-				qp->r_rq.wq = vmalloc_node(
+				qp->r_rq.wq = vzalloc_node(
 						sizeof(struct rvt_rwq) +
 						qp->r_rq.size * sz,
 						rdi->dparms.node);
@@ -857,6 +873,7 @@
 	rdi->driver_f.qp_priv_free(rdi, qp);
 
 bail_qp:
+	kfree(qp->s_ack_queue);
 	kfree(qp);
 
 bail_swq:
@@ -1284,6 +1301,7 @@
 		vfree(qp->r_rq.wq);
 	vfree(qp->s_wq);
 	rdi->driver_f.qp_priv_free(rdi, qp);
+	kfree(qp->s_ack_queue);
 	kfree(qp);
 	return 0;
 }
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index caec8e9..bab7db6 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -92,6 +92,8 @@
 	IPOIB_FLAG_UMCAST	  = 10,
 	IPOIB_STOP_NEIGH_GC	  = 11,
 	IPOIB_NEIGH_TBL_FLUSH	  = 12,
+	IPOIB_FLAG_DEV_ADDR_SET	  = 13,
+	IPOIB_FLAG_DEV_ADDR_CTRL  = 14,
 
 	IPOIB_MAX_BACKOFF_SECONDS = 16,
 
@@ -392,6 +394,7 @@
 	struct ipoib_ethtool_st ethtool;
 	struct timer_list poll_timer;
 	unsigned max_send_sge;
+	bool sm_fullmember_sendonly_support;
 };
 
 struct ipoib_ah {
@@ -476,6 +479,7 @@
 
 void ipoib_mark_paths_invalid(struct net_device *dev);
 void ipoib_flush_paths(struct net_device *dev);
+int ipoib_check_sm_sendonly_fullmember_support(struct ipoib_dev_priv *priv);
 struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);
 
 int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 418e5a1..45c40a1 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -997,6 +997,106 @@
 	return 0;
 }
 
+/*
+ * returns true if the device address of the ipoib interface has changed and the
+ * new address is a valid one (i.e in the gid table), return false otherwise.
+ */
+static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv)
+{
+	union ib_gid search_gid;
+	union ib_gid gid0;
+	union ib_gid *netdev_gid;
+	int err;
+	u16 index;
+	u8 port;
+	bool ret = false;
+
+	netdev_gid = (union ib_gid *)(priv->dev->dev_addr + 4);
+	if (ib_query_gid(priv->ca, priv->port, 0, &gid0, NULL))
+		return false;
+
+	netif_addr_lock(priv->dev);
+
+	/* The subnet prefix may have changed, update it now so we won't have
+	 * to do it later
+	 */
+	priv->local_gid.global.subnet_prefix = gid0.global.subnet_prefix;
+	netdev_gid->global.subnet_prefix = gid0.global.subnet_prefix;
+	search_gid.global.subnet_prefix = gid0.global.subnet_prefix;
+
+	search_gid.global.interface_id = priv->local_gid.global.interface_id;
+
+	netif_addr_unlock(priv->dev);
+
+	err = ib_find_gid(priv->ca, &search_gid, IB_GID_TYPE_IB,
+			  priv->dev, &port, &index);
+
+	netif_addr_lock(priv->dev);
+
+	if (search_gid.global.interface_id !=
+	    priv->local_gid.global.interface_id)
+		/* There was a change while we were looking up the gid, bail
+		 * here and let the next work sort this out
+		 */
+		goto out;
+
+	/* The next section of code needs some background:
+	 * Per IB spec the port GUID can't change if the HCA is powered on.
+	 * port GUID is the basis for GID at index 0 which is the basis for
+	 * the default device address of a ipoib interface.
+	 *
+	 * so it seems the flow should be:
+	 * if user_changed_dev_addr && gid in gid tbl
+	 *	set bit dev_addr_set
+	 *	return true
+	 * else
+	 *	return false
+	 *
+	 * The issue is that there are devices that don't follow the spec,
+	 * they change the port GUID when the HCA is powered, so in order
+	 * not to break userspace applications, We need to check if the
+	 * user wanted to control the device address and we assume that
+	 * if he sets the device address back to be based on GID index 0,
+	 * he no longer wishs to control it.
+	 *
+	 * If the user doesn't control the the device address,
+	 * IPOIB_FLAG_DEV_ADDR_SET is set and ib_find_gid failed it means
+	 * the port GUID has changed and GID at index 0 has changed
+	 * so we need to change priv->local_gid and priv->dev->dev_addr
+	 * to reflect the new GID.
+	 */
+	if (!test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) {
+		if (!err && port == priv->port) {
+			set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
+			if (index == 0)
+				clear_bit(IPOIB_FLAG_DEV_ADDR_CTRL,
+					  &priv->flags);
+			else
+				set_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags);
+			ret = true;
+		} else {
+			ret = false;
+		}
+	} else {
+		if (!err && port == priv->port) {
+			ret = true;
+		} else {
+			if (!test_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags)) {
+				memcpy(&priv->local_gid, &gid0,
+				       sizeof(priv->local_gid));
+				memcpy(priv->dev->dev_addr + 4, &gid0,
+				       sizeof(priv->local_gid));
+				ret = true;
+			}
+		}
+	}
+
+out:
+	netif_addr_unlock(priv->dev);
+
+	return ret;
+}
+
 static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
 				enum ipoib_flush_level level,
 				int nesting)
@@ -1018,6 +1118,9 @@
 
 	if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) &&
 	    level != IPOIB_FLUSH_HEAVY) {
+		/* Make sure the dev_addr is set even if not flushing */
+		if (level == IPOIB_FLUSH_LIGHT)
+			ipoib_dev_addr_changed_valid(priv);
 		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
 		return;
 	}
@@ -1029,7 +1132,8 @@
 				update_parent_pkey(priv);
 			else
 				update_child_pkey(priv);
-		}
+		} else if (level == IPOIB_FLUSH_LIGHT)
+			ipoib_dev_addr_changed_valid(priv);
 		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n");
 		return;
 	}
@@ -1081,7 +1185,8 @@
 	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
 		if (level >= IPOIB_FLUSH_NORMAL)
 			ipoib_ib_dev_up(dev);
-		ipoib_mcast_restart_task(&priv->restart_task);
+		if (ipoib_dev_addr_changed_valid(priv))
+			ipoib_mcast_restart_task(&priv->restart_task);
 	}
 }
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index b940ef1..2d7c163 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -99,6 +99,7 @@
 		struct ib_device *dev, u8 port, u16 pkey,
 		const union ib_gid *gid, const struct sockaddr *addr,
 		void *client_data);
+static int ipoib_set_mac(struct net_device *dev, void *addr);
 
 static struct ib_client ipoib_client = {
 	.name   = "ipoib",
@@ -117,6 +118,8 @@
 
 	set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 
+	priv->sm_fullmember_sendonly_support = false;
+
 	if (ipoib_ib_dev_open(dev)) {
 		if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
 			return 0;
@@ -629,6 +632,77 @@
 	spin_unlock_irq(&priv->lock);
 }
 
+struct classport_info_context {
+	struct ipoib_dev_priv	*priv;
+	struct completion	done;
+	struct ib_sa_query	*sa_query;
+};
+
+static void classport_info_query_cb(int status, struct ib_class_port_info *rec,
+				    void *context)
+{
+	struct classport_info_context *cb_ctx = context;
+	struct ipoib_dev_priv *priv;
+
+	WARN_ON(!context);
+
+	priv = cb_ctx->priv;
+
+	if (status || !rec) {
+		pr_debug("device: %s failed query classport_info status: %d\n",
+			 priv->dev->name, status);
+		/* keeps the default, will try next mcast_restart */
+		priv->sm_fullmember_sendonly_support = false;
+		goto out;
+	}
+
+	if (ib_get_cpi_capmask2(rec) &
+	    IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT) {
+		pr_debug("device: %s enabled fullmember-sendonly for sendonly MCG\n",
+			 priv->dev->name);
+		priv->sm_fullmember_sendonly_support = true;
+	} else {
+		pr_debug("device: %s disabled fullmember-sendonly for sendonly MCG\n",
+			 priv->dev->name);
+		priv->sm_fullmember_sendonly_support = false;
+	}
+
+out:
+	complete(&cb_ctx->done);
+}
+
+int ipoib_check_sm_sendonly_fullmember_support(struct ipoib_dev_priv *priv)
+{
+	struct classport_info_context *callback_context;
+	int ret;
+
+	callback_context = kmalloc(sizeof(*callback_context), GFP_KERNEL);
+	if (!callback_context)
+		return -ENOMEM;
+
+	callback_context->priv = priv;
+	init_completion(&callback_context->done);
+
+	ret = ib_sa_classport_info_rec_query(&ipoib_sa_client,
+					     priv->ca, priv->port, 3000,
+					     GFP_KERNEL,
+					     classport_info_query_cb,
+					     callback_context,
+					     &callback_context->sa_query);
+	if (ret < 0) {
+		pr_info("%s failed to send ib_sa_classport_info query, ret: %d\n",
+			priv->dev->name, ret);
+		kfree(callback_context);
+		return ret;
+	}
+
+	/* waiting for the callback to finish before returnning */
+	wait_for_completion(&callback_context->done);
+	kfree(callback_context);
+
+	return ret;
+}
+
 void ipoib_flush_paths(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -1649,6 +1723,7 @@
 	.ndo_get_vf_config	 = ipoib_get_vf_config,
 	.ndo_get_vf_stats	 = ipoib_get_vf_stats,
 	.ndo_set_vf_guid	 = ipoib_set_vf_guid,
+	.ndo_set_mac_address	 = ipoib_set_mac,
 };
 
 static const struct net_device_ops ipoib_netdev_ops_vf = {
@@ -1771,6 +1846,70 @@
 	return device_create_file(&dev->dev, &dev_attr_umcast);
 }
 
+static void set_base_guid(struct ipoib_dev_priv *priv, union ib_gid *gid)
+{
+	struct ipoib_dev_priv *child_priv;
+	struct net_device *netdev = priv->dev;
+
+	netif_addr_lock(netdev);
+
+	memcpy(&priv->local_gid.global.interface_id,
+	       &gid->global.interface_id,
+	       sizeof(gid->global.interface_id));
+	memcpy(netdev->dev_addr + 4, &priv->local_gid, sizeof(priv->local_gid));
+	clear_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
+
+	netif_addr_unlock(netdev);
+
+	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+		down_read(&priv->vlan_rwsem);
+		list_for_each_entry(child_priv, &priv->child_intfs, list)
+			set_base_guid(child_priv, gid);
+		up_read(&priv->vlan_rwsem);
+	}
+}
+
+static int ipoib_check_lladdr(struct net_device *dev,
+			      struct sockaddr_storage *ss)
+{
+	union ib_gid *gid = (union ib_gid *)(ss->__data + 4);
+	int ret = 0;
+
+	netif_addr_lock(dev);
+
+	/* Make sure the QPN, reserved and subnet prefix match the current
+	 * lladdr, it also makes sure the lladdr is unicast.
+	 */
+	if (memcmp(dev->dev_addr, ss->__data,
+		   4 + sizeof(gid->global.subnet_prefix)) ||
+	    gid->global.interface_id == 0)
+		ret = -EINVAL;
+
+	netif_addr_unlock(dev);
+
+	return ret;
+}
+
+static int ipoib_set_mac(struct net_device *dev, void *addr)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct sockaddr_storage *ss = addr;
+	int ret;
+
+	if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev))
+		return -EBUSY;
+
+	ret = ipoib_check_lladdr(dev, ss);
+	if (ret)
+		return ret;
+
+	set_base_guid(priv, (union ib_gid *)(ss->__data + 4));
+
+	queue_work(ipoib_workqueue, &priv->flush_light);
+
+	return 0;
+}
+
 static ssize_t create_child(struct device *dev,
 			    struct device_attribute *attr,
 			    const char *buf, size_t count)
@@ -1894,6 +2033,7 @@
 		goto device_init_failed;
 	} else
 		memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
+	set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
 
 	result = ipoib_dev_init(priv->dev, hca, port);
 	if (result < 0) {
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 2588931..82fbc94 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -64,6 +64,9 @@
 	unsigned int       send_only;
 };
 
+/* join state that allows creating mcg with sendonly member request */
+#define SENDONLY_FULLMEMBER_JOIN	8
+
 /*
  * This should be called with the priv->lock held
  */
@@ -326,12 +329,23 @@
 	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
 						   carrier_on_task);
 	struct ib_port_attr attr;
+	int ret;
 
 	if (ib_query_port(priv->ca, priv->port, &attr) ||
 	    attr.state != IB_PORT_ACTIVE) {
 		ipoib_dbg(priv, "Keeping carrier off until IB port is active\n");
 		return;
 	}
+	/*
+	 * Check if can send sendonly MCG's with sendonly-fullmember join state.
+	 * It done here after the successfully join to the broadcast group,
+	 * because the broadcast group must always be joined first and is always
+	 * re-joined if the SM changes substantially.
+	 */
+	ret = ipoib_check_sm_sendonly_fullmember_support(priv);
+	if (ret < 0)
+		pr_debug("%s failed query sm support for sendonly-fullmember (ret: %d)\n",
+			 priv->dev->name, ret);
 
 	/*
 	 * Take rtnl_lock to avoid racing with ipoib_stop() and
@@ -515,22 +529,20 @@
 		rec.hop_limit	  = priv->broadcast->mcmember.hop_limit;
 
 		/*
-		 * Send-only IB Multicast joins do not work at the core
-		 * IB layer yet, so we can't use them here.  However,
-		 * we are emulating an Ethernet multicast send, which
-		 * does not require a multicast subscription and will
-		 * still send properly.  The most appropriate thing to
+		 * Send-only IB Multicast joins work at the core IB layer but
+		 * require specific SM support.
+		 * We can use such joins here only if the current SM supports that feature.
+		 * However, if not, we emulate an Ethernet multicast send,
+		 * which does not require a multicast subscription and will
+		 * still send properly. The most appropriate thing to
 		 * do is to create the group if it doesn't exist as that
 		 * most closely emulates the behavior, from a user space
-		 * application perspecitive, of Ethernet multicast
-		 * operation.  For now, we do a full join, maybe later
-		 * when the core IB layers support send only joins we
-		 * will use them.
+		 * application perspective, of Ethernet multicast operation.
 		 */
-#if 0
-		if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
-			rec.join_state = 4;
-#endif
+		if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) &&
+		    priv->sm_fullmember_sendonly_support)
+			/* SM supports sendonly-fullmember, otherwise fallback to full-member */
+			rec.join_state = SENDONLY_FULLMEMBER_JOIN;
 	}
 	spin_unlock_irq(&priv->lock);
 
@@ -570,11 +582,13 @@
 		return;
 	}
 	priv->local_lid = port_attr.lid;
+	netif_addr_lock(dev);
 
-	if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid, NULL))
-		ipoib_warn(priv, "ib_query_gid() failed\n");
-	else
-		memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
+	if (!test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) {
+		netif_addr_unlock(dev);
+		return;
+	}
+	netif_addr_unlock(dev);
 
 	spin_lock_irq(&priv->lock);
 	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index b809c37..1e7cbba 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -307,5 +307,8 @@
 		queue_work(ipoib_workqueue, &priv->flush_normal);
 	} else if (record->event == IB_EVENT_PKEY_CHANGE) {
 		queue_work(ipoib_workqueue, &priv->flush_heavy);
+	} else if (record->event == IB_EVENT_GID_CHANGE &&
+		   !test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) {
+		queue_work(ipoib_workqueue, &priv->flush_light);
 	}
 }
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
index fca1a88..64a3559 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -68,6 +68,8 @@
 	priv->pkey = pkey;
 
 	memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN);
+	memcpy(&priv->local_gid, &ppriv->local_gid, sizeof(priv->local_gid));
+	set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
 	priv->dev->broadcast[8] = pkey >> 8;
 	priv->dev->broadcast[9] = pkey & 0xff;
 
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index 2843f1a..887ebad 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -254,8 +254,8 @@
 	memset(cif, 0, sizeof(*cif));
 	cif->base_version = 1;
 	cif->class_version = 1;
-	cif->resp_time_value = 20;
 
+	ib_set_cpi_resp_time(cif, 20);
 	mad->mad_hdr.status = 0;
 }
 
diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index 5bac28a..7c197d1 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -66,8 +66,6 @@
 
 source "drivers/staging/media/Kconfig"
 
-source "drivers/staging/rdma/Kconfig"
-
 source "drivers/staging/android/Kconfig"
 
 source "drivers/staging/board/Kconfig"
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index a954242..a470c72 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -23,7 +23,6 @@
 obj-$(CONFIG_USB_EMXX)		+= emxx_udc/
 obj-$(CONFIG_SPEAKUP)		+= speakup/
 obj-$(CONFIG_MFD_NVEC)		+= nvec/
-obj-$(CONFIG_STAGING_RDMA)	+= rdma/
 obj-$(CONFIG_ANDROID)		+= android/
 obj-$(CONFIG_STAGING_BOARD)	+= board/
 obj-$(CONFIG_LTE_GDM724X)	+= gdm724x/
diff --git a/drivers/staging/rdma/Kconfig b/drivers/staging/rdma/Kconfig
deleted file mode 100644
index f1f3eca..0000000
--- a/drivers/staging/rdma/Kconfig
+++ /dev/null
@@ -1,27 +0,0 @@
-menuconfig STAGING_RDMA
-        tristate "RDMA staging drivers"
-	depends on INFINIBAND
-	depends on PCI || BROKEN
-	depends on HAS_IOMEM
-	depends on NET
-	depends on INET
-        default n
-        ---help---
-          This option allows you to select a number of RDMA drivers that
-	  fall into one of two categories: deprecated drivers being held
-	  here before finally being removed or new drivers that still need
-	  some work before being moved to the normal RDMA driver area.
-
-          If you wish to work on these drivers, to help improve them, or
-          to report problems you have with them, please use the
-	  linux-rdma@vger.kernel.org mailing list.
-
-          If in doubt, say N here.
-
-
-# Please keep entries in alphabetic order
-if STAGING_RDMA
-
-source "drivers/staging/rdma/hfi1/Kconfig"
-
-endif
diff --git a/drivers/staging/rdma/Makefile b/drivers/staging/rdma/Makefile
deleted file mode 100644
index 8c7fc1d..0000000
--- a/drivers/staging/rdma/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# Entries for RDMA_STAGING tree
-obj-$(CONFIG_INFINIBAND_HFI1)	+= hfi1/
diff --git a/drivers/staging/rdma/hfi1/TODO b/drivers/staging/rdma/hfi1/TODO
deleted file mode 100644
index 4c6f1d7..0000000
--- a/drivers/staging/rdma/hfi1/TODO
+++ /dev/null
@@ -1,6 +0,0 @@
-July, 2015
-
-- Remove unneeded file entries in sysfs
-- Remove software processing of IB protocol and place in library for use
-  by qib, ipath (if still present), hfi1, and eventually soft-roce
-- Replace incorrect uAPI
diff --git a/drivers/staging/rdma/hfi1/diag.c b/drivers/staging/rdma/hfi1/diag.c
deleted file mode 100644
index bb2409a..0000000
--- a/drivers/staging/rdma/hfi1/diag.c
+++ /dev/null
@@ -1,1925 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-/*
- * This file contains support for diagnostic functions.  It is accessed by
- * opening the hfi1_diag device, normally minor number 129.  Diagnostic use
- * of the chip may render the chip or board unusable until the driver
- * is unloaded, or in some cases, until the system is rebooted.
- *
- * Accesses to the chip through this interface are not similar to going
- * through the /sys/bus/pci resource mmap interface.
- */
-
-#include <linux/io.h>
-#include <linux/pci.h>
-#include <linux/poll.h>
-#include <linux/vmalloc.h>
-#include <linux/export.h>
-#include <linux/fs.h>
-#include <linux/uaccess.h>
-#include <linux/module.h>
-#include <rdma/ib_smi.h>
-#include "hfi.h"
-#include "device.h"
-#include "common.h"
-#include "verbs_txreq.h"
-#include "trace.h"
-
-#undef pr_fmt
-#define pr_fmt(fmt) DRIVER_NAME ": " fmt
-#define snoop_dbg(fmt, ...) \
-	hfi1_cdbg(SNOOP, fmt, ##__VA_ARGS__)
-
-/* Snoop option mask */
-#define SNOOP_DROP_SEND		BIT(0)
-#define SNOOP_USE_METADATA	BIT(1)
-#define SNOOP_SET_VL0TOVL15     BIT(2)
-
-static u8 snoop_flags;
-
-/*
- * Extract packet length from LRH header.
- * This is in Dwords so multiply by 4 to get size in bytes
- */
-#define HFI1_GET_PKT_LEN(x)      (((be16_to_cpu((x)->lrh[2]) & 0xFFF)) << 2)
-
-enum hfi1_filter_status {
-	HFI1_FILTER_HIT,
-	HFI1_FILTER_ERR,
-	HFI1_FILTER_MISS
-};
-
-/* snoop processing functions */
-rhf_rcv_function_ptr snoop_rhf_rcv_functions[8] = {
-	[RHF_RCV_TYPE_EXPECTED] = snoop_recv_handler,
-	[RHF_RCV_TYPE_EAGER]    = snoop_recv_handler,
-	[RHF_RCV_TYPE_IB]       = snoop_recv_handler,
-	[RHF_RCV_TYPE_ERROR]    = snoop_recv_handler,
-	[RHF_RCV_TYPE_BYPASS]   = snoop_recv_handler,
-	[RHF_RCV_TYPE_INVALID5] = process_receive_invalid,
-	[RHF_RCV_TYPE_INVALID6] = process_receive_invalid,
-	[RHF_RCV_TYPE_INVALID7] = process_receive_invalid
-};
-
-/* Snoop packet structure */
-struct snoop_packet {
-	struct list_head list;
-	u32 total_len;
-	u8 data[];
-};
-
-/* Do not make these an enum or it will blow up the capture_md */
-#define PKT_DIR_EGRESS 0x0
-#define PKT_DIR_INGRESS 0x1
-
-/* Packet capture metadata returned to the user with the packet. */
-struct capture_md {
-	u8 port;
-	u8 dir;
-	u8 reserved[6];
-	union {
-		u64 pbc;
-		u64 rhf;
-	} u;
-};
-
-static atomic_t diagpkt_count = ATOMIC_INIT(0);
-static struct cdev diagpkt_cdev;
-static struct device *diagpkt_device;
-
-static ssize_t diagpkt_write(struct file *fp, const char __user *data,
-			     size_t count, loff_t *off);
-
-static const struct file_operations diagpkt_file_ops = {
-	.owner = THIS_MODULE,
-	.write = diagpkt_write,
-	.llseek = noop_llseek,
-};
-
-/*
- * This is used for communication with user space for snoop extended IOCTLs
- */
-struct hfi1_link_info {
-	__be64 node_guid;
-	u8 port_mode;
-	u8 port_state;
-	u16 link_speed_active;
-	u16 link_width_active;
-	u16 vl15_init;
-	u8 port_number;
-	/*
-	 * Add padding to make this a full IB SMP payload. Note: changing the
-	 * size of this structure will make the IOCTLs created with _IOWR
-	 * change.
-	 * Be sure to run tests on all IOCTLs when making changes to this
-	 * structure.
-	 */
-	u8 res[47];
-};
-
-/*
- * This starts our ioctl sequence numbers *way* off from the ones
- * defined in ib_core.
- */
-#define SNOOP_CAPTURE_VERSION 0x1
-
-#define IB_IOCTL_MAGIC          0x1b /* See Documentation/ioctl-number.txt */
-#define HFI1_SNOOP_IOC_MAGIC IB_IOCTL_MAGIC
-#define HFI1_SNOOP_IOC_BASE_SEQ 0x80
-
-#define HFI1_SNOOP_IOCGETLINKSTATE \
-	_IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ)
-#define HFI1_SNOOP_IOCSETLINKSTATE \
-	_IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 1)
-#define HFI1_SNOOP_IOCCLEARQUEUE \
-	_IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 2)
-#define HFI1_SNOOP_IOCCLEARFILTER \
-	_IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 3)
-#define HFI1_SNOOP_IOCSETFILTER \
-	_IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 4)
-#define HFI1_SNOOP_IOCGETVERSION \
-	_IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 5)
-#define HFI1_SNOOP_IOCSET_OPTS \
-	_IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 6)
-
-/*
- * These offsets +6/+7 could change, but these are already known and used
- * IOCTL numbers so don't change them without a good reason.
- */
-#define HFI1_SNOOP_IOCGETLINKSTATE_EXTRA \
-	_IOWR(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 6, \
-		struct hfi1_link_info)
-#define HFI1_SNOOP_IOCSETLINKSTATE_EXTRA \
-	_IOWR(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 7, \
-		struct hfi1_link_info)
-
-static int hfi1_snoop_open(struct inode *in, struct file *fp);
-static ssize_t hfi1_snoop_read(struct file *fp, char __user *data,
-			       size_t pkt_len, loff_t *off);
-static ssize_t hfi1_snoop_write(struct file *fp, const char __user *data,
-				size_t count, loff_t *off);
-static long hfi1_ioctl(struct file *fp, unsigned int cmd, unsigned long arg);
-static unsigned int hfi1_snoop_poll(struct file *fp,
-				    struct poll_table_struct *wait);
-static int hfi1_snoop_release(struct inode *in, struct file *fp);
-
-struct hfi1_packet_filter_command {
-	int opcode;
-	int length;
-	void *value_ptr;
-};
-
-/* Can't re-use PKT_DIR_*GRESS here because 0 means no packets for this */
-#define HFI1_SNOOP_INGRESS 0x1
-#define HFI1_SNOOP_EGRESS  0x2
-
-enum hfi1_packet_filter_opcodes {
-	FILTER_BY_LID,
-	FILTER_BY_DLID,
-	FILTER_BY_MAD_MGMT_CLASS,
-	FILTER_BY_QP_NUMBER,
-	FILTER_BY_PKT_TYPE,
-	FILTER_BY_SERVICE_LEVEL,
-	FILTER_BY_PKEY,
-	FILTER_BY_DIRECTION,
-};
-
-static const struct file_operations snoop_file_ops = {
-	.owner = THIS_MODULE,
-	.open = hfi1_snoop_open,
-	.read = hfi1_snoop_read,
-	.unlocked_ioctl = hfi1_ioctl,
-	.poll = hfi1_snoop_poll,
-	.write = hfi1_snoop_write,
-	.release = hfi1_snoop_release
-};
-
-struct hfi1_filter_array {
-	int (*filter)(void *, void *, void *);
-};
-
-static int hfi1_filter_lid(void *ibhdr, void *packet_data, void *value);
-static int hfi1_filter_dlid(void *ibhdr, void *packet_data, void *value);
-static int hfi1_filter_mad_mgmt_class(void *ibhdr, void *packet_data,
-				      void *value);
-static int hfi1_filter_qp_number(void *ibhdr, void *packet_data, void *value);
-static int hfi1_filter_ibpacket_type(void *ibhdr, void *packet_data,
-				     void *value);
-static int hfi1_filter_ib_service_level(void *ibhdr, void *packet_data,
-					void *value);
-static int hfi1_filter_ib_pkey(void *ibhdr, void *packet_data, void *value);
-static int hfi1_filter_direction(void *ibhdr, void *packet_data, void *value);
-
-static const struct hfi1_filter_array hfi1_filters[] = {
-	{ hfi1_filter_lid },
-	{ hfi1_filter_dlid },
-	{ hfi1_filter_mad_mgmt_class },
-	{ hfi1_filter_qp_number },
-	{ hfi1_filter_ibpacket_type },
-	{ hfi1_filter_ib_service_level },
-	{ hfi1_filter_ib_pkey },
-	{ hfi1_filter_direction },
-};
-
-#define HFI1_MAX_FILTERS	ARRAY_SIZE(hfi1_filters)
-#define HFI1_DIAG_MINOR_BASE	129
-
-static int hfi1_snoop_add(struct hfi1_devdata *dd, const char *name);
-
-int hfi1_diag_add(struct hfi1_devdata *dd)
-{
-	char name[16];
-	int ret = 0;
-
-	snprintf(name, sizeof(name), "%s_diagpkt%d", class_name(),
-		 dd->unit);
-	/*
-	 * Do this for each device as opposed to the normal diagpkt
-	 * interface which is one per host
-	 */
-	ret = hfi1_snoop_add(dd, name);
-	if (ret)
-		dd_dev_err(dd, "Unable to init snoop/capture device");
-
-	snprintf(name, sizeof(name), "%s_diagpkt", class_name());
-	if (atomic_inc_return(&diagpkt_count) == 1) {
-		ret = hfi1_cdev_init(HFI1_DIAGPKT_MINOR, name,
-				     &diagpkt_file_ops, &diagpkt_cdev,
-				     &diagpkt_device, false);
-	}
-
-	return ret;
-}
-
-/* this must be called w/ dd->snoop_in_lock held */
-static void drain_snoop_list(struct list_head *queue)
-{
-	struct list_head *pos, *q;
-	struct snoop_packet *packet;
-
-	list_for_each_safe(pos, q, queue) {
-		packet = list_entry(pos, struct snoop_packet, list);
-		list_del(pos);
-		kfree(packet);
-	}
-}
-
-static void hfi1_snoop_remove(struct hfi1_devdata *dd)
-{
-	unsigned long flags = 0;
-
-	spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-	drain_snoop_list(&dd->hfi1_snoop.queue);
-	hfi1_cdev_cleanup(&dd->hfi1_snoop.cdev, &dd->hfi1_snoop.class_dev);
-	spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-}
-
-void hfi1_diag_remove(struct hfi1_devdata *dd)
-{
-	hfi1_snoop_remove(dd);
-	if (atomic_dec_and_test(&diagpkt_count))
-		hfi1_cdev_cleanup(&diagpkt_cdev, &diagpkt_device);
-	hfi1_cdev_cleanup(&dd->diag_cdev, &dd->diag_device);
-}
-
-/*
- * Allocated structure shared between the credit return mechanism and
- * diagpkt_send().
- */
-struct diagpkt_wait {
-	struct completion credits_returned;
-	int code;
-	atomic_t count;
-};
-
-/*
- * When each side is finished with the structure, they call this.
- * The last user frees the structure.
- */
-static void put_diagpkt_wait(struct diagpkt_wait *wait)
-{
-	if (atomic_dec_and_test(&wait->count))
-		kfree(wait);
-}
-
-/*
- * Callback from the credit return code.  Set the complete, which
- * will let diapkt_send() continue.
- */
-static void diagpkt_complete(void *arg, int code)
-{
-	struct diagpkt_wait *wait = (struct diagpkt_wait *)arg;
-
-	wait->code = code;
-	complete(&wait->credits_returned);
-	put_diagpkt_wait(wait);	/* finished with the structure */
-}
-
-/**
- * diagpkt_send - send a packet
- * @dp: diag packet descriptor
- */
-static ssize_t diagpkt_send(struct diag_pkt *dp)
-{
-	struct hfi1_devdata *dd;
-	struct send_context *sc;
-	struct pio_buf *pbuf;
-	u32 *tmpbuf = NULL;
-	ssize_t ret = 0;
-	u32 pkt_len, total_len;
-	pio_release_cb credit_cb = NULL;
-	void *credit_arg = NULL;
-	struct diagpkt_wait *wait = NULL;
-	int trycount = 0;
-
-	dd = hfi1_lookup(dp->unit);
-	if (!dd || !(dd->flags & HFI1_PRESENT) || !dd->kregbase) {
-		ret = -ENODEV;
-		goto bail;
-	}
-	if (!(dd->flags & HFI1_INITTED)) {
-		/* no hardware, freeze, etc. */
-		ret = -ENODEV;
-		goto bail;
-	}
-
-	if (dp->version != _DIAG_PKT_VERS) {
-		dd_dev_err(dd, "Invalid version %u for diagpkt_write\n",
-			   dp->version);
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	/* send count must be an exact number of dwords */
-	if (dp->len & 3) {
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	/* there is only port 1 */
-	if (dp->port != 1) {
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	/* need a valid context */
-	if (dp->sw_index >= dd->num_send_contexts) {
-		ret = -EINVAL;
-		goto bail;
-	}
-	/* can only use kernel contexts */
-	if (dd->send_contexts[dp->sw_index].type != SC_KERNEL &&
-	    dd->send_contexts[dp->sw_index].type != SC_VL15) {
-		ret = -EINVAL;
-		goto bail;
-	}
-	/* must be allocated */
-	sc = dd->send_contexts[dp->sw_index].sc;
-	if (!sc) {
-		ret = -EINVAL;
-		goto bail;
-	}
-	/* must be enabled */
-	if (!(sc->flags & SCF_ENABLED)) {
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	/* allocate a buffer and copy the data in */
-	tmpbuf = vmalloc(dp->len);
-	if (!tmpbuf) {
-		ret = -ENOMEM;
-		goto bail;
-	}
-
-	if (copy_from_user(tmpbuf,
-			   (const void __user *)(unsigned long)dp->data,
-			   dp->len)) {
-		ret = -EFAULT;
-		goto bail;
-	}
-
-	/*
-	 * pkt_len is how much data we have to write, includes header and data.
-	 * total_len is length of the packet in Dwords plus the PBC should not
-	 * include the CRC.
-	 */
-	pkt_len = dp->len >> 2;
-	total_len = pkt_len + 2; /* PBC + packet */
-
-	/* if 0, fill in a default */
-	if (dp->pbc == 0) {
-		struct hfi1_pportdata *ppd = dd->pport;
-
-		hfi1_cdbg(PKT, "Generating PBC");
-		dp->pbc = create_pbc(ppd, 0, 0, 0, total_len);
-	} else {
-		hfi1_cdbg(PKT, "Using passed in PBC");
-	}
-
-	hfi1_cdbg(PKT, "Egress PBC content is 0x%llx", dp->pbc);
-
-	/*
-	 * The caller wants to wait until the packet is sent and to
-	 * check for errors.  The best we can do is wait until
-	 * the buffer credits are returned and check if any packet
-	 * error has occurred.  If there are any late errors, this
-	 * could miss it.  If there are other senders who generate
-	 * an error, this may find it.  However, in general, it
-	 * should catch most.
-	 */
-	if (dp->flags & F_DIAGPKT_WAIT) {
-		/* always force a credit return */
-		dp->pbc |= PBC_CREDIT_RETURN;
-		/* turn on credit return interrupts */
-		sc_add_credit_return_intr(sc);
-		wait = kmalloc(sizeof(*wait), GFP_KERNEL);
-		if (!wait) {
-			ret = -ENOMEM;
-			goto bail;
-		}
-		init_completion(&wait->credits_returned);
-		atomic_set(&wait->count, 2);
-		wait->code = PRC_OK;
-
-		credit_cb = diagpkt_complete;
-		credit_arg = wait;
-	}
-
-retry:
-	pbuf = sc_buffer_alloc(sc, total_len, credit_cb, credit_arg);
-	if (!pbuf) {
-		if (trycount == 0) {
-			/* force a credit return and try again */
-			sc_return_credits(sc);
-			trycount = 1;
-			goto retry;
-		}
-		/*
-		 * No send buffer means no credit callback.  Undo
-		 * the wait set-up that was done above.  We free wait
-		 * because the callback will never be called.
-		 */
-		if (dp->flags & F_DIAGPKT_WAIT) {
-			sc_del_credit_return_intr(sc);
-			kfree(wait);
-			wait = NULL;
-		}
-		ret = -ENOSPC;
-		goto bail;
-	}
-
-	pio_copy(dd, pbuf, dp->pbc, tmpbuf, pkt_len);
-	/* no flush needed as the HW knows the packet size */
-
-	ret = sizeof(*dp);
-
-	if (dp->flags & F_DIAGPKT_WAIT) {
-		/* wait for credit return */
-		ret = wait_for_completion_interruptible(
-						&wait->credits_returned);
-		/*
-		 * If the wait returns an error, the wait was interrupted,
-		 * e.g. with a ^C in the user program.  The callback is
-		 * still pending.  This is OK as the wait structure is
-		 * kmalloc'ed and the structure will free itself when
-		 * all users are done with it.
-		 *
-		 * A context disable occurs on a send context restart, so
-		 * include that in the list of errors below to check for.
-		 * NOTE: PRC_FILL_ERR is at best informational and cannot
-		 * be depended on.
-		 */
-		if (!ret && (((wait->code & PRC_STATUS_ERR) ||
-			      (wait->code & PRC_FILL_ERR) ||
-			      (wait->code & PRC_SC_DISABLE))))
-			ret = -EIO;
-
-		put_diagpkt_wait(wait);	/* finished with the structure */
-		sc_del_credit_return_intr(sc);
-	}
-
-bail:
-	vfree(tmpbuf);
-	return ret;
-}
-
-static ssize_t diagpkt_write(struct file *fp, const char __user *data,
-			     size_t count, loff_t *off)
-{
-	struct hfi1_devdata *dd;
-	struct send_context *sc;
-	u8 vl;
-
-	struct diag_pkt dp;
-
-	if (count != sizeof(dp))
-		return -EINVAL;
-
-	if (copy_from_user(&dp, data, sizeof(dp)))
-		return -EFAULT;
-
-	/*
-	* The Send Context is derived from the PbcVL value
-	* if PBC is populated
-	*/
-	if (dp.pbc) {
-		dd = hfi1_lookup(dp.unit);
-		if (!dd)
-			return -ENODEV;
-		vl = (dp.pbc >> PBC_VL_SHIFT) & PBC_VL_MASK;
-		sc = dd->vld[vl].sc;
-		if (sc) {
-			dp.sw_index = sc->sw_index;
-			hfi1_cdbg(
-			       PKT,
-			       "Packet sent over VL %d via Send Context %u(%u)",
-			       vl, sc->sw_index, sc->hw_context);
-		}
-	}
-
-	return diagpkt_send(&dp);
-}
-
-static int hfi1_snoop_add(struct hfi1_devdata *dd, const char *name)
-{
-	int ret = 0;
-
-	dd->hfi1_snoop.mode_flag = 0;
-	spin_lock_init(&dd->hfi1_snoop.snoop_lock);
-	INIT_LIST_HEAD(&dd->hfi1_snoop.queue);
-	init_waitqueue_head(&dd->hfi1_snoop.waitq);
-
-	ret = hfi1_cdev_init(HFI1_SNOOP_CAPTURE_BASE + dd->unit, name,
-			     &snoop_file_ops,
-			     &dd->hfi1_snoop.cdev, &dd->hfi1_snoop.class_dev,
-			     false);
-
-	if (ret) {
-		dd_dev_err(dd, "Couldn't create %s device: %d", name, ret);
-		hfi1_cdev_cleanup(&dd->hfi1_snoop.cdev,
-				  &dd->hfi1_snoop.class_dev);
-	}
-
-	return ret;
-}
-
-static struct hfi1_devdata *hfi1_dd_from_sc_inode(struct inode *in)
-{
-	int unit = iminor(in) - HFI1_SNOOP_CAPTURE_BASE;
-	struct hfi1_devdata *dd;
-
-	dd = hfi1_lookup(unit);
-	return dd;
-}
-
-/* clear or restore send context integrity checks */
-static void adjust_integrity_checks(struct hfi1_devdata *dd)
-{
-	struct send_context *sc;
-	unsigned long sc_flags;
-	int i;
-
-	spin_lock_irqsave(&dd->sc_lock, sc_flags);
-	for (i = 0; i < dd->num_send_contexts; i++) {
-		int enable;
-
-		sc = dd->send_contexts[i].sc;
-
-		if (!sc)
-			continue;	/* not allocated */
-
-		enable = likely(!HFI1_CAP_IS_KSET(NO_INTEGRITY)) &&
-			 dd->hfi1_snoop.mode_flag != HFI1_PORT_SNOOP_MODE;
-
-		set_pio_integrity(sc);
-
-		if (enable) /* take HFI_CAP_* flags into account */
-			hfi1_init_ctxt(sc);
-	}
-	spin_unlock_irqrestore(&dd->sc_lock, sc_flags);
-}
-
-static int hfi1_snoop_open(struct inode *in, struct file *fp)
-{
-	int ret;
-	int mode_flag = 0;
-	unsigned long flags = 0;
-	struct hfi1_devdata *dd;
-	struct list_head *queue;
-
-	mutex_lock(&hfi1_mutex);
-
-	dd = hfi1_dd_from_sc_inode(in);
-	if (!dd) {
-		ret = -ENODEV;
-		goto bail;
-	}
-
-	/*
-	 * File mode determines snoop or capture. Some existing user
-	 * applications expect the capture device to be able to be opened RDWR
-	 * because they expect a dedicated capture device. For this reason we
-	 * support a module param to force capture mode even if the file open
-	 * mode matches snoop.
-	 */
-	if ((fp->f_flags & O_ACCMODE) == O_RDONLY) {
-		snoop_dbg("Capture Enabled");
-		mode_flag = HFI1_PORT_CAPTURE_MODE;
-	} else if ((fp->f_flags & O_ACCMODE) == O_RDWR) {
-		snoop_dbg("Snoop Enabled");
-		mode_flag = HFI1_PORT_SNOOP_MODE;
-	} else {
-		snoop_dbg("Invalid");
-		ret =  -EINVAL;
-		goto bail;
-	}
-	queue = &dd->hfi1_snoop.queue;
-
-	/*
-	 * We are not supporting snoop and capture at the same time.
-	 */
-	spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-	if (dd->hfi1_snoop.mode_flag) {
-		ret = -EBUSY;
-		spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-		goto bail;
-	}
-
-	dd->hfi1_snoop.mode_flag = mode_flag;
-	drain_snoop_list(queue);
-
-	dd->hfi1_snoop.filter_callback = NULL;
-	dd->hfi1_snoop.filter_value = NULL;
-
-	/*
-	 * Send side packet integrity checks are not helpful when snooping so
-	 * disable and re-enable when we stop snooping.
-	 */
-	if (mode_flag == HFI1_PORT_SNOOP_MODE) {
-		/* clear after snoop mode is on */
-		adjust_integrity_checks(dd); /* clear */
-
-		/*
-		 * We also do not want to be doing the DLID LMC check for
-		 * ingressed packets.
-		 */
-		dd->hfi1_snoop.dcc_cfg = read_csr(dd, DCC_CFG_PORT_CONFIG1);
-		write_csr(dd, DCC_CFG_PORT_CONFIG1,
-			  (dd->hfi1_snoop.dcc_cfg >> 32) << 32);
-	}
-
-	/*
-	 * As soon as we set these function pointers the recv and send handlers
-	 * are active. This is a race condition so we must make sure to drain
-	 * the queue and init filter values above. Technically we should add
-	 * locking here but all that will happen is on recv a packet will get
-	 * allocated and get stuck on the snoop_lock before getting added to the
-	 * queue. Same goes for send.
-	 */
-	dd->rhf_rcv_function_map = snoop_rhf_rcv_functions;
-	dd->process_pio_send = snoop_send_pio_handler;
-	dd->process_dma_send = snoop_send_pio_handler;
-	dd->pio_inline_send = snoop_inline_pio_send;
-
-	spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-	ret = 0;
-
-bail:
-	mutex_unlock(&hfi1_mutex);
-
-	return ret;
-}
-
-static int hfi1_snoop_release(struct inode *in, struct file *fp)
-{
-	unsigned long flags = 0;
-	struct hfi1_devdata *dd;
-	int mode_flag;
-
-	dd = hfi1_dd_from_sc_inode(in);
-	if (!dd)
-		return -ENODEV;
-
-	spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-
-	/* clear the snoop mode before re-adjusting send context CSRs */
-	mode_flag = dd->hfi1_snoop.mode_flag;
-	dd->hfi1_snoop.mode_flag = 0;
-
-	/*
-	 * Drain the queue and clear the filters we are done with it. Don't
-	 * forget to restore the packet integrity checks
-	 */
-	drain_snoop_list(&dd->hfi1_snoop.queue);
-	if (mode_flag == HFI1_PORT_SNOOP_MODE) {
-		/* restore after snoop mode is clear */
-		adjust_integrity_checks(dd); /* restore */
-
-		/*
-		 * Also should probably reset the DCC_CONFIG1 register for DLID
-		 * checking on incoming packets again. Use the value saved when
-		 * opening the snoop device.
-		 */
-		write_csr(dd, DCC_CFG_PORT_CONFIG1, dd->hfi1_snoop.dcc_cfg);
-	}
-
-	dd->hfi1_snoop.filter_callback = NULL;
-	kfree(dd->hfi1_snoop.filter_value);
-	dd->hfi1_snoop.filter_value = NULL;
-
-	/*
-	 * User is done snooping and capturing, return control to the normal
-	 * handler. Re-enable SDMA handling.
-	 */
-	dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions;
-	dd->process_pio_send = hfi1_verbs_send_pio;
-	dd->process_dma_send = hfi1_verbs_send_dma;
-	dd->pio_inline_send = pio_copy;
-
-	spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-
-	snoop_dbg("snoop/capture device released");
-
-	return 0;
-}
-
-static unsigned int hfi1_snoop_poll(struct file *fp,
-				    struct poll_table_struct *wait)
-{
-	int ret = 0;
-	unsigned long flags = 0;
-
-	struct hfi1_devdata *dd;
-
-	dd = hfi1_dd_from_sc_inode(fp->f_inode);
-	if (!dd)
-		return -ENODEV;
-
-	spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-
-	poll_wait(fp, &dd->hfi1_snoop.waitq, wait);
-	if (!list_empty(&dd->hfi1_snoop.queue))
-		ret |= POLLIN | POLLRDNORM;
-
-	spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-	return ret;
-}
-
-static ssize_t hfi1_snoop_write(struct file *fp, const char __user *data,
-				size_t count, loff_t *off)
-{
-	struct diag_pkt dpkt;
-	struct hfi1_devdata *dd;
-	size_t ret;
-	u8 byte_two, sl, sc5, sc4, vl, byte_one;
-	struct send_context *sc;
-	u32 len;
-	u64 pbc;
-	struct hfi1_ibport *ibp;
-	struct hfi1_pportdata *ppd;
-
-	dd = hfi1_dd_from_sc_inode(fp->f_inode);
-	if (!dd)
-		return -ENODEV;
-
-	ppd = dd->pport;
-	snoop_dbg("received %lu bytes from user", count);
-
-	memset(&dpkt, 0, sizeof(struct diag_pkt));
-	dpkt.version = _DIAG_PKT_VERS;
-	dpkt.unit = dd->unit;
-	dpkt.port = 1;
-
-	if (likely(!(snoop_flags & SNOOP_USE_METADATA))) {
-		/*
-		* We need to generate the PBC and not let diagpkt_send do it,
-		* to do this we need the VL and the length in dwords.
-		* The VL can be determined by using the SL and looking up the
-		* SC. Then the SC can be converted into VL. The exception to
-		* this is those packets which are from an SMI queue pair.
-		* Since we can't detect anything about the QP here we have to
-		* rely on the SC. If its 0xF then we assume its SMI and
-		* do not look at the SL.
-		*/
-		if (copy_from_user(&byte_one, data, 1))
-			return -EINVAL;
-
-		if (copy_from_user(&byte_two, data + 1, 1))
-			return -EINVAL;
-
-		sc4 = (byte_one >> 4) & 0xf;
-		if (sc4 == 0xF) {
-			snoop_dbg("Detected VL15 packet ignoring SL in packet");
-			vl = sc4;
-		} else {
-			sl = (byte_two >> 4) & 0xf;
-			ibp = to_iport(&dd->verbs_dev.rdi.ibdev, 1);
-			sc5 = ibp->sl_to_sc[sl];
-			vl = sc_to_vlt(dd, sc5);
-			if (vl != sc4) {
-				snoop_dbg("VL %d does not match SC %d of packet",
-					  vl, sc4);
-				return -EINVAL;
-			}
-		}
-
-		sc = dd->vld[vl].sc; /* Look up the context based on VL */
-		if (sc) {
-			dpkt.sw_index = sc->sw_index;
-			snoop_dbg("Sending on context %u(%u)", sc->sw_index,
-				  sc->hw_context);
-		} else {
-			snoop_dbg("Could not find context for vl %d", vl);
-			return -EINVAL;
-		}
-
-		len = (count >> 2) + 2; /* Add in PBC */
-		pbc = create_pbc(ppd, 0, 0, vl, len);
-	} else {
-		if (copy_from_user(&pbc, data, sizeof(pbc)))
-			return -EINVAL;
-		vl = (pbc >> PBC_VL_SHIFT) & PBC_VL_MASK;
-		sc = dd->vld[vl].sc; /* Look up the context based on VL */
-		if (sc) {
-			dpkt.sw_index = sc->sw_index;
-		} else {
-			snoop_dbg("Could not find context for vl %d", vl);
-			return -EINVAL;
-		}
-		data += sizeof(pbc);
-		count -= sizeof(pbc);
-	}
-	dpkt.len = count;
-	dpkt.data = (unsigned long)data;
-
-	snoop_dbg("PBC: vl=0x%llx Length=0x%llx",
-		  (pbc >> 12) & 0xf,
-		  (pbc & 0xfff));
-
-	dpkt.pbc = pbc;
-	ret = diagpkt_send(&dpkt);
-	/*
-	 * diagpkt_send only returns number of bytes in the diagpkt so patch
-	 * that up here before returning.
-	 */
-	if (ret == sizeof(dpkt))
-		return count;
-
-	return ret;
-}
-
-static ssize_t hfi1_snoop_read(struct file *fp, char __user *data,
-			       size_t pkt_len, loff_t *off)
-{
-	ssize_t ret = 0;
-	unsigned long flags = 0;
-	struct snoop_packet *packet = NULL;
-	struct hfi1_devdata *dd;
-
-	dd = hfi1_dd_from_sc_inode(fp->f_inode);
-	if (!dd)
-		return -ENODEV;
-
-	spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-
-	while (list_empty(&dd->hfi1_snoop.queue)) {
-		spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-
-		if (fp->f_flags & O_NONBLOCK)
-			return -EAGAIN;
-
-		if (wait_event_interruptible(
-				dd->hfi1_snoop.waitq,
-				!list_empty(&dd->hfi1_snoop.queue)))
-			return -EINTR;
-
-		spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-	}
-
-	if (!list_empty(&dd->hfi1_snoop.queue)) {
-		packet = list_entry(dd->hfi1_snoop.queue.next,
-				    struct snoop_packet, list);
-		list_del(&packet->list);
-		spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-		if (pkt_len >= packet->total_len) {
-			if (copy_to_user(data, packet->data,
-					 packet->total_len))
-				ret = -EFAULT;
-			else
-				ret = packet->total_len;
-		} else {
-			ret = -EINVAL;
-		}
-
-		kfree(packet);
-	} else {
-		spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-	}
-
-	return ret;
-}
-
-/**
- * hfi1_assign_snoop_link_credits -- Set up credits for VL15 and others
- * @ppd : ptr to hfi1 port data
- * @value : options from user space
- *
- * Assumes the rest of the CM credit registers are zero from a
- * previous global or credit reset.
- * Leave shared count at zero for both global and all vls.
- * In snoop mode ideally we don't use shared credits
- * Reserve 8.5k for VL15
- * If total credits less than 8.5kbytes return error.
- * Divide the rest of the credits across VL0 to VL7 and if
- * each of these levels has less than 34 credits (at least 2048 + 128 bytes)
- * return with an error.
- * The credit registers will be reset to zero on link negotiation or link up
- * so this function should be activated from user space only if the port has
- * gone past link negotiation and link up.
- *
- * Return -- 0 if successful else error condition
- *
- */
-static long hfi1_assign_snoop_link_credits(struct hfi1_pportdata *ppd,
-					   int value)
-{
-#define  OPA_MIN_PER_VL_CREDITS  34  /* 2048 + 128 bytes */
-	struct buffer_control t;
-	int i;
-	struct hfi1_devdata *dd = ppd->dd;
-	u16  total_credits = (value >> 16) & 0xffff;
-	u16  vl15_credits = dd->vl15_init / 2;
-	u16  per_vl_credits;
-	__be16 be_per_vl_credits;
-
-	if (!(ppd->host_link_state & HLS_UP))
-		goto err_exit;
-	if (total_credits  <  vl15_credits)
-		goto err_exit;
-
-	per_vl_credits = (total_credits - vl15_credits) / TXE_NUM_DATA_VL;
-
-	if (per_vl_credits < OPA_MIN_PER_VL_CREDITS)
-		goto err_exit;
-
-	memset(&t, 0, sizeof(t));
-	be_per_vl_credits = cpu_to_be16(per_vl_credits);
-
-	for (i = 0; i < TXE_NUM_DATA_VL; i++)
-		t.vl[i].dedicated = be_per_vl_credits;
-
-	t.vl[15].dedicated  = cpu_to_be16(vl15_credits);
-	return set_buffer_control(ppd, &t);
-
-err_exit:
-	snoop_dbg("port_state = 0x%x, total_credits = %d, vl15_credits = %d",
-		  ppd->host_link_state, total_credits, vl15_credits);
-
-	return -EINVAL;
-}
-
-static long hfi1_ioctl(struct file *fp, unsigned int cmd, unsigned long arg)
-{
-	struct hfi1_devdata *dd;
-	void *filter_value = NULL;
-	long ret = 0;
-	int value = 0;
-	u8 phys_state = 0;
-	u8 link_state = 0;
-	u16 dev_state = 0;
-	unsigned long flags = 0;
-	unsigned long *argp = NULL;
-	struct hfi1_packet_filter_command filter_cmd = {0};
-	int mode_flag = 0;
-	struct hfi1_pportdata *ppd = NULL;
-	unsigned int index;
-	struct hfi1_link_info link_info;
-	int read_cmd, write_cmd, read_ok, write_ok;
-
-	dd = hfi1_dd_from_sc_inode(fp->f_inode);
-	if (!dd)
-		return -ENODEV;
-
-	mode_flag = dd->hfi1_snoop.mode_flag;
-	read_cmd = _IOC_DIR(cmd) & _IOC_READ;
-	write_cmd = _IOC_DIR(cmd) & _IOC_WRITE;
-	write_ok = access_ok(VERIFY_WRITE, (void __user *)arg, _IOC_SIZE(cmd));
-	read_ok = access_ok(VERIFY_READ, (void __user *)arg, _IOC_SIZE(cmd));
-
-	if ((read_cmd && !write_ok) || (write_cmd && !read_ok))
-		return -EFAULT;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if ((mode_flag & HFI1_PORT_CAPTURE_MODE) &&
-	    (cmd != HFI1_SNOOP_IOCCLEARQUEUE) &&
-	    (cmd != HFI1_SNOOP_IOCCLEARFILTER) &&
-	    (cmd != HFI1_SNOOP_IOCSETFILTER))
-		/* Capture devices are allowed only 3 operations
-		 * 1.Clear capture queue
-		 * 2.Clear capture filter
-		 * 3.Set capture filter
-		 * Other are invalid.
-		 */
-		return -EINVAL;
-
-	switch (cmd) {
-	case HFI1_SNOOP_IOCSETLINKSTATE_EXTRA:
-		memset(&link_info, 0, sizeof(link_info));
-
-		if (copy_from_user(&link_info,
-				   (struct hfi1_link_info __user *)arg,
-				   sizeof(link_info)))
-			return -EFAULT;
-
-		value = link_info.port_state;
-		index = link_info.port_number;
-		if (index > dd->num_pports - 1)
-			return -EINVAL;
-
-		ppd = &dd->pport[index];
-		if (!ppd)
-			return -EINVAL;
-
-		/* What we want to transition to */
-		phys_state = (value >> 4) & 0xF;
-		link_state = value & 0xF;
-		snoop_dbg("Setting link state 0x%x", value);
-
-		switch (link_state) {
-		case IB_PORT_NOP:
-			if (phys_state == 0)
-				break;
-				/* fall through */
-		case IB_PORT_DOWN:
-			switch (phys_state) {
-			case 0:
-				dev_state = HLS_DN_DOWNDEF;
-				break;
-			case 2:
-				dev_state = HLS_DN_POLL;
-				break;
-			case 3:
-				dev_state = HLS_DN_DISABLE;
-				break;
-			default:
-				return -EINVAL;
-			}
-			ret = set_link_state(ppd, dev_state);
-			break;
-		case IB_PORT_ARMED:
-			ret = set_link_state(ppd, HLS_UP_ARMED);
-			if (!ret)
-				send_idle_sma(dd, SMA_IDLE_ARM);
-			break;
-		case IB_PORT_ACTIVE:
-			ret = set_link_state(ppd, HLS_UP_ACTIVE);
-			if (!ret)
-				send_idle_sma(dd, SMA_IDLE_ACTIVE);
-			break;
-		default:
-			return -EINVAL;
-		}
-
-		if (ret)
-			break;
-		/* fall through */
-	case HFI1_SNOOP_IOCGETLINKSTATE:
-	case HFI1_SNOOP_IOCGETLINKSTATE_EXTRA:
-		if (cmd == HFI1_SNOOP_IOCGETLINKSTATE_EXTRA) {
-			memset(&link_info, 0, sizeof(link_info));
-			if (copy_from_user(&link_info,
-					   (struct hfi1_link_info __user *)arg,
-					   sizeof(link_info)))
-				return -EFAULT;
-			index = link_info.port_number;
-		} else {
-			ret = __get_user(index, (int __user *)arg);
-			if (ret !=  0)
-				break;
-		}
-
-		if (index > dd->num_pports - 1)
-			return -EINVAL;
-
-		ppd = &dd->pport[index];
-		if (!ppd)
-			return -EINVAL;
-
-		value = hfi1_ibphys_portstate(ppd);
-		value <<= 4;
-		value |= driver_lstate(ppd);
-
-		snoop_dbg("Link port | Link State: %d", value);
-
-		if ((cmd == HFI1_SNOOP_IOCGETLINKSTATE_EXTRA) ||
-		    (cmd == HFI1_SNOOP_IOCSETLINKSTATE_EXTRA)) {
-			link_info.port_state = value;
-			link_info.node_guid = cpu_to_be64(ppd->guid);
-			link_info.link_speed_active =
-						ppd->link_speed_active;
-			link_info.link_width_active =
-						ppd->link_width_active;
-			if (copy_to_user((struct hfi1_link_info __user *)arg,
-					 &link_info, sizeof(link_info)))
-				return -EFAULT;
-		} else {
-			ret = __put_user(value, (int __user *)arg);
-		}
-		break;
-
-	case HFI1_SNOOP_IOCCLEARQUEUE:
-		snoop_dbg("Clearing snoop queue");
-		spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-		drain_snoop_list(&dd->hfi1_snoop.queue);
-		spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-		break;
-
-	case HFI1_SNOOP_IOCCLEARFILTER:
-		snoop_dbg("Clearing filter");
-		spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-		if (dd->hfi1_snoop.filter_callback) {
-			/* Drain packets first */
-			drain_snoop_list(&dd->hfi1_snoop.queue);
-			dd->hfi1_snoop.filter_callback = NULL;
-		}
-		kfree(dd->hfi1_snoop.filter_value);
-		dd->hfi1_snoop.filter_value = NULL;
-		spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-		break;
-
-	case HFI1_SNOOP_IOCSETFILTER:
-		snoop_dbg("Setting filter");
-		/* just copy command structure */
-		argp = (unsigned long *)arg;
-		if (copy_from_user(&filter_cmd, (void __user *)argp,
-				   sizeof(filter_cmd)))
-			return -EFAULT;
-
-		if (filter_cmd.opcode >= HFI1_MAX_FILTERS) {
-			pr_alert("Invalid opcode in request\n");
-			return -EINVAL;
-		}
-
-		snoop_dbg("Opcode %d Len %d Ptr %p",
-			  filter_cmd.opcode, filter_cmd.length,
-			  filter_cmd.value_ptr);
-
-		filter_value = kcalloc(filter_cmd.length, sizeof(u8),
-				       GFP_KERNEL);
-		if (!filter_value)
-			return -ENOMEM;
-
-		/* copy remaining data from userspace */
-		if (copy_from_user((u8 *)filter_value,
-				   (void __user *)filter_cmd.value_ptr,
-				   filter_cmd.length)) {
-			kfree(filter_value);
-			return -EFAULT;
-		}
-		/* Drain packets first */
-		spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-		drain_snoop_list(&dd->hfi1_snoop.queue);
-		dd->hfi1_snoop.filter_callback =
-			hfi1_filters[filter_cmd.opcode].filter;
-		/* just in case we see back to back sets */
-		kfree(dd->hfi1_snoop.filter_value);
-		dd->hfi1_snoop.filter_value = filter_value;
-		spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-		break;
-	case HFI1_SNOOP_IOCGETVERSION:
-		value = SNOOP_CAPTURE_VERSION;
-		snoop_dbg("Getting version: %d", value);
-		ret = __put_user(value, (int __user *)arg);
-		break;
-	case HFI1_SNOOP_IOCSET_OPTS:
-		snoop_flags = 0;
-		ret = __get_user(value, (int __user *)arg);
-		if (ret != 0)
-			break;
-
-		snoop_dbg("Setting snoop option %d", value);
-		if (value & SNOOP_DROP_SEND)
-			snoop_flags |= SNOOP_DROP_SEND;
-		if (value & SNOOP_USE_METADATA)
-			snoop_flags |= SNOOP_USE_METADATA;
-		if (value & (SNOOP_SET_VL0TOVL15)) {
-			ppd = &dd->pport[0];  /* first port will do */
-			ret = hfi1_assign_snoop_link_credits(ppd, value);
-		}
-		break;
-	default:
-		return -ENOTTY;
-	}
-
-	return ret;
-}
-
-static void snoop_list_add_tail(struct snoop_packet *packet,
-				struct hfi1_devdata *dd)
-{
-	unsigned long flags = 0;
-
-	spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-	if (likely((dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE) ||
-		   (dd->hfi1_snoop.mode_flag & HFI1_PORT_CAPTURE_MODE))) {
-		list_add_tail(&packet->list, &dd->hfi1_snoop.queue);
-		snoop_dbg("Added packet to list");
-	}
-
-	/*
-	 * Technically we can could have closed the snoop device while waiting
-	 * on the above lock and it is gone now. The snoop mode_flag will
-	 * prevent us from adding the packet to the queue though.
-	 */
-
-	spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-	wake_up_interruptible(&dd->hfi1_snoop.waitq);
-}
-
-static inline int hfi1_filter_check(void *val, const char *msg)
-{
-	if (!val) {
-		snoop_dbg("Error invalid %s value for filter", msg);
-		return HFI1_FILTER_ERR;
-	}
-	return 0;
-}
-
-static int hfi1_filter_lid(void *ibhdr, void *packet_data, void *value)
-{
-	struct hfi1_ib_header *hdr;
-	int ret;
-
-	ret = hfi1_filter_check(ibhdr, "header");
-	if (ret)
-		return ret;
-	ret = hfi1_filter_check(value, "user");
-	if (ret)
-		return ret;
-	hdr = (struct hfi1_ib_header *)ibhdr;
-
-	if (*((u16 *)value) == be16_to_cpu(hdr->lrh[3])) /* matches slid */
-		return HFI1_FILTER_HIT; /* matched */
-
-	return HFI1_FILTER_MISS; /* Not matched */
-}
-
-static int hfi1_filter_dlid(void *ibhdr, void *packet_data, void *value)
-{
-	struct hfi1_ib_header *hdr;
-	int ret;
-
-	ret = hfi1_filter_check(ibhdr, "header");
-	if (ret)
-		return ret;
-	ret = hfi1_filter_check(value, "user");
-	if (ret)
-		return ret;
-
-	hdr = (struct hfi1_ib_header *)ibhdr;
-
-	if (*((u16 *)value) == be16_to_cpu(hdr->lrh[1]))
-		return HFI1_FILTER_HIT;
-
-	return HFI1_FILTER_MISS;
-}
-
-/* Not valid for outgoing packets, send handler passes null for data*/
-static int hfi1_filter_mad_mgmt_class(void *ibhdr, void *packet_data,
-				      void *value)
-{
-	struct hfi1_ib_header *hdr;
-	struct hfi1_other_headers *ohdr = NULL;
-	struct ib_smp *smp = NULL;
-	u32 qpn = 0;
-	int ret;
-
-	ret = hfi1_filter_check(ibhdr, "header");
-	if (ret)
-		return ret;
-	ret = hfi1_filter_check(packet_data, "packet_data");
-	if (ret)
-		return ret;
-	ret = hfi1_filter_check(value, "user");
-	if (ret)
-		return ret;
-
-	hdr = (struct hfi1_ib_header *)ibhdr;
-
-	/* Check for GRH */
-	if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH)
-		ohdr = &hdr->u.oth; /* LRH + BTH + DETH */
-	else
-		ohdr = &hdr->u.l.oth; /* LRH + GRH + BTH + DETH */
-
-	qpn = be32_to_cpu(ohdr->bth[1]) & 0x00FFFFFF;
-	if (qpn <= 1) {
-		smp = (struct ib_smp *)packet_data;
-		if (*((u8 *)value) == smp->mgmt_class)
-			return HFI1_FILTER_HIT;
-		else
-			return HFI1_FILTER_MISS;
-	}
-	return HFI1_FILTER_ERR;
-}
-
-static int hfi1_filter_qp_number(void *ibhdr, void *packet_data, void *value)
-{
-	struct hfi1_ib_header *hdr;
-	struct hfi1_other_headers *ohdr = NULL;
-	int ret;
-
-	ret = hfi1_filter_check(ibhdr, "header");
-	if (ret)
-		return ret;
-	ret = hfi1_filter_check(value, "user");
-	if (ret)
-		return ret;
-
-	hdr = (struct hfi1_ib_header *)ibhdr;
-
-	/* Check for GRH */
-	if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH)
-		ohdr = &hdr->u.oth; /* LRH + BTH + DETH */
-	else
-		ohdr = &hdr->u.l.oth; /* LRH + GRH + BTH + DETH */
-	if (*((u32 *)value) == (be32_to_cpu(ohdr->bth[1]) & 0x00FFFFFF))
-		return HFI1_FILTER_HIT;
-
-	return HFI1_FILTER_MISS;
-}
-
-static int hfi1_filter_ibpacket_type(void *ibhdr, void *packet_data,
-				     void *value)
-{
-	u32 lnh = 0;
-	u8 opcode = 0;
-	struct hfi1_ib_header *hdr;
-	struct hfi1_other_headers *ohdr = NULL;
-	int ret;
-
-	ret = hfi1_filter_check(ibhdr, "header");
-	if (ret)
-		return ret;
-	ret = hfi1_filter_check(value, "user");
-	if (ret)
-		return ret;
-
-	hdr = (struct hfi1_ib_header *)ibhdr;
-
-	lnh = (be16_to_cpu(hdr->lrh[0]) & 3);
-
-	if (lnh == HFI1_LRH_BTH)
-		ohdr = &hdr->u.oth;
-	else if (lnh == HFI1_LRH_GRH)
-		ohdr = &hdr->u.l.oth;
-	else
-		return HFI1_FILTER_ERR;
-
-	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
-
-	if (*((u8 *)value) == ((opcode >> 5) & 0x7))
-		return HFI1_FILTER_HIT;
-
-	return HFI1_FILTER_MISS;
-}
-
-static int hfi1_filter_ib_service_level(void *ibhdr, void *packet_data,
-					void *value)
-{
-	struct hfi1_ib_header *hdr;
-	int ret;
-
-	ret = hfi1_filter_check(ibhdr, "header");
-	if (ret)
-		return ret;
-	ret = hfi1_filter_check(value, "user");
-	if (ret)
-		return ret;
-
-	hdr = (struct hfi1_ib_header *)ibhdr;
-
-	if ((*((u8 *)value)) == ((be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF))
-		return HFI1_FILTER_HIT;
-
-	return HFI1_FILTER_MISS;
-}
-
-static int hfi1_filter_ib_pkey(void *ibhdr, void *packet_data, void *value)
-{
-	u32 lnh = 0;
-	struct hfi1_ib_header *hdr;
-	struct hfi1_other_headers *ohdr = NULL;
-	int ret;
-
-	ret = hfi1_filter_check(ibhdr, "header");
-	if (ret)
-		return ret;
-	ret = hfi1_filter_check(value, "user");
-	if (ret)
-		return ret;
-
-	hdr = (struct hfi1_ib_header *)ibhdr;
-
-	lnh = (be16_to_cpu(hdr->lrh[0]) & 3);
-	if (lnh == HFI1_LRH_BTH)
-		ohdr = &hdr->u.oth;
-	else if (lnh == HFI1_LRH_GRH)
-		ohdr = &hdr->u.l.oth;
-	else
-		return HFI1_FILTER_ERR;
-
-	/* P_key is 16-bit entity, however top most bit indicates
-	 * type of membership. 0 for limited and 1 for Full.
-	 * Limited members cannot accept information from other
-	 * Limited members, but communication is allowed between
-	 * every other combination of membership.
-	 * Hence we'll omit comparing top-most bit while filtering
-	 */
-
-	if ((*(u16 *)value & 0x7FFF) ==
-		((be32_to_cpu(ohdr->bth[0])) & 0x7FFF))
-		return HFI1_FILTER_HIT;
-
-	return HFI1_FILTER_MISS;
-}
-
-/*
- * If packet_data is NULL then this is coming from one of the send functions.
- * Thus we know if its an ingressed or egressed packet.
- */
-static int hfi1_filter_direction(void *ibhdr, void *packet_data, void *value)
-{
-	u8 user_dir = *(u8 *)value;
-	int ret;
-
-	ret = hfi1_filter_check(value, "user");
-	if (ret)
-		return ret;
-
-	if (packet_data) {
-		/* Incoming packet */
-		if (user_dir & HFI1_SNOOP_INGRESS)
-			return HFI1_FILTER_HIT;
-	} else {
-		/* Outgoing packet */
-		if (user_dir & HFI1_SNOOP_EGRESS)
-			return HFI1_FILTER_HIT;
-	}
-
-	return HFI1_FILTER_MISS;
-}
-
-/*
- * Allocate a snoop packet. The structure that is stored in the ring buffer, not
- * to be confused with an hfi packet type.
- */
-static struct snoop_packet *allocate_snoop_packet(u32 hdr_len,
-						  u32 data_len,
-						  u32 md_len)
-{
-	struct snoop_packet *packet;
-
-	packet = kzalloc(sizeof(*packet) + hdr_len + data_len
-			 + md_len,
-			 GFP_ATOMIC | __GFP_NOWARN);
-	if (likely(packet))
-		INIT_LIST_HEAD(&packet->list);
-
-	return packet;
-}
-
-/*
- * Instead of having snoop and capture code intermixed with the recv functions,
- * both the interrupt handler and hfi1_ib_rcv() we are going to hijack the call
- * and land in here for snoop/capture but if not enabled the call will go
- * through as before. This gives us a single point to constrain all of the snoop
- * snoop recv logic. There is nothing special that needs to happen for bypass
- * packets. This routine should not try to look into the packet. It just copied
- * it. There is no guarantee for filters when it comes to bypass packets as
- * there is no specific support. Bottom line is this routine does now even know
- * what a bypass packet is.
- */
-int snoop_recv_handler(struct hfi1_packet *packet)
-{
-	struct hfi1_pportdata *ppd = packet->rcd->ppd;
-	struct hfi1_ib_header *hdr = packet->hdr;
-	int header_size = packet->hlen;
-	void *data = packet->ebuf;
-	u32 tlen = packet->tlen;
-	struct snoop_packet *s_packet = NULL;
-	int ret;
-	int snoop_mode = 0;
-	u32 md_len = 0;
-	struct capture_md md;
-
-	snoop_dbg("PACKET IN: hdr size %d tlen %d data %p", header_size, tlen,
-		  data);
-
-	trace_snoop_capture(ppd->dd, header_size, hdr, tlen - header_size,
-			    data);
-
-	if (!ppd->dd->hfi1_snoop.filter_callback) {
-		snoop_dbg("filter not set");
-		ret = HFI1_FILTER_HIT;
-	} else {
-		ret = ppd->dd->hfi1_snoop.filter_callback(hdr, data,
-					ppd->dd->hfi1_snoop.filter_value);
-	}
-
-	switch (ret) {
-	case HFI1_FILTER_ERR:
-		snoop_dbg("Error in filter call");
-		break;
-	case HFI1_FILTER_MISS:
-		snoop_dbg("Filter Miss");
-		break;
-	case HFI1_FILTER_HIT:
-
-		if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE)
-			snoop_mode = 1;
-		if ((snoop_mode == 0) ||
-		    unlikely(snoop_flags & SNOOP_USE_METADATA))
-			md_len = sizeof(struct capture_md);
-
-		s_packet = allocate_snoop_packet(header_size,
-						 tlen - header_size,
-						 md_len);
-
-		if (unlikely(!s_packet)) {
-			dd_dev_warn_ratelimited(ppd->dd, "Unable to allocate snoop/capture packet\n");
-			break;
-		}
-
-		if (md_len > 0) {
-			memset(&md, 0, sizeof(struct capture_md));
-			md.port = 1;
-			md.dir = PKT_DIR_INGRESS;
-			md.u.rhf = packet->rhf;
-			memcpy(s_packet->data, &md, md_len);
-		}
-
-		/* We should always have a header */
-		if (hdr) {
-			memcpy(s_packet->data + md_len, hdr, header_size);
-		} else {
-			dd_dev_err(ppd->dd, "Unable to copy header to snoop/capture packet\n");
-			kfree(s_packet);
-			break;
-		}
-
-		/*
-		 * Packets with no data are possible. If there is no data needed
-		 * to take care of the last 4 bytes which are normally included
-		 * with data buffers and are included in tlen.  Since we kzalloc
-		 * the buffer we do not need to set any values but if we decide
-		 * not to use kzalloc we should zero them.
-		 */
-		if (data)
-			memcpy(s_packet->data + header_size + md_len, data,
-			       tlen - header_size);
-
-		s_packet->total_len = tlen + md_len;
-		snoop_list_add_tail(s_packet, ppd->dd);
-
-		/*
-		 * If we are snooping the packet not capturing then throw away
-		 * after adding to the list.
-		 */
-		snoop_dbg("Capturing packet");
-		if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE) {
-			snoop_dbg("Throwing packet away");
-			/*
-			 * If we are dropping the packet we still may need to
-			 * handle the case where error flags are set, this is
-			 * normally done by the type specific handler but that
-			 * won't be called in this case.
-			 */
-			if (unlikely(rhf_err_flags(packet->rhf)))
-				handle_eflags(packet);
-
-			/* throw the packet on the floor */
-			return RHF_RCV_CONTINUE;
-		}
-		break;
-	default:
-		break;
-	}
-
-	/*
-	 * We do not care what type of packet came in here - just pass it off
-	 * to the normal handler.
-	 */
-	return ppd->dd->normal_rhf_rcv_functions[rhf_rcv_type(packet->rhf)]
-			(packet);
-}
-
-/*
- * Handle snooping and capturing packets when sdma is being used.
- */
-int snoop_send_dma_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
-			   u64 pbc)
-{
-	pr_alert("Snooping/Capture of Send DMA Packets Is Not Supported!\n");
-	snoop_dbg("Unsupported Operation");
-	return hfi1_verbs_send_dma(qp, ps, 0);
-}
-
-/*
- * Handle snooping and capturing packets when pio is being used. Does not handle
- * bypass packets. The only way to send a bypass packet currently is to use the
- * diagpkt interface. When that interface is enable snoop/capture is not.
- */
-int snoop_send_pio_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
-			   u64 pbc)
-{
-	u32 hdrwords = qp->s_hdrwords;
-	struct rvt_sge_state *ss = qp->s_cur_sge;
-	u32 len = qp->s_cur_size;
-	u32 dwords = (len + 3) >> 2;
-	u32 plen = hdrwords + dwords + 2; /* includes pbc */
-	struct hfi1_pportdata *ppd = ps->ppd;
-	struct snoop_packet *s_packet = NULL;
-	u32 *hdr = (u32 *)&ps->s_txreq->phdr.hdr;
-	u32 length = 0;
-	struct rvt_sge_state temp_ss;
-	void *data = NULL;
-	void *data_start = NULL;
-	int ret;
-	int snoop_mode = 0;
-	int md_len = 0;
-	struct capture_md md;
-	u32 vl;
-	u32 hdr_len = hdrwords << 2;
-	u32 tlen = HFI1_GET_PKT_LEN(&ps->s_txreq->phdr.hdr);
-
-	md.u.pbc = 0;
-
-	snoop_dbg("PACKET OUT: hdrword %u len %u plen %u dwords %u tlen %u",
-		  hdrwords, len, plen, dwords, tlen);
-	if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE)
-		snoop_mode = 1;
-	if ((snoop_mode == 0) ||
-	    unlikely(snoop_flags & SNOOP_USE_METADATA))
-		md_len = sizeof(struct capture_md);
-
-	/* not using ss->total_len as arg 2 b/c that does not count CRC */
-	s_packet = allocate_snoop_packet(hdr_len, tlen - hdr_len, md_len);
-
-	if (unlikely(!s_packet)) {
-		dd_dev_warn_ratelimited(ppd->dd, "Unable to allocate snoop/capture packet\n");
-		goto out;
-	}
-
-	s_packet->total_len = tlen + md_len;
-
-	if (md_len > 0) {
-		memset(&md, 0, sizeof(struct capture_md));
-		md.port = 1;
-		md.dir = PKT_DIR_EGRESS;
-		if (likely(pbc == 0)) {
-			vl = be16_to_cpu(ps->s_txreq->phdr.hdr.lrh[0]) >> 12;
-			md.u.pbc = create_pbc(ppd, 0, qp->s_srate, vl, plen);
-		} else {
-			md.u.pbc = 0;
-		}
-		memcpy(s_packet->data, &md, md_len);
-	} else {
-		md.u.pbc = pbc;
-	}
-
-	/* Copy header */
-	if (likely(hdr)) {
-		memcpy(s_packet->data + md_len, hdr, hdr_len);
-	} else {
-		dd_dev_err(ppd->dd,
-			   "Unable to copy header to snoop/capture packet\n");
-		kfree(s_packet);
-		goto out;
-	}
-
-	if (ss) {
-		data = s_packet->data + hdr_len + md_len;
-		data_start = data;
-
-		/*
-		 * Copy SGE State
-		 * The update_sge() function below will not modify the
-		 * individual SGEs in the array. It will make a copy each time
-		 * and operate on that. So we only need to copy this instance
-		 * and it won't impact PIO.
-		 */
-		temp_ss = *ss;
-		length = len;
-
-		snoop_dbg("Need to copy %d bytes", length);
-		while (length) {
-			void *addr = temp_ss.sge.vaddr;
-			u32 slen = temp_ss.sge.length;
-
-			if (slen > length) {
-				slen = length;
-				snoop_dbg("slen %d > len %d", slen, length);
-			}
-			snoop_dbg("copy %d to %p", slen, addr);
-			memcpy(data, addr, slen);
-			update_sge(&temp_ss, slen);
-			length -= slen;
-			data += slen;
-			snoop_dbg("data is now %p bytes left %d", data, length);
-		}
-		snoop_dbg("Completed SGE copy");
-	}
-
-	/*
-	 * Why do the filter check down here? Because the event tracing has its
-	 * own filtering and we need to have the walked the SGE list.
-	 */
-	if (!ppd->dd->hfi1_snoop.filter_callback) {
-		snoop_dbg("filter not set\n");
-		ret = HFI1_FILTER_HIT;
-	} else {
-		ret = ppd->dd->hfi1_snoop.filter_callback(
-					&ps->s_txreq->phdr.hdr,
-					NULL,
-					ppd->dd->hfi1_snoop.filter_value);
-	}
-
-	switch (ret) {
-	case HFI1_FILTER_ERR:
-		snoop_dbg("Error in filter call");
-		/* fall through */
-	case HFI1_FILTER_MISS:
-		snoop_dbg("Filter Miss");
-		kfree(s_packet);
-		break;
-	case HFI1_FILTER_HIT:
-		snoop_dbg("Capturing packet");
-		snoop_list_add_tail(s_packet, ppd->dd);
-
-		if (unlikely((snoop_flags & SNOOP_DROP_SEND) &&
-			     (ppd->dd->hfi1_snoop.mode_flag &
-			      HFI1_PORT_SNOOP_MODE))) {
-			unsigned long flags;
-
-			snoop_dbg("Dropping packet");
-			if (qp->s_wqe) {
-				spin_lock_irqsave(&qp->s_lock, flags);
-				hfi1_send_complete(
-					qp,
-					qp->s_wqe,
-					IB_WC_SUCCESS);
-				spin_unlock_irqrestore(&qp->s_lock, flags);
-			} else if (qp->ibqp.qp_type == IB_QPT_RC) {
-				spin_lock_irqsave(&qp->s_lock, flags);
-				hfi1_rc_send_complete(qp,
-						      &ps->s_txreq->phdr.hdr);
-				spin_unlock_irqrestore(&qp->s_lock, flags);
-			}
-
-			/*
-			 * If snoop is dropping the packet we need to put the
-			 * txreq back because no one else will.
-			 */
-			hfi1_put_txreq(ps->s_txreq);
-			return 0;
-		}
-		break;
-	default:
-		kfree(s_packet);
-		break;
-	}
-out:
-	return hfi1_verbs_send_pio(qp, ps, md.u.pbc);
-}
-
-/*
- * Callers of this must pass a hfi1_ib_header type for the from ptr. Currently
- * this can be used anywhere, but the intention is for inline ACKs for RC and
- * CCA packets. We don't restrict this usage though.
- */
-void snoop_inline_pio_send(struct hfi1_devdata *dd, struct pio_buf *pbuf,
-			   u64 pbc, const void *from, size_t count)
-{
-	int snoop_mode = 0;
-	int md_len = 0;
-	struct capture_md md;
-	struct snoop_packet *s_packet = NULL;
-
-	/*
-	 * count is in dwords so we need to convert to bytes.
-	 * We also need to account for CRC which would be tacked on by hardware.
-	 */
-	int packet_len = (count << 2) + 4;
-	int ret;
-
-	snoop_dbg("ACK OUT: len %d", packet_len);
-
-	if (!dd->hfi1_snoop.filter_callback) {
-		snoop_dbg("filter not set");
-		ret = HFI1_FILTER_HIT;
-	} else {
-		ret = dd->hfi1_snoop.filter_callback(
-				(struct hfi1_ib_header *)from,
-				NULL,
-				dd->hfi1_snoop.filter_value);
-	}
-
-	switch (ret) {
-	case HFI1_FILTER_ERR:
-		snoop_dbg("Error in filter call");
-		/* fall through */
-	case HFI1_FILTER_MISS:
-		snoop_dbg("Filter Miss");
-		break;
-	case HFI1_FILTER_HIT:
-		snoop_dbg("Capturing packet");
-		if (dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE)
-			snoop_mode = 1;
-		if ((snoop_mode == 0) ||
-		    unlikely(snoop_flags & SNOOP_USE_METADATA))
-			md_len = sizeof(struct capture_md);
-
-		s_packet = allocate_snoop_packet(packet_len, 0, md_len);
-
-		if (unlikely(!s_packet)) {
-			dd_dev_warn_ratelimited(dd, "Unable to allocate snoop/capture packet\n");
-			goto inline_pio_out;
-		}
-
-		s_packet->total_len = packet_len + md_len;
-
-		/* Fill in the metadata for the packet */
-		if (md_len > 0) {
-			memset(&md, 0, sizeof(struct capture_md));
-			md.port = 1;
-			md.dir = PKT_DIR_EGRESS;
-			md.u.pbc = pbc;
-			memcpy(s_packet->data, &md, md_len);
-		}
-
-		/* Add the packet data which is a single buffer */
-		memcpy(s_packet->data + md_len, from, packet_len);
-
-		snoop_list_add_tail(s_packet, dd);
-
-		if (unlikely((snoop_flags & SNOOP_DROP_SEND) && snoop_mode)) {
-			snoop_dbg("Dropping packet");
-			return;
-		}
-		break;
-	default:
-		break;
-	}
-
-inline_pio_out:
-	pio_copy(dd, pbuf, pbc, from, count);
-}
diff --git a/drivers/staging/rdma/hfi1/eprom.c b/drivers/staging/rdma/hfi1/eprom.c
deleted file mode 100644
index bd87715..0000000
--- a/drivers/staging/rdma/hfi1/eprom.c
+++ /dev/null
@@ -1,471 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include <linux/delay.h>
-#include "hfi.h"
-#include "common.h"
-#include "eprom.h"
-
-/*
- * The EPROM is logically divided into three partitions:
- *	partition 0: the first 128K, visible from PCI ROM BAR
- *	partition 1: 4K config file (sector size)
- *	partition 2: the rest
- */
-#define P0_SIZE (128 * 1024)
-#define P1_SIZE   (4 * 1024)
-#define P1_START P0_SIZE
-#define P2_START (P0_SIZE + P1_SIZE)
-
-/* erase sizes supported by the controller */
-#define SIZE_4KB (4 * 1024)
-#define MASK_4KB (SIZE_4KB - 1)
-
-#define SIZE_32KB (32 * 1024)
-#define MASK_32KB (SIZE_32KB - 1)
-
-#define SIZE_64KB (64 * 1024)
-#define MASK_64KB (SIZE_64KB - 1)
-
-/* controller page size, in bytes */
-#define EP_PAGE_SIZE 256
-#define EEP_PAGE_MASK (EP_PAGE_SIZE - 1)
-
-/* controller commands */
-#define CMD_SHIFT 24
-#define CMD_NOP			    (0)
-#define CMD_PAGE_PROGRAM(addr)	    ((0x02 << CMD_SHIFT) | addr)
-#define CMD_READ_DATA(addr)	    ((0x03 << CMD_SHIFT) | addr)
-#define CMD_READ_SR1		    ((0x05 << CMD_SHIFT))
-#define CMD_WRITE_ENABLE	    ((0x06 << CMD_SHIFT))
-#define CMD_SECTOR_ERASE_4KB(addr)  ((0x20 << CMD_SHIFT) | addr)
-#define CMD_SECTOR_ERASE_32KB(addr) ((0x52 << CMD_SHIFT) | addr)
-#define CMD_CHIP_ERASE		    ((0x60 << CMD_SHIFT))
-#define CMD_READ_MANUF_DEV_ID	    ((0x90 << CMD_SHIFT))
-#define CMD_RELEASE_POWERDOWN_NOID  ((0xab << CMD_SHIFT))
-#define CMD_SECTOR_ERASE_64KB(addr) ((0xd8 << CMD_SHIFT) | addr)
-
-/* controller interface speeds */
-#define EP_SPEED_FULL 0x2	/* full speed */
-
-/* controller status register 1 bits */
-#define SR1_BUSY 0x1ull		/* the BUSY bit in SR1 */
-
-/* sleep length while waiting for controller */
-#define WAIT_SLEEP_US 100	/* must be larger than 5 (see usage) */
-#define COUNT_DELAY_SEC(n) ((n) * (1000000 / WAIT_SLEEP_US))
-
-/* GPIO pins */
-#define EPROM_WP_N BIT_ULL(14)	/* EPROM write line */
-
-/*
- * How long to wait for the EPROM to become available, in ms.
- * The spec 32 Mb EPROM takes around 40s to erase then write.
- * Double it for safety.
- */
-#define EPROM_TIMEOUT 80000 /* ms */
-
-/*
- * Turn on external enable line that allows writing on the flash.
- */
-static void write_enable(struct hfi1_devdata *dd)
-{
-	/* raise signal */
-	write_csr(dd, ASIC_GPIO_OUT, read_csr(dd, ASIC_GPIO_OUT) | EPROM_WP_N);
-	/* raise enable */
-	write_csr(dd, ASIC_GPIO_OE, read_csr(dd, ASIC_GPIO_OE) | EPROM_WP_N);
-}
-
-/*
- * Turn off external enable line that allows writing on the flash.
- */
-static void write_disable(struct hfi1_devdata *dd)
-{
-	/* lower signal */
-	write_csr(dd, ASIC_GPIO_OUT, read_csr(dd, ASIC_GPIO_OUT) & ~EPROM_WP_N);
-	/* lower enable */
-	write_csr(dd, ASIC_GPIO_OE, read_csr(dd, ASIC_GPIO_OE) & ~EPROM_WP_N);
-}
-
-/*
- * Wait for the device to become not busy.  Must be called after all
- * write or erase operations.
- */
-static int wait_for_not_busy(struct hfi1_devdata *dd)
-{
-	unsigned long count = 0;
-	u64 reg;
-	int ret = 0;
-
-	/* starts page mode */
-	write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_SR1);
-	while (1) {
-		udelay(WAIT_SLEEP_US);
-		usleep_range(WAIT_SLEEP_US - 5, WAIT_SLEEP_US + 5);
-		count++;
-		reg = read_csr(dd, ASIC_EEP_DATA);
-		if ((reg & SR1_BUSY) == 0)
-			break;
-		/* 200s is the largest time for a 128Mb device */
-		if (count > COUNT_DELAY_SEC(200)) {
-			dd_dev_err(dd, "waited too long for SPI FLASH busy to clear - failing\n");
-			ret = -ETIMEDOUT;
-			break; /* break, not goto - must stop page mode */
-		}
-	}
-
-	/* stop page mode with a NOP */
-	write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_NOP);
-
-	return ret;
-}
-
-/*
- * Read the device ID from the SPI controller.
- */
-static u32 read_device_id(struct hfi1_devdata *dd)
-{
-	/* read the Manufacture Device ID */
-	write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_MANUF_DEV_ID);
-	return (u32)read_csr(dd, ASIC_EEP_DATA);
-}
-
-/*
- * Erase the whole flash.
- */
-static int erase_chip(struct hfi1_devdata *dd)
-{
-	int ret;
-
-	write_enable(dd);
-
-	write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE);
-	write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_CHIP_ERASE);
-	ret = wait_for_not_busy(dd);
-
-	write_disable(dd);
-
-	return ret;
-}
-
-/*
- * Erase a range.
- */
-static int erase_range(struct hfi1_devdata *dd, u32 start, u32 len)
-{
-	u32 end = start + len;
-	int ret = 0;
-
-	if (end < start)
-		return -EINVAL;
-
-	/* check the end points for the minimum erase */
-	if ((start & MASK_4KB) || (end & MASK_4KB)) {
-		dd_dev_err(dd,
-			   "%s: non-aligned range (0x%x,0x%x) for a 4KB erase\n",
-			   __func__, start, end);
-		return -EINVAL;
-	}
-
-	write_enable(dd);
-
-	while (start < end) {
-		write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE);
-		/* check in order of largest to smallest */
-		if (((start & MASK_64KB) == 0) && (start + SIZE_64KB <= end)) {
-			write_csr(dd, ASIC_EEP_ADDR_CMD,
-				  CMD_SECTOR_ERASE_64KB(start));
-			start += SIZE_64KB;
-		} else if (((start & MASK_32KB) == 0) &&
-			   (start + SIZE_32KB <= end)) {
-			write_csr(dd, ASIC_EEP_ADDR_CMD,
-				  CMD_SECTOR_ERASE_32KB(start));
-			start += SIZE_32KB;
-		} else {	/* 4KB will work */
-			write_csr(dd, ASIC_EEP_ADDR_CMD,
-				  CMD_SECTOR_ERASE_4KB(start));
-			start += SIZE_4KB;
-		}
-		ret = wait_for_not_busy(dd);
-		if (ret)
-			goto done;
-	}
-
-done:
-	write_disable(dd);
-
-	return ret;
-}
-
-/*
- * Read a 256 byte (64 dword) EPROM page.
- * All callers have verified the offset is at a page boundary.
- */
-static void read_page(struct hfi1_devdata *dd, u32 offset, u32 *result)
-{
-	int i;
-
-	write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_DATA(offset));
-	for (i = 0; i < EP_PAGE_SIZE / sizeof(u32); i++)
-		result[i] = (u32)read_csr(dd, ASIC_EEP_DATA);
-	write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_NOP); /* close open page */
-}
-
-/*
- * Read length bytes starting at offset.  Copy to user address addr.
- */
-static int read_length(struct hfi1_devdata *dd, u32 start, u32 len, u64 addr)
-{
-	u32 offset;
-	u32 buffer[EP_PAGE_SIZE / sizeof(u32)];
-	int ret = 0;
-
-	/* reject anything not on an EPROM page boundary */
-	if ((start & EEP_PAGE_MASK) || (len & EEP_PAGE_MASK))
-		return -EINVAL;
-
-	for (offset = 0; offset < len; offset += EP_PAGE_SIZE) {
-		read_page(dd, start + offset, buffer);
-		if (copy_to_user((void __user *)(addr + offset),
-				 buffer, EP_PAGE_SIZE)) {
-			ret = -EFAULT;
-			goto done;
-		}
-	}
-
-done:
-	return ret;
-}
-
-/*
- * Write a 256 byte (64 dword) EPROM page.
- * All callers have verified the offset is at a page boundary.
- */
-static int write_page(struct hfi1_devdata *dd, u32 offset, u32 *data)
-{
-	int i;
-
-	write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE);
-	write_csr(dd, ASIC_EEP_DATA, data[0]);
-	write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_PAGE_PROGRAM(offset));
-	for (i = 1; i < EP_PAGE_SIZE / sizeof(u32); i++)
-		write_csr(dd, ASIC_EEP_DATA, data[i]);
-	/* will close the open page */
-	return wait_for_not_busy(dd);
-}
-
-/*
- * Write length bytes starting at offset.  Read from user address addr.
- */
-static int write_length(struct hfi1_devdata *dd, u32 start, u32 len, u64 addr)
-{
-	u32 offset;
-	u32 buffer[EP_PAGE_SIZE / sizeof(u32)];
-	int ret = 0;
-
-	/* reject anything not on an EPROM page boundary */
-	if ((start & EEP_PAGE_MASK) || (len & EEP_PAGE_MASK))
-		return -EINVAL;
-
-	write_enable(dd);
-
-	for (offset = 0; offset < len; offset += EP_PAGE_SIZE) {
-		if (copy_from_user(buffer, (void __user *)(addr + offset),
-				   EP_PAGE_SIZE)) {
-			ret = -EFAULT;
-			goto done;
-		}
-		ret = write_page(dd, start + offset, buffer);
-		if (ret)
-			goto done;
-	}
-
-done:
-	write_disable(dd);
-	return ret;
-}
-
-/* convert an range composite to a length, in bytes */
-static inline u32 extract_rlen(u32 composite)
-{
-	return (composite & 0xffff) * EP_PAGE_SIZE;
-}
-
-/* convert an range composite to a start, in bytes */
-static inline u32 extract_rstart(u32 composite)
-{
-	return (composite >> 16) * EP_PAGE_SIZE;
-}
-
-/*
- * Perform the given operation on the EPROM.  Called from user space.  The
- * user credentials have already been checked.
- *
- * Return 0 on success, -ERRNO on error
- */
-int handle_eprom_command(struct file *fp, const struct hfi1_cmd *cmd)
-{
-	struct hfi1_devdata *dd;
-	u32 dev_id;
-	u32 rlen;	/* range length */
-	u32 rstart;	/* range start */
-	int i_minor;
-	int ret = 0;
-
-	/*
-	 * Map the device file to device data using the relative minor.
-	 * The device file minor number is the unit number + 1.  0 is
-	 * the generic device file - reject it.
-	 */
-	i_minor = iminor(file_inode(fp)) - HFI1_USER_MINOR_BASE;
-	if (i_minor <= 0)
-		return -EINVAL;
-	dd = hfi1_lookup(i_minor - 1);
-	if (!dd) {
-		pr_err("%s: cannot find unit %d!\n", __func__, i_minor);
-		return -EINVAL;
-	}
-
-	/* some devices do not have an EPROM */
-	if (!dd->eprom_available)
-		return -EOPNOTSUPP;
-
-	ret = acquire_chip_resource(dd, CR_EPROM, EPROM_TIMEOUT);
-	if (ret) {
-		dd_dev_err(dd, "%s: unable to acquire EPROM resource\n",
-			   __func__);
-		goto done_asic;
-	}
-
-	dd_dev_info(dd, "%s: cmd: type %d, len 0x%x, addr 0x%016llx\n",
-		    __func__, cmd->type, cmd->len, cmd->addr);
-
-	switch (cmd->type) {
-	case HFI1_CMD_EP_INFO:
-		if (cmd->len != sizeof(u32)) {
-			ret = -ERANGE;
-			break;
-		}
-		dev_id = read_device_id(dd);
-		/* addr points to a u32 user buffer */
-		if (copy_to_user((void __user *)cmd->addr, &dev_id,
-				 sizeof(u32)))
-			ret = -EFAULT;
-		break;
-
-	case HFI1_CMD_EP_ERASE_CHIP:
-		ret = erase_chip(dd);
-		break;
-
-	case HFI1_CMD_EP_ERASE_RANGE:
-		rlen = extract_rlen(cmd->len);
-		rstart = extract_rstart(cmd->len);
-		ret = erase_range(dd, rstart, rlen);
-		break;
-
-	case HFI1_CMD_EP_READ_RANGE:
-		rlen = extract_rlen(cmd->len);
-		rstart = extract_rstart(cmd->len);
-		ret = read_length(dd, rstart, rlen, cmd->addr);
-		break;
-
-	case HFI1_CMD_EP_WRITE_RANGE:
-		rlen = extract_rlen(cmd->len);
-		rstart = extract_rstart(cmd->len);
-		ret = write_length(dd, rstart, rlen, cmd->addr);
-		break;
-
-	default:
-		dd_dev_err(dd, "%s: unexpected command %d\n",
-			   __func__, cmd->type);
-		ret = -EINVAL;
-		break;
-	}
-
-	release_chip_resource(dd, CR_EPROM);
-done_asic:
-	return ret;
-}
-
-/*
- * Initialize the EPROM handler.
- */
-int eprom_init(struct hfi1_devdata *dd)
-{
-	int ret = 0;
-
-	/* only the discrete chip has an EPROM */
-	if (dd->pcidev->device != PCI_DEVICE_ID_INTEL0)
-		return 0;
-
-	/*
-	 * It is OK if both HFIs reset the EPROM as long as they don't
-	 * do it at the same time.
-	 */
-	ret = acquire_chip_resource(dd, CR_EPROM, EPROM_TIMEOUT);
-	if (ret) {
-		dd_dev_err(dd,
-			   "%s: unable to acquire EPROM resource, no EPROM support\n",
-			   __func__);
-		goto done_asic;
-	}
-
-	/* reset EPROM to be sure it is in a good state */
-
-	/* set reset */
-	write_csr(dd, ASIC_EEP_CTL_STAT, ASIC_EEP_CTL_STAT_EP_RESET_SMASK);
-	/* clear reset, set speed */
-	write_csr(dd, ASIC_EEP_CTL_STAT,
-		  EP_SPEED_FULL << ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT);
-
-	/* wake the device with command "release powerdown NoID" */
-	write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_RELEASE_POWERDOWN_NOID);
-
-	dd->eprom_available = true;
-	release_chip_resource(dd, CR_EPROM);
-done_asic:
-	return ret;
-}
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index 37dd534c..c8a773f 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -239,12 +239,15 @@
 
 #define IB_MGMT_CLASSPORTINFO_ATTR_ID	cpu_to_be16(0x0001)
 
+#define IB_CLASS_PORT_INFO_RESP_TIME_MASK	0x1F
+#define IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE 5
+
 struct ib_class_port_info {
 	u8			base_version;
 	u8			class_version;
 	__be16			capability_mask;
-	u8			reserved[3];
-	u8			resp_time_value;
+	  /* 27 bits for cap_mask2, 5 bits for resp_time */
+	__be32			cap_mask2_resp_time;
 	u8			redirect_gid[16];
 	__be32			redirect_tcslfl;
 	__be16			redirect_lid;
@@ -259,6 +262,59 @@
 	__be32			trap_qkey;
 };
 
+/**
+ * ib_get_cpi_resp_time - Returns the resp_time value from
+ * cap_mask2_resp_time in ib_class_port_info.
+ * @cpi: A struct ib_class_port_info mad.
+ */
+static inline u8 ib_get_cpi_resp_time(struct ib_class_port_info *cpi)
+{
+	return (u8)(be32_to_cpu(cpi->cap_mask2_resp_time) &
+		    IB_CLASS_PORT_INFO_RESP_TIME_MASK);
+}
+
+/**
+ * ib_set_cpi_resptime - Sets the response time in an
+ * ib_class_port_info mad.
+ * @cpi: A struct ib_class_port_info.
+ * @rtime: The response time to set.
+ */
+static inline void ib_set_cpi_resp_time(struct ib_class_port_info *cpi,
+					u8 rtime)
+{
+	cpi->cap_mask2_resp_time =
+		(cpi->cap_mask2_resp_time &
+		 cpu_to_be32(~IB_CLASS_PORT_INFO_RESP_TIME_MASK)) |
+		cpu_to_be32(rtime & IB_CLASS_PORT_INFO_RESP_TIME_MASK);
+}
+
+/**
+ * ib_get_cpi_capmask2 - Returns the capmask2 value from
+ * cap_mask2_resp_time in ib_class_port_info.
+ * @cpi: A struct ib_class_port_info mad.
+ */
+static inline u32 ib_get_cpi_capmask2(struct ib_class_port_info *cpi)
+{
+	return (be32_to_cpu(cpi->cap_mask2_resp_time) >>
+		IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE);
+}
+
+/**
+ * ib_set_cpi_capmask2 - Sets the capmask2 in an
+ * ib_class_port_info mad.
+ * @cpi: A struct ib_class_port_info.
+ * @capmask2: The capmask2 to set.
+ */
+static inline void ib_set_cpi_capmask2(struct ib_class_port_info *cpi,
+				       u32 capmask2)
+{
+	cpi->cap_mask2_resp_time =
+		(cpi->cap_mask2_resp_time &
+		 cpu_to_be32(IB_CLASS_PORT_INFO_RESP_TIME_MASK)) |
+		cpu_to_be32(capmask2 <<
+			    IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE);
+}
+
 struct ib_mad_notice_attr {
 	u8 generic_type;
 	u8 prod_type_msb;
diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h
index 0f3daae..b13419c 100644
--- a/include/rdma/ib_pack.h
+++ b/include/rdma/ib_pack.h
@@ -103,6 +103,9 @@
 	IB_OPCODE_ATOMIC_ACKNOWLEDGE                = 0x12,
 	IB_OPCODE_COMPARE_SWAP                      = 0x13,
 	IB_OPCODE_FETCH_ADD                         = 0x14,
+	/* opcode 0x15 is reserved */
+	IB_OPCODE_SEND_LAST_WITH_INVALIDATE         = 0x16,
+	IB_OPCODE_SEND_ONLY_WITH_INVALIDATE         = 0x17,
 
 	/* real constants follow -- see comment about above IB_OPCODE()
 	   macro for more details */
@@ -129,6 +132,8 @@
 	IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE),
 	IB_OPCODE(RC, COMPARE_SWAP),
 	IB_OPCODE(RC, FETCH_ADD),
+	IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE),
+	IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE),
 
 	/* UC */
 	IB_OPCODE(UC, SEND_FIRST),
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
index cdc1c81..3840416 100644
--- a/include/rdma/ib_sa.h
+++ b/include/rdma/ib_sa.h
@@ -94,6 +94,8 @@
 	IB_SA_BEST = 3
 };
 
+#define IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT	BIT(12)
+
 /*
  * Structures for SA records are named "struct ib_sa_xxx_rec."  No
  * attempt is made to pack structures to match the physical layout of
@@ -439,4 +441,14 @@
 			      void *context,
 			      struct ib_sa_query **sa_query);
 
+/* Support get SA ClassPortInfo */
+int ib_sa_classport_info_rec_query(struct ib_sa_client *client,
+				   struct ib_device *device, u8 port_num,
+				   int timeout_ms, gfp_t gfp_mask,
+				   void (*callback)(int status,
+						    struct ib_class_port_info *resp,
+						    void *context),
+				   void *context,
+				   struct ib_sa_query **sa_query);
+
 #endif /* IB_SA_H */
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index fc0320c..432bed5 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -403,56 +403,55 @@
 	IB_SPEED_EDR	= 32
 };
 
-struct ib_protocol_stats {
-	/* TBD... */
+/**
+ * struct rdma_hw_stats
+ * @timestamp - Used by the core code to track when the last update was
+ * @lifespan - Used by the core code to determine how old the counters
+ *   should be before being updated again.  Stored in jiffies, defaults
+ *   to 10 milliseconds, drivers can override the default be specifying
+ *   their own value during their allocation routine.
+ * @name - Array of pointers to static names used for the counters in
+ *   directory.
+ * @num_counters - How many hardware counters there are.  If name is
+ *   shorter than this number, a kernel oops will result.  Driver authors
+ *   are encouraged to leave BUILD_BUG_ON(ARRAY_SIZE(@name) < num_counters)
+ *   in their code to prevent this.
+ * @value - Array of u64 counters that are accessed by the sysfs code and
+ *   filled in by the drivers get_stats routine
+ */
+struct rdma_hw_stats {
+	unsigned long	timestamp;
+	unsigned long	lifespan;
+	const char * const *names;
+	int		num_counters;
+	u64		value[];
 };
 
-struct iw_protocol_stats {
-	u64	ipInReceives;
-	u64	ipInHdrErrors;
-	u64	ipInTooBigErrors;
-	u64	ipInNoRoutes;
-	u64	ipInAddrErrors;
-	u64	ipInUnknownProtos;
-	u64	ipInTruncatedPkts;
-	u64	ipInDiscards;
-	u64	ipInDelivers;
-	u64	ipOutForwDatagrams;
-	u64	ipOutRequests;
-	u64	ipOutDiscards;
-	u64	ipOutNoRoutes;
-	u64	ipReasmTimeout;
-	u64	ipReasmReqds;
-	u64	ipReasmOKs;
-	u64	ipReasmFails;
-	u64	ipFragOKs;
-	u64	ipFragFails;
-	u64	ipFragCreates;
-	u64	ipInMcastPkts;
-	u64	ipOutMcastPkts;
-	u64	ipInBcastPkts;
-	u64	ipOutBcastPkts;
+#define RDMA_HW_STATS_DEFAULT_LIFESPAN 10
+/**
+ * rdma_alloc_hw_stats_struct - Helper function to allocate dynamic struct
+ *   for drivers.
+ * @names - Array of static const char *
+ * @num_counters - How many elements in array
+ * @lifespan - How many milliseconds between updates
+ */
+static inline struct rdma_hw_stats *rdma_alloc_hw_stats_struct(
+		const char * const *names, int num_counters,
+		unsigned long lifespan)
+{
+	struct rdma_hw_stats *stats;
 
-	u64	tcpRtoAlgorithm;
-	u64	tcpRtoMin;
-	u64	tcpRtoMax;
-	u64	tcpMaxConn;
-	u64	tcpActiveOpens;
-	u64	tcpPassiveOpens;
-	u64	tcpAttemptFails;
-	u64	tcpEstabResets;
-	u64	tcpCurrEstab;
-	u64	tcpInSegs;
-	u64	tcpOutSegs;
-	u64	tcpRetransSegs;
-	u64	tcpInErrs;
-	u64	tcpOutRsts;
-};
+	stats = kzalloc(sizeof(*stats) + num_counters * sizeof(u64),
+			GFP_KERNEL);
+	if (!stats)
+		return NULL;
+	stats->names = names;
+	stats->num_counters = num_counters;
+	stats->lifespan = msecs_to_jiffies(lifespan);
 
-union rdma_protocol_stats {
-	struct ib_protocol_stats	ib;
-	struct iw_protocol_stats	iw;
-};
+	return stats;
+}
+
 
 /* Define bits for the various functionality this port needs to be supported by
  * the core.
@@ -1707,8 +1706,29 @@
 
 	struct iw_cm_verbs	     *iwcm;
 
-	int		           (*get_protocol_stats)(struct ib_device *device,
-							 union rdma_protocol_stats *stats);
+	/**
+	 * alloc_hw_stats - Allocate a struct rdma_hw_stats and fill in the
+	 *   driver initialized data.  The struct is kfree()'ed by the sysfs
+	 *   core when the device is removed.  A lifespan of -1 in the return
+	 *   struct tells the core to set a default lifespan.
+	 */
+	struct rdma_hw_stats      *(*alloc_hw_stats)(struct ib_device *device,
+						     u8 port_num);
+	/**
+	 * get_hw_stats - Fill in the counter value(s) in the stats struct.
+	 * @index - The index in the value array we wish to have updated, or
+	 *   num_counters if we want all stats updated
+	 * Return codes -
+	 *   < 0 - Error, no counters updated
+	 *   index - Updated the single counter pointed to by index
+	 *   num_counters - Updated all counters (will reset the timestamp
+	 *     and prevent further calls for lifespan milliseconds)
+	 * Drivers are allowed to update all counters in leiu of just the
+	 *   one given in index at their option
+	 */
+	int		           (*get_hw_stats)(struct ib_device *device,
+						   struct rdma_hw_stats *stats,
+						   u8 port, int index);
 	int		           (*query_device)(struct ib_device *device,
 						   struct ib_device_attr *device_attr,
 						   struct ib_udata *udata);
@@ -1926,6 +1946,8 @@
 	u8                           node_type;
 	u8                           phys_port_cnt;
 	struct ib_device_attr        attrs;
+	struct attribute_group	     *hw_stats_ag;
+	struct rdma_hw_stats         *hw_stats;
 
 	/**
 	 * The following mandatory functions are used only at device
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
index d57ceee..16274e2 100644
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -149,15 +149,15 @@
 	int qpn_res_end;
 	int nports;
 	int npkeys;
-	u8 qos_shift;
 	char cq_name[RVT_CQN_MAX];
 	int node;
-	int max_rdma_atomic;
 	int psn_mask;
 	int psn_shift;
 	int psn_modify_mask;
 	u32 core_cap_flags;
 	u32 max_mad_size;
+	u8 qos_shift;
+	u8 max_rdma_atomic;
 };
 
 /* Protection domain */
@@ -426,6 +426,15 @@
 }
 
 /*
+ * Return the max atomic suitable for determining
+ * the size of the ack ring buffer in a QP.
+ */
+static inline unsigned int rvt_max_atomic(struct rvt_dev_info *rdi)
+{
+	return rdi->dparms.max_rdma_atomic + 1;
+}
+
+/*
  * Return the indexed PKEY from the port PKEY table.
  */
 static inline u16 rvt_get_pkey(struct rvt_dev_info *rdi,
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index 0e1ff2a..6d23b87 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -211,8 +211,6 @@
 	unsigned size;
 };
 
-#define RVT_MAX_RDMA_ATOMIC	16
-
 /*
  * This structure holds the information that the send tasklet needs
  * to send a RDMA read response or atomic operation.
@@ -282,8 +280,7 @@
 	atomic_t refcount ____cacheline_aligned_in_smp;
 	wait_queue_head_t wait;
 
-	struct rvt_ack_entry s_ack_queue[RVT_MAX_RDMA_ATOMIC + 1]
-		____cacheline_aligned_in_smp;
+	struct rvt_ack_entry *s_ack_queue;
 	struct rvt_sge_state s_rdma_read_sge;
 
 	spinlock_t r_lock ____cacheline_aligned_in_smp;      /* used for APM */
diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h
index a533cec..98bebf8 100644
--- a/include/uapi/rdma/hfi/hfi1_user.h
+++ b/include/uapi/rdma/hfi/hfi1_user.h
@@ -66,7 +66,7 @@
  * The major version changes when data structures change in an incompatible
  * way. The driver must be the same for initialization to succeed.
  */
-#define HFI1_USER_SWMAJOR 5
+#define HFI1_USER_SWMAJOR 6
 
 /*
  * Minor version differences are always compatible
@@ -75,7 +75,12 @@
  * may not be implemented; the user code must deal with this if it
  * cares, or it must abort after initialization reports the difference.
  */
-#define HFI1_USER_SWMINOR 0
+#define HFI1_USER_SWMINOR 1
+
+/*
+ * We will encode the major/minor inside a single 32bit version number.
+ */
+#define HFI1_SWMAJOR_SHIFT 16
 
 /*
  * Set of HW and driver capability/feature bits.
@@ -107,19 +112,6 @@
 #define HFI1_RCVHDR_ENTSIZE_16   (1UL << 1)
 #define HFI1_RCVDHR_ENTSIZE_32   (1UL << 2)
 
-/*
- * If the unit is specified via open, HFI choice is fixed.  If port is
- * specified, it's also fixed.  Otherwise we try to spread contexts
- * across ports and HFIs, using different algorithms.  WITHIN is
- * the old default, prior to this mechanism.
- */
-#define HFI1_ALG_ACROSS 0 /* round robin contexts across HFIs, then
-			  * ports; this is the default */
-#define HFI1_ALG_WITHIN 1 /* use all contexts on an HFI (round robin
-			  * active ports within), then next HFI */
-#define HFI1_ALG_COUNT  2 /* number of algorithm choices */
-
-
 /* User commands. */
 #define HFI1_CMD_ASSIGN_CTXT     1	/* allocate HFI and context */
 #define HFI1_CMD_CTXT_INFO       2	/* find out what resources we got */
@@ -127,7 +119,6 @@
 #define HFI1_CMD_TID_UPDATE      4	/* update expected TID entries */
 #define HFI1_CMD_TID_FREE        5	/* free expected TID entries */
 #define HFI1_CMD_CREDIT_UPD      6	/* force an update of PIO credit */
-#define HFI1_CMD_SDMA_STATUS_UPD 7      /* force update of SDMA status ring */
 
 #define HFI1_CMD_RECV_CTRL       8	/* control receipt of packets */
 #define HFI1_CMD_POLL_TYPE       9	/* set the kind of polling we want */
@@ -135,13 +126,46 @@
 #define HFI1_CMD_SET_PKEY        11     /* set context's pkey */
 #define HFI1_CMD_CTXT_RESET      12     /* reset context's HW send context */
 #define HFI1_CMD_TID_INVAL_READ  13     /* read TID cache invalidations */
-/* separate EPROM commands from normal PSM commands */
-#define HFI1_CMD_EP_INFO         64      /* read EPROM device ID */
-#define HFI1_CMD_EP_ERASE_CHIP   65      /* erase whole EPROM */
-/* range 66-74 no longer used */
-#define HFI1_CMD_EP_ERASE_RANGE  75      /* erase EPROM range */
-#define HFI1_CMD_EP_READ_RANGE   76      /* read EPROM range */
-#define HFI1_CMD_EP_WRITE_RANGE  77      /* write EPROM range */
+#define HFI1_CMD_GET_VERS	 14	/* get the version of the user cdev */
+
+/*
+ * User IOCTLs can not go above 128 if they do then see common.h and change the
+ * base for the snoop ioctl
+ */
+#define IB_IOCTL_MAGIC 0x1b /* See Documentation/ioctl/ioctl-number.txt */
+
+/*
+ * Make the ioctls occupy the last 0xf0-0xff portion of the IB range
+ */
+#define __NUM(cmd) (HFI1_CMD_##cmd + 0xe0)
+
+struct hfi1_cmd;
+#define HFI1_IOCTL_ASSIGN_CTXT \
+	_IOWR(IB_IOCTL_MAGIC, __NUM(ASSIGN_CTXT), struct hfi1_user_info)
+#define HFI1_IOCTL_CTXT_INFO \
+	_IOW(IB_IOCTL_MAGIC, __NUM(CTXT_INFO), struct hfi1_ctxt_info)
+#define HFI1_IOCTL_USER_INFO \
+	_IOW(IB_IOCTL_MAGIC, __NUM(USER_INFO), struct hfi1_base_info)
+#define HFI1_IOCTL_TID_UPDATE \
+	_IOWR(IB_IOCTL_MAGIC, __NUM(TID_UPDATE), struct hfi1_tid_info)
+#define HFI1_IOCTL_TID_FREE \
+	_IOWR(IB_IOCTL_MAGIC, __NUM(TID_FREE), struct hfi1_tid_info)
+#define HFI1_IOCTL_CREDIT_UPD \
+	_IO(IB_IOCTL_MAGIC, __NUM(CREDIT_UPD))
+#define HFI1_IOCTL_RECV_CTRL \
+	_IOW(IB_IOCTL_MAGIC, __NUM(RECV_CTRL), int)
+#define HFI1_IOCTL_POLL_TYPE \
+	_IOW(IB_IOCTL_MAGIC, __NUM(POLL_TYPE), int)
+#define HFI1_IOCTL_ACK_EVENT \
+	_IOW(IB_IOCTL_MAGIC, __NUM(ACK_EVENT), unsigned long)
+#define HFI1_IOCTL_SET_PKEY \
+	_IOW(IB_IOCTL_MAGIC, __NUM(SET_PKEY), __u16)
+#define HFI1_IOCTL_CTXT_RESET \
+	_IO(IB_IOCTL_MAGIC, __NUM(CTXT_RESET))
+#define HFI1_IOCTL_TID_INVAL_READ \
+	_IOWR(IB_IOCTL_MAGIC, __NUM(TID_INVAL_READ), struct hfi1_tid_info)
+#define HFI1_IOCTL_GET_VERS \
+	_IOR(IB_IOCTL_MAGIC, __NUM(GET_VERS), int)
 
 #define _HFI1_EVENT_FROZEN_BIT         0
 #define _HFI1_EVENT_LINKDOWN_BIT       1
@@ -199,9 +223,7 @@
 	 * Should be set to HFI1_USER_SWVERSION.
 	 */
 	__u32 userversion;
-	__u16 pad;
-	/* HFI selection algorithm, if unit has not selected */
-	__u16 hfi1_alg;
+	__u32 pad;
 	/*
 	 * If two or more processes wish to share a context, each process
 	 * must set the subcontext_cnt and subcontext_id to the same
@@ -243,12 +265,6 @@
 	__u32 length;
 };
 
-struct hfi1_cmd {
-	__u32 type;        /* command type */
-	__u32 len;         /* length of struct pointed to by add */
-	__u64 addr;        /* pointer to user structure */
-};
-
 enum hfi1_sdma_comp_state {
 	FREE = 0,
 	QUEUED,
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 6e373d1..02fe839 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -135,10 +135,12 @@
  * Local service operations:
  *   RESOLVE - The client requests the local service to resolve a path.
  *   SET_TIMEOUT - The local service requests the client to set the timeout.
+ *   IP_RESOLVE - The client requests the local service to resolve an IP to GID.
  */
 enum {
 	RDMA_NL_LS_OP_RESOLVE = 0,
 	RDMA_NL_LS_OP_SET_TIMEOUT,
+	RDMA_NL_LS_OP_IP_RESOLVE,
 	RDMA_NL_LS_NUM_OPS
 };
 
@@ -176,6 +178,10 @@
 	__u8 path_use;
 };
 
+struct rdma_ls_ip_resolve_header {
+	__u32 ifindex;
+};
+
 /* Local service attribute type */
 #define RDMA_NLA_F_MANDATORY	(1 << 13)
 #define RDMA_NLA_TYPE_MASK	(~(NLA_F_NESTED | NLA_F_NET_BYTEORDER | \
@@ -193,6 +199,8 @@
  *   TCLASS          u8
  *   PKEY            u16                        cpu
  *   QOS_CLASS       u16                        cpu
+ *   IPV4            u32                        BE
+ *   IPV6            u8[16]                     BE
  */
 enum {
 	LS_NLA_TYPE_UNSPEC = 0,
@@ -204,6 +212,8 @@
 	LS_NLA_TYPE_TCLASS,
 	LS_NLA_TYPE_PKEY,
 	LS_NLA_TYPE_QOS_CLASS,
+	LS_NLA_TYPE_IPV4,
+	LS_NLA_TYPE_IPV6,
 	LS_NLA_TYPE_MAX
 };