[PATCH] IB/mthca: add fast memory region implementation
Implement fast memory regions (FMRs), where the driver writes directly into
the HCA's translation tables rather than requiring a firmware command. For
Tavor, MTTs for FMR are separate from regular MTTs, and are reserved at driver
initialization. This is done to limit the amount of virtual memory needed to
map the MTTs. For Arbel, there's no such limitation, and all MTTs and MPTs
may be used for FMR or for regular MR.
Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Roland Dreier <roland@topspin.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c
index ac3265d..a85b503 100644
--- a/drivers/infiniband/hw/mthca/mthca_mr.c
+++ b/drivers/infiniband/hw/mthca/mthca_mr.c
@@ -66,6 +66,9 @@
#define MTHCA_MTT_FLAG_PRESENT 1
+#define MTHCA_MPT_STATUS_SW 0xF0
+#define MTHCA_MPT_STATUS_HW 0x00
+
/*
* Buddy allocator for MTT segments (currently not very efficient
* since it doesn't keep a free list and just searches linearly
@@ -442,6 +445,20 @@
return err;
}
+/* Free mr or fmr */
+static void mthca_free_region(struct mthca_dev *dev, u32 lkey, int order,
+ u32 first_seg, struct mthca_buddy *buddy)
+{
+ if (order >= 0)
+ mthca_free_mtt(dev, first_seg, order, buddy);
+
+ if (dev->hca_type == ARBEL_NATIVE)
+ mthca_table_put(dev, dev->mr_table.mpt_table,
+ arbel_key_to_hw_index(lkey));
+
+ mthca_free(&dev->mr_table.mpt_alloc, key_to_hw_index(dev, lkey));
+}
+
void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr)
{
int err;
@@ -459,18 +476,288 @@
mthca_warn(dev, "HW2SW_MPT returned status 0x%02x\n",
status);
- if (mr->order >= 0)
- mthca_free_mtt(dev, mr->first_seg, mr->order, &dev->mr_table.mtt_buddy);
+ mthca_free_region(dev, mr->ibmr.lkey, mr->order, mr->first_seg,
+ &dev->mr_table.mtt_buddy);
+}
+int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd,
+ u32 access, struct mthca_fmr *mr)
+{
+ struct mthca_mpt_entry *mpt_entry;
+ void *mailbox;
+ u64 mtt_seg;
+ u32 key, idx;
+ u8 status;
+ int list_len = mr->attr.max_pages;
+ int err = -ENOMEM;
+ int i;
+
+ might_sleep();
+
+ if (mr->attr.page_size < 12 || mr->attr.page_size >= 32)
+ return -EINVAL;
+
+ /* For Arbel, all MTTs must fit in the same page. */
+ if (dev->hca_type == ARBEL_NATIVE &&
+ mr->attr.max_pages * sizeof *mr->mem.arbel.mtts > PAGE_SIZE)
+ return -EINVAL;
+
+ mr->maps = 0;
+
+ key = mthca_alloc(&dev->mr_table.mpt_alloc);
+ if (key == -1)
+ return -ENOMEM;
+
+ idx = key & (dev->limits.num_mpts - 1);
+ mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key);
+
+ if (dev->hca_type == ARBEL_NATIVE) {
+ err = mthca_table_get(dev, dev->mr_table.mpt_table, key);
+ if (err)
+ goto err_out_mpt_free;
+
+ mr->mem.arbel.mpt = mthca_table_find(dev->mr_table.mpt_table, key);
+ BUG_ON(!mr->mem.arbel.mpt);
+ } else
+ mr->mem.tavor.mpt = dev->mr_table.tavor_fmr.mpt_base +
+ sizeof *(mr->mem.tavor.mpt) * idx;
+
+ for (i = MTHCA_MTT_SEG_SIZE / 8, mr->order = 0;
+ i < list_len;
+ i <<= 1, ++mr->order)
+ ; /* nothing */
+
+ mr->first_seg = mthca_alloc_mtt(dev, mr->order,
+ dev->mr_table.fmr_mtt_buddy);
+ if (mr->first_seg == -1)
+ goto err_out_table;
+
+ mtt_seg = mr->first_seg * MTHCA_MTT_SEG_SIZE;
+
+ if (dev->hca_type == ARBEL_NATIVE) {
+ mr->mem.arbel.mtts = mthca_table_find(dev->mr_table.mtt_table,
+ mr->first_seg);
+ BUG_ON(!mr->mem.arbel.mtts);
+ } else
+ mr->mem.tavor.mtts = dev->mr_table.tavor_fmr.mtt_base + mtt_seg;
+
+ mailbox = kmalloc(sizeof *mpt_entry + MTHCA_CMD_MAILBOX_EXTRA,
+ GFP_KERNEL);
+ if (!mailbox)
+ goto err_out_free_mtt;
+
+ mpt_entry = MAILBOX_ALIGN(mailbox);
+
+ mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS |
+ MTHCA_MPT_FLAG_MIO |
+ MTHCA_MPT_FLAG_REGION |
+ access);
+
+ mpt_entry->page_size = cpu_to_be32(mr->attr.page_size - 12);
+ mpt_entry->key = cpu_to_be32(key);
+ mpt_entry->pd = cpu_to_be32(pd);
+ memset(&mpt_entry->start, 0,
+ sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, start));
+ mpt_entry->mtt_seg = cpu_to_be64(dev->mr_table.mtt_base + mtt_seg);
+
+ if (0) {
+ mthca_dbg(dev, "Dumping MPT entry %08x:\n", mr->ibmr.lkey);
+ for (i = 0; i < sizeof (struct mthca_mpt_entry) / 4; ++i) {
+ if (i % 4 == 0)
+ printk("[%02x] ", i * 4);
+ printk(" %08x", be32_to_cpu(((u32 *) mpt_entry)[i]));
+ if ((i + 1) % 4 == 0)
+ printk("\n");
+ }
+ }
+
+ err = mthca_SW2HW_MPT(dev, mpt_entry,
+ key & (dev->limits.num_mpts - 1),
+ &status);
+ if (err) {
+ mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+ goto err_out_mailbox_free;
+ }
+ if (status) {
+ mthca_warn(dev, "SW2HW_MPT returned status 0x%02x\n",
+ status);
+ err = -EINVAL;
+ goto err_out_mailbox_free;
+ }
+
+ kfree(mailbox);
+ return 0;
+
+err_out_mailbox_free:
+ kfree(mailbox);
+
+err_out_free_mtt:
+ mthca_free_mtt(dev, mr->first_seg, mr->order,
+ dev->mr_table.fmr_mtt_buddy);
+
+err_out_table:
if (dev->hca_type == ARBEL_NATIVE)
- mthca_table_put(dev, dev->mr_table.mpt_table,
- key_to_hw_index(dev, mr->ibmr.lkey));
- mthca_free(&dev->mr_table.mpt_alloc, key_to_hw_index(dev, mr->ibmr.lkey));
+ mthca_table_put(dev, dev->mr_table.mpt_table, key);
+
+err_out_mpt_free:
+ mthca_free(&dev->mr_table.mpt_alloc, mr->ibmr.lkey);
+ return err;
+}
+
+int mthca_free_fmr(struct mthca_dev *dev, struct mthca_fmr *fmr)
+{
+ if (fmr->maps)
+ return -EBUSY;
+
+ mthca_free_region(dev, fmr->ibmr.lkey, fmr->order, fmr->first_seg,
+ dev->mr_table.fmr_mtt_buddy);
+ return 0;
+}
+
+static inline int mthca_check_fmr(struct mthca_fmr *fmr, u64 *page_list,
+ int list_len, u64 iova)
+{
+ int i, page_mask;
+
+ if (list_len > fmr->attr.max_pages)
+ return -EINVAL;
+
+ page_mask = (1 << fmr->attr.page_size) - 1;
+
+ /* We are getting page lists, so va must be page aligned. */
+ if (iova & page_mask)
+ return -EINVAL;
+
+ /* Trust the user not to pass misaligned data in page_list */
+ if (0)
+ for (i = 0; i < list_len; ++i) {
+ if (page_list[i] & ~page_mask)
+ return -EINVAL;
+ }
+
+ if (fmr->maps >= fmr->attr.max_maps)
+ return -EINVAL;
+
+ return 0;
+}
+
+
+int mthca_tavor_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+ int list_len, u64 iova)
+{
+ struct mthca_fmr *fmr = to_mfmr(ibfmr);
+ struct mthca_dev *dev = to_mdev(ibfmr->device);
+ struct mthca_mpt_entry mpt_entry;
+ u32 key;
+ int i, err;
+
+ err = mthca_check_fmr(fmr, page_list, list_len, iova);
+ if (err)
+ return err;
+
+ ++fmr->maps;
+
+ key = tavor_key_to_hw_index(fmr->ibmr.lkey);
+ key += dev->limits.num_mpts;
+ fmr->ibmr.lkey = fmr->ibmr.rkey = tavor_hw_index_to_key(key);
+
+ writeb(MTHCA_MPT_STATUS_SW, fmr->mem.tavor.mpt);
+
+ for (i = 0; i < list_len; ++i) {
+ __be64 mtt_entry = cpu_to_be64(page_list[i] |
+ MTHCA_MTT_FLAG_PRESENT);
+ mthca_write64_raw(mtt_entry, fmr->mem.tavor.mtts + i);
+ }
+
+ mpt_entry.lkey = cpu_to_be32(key);
+ mpt_entry.length = cpu_to_be64(list_len * (1ull << fmr->attr.page_size));
+ mpt_entry.start = cpu_to_be64(iova);
+
+ writel(mpt_entry.lkey, &fmr->mem.tavor.mpt->key);
+ memcpy_toio(&fmr->mem.tavor.mpt->start, &mpt_entry.start,
+ offsetof(struct mthca_mpt_entry, window_count) -
+ offsetof(struct mthca_mpt_entry, start));
+
+ writeb(MTHCA_MPT_STATUS_HW, fmr->mem.tavor.mpt);
+
+ return 0;
+}
+
+int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+ int list_len, u64 iova)
+{
+ struct mthca_fmr *fmr = to_mfmr(ibfmr);
+ struct mthca_dev *dev = to_mdev(ibfmr->device);
+ u32 key;
+ int i, err;
+
+ err = mthca_check_fmr(fmr, page_list, list_len, iova);
+ if (err)
+ return err;
+
+ ++fmr->maps;
+
+ key = arbel_key_to_hw_index(fmr->ibmr.lkey);
+ key += dev->limits.num_mpts;
+ fmr->ibmr.lkey = fmr->ibmr.rkey = arbel_hw_index_to_key(key);
+
+ *(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_SW;
+
+ wmb();
+
+ for (i = 0; i < list_len; ++i)
+ fmr->mem.arbel.mtts[i] = cpu_to_be64(page_list[i] |
+ MTHCA_MTT_FLAG_PRESENT);
+
+ fmr->mem.arbel.mpt->key = cpu_to_be32(key);
+ fmr->mem.arbel.mpt->lkey = cpu_to_be32(key);
+ fmr->mem.arbel.mpt->length = cpu_to_be64(list_len * (1ull << fmr->attr.page_size));
+ fmr->mem.arbel.mpt->start = cpu_to_be64(iova);
+
+ wmb();
+
+ *(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_HW;
+
+ wmb();
+
+ return 0;
+}
+
+void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr)
+{
+ u32 key;
+
+ if (!fmr->maps)
+ return;
+
+ key = tavor_key_to_hw_index(fmr->ibmr.lkey);
+ key &= dev->limits.num_mpts - 1;
+ fmr->ibmr.lkey = fmr->ibmr.rkey = tavor_hw_index_to_key(key);
+
+ fmr->maps = 0;
+
+ writeb(MTHCA_MPT_STATUS_SW, fmr->mem.tavor.mpt);
+}
+
+void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr)
+{
+ u32 key;
+
+ if (!fmr->maps)
+ return;
+
+ key = arbel_key_to_hw_index(fmr->ibmr.lkey);
+ key &= dev->limits.num_mpts - 1;
+ fmr->ibmr.lkey = fmr->ibmr.rkey = arbel_hw_index_to_key(key);
+
+ fmr->maps = 0;
+
+ *(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_SW;
}
int __devinit mthca_init_mr_table(struct mthca_dev *dev)
{
- int err;
+ int err, i;
err = mthca_alloc_init(&dev->mr_table.mpt_alloc,
dev->limits.num_mpts,
@@ -478,23 +765,93 @@
if (err)
return err;
+ if (dev->hca_type != ARBEL_NATIVE &&
+ (dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN))
+ dev->limits.fmr_reserved_mtts = 0;
+ else
+ dev->mthca_flags |= MTHCA_FLAG_FMR;
+
err = mthca_buddy_init(&dev->mr_table.mtt_buddy,
fls(dev->limits.num_mtt_segs - 1));
+
if (err)
goto err_mtt_buddy;
- if (dev->limits.reserved_mtts) {
- if (mthca_alloc_mtt(dev, fls(dev->limits.reserved_mtts - 1),
- &dev->mr_table.mtt_buddy) == -1) {
- mthca_warn(dev, "MTT table of order %d is too small.\n",
- dev->mr_table.mtt_buddy.max_order);
+ dev->mr_table.tavor_fmr.mpt_base = NULL;
+ dev->mr_table.tavor_fmr.mtt_base = NULL;
+
+ if (dev->limits.fmr_reserved_mtts) {
+ i = fls(dev->limits.fmr_reserved_mtts - 1);
+
+ if (i >= 31) {
+ mthca_warn(dev, "Unable to reserve 2^31 FMR MTTs.\n");
+ err = -EINVAL;
+ goto err_fmr_mpt;
+ }
+
+ dev->mr_table.tavor_fmr.mpt_base =
+ ioremap(dev->mr_table.mpt_base,
+ (1 << i) * sizeof (struct mthca_mpt_entry));
+
+ if (!dev->mr_table.tavor_fmr.mpt_base) {
+ mthca_warn(dev, "MPT ioremap for FMR failed.\n");
err = -ENOMEM;
- goto err_mtt_buddy;
+ goto err_fmr_mpt;
+ }
+
+ dev->mr_table.tavor_fmr.mtt_base =
+ ioremap(dev->mr_table.mtt_base,
+ (1 << i) * MTHCA_MTT_SEG_SIZE);
+ if (!dev->mr_table.tavor_fmr.mtt_base) {
+ mthca_warn(dev, "MTT ioremap for FMR failed.\n");
+ err = -ENOMEM;
+ goto err_fmr_mtt;
+ }
+
+ err = mthca_buddy_init(&dev->mr_table.tavor_fmr.mtt_buddy, i);
+ if (err)
+ goto err_fmr_mtt_buddy;
+
+ /* Prevent regular MRs from using FMR keys */
+ err = mthca_buddy_alloc(&dev->mr_table.mtt_buddy, i);
+ if (err)
+ goto err_reserve_fmr;
+
+ dev->mr_table.fmr_mtt_buddy =
+ &dev->mr_table.tavor_fmr.mtt_buddy;
+ } else
+ dev->mr_table.fmr_mtt_buddy = &dev->mr_table.mtt_buddy;
+
+ /* FMR table is always the first, take reserved MTTs out of there */
+ if (dev->limits.reserved_mtts) {
+ i = fls(dev->limits.reserved_mtts - 1);
+
+ if (mthca_alloc_mtt(dev, i, dev->mr_table.fmr_mtt_buddy) == -1) {
+ mthca_warn(dev, "MTT table of order %d is too small.\n",
+ dev->mr_table.fmr_mtt_buddy->max_order);
+ err = -ENOMEM;
+ goto err_reserve_mtts;
}
}
return 0;
+err_reserve_mtts:
+err_reserve_fmr:
+ if (dev->limits.fmr_reserved_mtts)
+ mthca_buddy_cleanup(&dev->mr_table.tavor_fmr.mtt_buddy);
+
+err_fmr_mtt_buddy:
+ if (dev->mr_table.tavor_fmr.mtt_base)
+ iounmap(dev->mr_table.tavor_fmr.mtt_base);
+
+err_fmr_mtt:
+ if (dev->mr_table.tavor_fmr.mpt_base)
+ iounmap(dev->mr_table.tavor_fmr.mpt_base);
+
+err_fmr_mpt:
+ mthca_buddy_cleanup(&dev->mr_table.mtt_buddy);
+
err_mtt_buddy:
mthca_alloc_cleanup(&dev->mr_table.mpt_alloc);
@@ -504,6 +861,15 @@
void __devexit mthca_cleanup_mr_table(struct mthca_dev *dev)
{
/* XXX check if any MRs are still allocated? */
+ if (dev->limits.fmr_reserved_mtts)
+ mthca_buddy_cleanup(&dev->mr_table.tavor_fmr.mtt_buddy);
+
mthca_buddy_cleanup(&dev->mr_table.mtt_buddy);
+
+ if (dev->mr_table.tavor_fmr.mtt_base)
+ iounmap(dev->mr_table.tavor_fmr.mtt_base);
+ if (dev->mr_table.tavor_fmr.mpt_base)
+ iounmap(dev->mr_table.tavor_fmr.mpt_base);
+
mthca_alloc_cleanup(&dev->mr_table.mpt_alloc);
}