Jack Steiner | ee5b8fe | 2008-07-29 22:33:59 -0700 | [diff] [blame] | 1 | /* |
| 2 | * SN Platform GRU Driver |
| 3 | * |
| 4 | * MMUOPS callbacks + TLB flushing |
| 5 | * |
| 6 | * This file handles emu notifier callbacks from the core kernel. The callbacks |
| 7 | * are used to update the TLB in the GRU as a result of changes in the |
| 8 | * state of a process address space. This file also handles TLB invalidates |
| 9 | * from the GRU driver. |
| 10 | * |
| 11 | * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. |
| 12 | * |
| 13 | * This program is free software; you can redistribute it and/or modify |
| 14 | * it under the terms of the GNU General Public License as published by |
| 15 | * the Free Software Foundation; either version 2 of the License, or |
| 16 | * (at your option) any later version. |
| 17 | * |
| 18 | * This program is distributed in the hope that it will be useful, |
| 19 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 20 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 21 | * GNU General Public License for more details. |
| 22 | * |
| 23 | * You should have received a copy of the GNU General Public License |
| 24 | * along with this program; if not, write to the Free Software |
| 25 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
| 26 | */ |
| 27 | |
| 28 | #include <linux/kernel.h> |
| 29 | #include <linux/list.h> |
| 30 | #include <linux/spinlock.h> |
| 31 | #include <linux/mm.h> |
| 32 | #include <linux/slab.h> |
| 33 | #include <linux/device.h> |
| 34 | #include <linux/hugetlb.h> |
| 35 | #include <linux/delay.h> |
| 36 | #include <linux/timex.h> |
Jack Steiner | ee5b8fe | 2008-07-29 22:33:59 -0700 | [diff] [blame] | 37 | #include <linux/srcu.h> |
| 38 | #include <asm/processor.h> |
| 39 | #include "gru.h" |
| 40 | #include "grutables.h" |
| 41 | #include <asm/uv/uv_hub.h> |
| 42 | |
| 43 | #define gru_random() get_cycles() |
| 44 | |
| 45 | /* ---------------------------------- TLB Invalidation functions -------- |
| 46 | * get_tgh_handle |
| 47 | * |
| 48 | * Find a TGH to use for issuing a TLB invalidate. For GRUs that are on the |
| 49 | * local blade, use a fixed TGH that is a function of the blade-local cpu |
| 50 | * number. Normally, this TGH is private to the cpu & no contention occurs for |
| 51 | * the TGH. For offblade GRUs, select a random TGH in the range above the |
| 52 | * private TGHs. A spinlock is required to access this TGH & the lock must be |
| 53 | * released when the invalidate is completes. This sucks, but it is the best we |
| 54 | * can do. |
| 55 | * |
| 56 | * Note that the spinlock is IN the TGH handle so locking does not involve |
| 57 | * additional cache lines. |
| 58 | * |
| 59 | */ |
| 60 | static inline int get_off_blade_tgh(struct gru_state *gru) |
| 61 | { |
| 62 | int n; |
| 63 | |
| 64 | n = GRU_NUM_TGH - gru->gs_tgh_first_remote; |
| 65 | n = gru_random() % n; |
| 66 | n += gru->gs_tgh_first_remote; |
| 67 | return n; |
| 68 | } |
| 69 | |
| 70 | static inline int get_on_blade_tgh(struct gru_state *gru) |
| 71 | { |
| 72 | return uv_blade_processor_id() >> gru->gs_tgh_local_shift; |
| 73 | } |
| 74 | |
| 75 | static struct gru_tlb_global_handle *get_lock_tgh_handle(struct gru_state |
| 76 | *gru) |
| 77 | { |
| 78 | struct gru_tlb_global_handle *tgh; |
| 79 | int n; |
| 80 | |
| 81 | preempt_disable(); |
| 82 | if (uv_numa_blade_id() == gru->gs_blade_id) |
| 83 | n = get_on_blade_tgh(gru); |
| 84 | else |
| 85 | n = get_off_blade_tgh(gru); |
| 86 | tgh = get_tgh_by_index(gru, n); |
| 87 | lock_tgh_handle(tgh); |
| 88 | |
| 89 | return tgh; |
| 90 | } |
| 91 | |
| 92 | static void get_unlock_tgh_handle(struct gru_tlb_global_handle *tgh) |
| 93 | { |
| 94 | unlock_tgh_handle(tgh); |
| 95 | preempt_enable(); |
| 96 | } |
| 97 | |
| 98 | /* |
| 99 | * gru_flush_tlb_range |
| 100 | * |
| 101 | * General purpose TLB invalidation function. This function scans every GRU in |
| 102 | * the ENTIRE system (partition) looking for GRUs where the specified MM has |
| 103 | * been accessed by the GRU. For each GRU found, the TLB must be invalidated OR |
| 104 | * the ASID invalidated. Invalidating an ASID causes a new ASID to be assigned |
| 105 | * on the next fault. This effectively flushes the ENTIRE TLB for the MM at the |
| 106 | * cost of (possibly) a large number of future TLBmisses. |
| 107 | * |
| 108 | * The current algorithm is optimized based on the following (somewhat true) |
| 109 | * assumptions: |
| 110 | * - GRU contexts are not loaded into a GRU unless a reference is made to |
| 111 | * the data segment or control block (this is true, not an assumption). |
| 112 | * If a DS/CB is referenced, the user will also issue instructions that |
| 113 | * cause TLBmisses. It is not necessary to optimize for the case where |
| 114 | * contexts are loaded but no instructions cause TLB misses. (I know |
| 115 | * this will happen but I'm not optimizing for it). |
| 116 | * - GRU instructions to invalidate TLB entries are SLOOOOWWW - normally |
| 117 | * a few usec but in unusual cases, it could be longer. Avoid if |
| 118 | * possible. |
| 119 | * - intrablade process migration between cpus is not frequent but is |
| 120 | * common. |
| 121 | * - a GRU context is not typically migrated to a different GRU on the |
| 122 | * blade because of intrablade migration |
| 123 | * - interblade migration is rare. Processes migrate their GRU context to |
| 124 | * the new blade. |
| 125 | * - if interblade migration occurs, migration back to the original blade |
| 126 | * is very very rare (ie., no optimization for this case) |
| 127 | * - most GRU instruction operate on a subset of the user REGIONS. Code |
| 128 | * & shared library regions are not likely targets of GRU instructions. |
| 129 | * |
| 130 | * To help improve the efficiency of TLB invalidation, the GMS data |
| 131 | * structure is maintained for EACH address space (MM struct). The GMS is |
| 132 | * also the structure that contains the pointer to the mmu callout |
| 133 | * functions. This structure is linked to the mm_struct for the address space |
| 134 | * using the mmu "register" function. The mmu interfaces are used to |
| 135 | * provide the callbacks for TLB invalidation. The GMS contains: |
| 136 | * |
| 137 | * - asid[maxgrus] array. ASIDs are assigned to a GRU when a context is |
| 138 | * loaded into the GRU. |
| 139 | * - asidmap[maxgrus]. bitmap to make it easier to find non-zero asids in |
| 140 | * the above array |
| 141 | * - ctxbitmap[maxgrus]. Indicates the contexts that are currently active |
| 142 | * in the GRU for the address space. This bitmap must be passed to the |
| 143 | * GRU to do an invalidate. |
| 144 | * |
| 145 | * The current algorithm for invalidating TLBs is: |
| 146 | * - scan the asidmap for GRUs where the context has been loaded, ie, |
| 147 | * asid is non-zero. |
| 148 | * - for each gru found: |
| 149 | * - if the ctxtmap is non-zero, there are active contexts in the |
| 150 | * GRU. TLB invalidate instructions must be issued to the GRU. |
| 151 | * - if the ctxtmap is zero, no context is active. Set the ASID to |
| 152 | * zero to force a full TLB invalidation. This is fast but will |
| 153 | * cause a lot of TLB misses if the context is reloaded onto the |
| 154 | * GRU |
| 155 | * |
| 156 | */ |
| 157 | |
| 158 | void gru_flush_tlb_range(struct gru_mm_struct *gms, unsigned long start, |
| 159 | unsigned long len) |
| 160 | { |
| 161 | struct gru_state *gru; |
| 162 | struct gru_mm_tracker *asids; |
| 163 | struct gru_tlb_global_handle *tgh; |
| 164 | unsigned long num; |
| 165 | int grupagesize, pagesize, pageshift, gid, asid; |
| 166 | |
| 167 | /* ZZZ TODO - handle huge pages */ |
| 168 | pageshift = PAGE_SHIFT; |
| 169 | pagesize = (1UL << pageshift); |
| 170 | grupagesize = GRU_PAGESIZE(pageshift); |
| 171 | num = min(((len + pagesize - 1) >> pageshift), GRUMAXINVAL); |
| 172 | |
| 173 | STAT(flush_tlb); |
| 174 | gru_dbg(grudev, "gms %p, start 0x%lx, len 0x%lx, asidmap 0x%lx\n", gms, |
| 175 | start, len, gms->ms_asidmap[0]); |
| 176 | |
| 177 | spin_lock(&gms->ms_asid_lock); |
| 178 | for_each_gru_in_bitmap(gid, gms->ms_asidmap) { |
| 179 | STAT(flush_tlb_gru); |
| 180 | gru = GID_TO_GRU(gid); |
| 181 | asids = gms->ms_asids + gid; |
| 182 | asid = asids->mt_asid; |
| 183 | if (asids->mt_ctxbitmap && asid) { |
| 184 | STAT(flush_tlb_gru_tgh); |
| 185 | asid = GRUASID(asid, start); |
| 186 | gru_dbg(grudev, |
| 187 | " FLUSH gruid %d, asid 0x%x, num %ld, cbmap 0x%x\n", |
| 188 | gid, asid, num, asids->mt_ctxbitmap); |
| 189 | tgh = get_lock_tgh_handle(gru); |
Jack Steiner | fe5bb6b | 2009-04-02 16:59:04 -0700 | [diff] [blame] | 190 | tgh_invalidate(tgh, start, ~0, asid, grupagesize, 0, |
Jack Steiner | ee5b8fe | 2008-07-29 22:33:59 -0700 | [diff] [blame] | 191 | num - 1, asids->mt_ctxbitmap); |
| 192 | get_unlock_tgh_handle(tgh); |
| 193 | } else { |
| 194 | STAT(flush_tlb_gru_zero_asid); |
| 195 | asids->mt_asid = 0; |
| 196 | __clear_bit(gru->gs_gid, gms->ms_asidmap); |
| 197 | gru_dbg(grudev, |
| 198 | " CLEARASID gruid %d, asid 0x%x, cbtmap 0x%x, asidmap 0x%lx\n", |
| 199 | gid, asid, asids->mt_ctxbitmap, |
| 200 | gms->ms_asidmap[0]); |
| 201 | } |
| 202 | } |
| 203 | spin_unlock(&gms->ms_asid_lock); |
| 204 | } |
| 205 | |
| 206 | /* |
| 207 | * Flush the entire TLB on a chiplet. |
| 208 | */ |
| 209 | void gru_flush_all_tlb(struct gru_state *gru) |
| 210 | { |
| 211 | struct gru_tlb_global_handle *tgh; |
| 212 | |
Jack Steiner | 4388460 | 2009-04-02 16:59:05 -0700 | [diff] [blame] | 213 | gru_dbg(grudev, "gid %d\n", gru->gs_gid); |
Jack Steiner | ee5b8fe | 2008-07-29 22:33:59 -0700 | [diff] [blame] | 214 | tgh = get_lock_tgh_handle(gru); |
Jack Steiner | fe5bb6b | 2009-04-02 16:59:04 -0700 | [diff] [blame] | 215 | tgh_invalidate(tgh, 0, ~0, 0, 1, 1, GRUMAXINVAL - 1, 0xffff); |
Jack Steiner | ee5b8fe | 2008-07-29 22:33:59 -0700 | [diff] [blame] | 216 | get_unlock_tgh_handle(tgh); |
Jack Steiner | ee5b8fe | 2008-07-29 22:33:59 -0700 | [diff] [blame] | 217 | } |
| 218 | |
| 219 | /* |
| 220 | * MMUOPS notifier callout functions |
| 221 | */ |
| 222 | static void gru_invalidate_range_start(struct mmu_notifier *mn, |
| 223 | struct mm_struct *mm, |
| 224 | unsigned long start, unsigned long end) |
| 225 | { |
| 226 | struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct, |
| 227 | ms_notifier); |
| 228 | |
| 229 | STAT(mmu_invalidate_range); |
| 230 | atomic_inc(&gms->ms_range_active); |
| 231 | gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx, act %d\n", gms, |
| 232 | start, end, atomic_read(&gms->ms_range_active)); |
| 233 | gru_flush_tlb_range(gms, start, end - start); |
| 234 | } |
| 235 | |
| 236 | static void gru_invalidate_range_end(struct mmu_notifier *mn, |
| 237 | struct mm_struct *mm, unsigned long start, |
| 238 | unsigned long end) |
| 239 | { |
| 240 | struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct, |
| 241 | ms_notifier); |
| 242 | |
Jack Steiner | 9ca8e40c1 | 2008-07-29 22:34:02 -0700 | [diff] [blame] | 243 | /* ..._and_test() provides needed barrier */ |
| 244 | (void)atomic_dec_and_test(&gms->ms_range_active); |
| 245 | |
Jack Steiner | ee5b8fe | 2008-07-29 22:33:59 -0700 | [diff] [blame] | 246 | wake_up_all(&gms->ms_wait_queue); |
| 247 | gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx\n", gms, start, end); |
| 248 | } |
| 249 | |
| 250 | static void gru_invalidate_page(struct mmu_notifier *mn, struct mm_struct *mm, |
| 251 | unsigned long address) |
| 252 | { |
| 253 | struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct, |
| 254 | ms_notifier); |
| 255 | |
| 256 | STAT(mmu_invalidate_page); |
| 257 | gru_flush_tlb_range(gms, address, PAGE_SIZE); |
| 258 | gru_dbg(grudev, "gms %p, address 0x%lx\n", gms, address); |
| 259 | } |
| 260 | |
| 261 | static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm) |
| 262 | { |
| 263 | struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct, |
| 264 | ms_notifier); |
| 265 | |
| 266 | gms->ms_released = 1; |
| 267 | gru_dbg(grudev, "gms %p\n", gms); |
| 268 | } |
| 269 | |
| 270 | |
| 271 | static const struct mmu_notifier_ops gru_mmuops = { |
| 272 | .invalidate_page = gru_invalidate_page, |
| 273 | .invalidate_range_start = gru_invalidate_range_start, |
| 274 | .invalidate_range_end = gru_invalidate_range_end, |
| 275 | .release = gru_release, |
| 276 | }; |
| 277 | |
| 278 | /* Move this to the basic mmu_notifier file. But for now... */ |
| 279 | static struct mmu_notifier *mmu_find_ops(struct mm_struct *mm, |
| 280 | const struct mmu_notifier_ops *ops) |
| 281 | { |
| 282 | struct mmu_notifier *mn, *gru_mn = NULL; |
| 283 | struct hlist_node *n; |
| 284 | |
| 285 | if (mm->mmu_notifier_mm) { |
| 286 | rcu_read_lock(); |
| 287 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, |
| 288 | hlist) |
| 289 | if (mn->ops == ops) { |
| 290 | gru_mn = mn; |
| 291 | break; |
| 292 | } |
| 293 | rcu_read_unlock(); |
| 294 | } |
| 295 | return gru_mn; |
| 296 | } |
| 297 | |
| 298 | struct gru_mm_struct *gru_register_mmu_notifier(void) |
| 299 | { |
| 300 | struct gru_mm_struct *gms; |
| 301 | struct mmu_notifier *mn; |
| 302 | |
| 303 | mn = mmu_find_ops(current->mm, &gru_mmuops); |
| 304 | if (mn) { |
| 305 | gms = container_of(mn, struct gru_mm_struct, ms_notifier); |
| 306 | atomic_inc(&gms->ms_refcnt); |
| 307 | } else { |
| 308 | gms = kzalloc(sizeof(*gms), GFP_KERNEL); |
| 309 | if (gms) { |
| 310 | spin_lock_init(&gms->ms_asid_lock); |
| 311 | gms->ms_notifier.ops = &gru_mmuops; |
| 312 | atomic_set(&gms->ms_refcnt, 1); |
| 313 | init_waitqueue_head(&gms->ms_wait_queue); |
| 314 | __mmu_notifier_register(&gms->ms_notifier, current->mm); |
| 315 | } |
| 316 | } |
| 317 | gru_dbg(grudev, "gms %p, refcnt %d\n", gms, |
| 318 | atomic_read(&gms->ms_refcnt)); |
| 319 | return gms; |
| 320 | } |
| 321 | |
| 322 | void gru_drop_mmu_notifier(struct gru_mm_struct *gms) |
| 323 | { |
| 324 | gru_dbg(grudev, "gms %p, refcnt %d, released %d\n", gms, |
| 325 | atomic_read(&gms->ms_refcnt), gms->ms_released); |
| 326 | if (atomic_dec_return(&gms->ms_refcnt) == 0) { |
| 327 | if (!gms->ms_released) |
| 328 | mmu_notifier_unregister(&gms->ms_notifier, current->mm); |
| 329 | kfree(gms); |
| 330 | } |
| 331 | } |
| 332 | |
| 333 | /* |
| 334 | * Setup TGH parameters. There are: |
| 335 | * - 24 TGH handles per GRU chiplet |
| 336 | * - a portion (MAX_LOCAL_TGH) of the handles are reserved for |
| 337 | * use by blade-local cpus |
| 338 | * - the rest are used by off-blade cpus. This usage is |
| 339 | * less frequent than blade-local usage. |
| 340 | * |
| 341 | * For now, use 16 handles for local flushes, 8 for remote flushes. If the blade |
| 342 | * has less tan or equal to 16 cpus, each cpu has a unique handle that it can |
| 343 | * use. |
| 344 | */ |
| 345 | #define MAX_LOCAL_TGH 16 |
| 346 | |
| 347 | void gru_tgh_flush_init(struct gru_state *gru) |
| 348 | { |
| 349 | int cpus, shift = 0, n; |
| 350 | |
| 351 | cpus = uv_blade_nr_possible_cpus(gru->gs_blade_id); |
| 352 | |
| 353 | /* n = cpus rounded up to next power of 2 */ |
| 354 | if (cpus) { |
| 355 | n = 1 << fls(cpus - 1); |
| 356 | |
| 357 | /* |
| 358 | * shift count for converting local cpu# to TGH index |
| 359 | * 0 if cpus <= MAX_LOCAL_TGH, |
| 360 | * 1 if cpus <= 2*MAX_LOCAL_TGH, |
| 361 | * etc |
| 362 | */ |
| 363 | shift = max(0, fls(n - 1) - fls(MAX_LOCAL_TGH - 1)); |
| 364 | } |
| 365 | gru->gs_tgh_local_shift = shift; |
| 366 | |
| 367 | /* first starting TGH index to use for remote purges */ |
| 368 | gru->gs_tgh_first_remote = (cpus + (1 << shift) - 1) >> shift; |
| 369 | |
| 370 | } |