| /* |
| * Copyright (C) 2007 Oracle. All rights reserved. |
| * |
| * This program is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU General Public |
| * License v2 as published by the Free Software Foundation. |
| * |
| * This program is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * General Public License for more details. |
| * |
| * You should have received a copy of the GNU General Public |
| * License along with this program; if not, write to the |
| * Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
| * Boston, MA 021110-1307, USA. |
| */ |
| |
| #include <linux/kthread.h> |
| #include <linux/slab.h> |
| #include <linux/list.h> |
| #include <linux/spinlock.h> |
| #include <linux/freezer.h> |
| #include "async-thread.h" |
| |
| #define WORK_QUEUED_BIT 0 |
| #define WORK_DONE_BIT 1 |
| #define WORK_ORDER_DONE_BIT 2 |
| #define WORK_HIGH_PRIO_BIT 3 |
| |
| /* |
| * container for the kthread task pointer and the list of pending work |
| * One of these is allocated per thread. |
| */ |
| struct btrfs_worker_thread { |
| /* pool we belong to */ |
| struct btrfs_workers *workers; |
| |
| /* list of struct btrfs_work that are waiting for service */ |
| struct list_head pending; |
| struct list_head prio_pending; |
| |
| /* list of worker threads from struct btrfs_workers */ |
| struct list_head worker_list; |
| |
| /* kthread */ |
| struct task_struct *task; |
| |
| /* number of things on the pending list */ |
| atomic_t num_pending; |
| |
| /* reference counter for this struct */ |
| atomic_t refs; |
| |
| unsigned long sequence; |
| |
| /* protects the pending list. */ |
| spinlock_t lock; |
| |
| /* set to non-zero when this thread is already awake and kicking */ |
| int working; |
| |
| /* are we currently idle */ |
| int idle; |
| }; |
| |
| static int __btrfs_start_workers(struct btrfs_workers *workers); |
| |
| /* |
| * btrfs_start_workers uses kthread_run, which can block waiting for memory |
| * for a very long time. It will actually throttle on page writeback, |
| * and so it may not make progress until after our btrfs worker threads |
| * process all of the pending work structs in their queue |
| * |
| * This means we can't use btrfs_start_workers from inside a btrfs worker |
| * thread that is used as part of cleaning dirty memory, which pretty much |
| * involves all of the worker threads. |
| * |
| * Instead we have a helper queue who never has more than one thread |
| * where we scheduler thread start operations. This worker_start struct |
| * is used to contain the work and hold a pointer to the queue that needs |
| * another worker. |
| */ |
| struct worker_start { |
| struct btrfs_work work; |
| struct btrfs_workers *queue; |
| }; |
| |
| static void start_new_worker_func(struct btrfs_work *work) |
| { |
| struct worker_start *start; |
| start = container_of(work, struct worker_start, work); |
| __btrfs_start_workers(start->queue); |
| kfree(start); |
| } |
| |
| /* |
| * helper function to move a thread onto the idle list after it |
| * has finished some requests. |
| */ |
| static void check_idle_worker(struct btrfs_worker_thread *worker) |
| { |
| if (!worker->idle && atomic_read(&worker->num_pending) < |
| worker->workers->idle_thresh / 2) { |
| unsigned long flags; |
| spin_lock_irqsave(&worker->workers->lock, flags); |
| worker->idle = 1; |
| |
| /* the list may be empty if the worker is just starting */ |
| if (!list_empty(&worker->worker_list)) { |
| list_move(&worker->worker_list, |
| &worker->workers->idle_list); |
| } |
| spin_unlock_irqrestore(&worker->workers->lock, flags); |
| } |
| } |
| |
| /* |
| * helper function to move a thread off the idle list after new |
| * pending work is added. |
| */ |
| static void check_busy_worker(struct btrfs_worker_thread *worker) |
| { |
| if (worker->idle && atomic_read(&worker->num_pending) >= |
| worker->workers->idle_thresh) { |
| unsigned long flags; |
| spin_lock_irqsave(&worker->workers->lock, flags); |
| worker->idle = 0; |
| |
| if (!list_empty(&worker->worker_list)) { |
| list_move_tail(&worker->worker_list, |
| &worker->workers->worker_list); |
| } |
| spin_unlock_irqrestore(&worker->workers->lock, flags); |
| } |
| } |
| |
| static void check_pending_worker_creates(struct btrfs_worker_thread *worker) |
| { |
| struct btrfs_workers *workers = worker->workers; |
| struct worker_start *start; |
| unsigned long flags; |
| |
| rmb(); |
| if (!workers->atomic_start_pending) |
| return; |
| |
| start = kzalloc(sizeof(*start), GFP_NOFS); |
| if (!start) |
| return; |
| |
| start->work.func = start_new_worker_func; |
| start->queue = workers; |
| |
| spin_lock_irqsave(&workers->lock, flags); |
| if (!workers->atomic_start_pending) |
| goto out; |
| |
| workers->atomic_start_pending = 0; |
| if (workers->num_workers + workers->num_workers_starting >= |
| workers->max_workers) |
| goto out; |
| |
| workers->num_workers_starting += 1; |
| spin_unlock_irqrestore(&workers->lock, flags); |
| btrfs_queue_worker(workers->atomic_worker_start, &start->work); |
| return; |
| |
| out: |
| kfree(start); |
| spin_unlock_irqrestore(&workers->lock, flags); |
| } |
| |
| static noinline int run_ordered_completions(struct btrfs_workers *workers, |
| struct btrfs_work *work) |
| { |
| if (!workers->ordered) |
| return 0; |
| |
| set_bit(WORK_DONE_BIT, &work->flags); |
| |
| spin_lock(&workers->order_lock); |
| |
| while (1) { |
| if (!list_empty(&workers->prio_order_list)) { |
| work = list_entry(workers->prio_order_list.next, |
| struct btrfs_work, order_list); |
| } else if (!list_empty(&workers->order_list)) { |
| work = list_entry(workers->order_list.next, |
| struct btrfs_work, order_list); |
| } else { |
| break; |
| } |
| if (!test_bit(WORK_DONE_BIT, &work->flags)) |
| break; |
| |
| /* we are going to call the ordered done function, but |
| * we leave the work item on the list as a barrier so |
| * that later work items that are done don't have their |
| * functions called before this one returns |
| */ |
| if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) |
| break; |
| |
| spin_unlock(&workers->order_lock); |
| |
| work->ordered_func(work); |
| |
| /* now take the lock again and call the freeing code */ |
| spin_lock(&workers->order_lock); |
| list_del(&work->order_list); |
| work->ordered_free(work); |
| } |
| |
| spin_unlock(&workers->order_lock); |
| return 0; |
| } |
| |
| static void put_worker(struct btrfs_worker_thread *worker) |
| { |
| if (atomic_dec_and_test(&worker->refs)) |
| kfree(worker); |
| } |
| |
| static int try_worker_shutdown(struct btrfs_worker_thread *worker) |
| { |
| int freeit = 0; |
| |
| spin_lock_irq(&worker->lock); |
| spin_lock(&worker->workers->lock); |
| if (worker->workers->num_workers > 1 && |
| worker->idle && |
| !worker->working && |
| !list_empty(&worker->worker_list) && |
| list_empty(&worker->prio_pending) && |
| list_empty(&worker->pending) && |
| atomic_read(&worker->num_pending) == 0) { |
| freeit = 1; |
| list_del_init(&worker->worker_list); |
| worker->workers->num_workers--; |
| } |
| spin_unlock(&worker->workers->lock); |
| spin_unlock_irq(&worker->lock); |
| |
| if (freeit) |
| put_worker(worker); |
| return freeit; |
| } |
| |
| static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker, |
| struct list_head *prio_head, |
| struct list_head *head) |
| { |
| struct btrfs_work *work = NULL; |
| struct list_head *cur = NULL; |
| |
| if(!list_empty(prio_head)) |
| cur = prio_head->next; |
| |
| smp_mb(); |
| if (!list_empty(&worker->prio_pending)) |
| goto refill; |
| |
| if (!list_empty(head)) |
| cur = head->next; |
| |
| if (cur) |
| goto out; |
| |
| refill: |
| spin_lock_irq(&worker->lock); |
| list_splice_tail_init(&worker->prio_pending, prio_head); |
| list_splice_tail_init(&worker->pending, head); |
| |
| if (!list_empty(prio_head)) |
| cur = prio_head->next; |
| else if (!list_empty(head)) |
| cur = head->next; |
| spin_unlock_irq(&worker->lock); |
| |
| if (!cur) |
| goto out_fail; |
| |
| out: |
| work = list_entry(cur, struct btrfs_work, list); |
| |
| out_fail: |
| return work; |
| } |
| |
| /* |
| * main loop for servicing work items |
| */ |
| static int worker_loop(void *arg) |
| { |
| struct btrfs_worker_thread *worker = arg; |
| struct list_head head; |
| struct list_head prio_head; |
| struct btrfs_work *work; |
| |
| INIT_LIST_HEAD(&head); |
| INIT_LIST_HEAD(&prio_head); |
| |
| do { |
| again: |
| while (1) { |
| |
| |
| work = get_next_work(worker, &prio_head, &head); |
| if (!work) |
| break; |
| |
| list_del(&work->list); |
| clear_bit(WORK_QUEUED_BIT, &work->flags); |
| |
| work->worker = worker; |
| |
| work->func(work); |
| |
| atomic_dec(&worker->num_pending); |
| /* |
| * unless this is an ordered work queue, |
| * 'work' was probably freed by func above. |
| */ |
| run_ordered_completions(worker->workers, work); |
| |
| check_pending_worker_creates(worker); |
| cond_resched(); |
| } |
| |
| spin_lock_irq(&worker->lock); |
| check_idle_worker(worker); |
| |
| if (freezing(current)) { |
| worker->working = 0; |
| spin_unlock_irq(&worker->lock); |
| try_to_freeze(); |
| } else { |
| spin_unlock_irq(&worker->lock); |
| if (!kthread_should_stop()) { |
| cpu_relax(); |
| /* |
| * we've dropped the lock, did someone else |
| * jump_in? |
| */ |
| smp_mb(); |
| if (!list_empty(&worker->pending) || |
| !list_empty(&worker->prio_pending)) |
| continue; |
| |
| /* |
| * this short schedule allows more work to |
| * come in without the queue functions |
| * needing to go through wake_up_process() |
| * |
| * worker->working is still 1, so nobody |
| * is going to try and wake us up |
| */ |
| schedule_timeout(1); |
| smp_mb(); |
| if (!list_empty(&worker->pending) || |
| !list_empty(&worker->prio_pending)) |
| continue; |
| |
| if (kthread_should_stop()) |
| break; |
| |
| /* still no more work?, sleep for real */ |
| spin_lock_irq(&worker->lock); |
| set_current_state(TASK_INTERRUPTIBLE); |
| if (!list_empty(&worker->pending) || |
| !list_empty(&worker->prio_pending)) { |
| spin_unlock_irq(&worker->lock); |
| set_current_state(TASK_RUNNING); |
| goto again; |
| } |
| |
| /* |
| * this makes sure we get a wakeup when someone |
| * adds something new to the queue |
| */ |
| worker->working = 0; |
| spin_unlock_irq(&worker->lock); |
| |
| if (!kthread_should_stop()) { |
| schedule_timeout(HZ * 120); |
| if (!worker->working && |
| try_worker_shutdown(worker)) { |
| return 0; |
| } |
| } |
| } |
| __set_current_state(TASK_RUNNING); |
| } |
| } while (!kthread_should_stop()); |
| return 0; |
| } |
| |
| /* |
| * this will wait for all the worker threads to shutdown |
| */ |
| int btrfs_stop_workers(struct btrfs_workers *workers) |
| { |
| struct list_head *cur; |
| struct btrfs_worker_thread *worker; |
| int can_stop; |
| |
| spin_lock_irq(&workers->lock); |
| list_splice_init(&workers->idle_list, &workers->worker_list); |
| while (!list_empty(&workers->worker_list)) { |
| cur = workers->worker_list.next; |
| worker = list_entry(cur, struct btrfs_worker_thread, |
| worker_list); |
| |
| atomic_inc(&worker->refs); |
| workers->num_workers -= 1; |
| if (!list_empty(&worker->worker_list)) { |
| list_del_init(&worker->worker_list); |
| put_worker(worker); |
| can_stop = 1; |
| } else |
| can_stop = 0; |
| spin_unlock_irq(&workers->lock); |
| if (can_stop) |
| kthread_stop(worker->task); |
| spin_lock_irq(&workers->lock); |
| put_worker(worker); |
| } |
| spin_unlock_irq(&workers->lock); |
| return 0; |
| } |
| |
| /* |
| * simple init on struct btrfs_workers |
| */ |
| void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, |
| struct btrfs_workers *async_helper) |
| { |
| workers->num_workers = 0; |
| workers->num_workers_starting = 0; |
| INIT_LIST_HEAD(&workers->worker_list); |
| INIT_LIST_HEAD(&workers->idle_list); |
| INIT_LIST_HEAD(&workers->order_list); |
| INIT_LIST_HEAD(&workers->prio_order_list); |
| spin_lock_init(&workers->lock); |
| spin_lock_init(&workers->order_lock); |
| workers->max_workers = max; |
| workers->idle_thresh = 32; |
| workers->name = name; |
| workers->ordered = 0; |
| workers->atomic_start_pending = 0; |
| workers->atomic_worker_start = async_helper; |
| } |
| |
| /* |
| * starts new worker threads. This does not enforce the max worker |
| * count in case you need to temporarily go past it. |
| */ |
| static int __btrfs_start_workers(struct btrfs_workers *workers) |
| { |
| struct btrfs_worker_thread *worker; |
| int ret = 0; |
| |
| worker = kzalloc(sizeof(*worker), GFP_NOFS); |
| if (!worker) { |
| ret = -ENOMEM; |
| goto fail; |
| } |
| |
| INIT_LIST_HEAD(&worker->pending); |
| INIT_LIST_HEAD(&worker->prio_pending); |
| INIT_LIST_HEAD(&worker->worker_list); |
| spin_lock_init(&worker->lock); |
| |
| atomic_set(&worker->num_pending, 0); |
| atomic_set(&worker->refs, 1); |
| worker->workers = workers; |
| worker->task = kthread_run(worker_loop, worker, |
| "btrfs-%s-%d", workers->name, |
| workers->num_workers + 1); |
| if (IS_ERR(worker->task)) { |
| ret = PTR_ERR(worker->task); |
| kfree(worker); |
| goto fail; |
| } |
| spin_lock_irq(&workers->lock); |
| list_add_tail(&worker->worker_list, &workers->idle_list); |
| worker->idle = 1; |
| workers->num_workers++; |
| workers->num_workers_starting--; |
| WARN_ON(workers->num_workers_starting < 0); |
| spin_unlock_irq(&workers->lock); |
| |
| return 0; |
| fail: |
| spin_lock_irq(&workers->lock); |
| workers->num_workers_starting--; |
| spin_unlock_irq(&workers->lock); |
| return ret; |
| } |
| |
| int btrfs_start_workers(struct btrfs_workers *workers) |
| { |
| spin_lock_irq(&workers->lock); |
| workers->num_workers_starting++; |
| spin_unlock_irq(&workers->lock); |
| return __btrfs_start_workers(workers); |
| } |
| |
| /* |
| * run through the list and find a worker thread that doesn't have a lot |
| * to do right now. This can return null if we aren't yet at the thread |
| * count limit and all of the threads are busy. |
| */ |
| static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) |
| { |
| struct btrfs_worker_thread *worker; |
| struct list_head *next; |
| int enforce_min; |
| |
| enforce_min = (workers->num_workers + workers->num_workers_starting) < |
| workers->max_workers; |
| |
| /* |
| * if we find an idle thread, don't move it to the end of the |
| * idle list. This improves the chance that the next submission |
| * will reuse the same thread, and maybe catch it while it is still |
| * working |
| */ |
| if (!list_empty(&workers->idle_list)) { |
| next = workers->idle_list.next; |
| worker = list_entry(next, struct btrfs_worker_thread, |
| worker_list); |
| return worker; |
| } |
| if (enforce_min || list_empty(&workers->worker_list)) |
| return NULL; |
| |
| /* |
| * if we pick a busy task, move the task to the end of the list. |
| * hopefully this will keep things somewhat evenly balanced. |
| * Do the move in batches based on the sequence number. This groups |
| * requests submitted at roughly the same time onto the same worker. |
| */ |
| next = workers->worker_list.next; |
| worker = list_entry(next, struct btrfs_worker_thread, worker_list); |
| worker->sequence++; |
| |
| if (worker->sequence % workers->idle_thresh == 0) |
| list_move_tail(next, &workers->worker_list); |
| return worker; |
| } |
| |
| /* |
| * selects a worker thread to take the next job. This will either find |
| * an idle worker, start a new worker up to the max count, or just return |
| * one of the existing busy workers. |
| */ |
| static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) |
| { |
| struct btrfs_worker_thread *worker; |
| unsigned long flags; |
| struct list_head *fallback; |
| int ret; |
| |
| spin_lock_irqsave(&workers->lock, flags); |
| again: |
| worker = next_worker(workers); |
| |
| if (!worker) { |
| if (workers->num_workers + workers->num_workers_starting >= |
| workers->max_workers) { |
| goto fallback; |
| } else if (workers->atomic_worker_start) { |
| workers->atomic_start_pending = 1; |
| goto fallback; |
| } else { |
| workers->num_workers_starting++; |
| spin_unlock_irqrestore(&workers->lock, flags); |
| /* we're below the limit, start another worker */ |
| ret = __btrfs_start_workers(workers); |
| spin_lock_irqsave(&workers->lock, flags); |
| if (ret) |
| goto fallback; |
| goto again; |
| } |
| } |
| goto found; |
| |
| fallback: |
| fallback = NULL; |
| /* |
| * we have failed to find any workers, just |
| * return the first one we can find. |
| */ |
| if (!list_empty(&workers->worker_list)) |
| fallback = workers->worker_list.next; |
| if (!list_empty(&workers->idle_list)) |
| fallback = workers->idle_list.next; |
| BUG_ON(!fallback); |
| worker = list_entry(fallback, |
| struct btrfs_worker_thread, worker_list); |
| found: |
| /* |
| * this makes sure the worker doesn't exit before it is placed |
| * onto a busy/idle list |
| */ |
| atomic_inc(&worker->num_pending); |
| spin_unlock_irqrestore(&workers->lock, flags); |
| return worker; |
| } |
| |
| /* |
| * btrfs_requeue_work just puts the work item back on the tail of the list |
| * it was taken from. It is intended for use with long running work functions |
| * that make some progress and want to give the cpu up for others. |
| */ |
| int btrfs_requeue_work(struct btrfs_work *work) |
| { |
| struct btrfs_worker_thread *worker = work->worker; |
| unsigned long flags; |
| int wake = 0; |
| |
| if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) |
| goto out; |
| |
| spin_lock_irqsave(&worker->lock, flags); |
| if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) |
| list_add_tail(&work->list, &worker->prio_pending); |
| else |
| list_add_tail(&work->list, &worker->pending); |
| atomic_inc(&worker->num_pending); |
| |
| /* by definition we're busy, take ourselves off the idle |
| * list |
| */ |
| if (worker->idle) { |
| spin_lock(&worker->workers->lock); |
| worker->idle = 0; |
| list_move_tail(&worker->worker_list, |
| &worker->workers->worker_list); |
| spin_unlock(&worker->workers->lock); |
| } |
| if (!worker->working) { |
| wake = 1; |
| worker->working = 1; |
| } |
| |
| if (wake) |
| wake_up_process(worker->task); |
| spin_unlock_irqrestore(&worker->lock, flags); |
| out: |
| |
| return 0; |
| } |
| |
| void btrfs_set_work_high_prio(struct btrfs_work *work) |
| { |
| set_bit(WORK_HIGH_PRIO_BIT, &work->flags); |
| } |
| |
| /* |
| * places a struct btrfs_work into the pending queue of one of the kthreads |
| */ |
| void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) |
| { |
| struct btrfs_worker_thread *worker; |
| unsigned long flags; |
| int wake = 0; |
| |
| /* don't requeue something already on a list */ |
| if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) |
| return; |
| |
| worker = find_worker(workers); |
| if (workers->ordered) { |
| /* |
| * you're not allowed to do ordered queues from an |
| * interrupt handler |
| */ |
| spin_lock(&workers->order_lock); |
| if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) { |
| list_add_tail(&work->order_list, |
| &workers->prio_order_list); |
| } else { |
| list_add_tail(&work->order_list, &workers->order_list); |
| } |
| spin_unlock(&workers->order_lock); |
| } else { |
| INIT_LIST_HEAD(&work->order_list); |
| } |
| |
| spin_lock_irqsave(&worker->lock, flags); |
| |
| if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) |
| list_add_tail(&work->list, &worker->prio_pending); |
| else |
| list_add_tail(&work->list, &worker->pending); |
| check_busy_worker(worker); |
| |
| /* |
| * avoid calling into wake_up_process if this thread has already |
| * been kicked |
| */ |
| if (!worker->working) |
| wake = 1; |
| worker->working = 1; |
| |
| if (wake) |
| wake_up_process(worker->task); |
| spin_unlock_irqrestore(&worker->lock, flags); |
| } |