blob: 601ccf02618abfe21882e9856051e7f0294f5a32 [file] [log] [blame]
Dave Chinner71e330b2010-05-21 14:37:18 +10001/*
2 * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write the Free Software Foundation,
15 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 */
17
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
Dave Chinner71e330b2010-05-21 14:37:18 +100023#include "xfs_trans.h"
24#include "xfs_trans_priv.h"
25#include "xfs_log_priv.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
Dave Chinner71e330b2010-05-21 14:37:18 +100028#include "xfs_mount.h"
29#include "xfs_error.h"
30#include "xfs_alloc.h"
Dave Chinnerefc27b52012-04-29 10:39:43 +000031#include "xfs_extent_busy.h"
Christoph Hellwige84661a2011-05-20 13:45:32 +000032#include "xfs_discard.h"
Dave Chinner71e330b2010-05-21 14:37:18 +100033
34/*
Dave Chinner71e330b2010-05-21 14:37:18 +100035 * Allocate a new ticket. Failing to get a new ticket makes it really hard to
36 * recover, so we don't allow failure here. Also, we allocate in a context that
37 * we don't want to be issuing transactions from, so we need to tell the
38 * allocation code this as well.
39 *
40 * We don't reserve any space for the ticket - we are going to steal whatever
41 * space we require from transactions as they commit. To ensure we reserve all
42 * the space required, we need to set the current reservation of the ticket to
43 * zero so that we know to steal the initial transaction overhead from the
44 * first transaction commit.
45 */
46static struct xlog_ticket *
47xlog_cil_ticket_alloc(
48 struct log *log)
49{
50 struct xlog_ticket *tic;
51
52 tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
53 KM_SLEEP|KM_NOFS);
54 tic->t_trans_type = XFS_TRANS_CHECKPOINT;
55
56 /*
57 * set the current reservation to zero so we know to steal the basic
58 * transaction overhead reservation from the first transaction commit.
59 */
60 tic->t_curr_res = 0;
61 return tic;
62}
63
64/*
65 * After the first stage of log recovery is done, we know where the head and
66 * tail of the log are. We need this log initialisation done before we can
67 * initialise the first CIL checkpoint context.
68 *
69 * Here we allocate a log ticket to track space usage during a CIL push. This
70 * ticket is passed to xlog_write() directly so that we don't slowly leak log
71 * space by failing to account for space used by log headers and additional
72 * region headers for split regions.
73 */
74void
75xlog_cil_init_post_recovery(
76 struct log *log)
77{
Dave Chinner71e330b2010-05-21 14:37:18 +100078 log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
79 log->l_cilp->xc_ctx->sequence = 1;
80 log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
81 log->l_curr_block);
82}
83
84/*
Dave Chinner71e330b2010-05-21 14:37:18 +100085 * Format log item into a flat buffers
86 *
87 * For delayed logging, we need to hold a formatted buffer containing all the
88 * changes on the log item. This enables us to relog the item in memory and
89 * write it out asynchronously without needing to relock the object that was
90 * modified at the time it gets written into the iclog.
91 *
92 * This function builds a vector for the changes in each log item in the
93 * transaction. It then works out the length of the buffer needed for each log
94 * item, allocates them and formats the vector for the item into the buffer.
95 * The buffer is then attached to the log item are then inserted into the
96 * Committed Item List for tracking until the next checkpoint is written out.
97 *
98 * We don't set up region headers during this process; we simply copy the
99 * regions into the flat buffer. We can do this because we still have to do a
100 * formatting step to write the regions into the iclog buffer. Writing the
101 * ophdrs during the iclog write means that we can support splitting large
102 * regions across iclog boundares without needing a change in the format of the
103 * item/region encapsulation.
104 *
105 * Hence what we need to do now is change the rewrite the vector array to point
106 * to the copied region inside the buffer we just allocated. This allows us to
107 * format the regions into the iclog as though they are being formatted
108 * directly out of the objects themselves.
109 */
Christoph Hellwig0244b962011-12-06 21:58:08 +0000110static struct xfs_log_vec *
111xlog_cil_prepare_log_vecs(
112 struct xfs_trans *tp)
Dave Chinner71e330b2010-05-21 14:37:18 +1000113{
Christoph Hellwig0244b962011-12-06 21:58:08 +0000114 struct xfs_log_item_desc *lidp;
115 struct xfs_log_vec *lv = NULL;
116 struct xfs_log_vec *ret_lv = NULL;
Dave Chinner71e330b2010-05-21 14:37:18 +1000117
Christoph Hellwig0244b962011-12-06 21:58:08 +0000118
119 /* Bail out if we didn't find a log item. */
120 if (list_empty(&tp->t_items)) {
121 ASSERT(0);
122 return NULL;
123 }
124
125 list_for_each_entry(lidp, &tp->t_items, lid_trans) {
126 struct xfs_log_vec *new_lv;
Dave Chinner71e330b2010-05-21 14:37:18 +1000127 void *ptr;
128 int index;
129 int len = 0;
Christoph Hellwigb3934212011-12-06 21:58:09 +0000130 uint niovecs;
Dave Chinner71e330b2010-05-21 14:37:18 +1000131
Christoph Hellwig0244b962011-12-06 21:58:08 +0000132 /* Skip items which aren't dirty in this transaction. */
133 if (!(lidp->lid_flags & XFS_LID_DIRTY))
134 continue;
135
136 /* Skip items that do not have any vectors for writing */
Christoph Hellwigb3934212011-12-06 21:58:09 +0000137 niovecs = IOP_SIZE(lidp->lid_item);
138 if (!niovecs)
Christoph Hellwig0244b962011-12-06 21:58:08 +0000139 continue;
140
141 new_lv = kmem_zalloc(sizeof(*new_lv) +
Christoph Hellwigb3934212011-12-06 21:58:09 +0000142 niovecs * sizeof(struct xfs_log_iovec),
Christoph Hellwig0244b962011-12-06 21:58:08 +0000143 KM_SLEEP);
144
145 /* The allocated iovec region lies beyond the log vector. */
146 new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
Christoph Hellwigb3934212011-12-06 21:58:09 +0000147 new_lv->lv_niovecs = niovecs;
Christoph Hellwig0244b962011-12-06 21:58:08 +0000148 new_lv->lv_item = lidp->lid_item;
149
Dave Chinner71e330b2010-05-21 14:37:18 +1000150 /* build the vector array and calculate it's length */
Christoph Hellwig0244b962011-12-06 21:58:08 +0000151 IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp);
152 for (index = 0; index < new_lv->lv_niovecs; index++)
153 len += new_lv->lv_iovecp[index].i_len;
Dave Chinner71e330b2010-05-21 14:37:18 +1000154
Christoph Hellwig0244b962011-12-06 21:58:08 +0000155 new_lv->lv_buf_len = len;
156 new_lv->lv_buf = kmem_alloc(new_lv->lv_buf_len,
157 KM_SLEEP|KM_NOFS);
158 ptr = new_lv->lv_buf;
Dave Chinner71e330b2010-05-21 14:37:18 +1000159
Christoph Hellwig0244b962011-12-06 21:58:08 +0000160 for (index = 0; index < new_lv->lv_niovecs; index++) {
161 struct xfs_log_iovec *vec = &new_lv->lv_iovecp[index];
Dave Chinner71e330b2010-05-21 14:37:18 +1000162
163 memcpy(ptr, vec->i_addr, vec->i_len);
164 vec->i_addr = ptr;
165 ptr += vec->i_len;
166 }
Christoph Hellwig0244b962011-12-06 21:58:08 +0000167 ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len);
168
169 if (!ret_lv)
170 ret_lv = new_lv;
171 else
172 lv->lv_next = new_lv;
173 lv = new_lv;
Dave Chinner71e330b2010-05-21 14:37:18 +1000174 }
Christoph Hellwig0244b962011-12-06 21:58:08 +0000175
176 return ret_lv;
Dave Chinner71e330b2010-05-21 14:37:18 +1000177}
178
Dave Chinnerd1583a32010-09-24 18:14:13 +1000179/*
180 * Prepare the log item for insertion into the CIL. Calculate the difference in
181 * log space and vectors it will consume, and if it is a new item pin it as
182 * well.
183 */
184STATIC void
185xfs_cil_prepare_item(
186 struct log *log,
187 struct xfs_log_vec *lv,
188 int *len,
189 int *diff_iovecs)
190{
191 struct xfs_log_vec *old = lv->lv_item->li_lv;
192
193 if (old) {
194 /* existing lv on log item, space used is a delta */
195 ASSERT(!list_empty(&lv->lv_item->li_cil));
196 ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
197
198 *len += lv->lv_buf_len - old->lv_buf_len;
199 *diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
200 kmem_free(old->lv_buf);
201 kmem_free(old);
202 } else {
203 /* new lv, must pin the log item */
204 ASSERT(!lv->lv_item->li_lv);
205 ASSERT(list_empty(&lv->lv_item->li_cil));
206
207 *len += lv->lv_buf_len;
208 *diff_iovecs += lv->lv_niovecs;
209 IOP_PIN(lv->lv_item);
210
211 }
212
213 /* attach new log vector to log item */
214 lv->lv_item->li_lv = lv;
215
216 /*
217 * If this is the first time the item is being committed to the
218 * CIL, store the sequence number on the log item so we can
219 * tell in future commits whether this is the first checkpoint
220 * the item is being committed into.
221 */
222 if (!lv->lv_item->li_seq)
223 lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
224}
225
226/*
227 * Insert the log items into the CIL and calculate the difference in space
228 * consumed by the item. Add the space to the checkpoint ticket and calculate
229 * if the change requires additional log metadata. If it does, take that space
Justin P. Mattock42b2aa82011-11-28 20:31:00 -0800230 * as well. Remove the amount of space we added to the checkpoint ticket from
Dave Chinnerd1583a32010-09-24 18:14:13 +1000231 * the current transaction ticket so that the accounting works out correctly.
232 */
Dave Chinner71e330b2010-05-21 14:37:18 +1000233static void
Dave Chinner3b93c7a2010-08-24 11:45:53 +1000234xlog_cil_insert_items(
235 struct log *log,
236 struct xfs_log_vec *log_vector,
Dave Chinnerd1583a32010-09-24 18:14:13 +1000237 struct xlog_ticket *ticket)
Dave Chinner3b93c7a2010-08-24 11:45:53 +1000238{
Dave Chinnerd1583a32010-09-24 18:14:13 +1000239 struct xfs_cil *cil = log->l_cilp;
240 struct xfs_cil_ctx *ctx = cil->xc_ctx;
241 struct xfs_log_vec *lv;
242 int len = 0;
243 int diff_iovecs = 0;
244 int iclog_space;
Dave Chinner3b93c7a2010-08-24 11:45:53 +1000245
246 ASSERT(log_vector);
Dave Chinnerd1583a32010-09-24 18:14:13 +1000247
248 /*
249 * Do all the accounting aggregation and switching of log vectors
250 * around in a separate loop to the insertion of items into the CIL.
251 * Then we can do a separate loop to update the CIL within a single
252 * lock/unlock pair. This reduces the number of round trips on the CIL
253 * lock from O(nr_logvectors) to O(1) and greatly reduces the overall
254 * hold time for the transaction commit.
255 *
256 * If this is the first time the item is being placed into the CIL in
257 * this context, pin it so it can't be written to disk until the CIL is
258 * flushed to the iclog and the iclog written to disk.
259 *
260 * We can do this safely because the context can't checkpoint until we
261 * are done so it doesn't matter exactly how we update the CIL.
262 */
Dave Chinner3b93c7a2010-08-24 11:45:53 +1000263 for (lv = log_vector; lv; lv = lv->lv_next)
Dave Chinnerd1583a32010-09-24 18:14:13 +1000264 xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
265
266 /* account for space used by new iovec headers */
267 len += diff_iovecs * sizeof(xlog_op_header_t);
268
269 spin_lock(&cil->xc_cil_lock);
270
271 /* move the items to the tail of the CIL */
272 for (lv = log_vector; lv; lv = lv->lv_next)
273 list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
274
275 ctx->nvecs += diff_iovecs;
276
277 /*
278 * Now transfer enough transaction reservation to the context ticket
279 * for the checkpoint. The context ticket is special - the unit
280 * reservation has to grow as well as the current reservation as we
281 * steal from tickets so we can correctly determine the space used
282 * during the transaction commit.
283 */
284 if (ctx->ticket->t_curr_res == 0) {
285 /* first commit in checkpoint, steal the header reservation */
286 ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
287 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
288 ticket->t_curr_res -= ctx->ticket->t_unit_res;
289 }
290
291 /* do we need space for more log record headers? */
292 iclog_space = log->l_iclog_size - log->l_iclog_hsize;
293 if (len > 0 && (ctx->space_used / iclog_space !=
294 (ctx->space_used + len) / iclog_space)) {
295 int hdrs;
296
297 hdrs = (len + iclog_space - 1) / iclog_space;
298 /* need to take into account split region headers, too */
299 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
300 ctx->ticket->t_unit_res += hdrs;
301 ctx->ticket->t_curr_res += hdrs;
302 ticket->t_curr_res -= hdrs;
303 ASSERT(ticket->t_curr_res >= len);
304 }
305 ticket->t_curr_res -= len;
306 ctx->space_used += len;
307
308 spin_unlock(&cil->xc_cil_lock);
Dave Chinner3b93c7a2010-08-24 11:45:53 +1000309}
310
311static void
Dave Chinner71e330b2010-05-21 14:37:18 +1000312xlog_cil_free_logvec(
313 struct xfs_log_vec *log_vector)
314{
315 struct xfs_log_vec *lv;
316
317 for (lv = log_vector; lv; ) {
318 struct xfs_log_vec *next = lv->lv_next;
319 kmem_free(lv->lv_buf);
320 kmem_free(lv);
321 lv = next;
322 }
323}
324
325/*
Dave Chinner71e330b2010-05-21 14:37:18 +1000326 * Mark all items committed and clear busy extents. We free the log vector
327 * chains in a separate pass so that we unpin the log items as quickly as
328 * possible.
329 */
330static void
331xlog_cil_committed(
332 void *args,
333 int abort)
334{
335 struct xfs_cil_ctx *ctx = args;
Christoph Hellwige84661a2011-05-20 13:45:32 +0000336 struct xfs_mount *mp = ctx->cil->xc_log->l_mp;
Dave Chinner71e330b2010-05-21 14:37:18 +1000337
Dave Chinner0e57f6a2010-12-20 12:02:19 +1100338 xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
339 ctx->start_lsn, abort);
Dave Chinner71e330b2010-05-21 14:37:18 +1000340
Dave Chinner4ecbfe62012-04-29 10:41:10 +0000341 xfs_extent_busy_sort(&ctx->busy_extents);
342 xfs_extent_busy_clear(mp, &ctx->busy_extents,
Christoph Hellwige84661a2011-05-20 13:45:32 +0000343 (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
Dave Chinner71e330b2010-05-21 14:37:18 +1000344
345 spin_lock(&ctx->cil->xc_cil_lock);
346 list_del(&ctx->committing);
347 spin_unlock(&ctx->cil->xc_cil_lock);
348
349 xlog_cil_free_logvec(ctx->lv_chain);
Christoph Hellwige84661a2011-05-20 13:45:32 +0000350
351 if (!list_empty(&ctx->busy_extents)) {
352 ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
353
354 xfs_discard_extents(mp, &ctx->busy_extents);
Dave Chinner4ecbfe62012-04-29 10:41:10 +0000355 xfs_extent_busy_clear(mp, &ctx->busy_extents, false);
Christoph Hellwige84661a2011-05-20 13:45:32 +0000356 }
357
Dave Chinner71e330b2010-05-21 14:37:18 +1000358 kmem_free(ctx);
359}
360
361/*
Dave Chinnera44f13e2010-08-24 11:40:03 +1000362 * Push the Committed Item List to the log. If @push_seq flag is zero, then it
363 * is a background flush and so we can chose to ignore it. Otherwise, if the
364 * current sequence is the same as @push_seq we need to do a flush. If
365 * @push_seq is less than the current sequence, then it has already been
366 * flushed and we don't need to do anything - the caller will wait for it to
367 * complete if necessary.
368 *
369 * @push_seq is a value rather than a flag because that allows us to do an
370 * unlocked check of the sequence number for a match. Hence we can allows log
371 * forces to run racily and not issue pushes for the same sequence twice. If we
372 * get a race between multiple pushes for the same sequence they will block on
373 * the first one and then abort, hence avoiding needless pushes.
Dave Chinner71e330b2010-05-21 14:37:18 +1000374 */
Dave Chinnera44f13e2010-08-24 11:40:03 +1000375STATIC int
Dave Chinner71e330b2010-05-21 14:37:18 +1000376xlog_cil_push(
Dave Chinner4c2d5422012-04-23 17:54:32 +1000377 struct log *log)
Dave Chinner71e330b2010-05-21 14:37:18 +1000378{
379 struct xfs_cil *cil = log->l_cilp;
380 struct xfs_log_vec *lv;
381 struct xfs_cil_ctx *ctx;
382 struct xfs_cil_ctx *new_ctx;
383 struct xlog_in_core *commit_iclog;
384 struct xlog_ticket *tic;
385 int num_lv;
386 int num_iovecs;
387 int len;
388 int error = 0;
389 struct xfs_trans_header thdr;
390 struct xfs_log_iovec lhdr;
391 struct xfs_log_vec lvhdr = { NULL };
392 xfs_lsn_t commit_lsn;
Dave Chinner4c2d5422012-04-23 17:54:32 +1000393 xfs_lsn_t push_seq;
Dave Chinner71e330b2010-05-21 14:37:18 +1000394
395 if (!cil)
396 return 0;
397
Dave Chinner71e330b2010-05-21 14:37:18 +1000398 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
399 new_ctx->ticket = xlog_cil_ticket_alloc(log);
400
Dave Chinner4c2d5422012-04-23 17:54:32 +1000401 down_write(&cil->xc_ctx_lock);
Dave Chinner71e330b2010-05-21 14:37:18 +1000402 ctx = cil->xc_ctx;
403
Dave Chinner4c2d5422012-04-23 17:54:32 +1000404 spin_lock(&cil->xc_cil_lock);
405 push_seq = cil->xc_push_seq;
406 ASSERT(push_seq <= ctx->sequence);
Dave Chinner71e330b2010-05-21 14:37:18 +1000407
Dave Chinner4c2d5422012-04-23 17:54:32 +1000408 /*
409 * Check if we've anything to push. If there is nothing, then we don't
410 * move on to a new sequence number and so we have to be able to push
411 * this sequence again later.
412 */
413 if (list_empty(&cil->xc_cil)) {
414 cil->xc_push_seq = 0;
415 spin_unlock(&cil->xc_cil_lock);
Dave Chinnera44f13e2010-08-24 11:40:03 +1000416 goto out_skip;
Dave Chinner4c2d5422012-04-23 17:54:32 +1000417 }
418 spin_unlock(&cil->xc_cil_lock);
419
Dave Chinnera44f13e2010-08-24 11:40:03 +1000420
421 /* check for a previously pushed seqeunce */
Dave Chinner4c2d5422012-04-23 17:54:32 +1000422 if (push_seq < cil->xc_ctx->sequence)
Dave Chinnerdf806152010-05-17 15:52:13 +1000423 goto out_skip;
424
Dave Chinner71e330b2010-05-21 14:37:18 +1000425 /*
426 * pull all the log vectors off the items in the CIL, and
427 * remove the items from the CIL. We don't need the CIL lock
428 * here because it's only needed on the transaction commit
429 * side which is currently locked out by the flush lock.
430 */
431 lv = NULL;
432 num_lv = 0;
433 num_iovecs = 0;
434 len = 0;
435 while (!list_empty(&cil->xc_cil)) {
436 struct xfs_log_item *item;
437 int i;
438
439 item = list_first_entry(&cil->xc_cil,
440 struct xfs_log_item, li_cil);
441 list_del_init(&item->li_cil);
442 if (!ctx->lv_chain)
443 ctx->lv_chain = item->li_lv;
444 else
445 lv->lv_next = item->li_lv;
446 lv = item->li_lv;
447 item->li_lv = NULL;
448
449 num_lv++;
450 num_iovecs += lv->lv_niovecs;
451 for (i = 0; i < lv->lv_niovecs; i++)
452 len += lv->lv_iovecp[i].i_len;
453 }
454
455 /*
456 * initialise the new context and attach it to the CIL. Then attach
457 * the current context to the CIL committing lsit so it can be found
458 * during log forces to extract the commit lsn of the sequence that
459 * needs to be forced.
460 */
461 INIT_LIST_HEAD(&new_ctx->committing);
462 INIT_LIST_HEAD(&new_ctx->busy_extents);
463 new_ctx->sequence = ctx->sequence + 1;
464 new_ctx->cil = cil;
465 cil->xc_ctx = new_ctx;
466
467 /*
Dave Chinnera44f13e2010-08-24 11:40:03 +1000468 * mirror the new sequence into the cil structure so that we can do
469 * unlocked checks against the current sequence in log forces without
470 * risking deferencing a freed context pointer.
471 */
472 cil->xc_current_sequence = new_ctx->sequence;
473
474 /*
Dave Chinner71e330b2010-05-21 14:37:18 +1000475 * The switch is now done, so we can drop the context lock and move out
476 * of a shared context. We can't just go straight to the commit record,
477 * though - we need to synchronise with previous and future commits so
478 * that the commit records are correctly ordered in the log to ensure
479 * that we process items during log IO completion in the correct order.
480 *
481 * For example, if we get an EFI in one checkpoint and the EFD in the
482 * next (e.g. due to log forces), we do not want the checkpoint with
483 * the EFD to be committed before the checkpoint with the EFI. Hence
484 * we must strictly order the commit records of the checkpoints so
485 * that: a) the checkpoint callbacks are attached to the iclogs in the
486 * correct order; and b) the checkpoints are replayed in correct order
487 * in log recovery.
488 *
489 * Hence we need to add this context to the committing context list so
490 * that higher sequences will wait for us to write out a commit record
491 * before they do.
492 */
493 spin_lock(&cil->xc_cil_lock);
494 list_add(&ctx->committing, &cil->xc_committing);
495 spin_unlock(&cil->xc_cil_lock);
496 up_write(&cil->xc_ctx_lock);
497
498 /*
499 * Build a checkpoint transaction header and write it to the log to
500 * begin the transaction. We need to account for the space used by the
501 * transaction header here as it is not accounted for in xlog_write().
502 *
503 * The LSN we need to pass to the log items on transaction commit is
504 * the LSN reported by the first log vector write. If we use the commit
505 * record lsn then we can move the tail beyond the grant write head.
506 */
507 tic = ctx->ticket;
508 thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
509 thdr.th_type = XFS_TRANS_CHECKPOINT;
510 thdr.th_tid = tic->t_tid;
511 thdr.th_num_items = num_iovecs;
Christoph Hellwig4e0d5f92010-06-23 18:11:15 +1000512 lhdr.i_addr = &thdr;
Dave Chinner71e330b2010-05-21 14:37:18 +1000513 lhdr.i_len = sizeof(xfs_trans_header_t);
514 lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
515 tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
516
517 lvhdr.lv_niovecs = 1;
518 lvhdr.lv_iovecp = &lhdr;
519 lvhdr.lv_next = ctx->lv_chain;
520
521 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
522 if (error)
Dave Chinner7db37c52011-01-27 12:02:00 +1100523 goto out_abort_free_ticket;
Dave Chinner71e330b2010-05-21 14:37:18 +1000524
525 /*
526 * now that we've written the checkpoint into the log, strictly
527 * order the commit records so replay will get them in the right order.
528 */
529restart:
530 spin_lock(&cil->xc_cil_lock);
531 list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
532 /*
533 * Higher sequences will wait for this one so skip them.
534 * Don't wait for own own sequence, either.
535 */
536 if (new_ctx->sequence >= ctx->sequence)
537 continue;
538 if (!new_ctx->commit_lsn) {
539 /*
540 * It is still being pushed! Wait for the push to
541 * complete, then start again from the beginning.
542 */
Dave Chinnereb40a872010-12-21 12:09:01 +1100543 xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
Dave Chinner71e330b2010-05-21 14:37:18 +1000544 goto restart;
545 }
546 }
547 spin_unlock(&cil->xc_cil_lock);
548
Dave Chinner7db37c52011-01-27 12:02:00 +1100549 /* xfs_log_done always frees the ticket on error. */
Dave Chinner71e330b2010-05-21 14:37:18 +1000550 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
Dave Chinner7db37c52011-01-27 12:02:00 +1100551 if (commit_lsn == -1)
Dave Chinner71e330b2010-05-21 14:37:18 +1000552 goto out_abort;
553
554 /* attach all the transactions w/ busy extents to iclog */
555 ctx->log_cb.cb_func = xlog_cil_committed;
556 ctx->log_cb.cb_arg = ctx;
557 error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb);
558 if (error)
559 goto out_abort;
560
561 /*
562 * now the checkpoint commit is complete and we've attached the
563 * callbacks to the iclog we can assign the commit LSN to the context
564 * and wake up anyone who is waiting for the commit to complete.
565 */
566 spin_lock(&cil->xc_cil_lock);
567 ctx->commit_lsn = commit_lsn;
Dave Chinnereb40a872010-12-21 12:09:01 +1100568 wake_up_all(&cil->xc_commit_wait);
Dave Chinner71e330b2010-05-21 14:37:18 +1000569 spin_unlock(&cil->xc_cil_lock);
570
571 /* release the hounds! */
572 return xfs_log_release_iclog(log->l_mp, commit_iclog);
573
574out_skip:
575 up_write(&cil->xc_ctx_lock);
Dave Chinner71e330b2010-05-21 14:37:18 +1000576 xfs_log_ticket_put(new_ctx->ticket);
577 kmem_free(new_ctx);
578 return 0;
579
Dave Chinner7db37c52011-01-27 12:02:00 +1100580out_abort_free_ticket:
581 xfs_log_ticket_put(tic);
Dave Chinner71e330b2010-05-21 14:37:18 +1000582out_abort:
583 xlog_cil_committed(ctx, XFS_LI_ABORTED);
584 return XFS_ERROR(EIO);
585}
586
Dave Chinner4c2d5422012-04-23 17:54:32 +1000587static void
588xlog_cil_push_work(
589 struct work_struct *work)
590{
591 struct xfs_cil *cil = container_of(work, struct xfs_cil,
592 xc_push_work);
593 xlog_cil_push(cil->xc_log);
594}
595
596/*
597 * We need to push CIL every so often so we don't cache more than we can fit in
598 * the log. The limit really is that a checkpoint can't be more than half the
599 * log (the current checkpoint is not allowed to overwrite the previous
600 * checkpoint), but commit latency and memory usage limit this to a smaller
601 * size.
602 */
603static void
604xlog_cil_push_background(
605 struct log *log)
606{
607 struct xfs_cil *cil = log->l_cilp;
608
609 /*
610 * The cil won't be empty because we are called while holding the
611 * context lock so whatever we added to the CIL will still be there
612 */
613 ASSERT(!list_empty(&cil->xc_cil));
614
615 /*
616 * don't do a background push if we haven't used up all the
617 * space available yet.
618 */
619 if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
620 return;
621
622 spin_lock(&cil->xc_cil_lock);
623 if (cil->xc_push_seq < cil->xc_current_sequence) {
624 cil->xc_push_seq = cil->xc_current_sequence;
625 queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
626 }
627 spin_unlock(&cil->xc_cil_lock);
628
629}
630
631static void
632xlog_cil_push_foreground(
633 struct log *log,
634 xfs_lsn_t push_seq)
635{
636 struct xfs_cil *cil = log->l_cilp;
637
638 if (!cil)
639 return;
640
641 ASSERT(push_seq && push_seq <= cil->xc_current_sequence);
642
643 /* start on any pending background push to minimise wait time on it */
644 flush_work(&cil->xc_push_work);
645
646 /*
647 * If the CIL is empty or we've already pushed the sequence then
648 * there's no work we need to do.
649 */
650 spin_lock(&cil->xc_cil_lock);
651 if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
652 spin_unlock(&cil->xc_cil_lock);
653 return;
654 }
655
656 cil->xc_push_seq = push_seq;
657 spin_unlock(&cil->xc_cil_lock);
658
659 /* do the push now */
660 xlog_cil_push(log);
661}
662
Dave Chinner71e330b2010-05-21 14:37:18 +1000663/*
Dave Chinnera44f13e2010-08-24 11:40:03 +1000664 * Commit a transaction with the given vector to the Committed Item List.
665 *
666 * To do this, we need to format the item, pin it in memory if required and
667 * account for the space used by the transaction. Once we have done that we
668 * need to release the unused reservation for the transaction, attach the
669 * transaction to the checkpoint context so we carry the busy extents through
670 * to checkpoint completion, and then unlock all the items in the transaction.
671 *
672 * For more specific information about the order of operations in
673 * xfs_log_commit_cil() please refer to the comments in
674 * xfs_trans_commit_iclog().
675 *
676 * Called with the context lock already held in read mode to lock out
677 * background commit, returns without it held once background commits are
678 * allowed again.
679 */
Christoph Hellwig0244b962011-12-06 21:58:08 +0000680int
Dave Chinnera44f13e2010-08-24 11:40:03 +1000681xfs_log_commit_cil(
682 struct xfs_mount *mp,
683 struct xfs_trans *tp,
Dave Chinnera44f13e2010-08-24 11:40:03 +1000684 xfs_lsn_t *commit_lsn,
685 int flags)
686{
687 struct log *log = mp->m_log;
688 int log_flags = 0;
Christoph Hellwig0244b962011-12-06 21:58:08 +0000689 struct xfs_log_vec *log_vector;
Dave Chinnera44f13e2010-08-24 11:40:03 +1000690
691 if (flags & XFS_TRANS_RELEASE_LOG_RES)
692 log_flags = XFS_LOG_REL_PERM_RESERV;
693
Dave Chinner3b93c7a2010-08-24 11:45:53 +1000694 /*
Christoph Hellwig0244b962011-12-06 21:58:08 +0000695 * Do all the hard work of formatting items (including memory
Dave Chinner3b93c7a2010-08-24 11:45:53 +1000696 * allocation) outside the CIL context lock. This prevents stalling CIL
697 * pushes when we are low on memory and a transaction commit spends a
698 * lot of time in memory reclaim.
699 */
Christoph Hellwig0244b962011-12-06 21:58:08 +0000700 log_vector = xlog_cil_prepare_log_vecs(tp);
701 if (!log_vector)
702 return ENOMEM;
Dave Chinner3b93c7a2010-08-24 11:45:53 +1000703
Dave Chinnera44f13e2010-08-24 11:40:03 +1000704 /* lock out background commit */
705 down_read(&log->l_cilp->xc_ctx_lock);
Dave Chinnerd1583a32010-09-24 18:14:13 +1000706 if (commit_lsn)
707 *commit_lsn = log->l_cilp->xc_ctx->sequence;
708
709 xlog_cil_insert_items(log, log_vector, tp->t_ticket);
Dave Chinnera44f13e2010-08-24 11:40:03 +1000710
711 /* check we didn't blow the reservation */
712 if (tp->t_ticket->t_curr_res < 0)
713 xlog_print_tic_res(log->l_mp, tp->t_ticket);
714
715 /* attach the transaction to the CIL if it has any busy extents */
716 if (!list_empty(&tp->t_busy)) {
717 spin_lock(&log->l_cilp->xc_cil_lock);
718 list_splice_init(&tp->t_busy,
719 &log->l_cilp->xc_ctx->busy_extents);
720 spin_unlock(&log->l_cilp->xc_cil_lock);
721 }
722
723 tp->t_commit_lsn = *commit_lsn;
724 xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
725 xfs_trans_unreserve_and_mod_sb(tp);
726
727 /*
728 * Once all the items of the transaction have been copied to the CIL,
729 * the items can be unlocked and freed.
730 *
731 * This needs to be done before we drop the CIL context lock because we
732 * have to update state in the log items and unlock them before they go
733 * to disk. If we don't, then the CIL checkpoint can race with us and
734 * we can run checkpoint completion before we've updated and unlocked
735 * the log items. This affects (at least) processing of stale buffers,
736 * inodes and EFIs.
737 */
738 xfs_trans_free_items(tp, *commit_lsn, 0);
739
Dave Chinner4c2d5422012-04-23 17:54:32 +1000740 xlog_cil_push_background(log);
Dave Chinnera44f13e2010-08-24 11:40:03 +1000741
742 up_read(&log->l_cilp->xc_ctx_lock);
Christoph Hellwig0244b962011-12-06 21:58:08 +0000743 return 0;
Dave Chinnera44f13e2010-08-24 11:40:03 +1000744}
745
746/*
Dave Chinner71e330b2010-05-21 14:37:18 +1000747 * Conditionally push the CIL based on the sequence passed in.
748 *
749 * We only need to push if we haven't already pushed the sequence
750 * number given. Hence the only time we will trigger a push here is
751 * if the push sequence is the same as the current context.
752 *
753 * We return the current commit lsn to allow the callers to determine if a
754 * iclog flush is necessary following this call.
Dave Chinner71e330b2010-05-21 14:37:18 +1000755 */
756xfs_lsn_t
Dave Chinnera44f13e2010-08-24 11:40:03 +1000757xlog_cil_force_lsn(
Dave Chinner71e330b2010-05-21 14:37:18 +1000758 struct log *log,
Dave Chinnera44f13e2010-08-24 11:40:03 +1000759 xfs_lsn_t sequence)
Dave Chinner71e330b2010-05-21 14:37:18 +1000760{
761 struct xfs_cil *cil = log->l_cilp;
762 struct xfs_cil_ctx *ctx;
763 xfs_lsn_t commit_lsn = NULLCOMMITLSN;
764
Dave Chinnera44f13e2010-08-24 11:40:03 +1000765 ASSERT(sequence <= cil->xc_current_sequence);
Dave Chinner71e330b2010-05-21 14:37:18 +1000766
Dave Chinnera44f13e2010-08-24 11:40:03 +1000767 /*
768 * check to see if we need to force out the current context.
769 * xlog_cil_push() handles racing pushes for the same sequence,
770 * so no need to deal with it here.
771 */
Dave Chinner4c2d5422012-04-23 17:54:32 +1000772 xlog_cil_push_foreground(log, sequence);
Dave Chinner71e330b2010-05-21 14:37:18 +1000773
774 /*
775 * See if we can find a previous sequence still committing.
Dave Chinner71e330b2010-05-21 14:37:18 +1000776 * We need to wait for all previous sequence commits to complete
777 * before allowing the force of push_seq to go ahead. Hence block
778 * on commits for those as well.
779 */
Dave Chinnera44f13e2010-08-24 11:40:03 +1000780restart:
Dave Chinner71e330b2010-05-21 14:37:18 +1000781 spin_lock(&cil->xc_cil_lock);
Dave Chinner71e330b2010-05-21 14:37:18 +1000782 list_for_each_entry(ctx, &cil->xc_committing, committing) {
Dave Chinnera44f13e2010-08-24 11:40:03 +1000783 if (ctx->sequence > sequence)
Dave Chinner71e330b2010-05-21 14:37:18 +1000784 continue;
785 if (!ctx->commit_lsn) {
786 /*
787 * It is still being pushed! Wait for the push to
788 * complete, then start again from the beginning.
789 */
Dave Chinnereb40a872010-12-21 12:09:01 +1100790 xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
Dave Chinner71e330b2010-05-21 14:37:18 +1000791 goto restart;
792 }
Dave Chinnera44f13e2010-08-24 11:40:03 +1000793 if (ctx->sequence != sequence)
Dave Chinner71e330b2010-05-21 14:37:18 +1000794 continue;
795 /* found it! */
796 commit_lsn = ctx->commit_lsn;
797 }
798 spin_unlock(&cil->xc_cil_lock);
799 return commit_lsn;
800}
Dave Chinnerccf7c232010-05-20 23:19:42 +1000801
802/*
803 * Check if the current log item was first committed in this sequence.
804 * We can't rely on just the log item being in the CIL, we have to check
805 * the recorded commit sequence number.
806 *
807 * Note: for this to be used in a non-racy manner, it has to be called with
808 * CIL flushing locked out. As a result, it should only be used during the
809 * transaction commit process when deciding what to format into the item.
810 */
811bool
812xfs_log_item_in_current_chkpt(
813 struct xfs_log_item *lip)
814{
815 struct xfs_cil_ctx *ctx;
816
Dave Chinnerccf7c232010-05-20 23:19:42 +1000817 if (list_empty(&lip->li_cil))
818 return false;
819
820 ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
821
822 /*
823 * li_seq is written on the first commit of a log item to record the
824 * first checkpoint it is written to. Hence if it is different to the
825 * current sequence, we're in a new checkpoint.
826 */
827 if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
828 return false;
829 return true;
830}
Dave Chinner4c2d5422012-04-23 17:54:32 +1000831
832/*
833 * Perform initial CIL structure initialisation.
834 */
835int
836xlog_cil_init(
837 struct log *log)
838{
839 struct xfs_cil *cil;
840 struct xfs_cil_ctx *ctx;
841
842 cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
843 if (!cil)
844 return ENOMEM;
845
846 ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
847 if (!ctx) {
848 kmem_free(cil);
849 return ENOMEM;
850 }
851
852 INIT_WORK(&cil->xc_push_work, xlog_cil_push_work);
853 INIT_LIST_HEAD(&cil->xc_cil);
854 INIT_LIST_HEAD(&cil->xc_committing);
855 spin_lock_init(&cil->xc_cil_lock);
856 init_rwsem(&cil->xc_ctx_lock);
857 init_waitqueue_head(&cil->xc_commit_wait);
858
859 INIT_LIST_HEAD(&ctx->committing);
860 INIT_LIST_HEAD(&ctx->busy_extents);
861 ctx->sequence = 1;
862 ctx->cil = cil;
863 cil->xc_ctx = ctx;
864 cil->xc_current_sequence = ctx->sequence;
865
866 cil->xc_log = log;
867 log->l_cilp = cil;
868 return 0;
869}
870
871void
872xlog_cil_destroy(
873 struct log *log)
874{
875 if (log->l_cilp->xc_ctx) {
876 if (log->l_cilp->xc_ctx->ticket)
877 xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
878 kmem_free(log->l_cilp->xc_ctx);
879 }
880
881 ASSERT(list_empty(&log->l_cilp->xc_cil));
882 kmem_free(log->l_cilp);
883}
884