Blame - fs/xfs/xfs_log.c - kernel/msm

blob: 51814c32eddf91148cc1585d6802affeb2eb64ef [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
				3	*
				4	* This program is free software; you can redistribute it and/or modify it
				5	* under the terms of version 2 of the GNU General Public License as
				6	* published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it would be useful, but
				9	* WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
				11	*
				12	* Further, this software is distributed without any warranty that it is
				13	* free of the rightful claim of any third person regarding infringement
				14	* or the like. Any license provided herein, whether implied or
				15	* otherwise, applies only to this software file. Patent licenses, if
				16	* any, provided herein do not apply to combinations of this program with
				17	* other software, or any other product whatsoever.
				18	*
				19	* You should have received a copy of the GNU General Public License along
				20	* with this program; if not, write the Free Software Foundation, Inc., 59
				21	* Temple Place - Suite 330, Boston MA 02111-1307, USA.
				22	*
				23	* Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
				24	* Mountain View, CA 94043, or:
				25	*
				26	* http://www.sgi.com
				27	*
				28	* For further information regarding this notice, see:
				29	*
				30	* http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
				31	*/
				32
				33	/*
				34	* High level interface routines for log manager
				35	*/
				36
				37	#include "xfs.h"
				38	#include "xfs_macros.h"
				39	#include "xfs_types.h"
				40	#include "xfs_inum.h"
				41	#include "xfs_ag.h"
				42	#include "xfs_sb.h"
				43	#include "xfs_log.h"
				44	#include "xfs_trans.h"
				45	#include "xfs_dir.h"
				46	#include "xfs_dmapi.h"
				47	#include "xfs_mount.h"
				48	#include "xfs_error.h"
				49	#include "xfs_log_priv.h"
				50	#include "xfs_buf_item.h"
				51	#include "xfs_alloc_btree.h"
				52	#include "xfs_log_recover.h"
				53	#include "xfs_bit.h"
				54	#include "xfs_rw.h"
				55	#include "xfs_trans_priv.h"
				56
				57
				58	#define xlog_write_adv_cnt(ptr, len, off, bytes) \
				59	{ (ptr) += (bytes); \
				60	(len) -= (bytes); \
				61	(off) += (bytes);}
				62
				63	/* Local miscellaneous function prototypes */
				64	STATIC int xlog_bdstrat_cb(struct xfs_buf *);
				65	STATIC int xlog_commit_record(xfs_mount_t mp, xlog_ticket_t ticket,
				66	xlog_in_core_t *, xfs_lsn_t );
				67	STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
				68	xfs_buftarg_t *log_target,
				69	xfs_daddr_t blk_offset,
				70	int num_bblks);
				71	STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes);
				72	STATIC int xlog_sync(xlog_t log, xlog_in_core_t iclog);
				73	STATIC void xlog_unalloc_log(xlog_t *log);
				74	STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[],
				75	int nentries, xfs_log_ticket_t tic,
				76	xfs_lsn_t *start_lsn,
				77	xlog_in_core_t **commit_iclog,
				78	uint flags);
				79
				80	/* local state machine functions */
				81	STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
				82	STATIC void xlog_state_do_callback(xlog_t log,int aborted, xlog_in_core_t iclog);
				83	STATIC int xlog_state_get_iclog_space(xlog_t *log,
				84	int len,
				85	xlog_in_core_t **iclog,
				86	xlog_ticket_t *ticket,
				87	int *continued_write,
				88	int *logoffsetp);
				89	STATIC void xlog_state_put_ticket(xlog_t *log,
				90	xlog_ticket_t *tic);
				91	STATIC int xlog_state_release_iclog(xlog_t *log,
				92	xlog_in_core_t *iclog);
				93	STATIC void xlog_state_switch_iclogs(xlog_t *log,
				94	xlog_in_core_t *iclog,
				95	int eventual_size);
				96	STATIC int xlog_state_sync(xlog_t *log, xfs_lsn_t lsn, uint flags);
				97	STATIC int xlog_state_sync_all(xlog_t *log, uint flags);
				98	STATIC void xlog_state_want_sync(xlog_t log, xlog_in_core_t iclog);
				99
				100	/* local functions to manipulate grant head */
				101	STATIC int xlog_grant_log_space(xlog_t *log,
				102	xlog_ticket_t *xtic);
				103	STATIC void xlog_grant_push_ail(xfs_mount_t *mp,
				104	int need_bytes);
				105	STATIC void xlog_regrant_reserve_log_space(xlog_t *log,
				106	xlog_ticket_t *ticket);
				107	STATIC int xlog_regrant_write_log_space(xlog_t *log,
				108	xlog_ticket_t *ticket);
				109	STATIC void xlog_ungrant_log_space(xlog_t *log,
				110	xlog_ticket_t *ticket);
				111
				112
				113	/* local ticket functions */
				114	STATIC void xlog_state_ticket_alloc(xlog_t *log);
				115	STATIC xlog_ticket_t xlog_ticket_get(xlog_t log,
				116	int unit_bytes,
				117	int count,
				118	char clientid,
				119	uint flags);
				120	STATIC void xlog_ticket_put(xlog_t log, xlog_ticket_t ticket);
				121
				122	/* local debug functions */
				123	#if defined(DEBUG) && !defined(XLOG_NOLOG)
				124	STATIC void xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr);
				125	STATIC void xlog_verify_grant_head(xlog_t *log, int equals);
				126	STATIC void xlog_verify_iclog(xlog_t log, xlog_in_core_t iclog,
				127	int count, boolean_t syncing);
				128	STATIC void xlog_verify_tail_lsn(xlog_t log, xlog_in_core_t iclog,
				129	xfs_lsn_t tail_lsn);
				130	#else
				131	#define xlog_verify_dest_ptr(a,b)
				132	#define xlog_verify_grant_head(a,b)
				133	#define xlog_verify_iclog(a,b,c,d)
				134	#define xlog_verify_tail_lsn(a,b,c)
				135	#endif
				136
Christoph Hellwig	ba0f32d	2005-06-21 15:36:52 +1000	[diff] [blame]	137	STATIC int xlog_iclogs_empty(xlog_t *log);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	138
				139	#ifdef DEBUG
				140	int xlog_do_error = 0;
				141	int xlog_req_num = 0;
				142	int xlog_error_mod = 33;
				143	#endif
				144
				145	#define XLOG_FORCED_SHUTDOWN(log) (log->l_flags & XLOG_IO_ERROR)
				146
				147	/*
				148	* 0 => disable log manager
				149	* 1 => enable log manager
				150	* 2 => enable log manager and log debugging
				151	*/
				152	#if defined(XLOG_NOLOG) \|\| defined(DEBUG)
				153	int xlog_debug = 1;
				154	xfs_buftarg_t *xlog_target;
				155	#endif
				156
				157	#if defined(XFS_LOG_TRACE)
				158
				159	void
				160	xlog_trace_loggrant(xlog_t log, xlog_ticket_t tic, xfs_caddr_t string)
				161	{
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	162	unsigned long cnts;
				163
				164	if (!log->l_grant_trace) {
				165	log->l_grant_trace = ktrace_alloc(2048, KM_NOSLEEP);
				166	if (!log->l_grant_trace)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	167	return;
				168	}
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	169	/* ticket counts are 1 byte each */
				170	cnts = ((unsigned long)tic->t_ocnt) \| ((unsigned long)tic->t_cnt) << 8;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	171
				172	ktrace_enter(log->l_grant_trace,
				173	(void *)tic,
				174	(void *)log->l_reserve_headq,
				175	(void *)log->l_write_headq,
				176	(void *)((unsigned long)log->l_grant_reserve_cycle),
				177	(void *)((unsigned long)log->l_grant_reserve_bytes),
				178	(void *)((unsigned long)log->l_grant_write_cycle),
				179	(void *)((unsigned long)log->l_grant_write_bytes),
				180	(void *)((unsigned long)log->l_curr_cycle),
				181	(void *)((unsigned long)log->l_curr_block),
				182	(void *)((unsigned long)CYCLE_LSN(log->l_tail_lsn)),
				183	(void *)((unsigned long)BLOCK_LSN(log->l_tail_lsn)),
				184	(void *)string,
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	185	(void *)((unsigned long)tic->t_trans_type),
				186	(void *)cnts,
				187	(void *)((unsigned long)tic->t_curr_res),
				188	(void *)((unsigned long)tic->t_unit_res));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	189	}
				190
				191	void
				192	xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
				193	{
				194	pid_t pid;
				195
				196	pid = current_pid();
				197
				198	if (!iclog->ic_trace)
				199	iclog->ic_trace = ktrace_alloc(256, KM_SLEEP);
				200	ktrace_enter(iclog->ic_trace,
				201	(void *)((unsigned long)state),
				202	(void *)((unsigned long)pid),
				203	(void *)0,
				204	(void *)0,
				205	(void *)0,
				206	(void *)0,
				207	(void *)0,
				208	(void *)0,
				209	(void *)0,
				210	(void *)0,
				211	(void *)0,
				212	(void *)0,
				213	(void *)0,
				214	(void *)0,
				215	(void *)0,
				216	(void *)0);
				217	}
				218
				219	#else
				220	#define xlog_trace_loggrant(log,tic,string)
				221	#define xlog_trace_iclog(iclog,state)
				222	#endif /* XFS_LOG_TRACE */
				223
				224	/*
				225	* NOTES:
				226	*
				227	* 1. currblock field gets updated at startup and after in-core logs
				228	* marked as with WANT_SYNC.
				229	*/
				230
				231	/*
				232	* This routine is called when a user of a log manager ticket is done with
				233	* the reservation. If the ticket was ever used, then a commit record for
				234	* the associated transaction is written out as a log operation header with
				235	* no data. The flag XLOG_TIC_INITED is set when the first write occurs with
				236	* a given ticket. If the ticket was one with a permanent reservation, then
				237	* a few operations are done differently. Permanent reservation tickets by
				238	* default don't release the reservation. They just commit the current
				239	* transaction with the belief that the reservation is still needed. A flag
				240	* must be passed in before permanent reservations are actually released.
				241	* When these type of tickets are not released, they need to be set into
				242	* the inited state again. By doing this, a start record will be written
				243	* out when the next write occurs.
				244	*/
				245	xfs_lsn_t
				246	xfs_log_done(xfs_mount_t *mp,
				247	xfs_log_ticket_t xtic,
				248	void **iclog,
				249	uint flags)
				250	{
				251	xlog_t *log = mp->m_log;
				252	xlog_ticket_t *ticket = (xfs_log_ticket_t) xtic;
				253	xfs_lsn_t lsn = 0;
				254
				255	#if defined(DEBUG) \|\| defined(XLOG_NOLOG)
				256	if (!xlog_debug && xlog_target == log->l_targ)
				257	return 0;
				258	#endif
				259
				260	if (XLOG_FORCED_SHUTDOWN(log) \|\|
				261	/*
				262	* If nothing was ever written, don't write out commit record.
				263	* If we get an error, just continue and give back the log ticket.
				264	*/
				265	(((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
				266	(xlog_commit_record(mp, ticket,
				267	(xlog_in_core_t **)iclog, &lsn)))) {
				268	lsn = (xfs_lsn_t) -1;
				269	if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
				270	flags \|= XFS_LOG_REL_PERM_RESERV;
				271	}
				272	}
				273
				274
				275	if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 \|\|
				276	(flags & XFS_LOG_REL_PERM_RESERV)) {
				277	/*
				278	* Release ticket if not permanent reservation or a specifc
				279	* request has been made to release a permanent reservation.
				280	*/
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	281	xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)");
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	282	xlog_ungrant_log_space(log, ticket);
				283	xlog_state_put_ticket(log, ticket);
				284	} else {
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	285	xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)");
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	286	xlog_regrant_reserve_log_space(log, ticket);
				287	}
				288
				289	/* If this ticket was a permanent reservation and we aren't
				290	* trying to release it, reset the inited flags; so next time
				291	* we write, a start record will be written out.
				292	*/
				293	if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) &&
				294	(flags & XFS_LOG_REL_PERM_RESERV) == 0)
				295	ticket->t_flags \|= XLOG_TIC_INITED;
				296
				297	return lsn;
				298	} /* xfs_log_done */
				299
				300
				301	/*
				302	* Force the in-core log to disk. If flags == XFS_LOG_SYNC,
				303	* the force is done synchronously.
				304	*
				305	* Asynchronous forces are implemented by setting the WANT_SYNC
				306	* bit in the appropriate in-core log and then returning.
				307	*
				308	* Synchronous forces are implemented with a semaphore. All callers
				309	* to force a given lsn to disk will wait on a semaphore attached to the
				310	* specific in-core log. When given in-core log finally completes its
				311	* write to disk, that thread will wake up all threads waiting on the
				312	* semaphore.
				313	*/
				314	int
				315	xfs_log_force(xfs_mount_t *mp,
				316	xfs_lsn_t lsn,
				317	uint flags)
				318	{
				319	int rval;
				320	xlog_t *log = mp->m_log;
				321
				322	#if defined(DEBUG) \|\| defined(XLOG_NOLOG)
				323	if (!xlog_debug && xlog_target == log->l_targ)
				324	return 0;
				325	#endif
				326
				327	ASSERT(flags & XFS_LOG_FORCE);
				328
				329	XFS_STATS_INC(xs_log_force);
				330
				331	if ((log->l_flags & XLOG_IO_ERROR) == 0) {
				332	if (lsn == 0)
				333	rval = xlog_state_sync_all(log, flags);
				334	else
				335	rval = xlog_state_sync(log, lsn, flags);
				336	} else {
				337	rval = XFS_ERROR(EIO);
				338	}
				339
				340	return rval;
				341
				342	} /* xfs_log_force */
				343
				344	/*
				345	* Attaches a new iclog I/O completion callback routine during
				346	* transaction commit. If the log is in error state, a non-zero
				347	* return code is handed back and the caller is responsible for
				348	* executing the callback at an appropriate time.
				349	*/
				350	int
				351	xfs_log_notify(xfs_mount_t mp, / mount of partition */
				352	void iclog_hndl, / iclog to hang callback off */
				353	xfs_log_callback_t *cb)
				354	{
				355	xlog_t *log = mp->m_log;
				356	xlog_in_core_t iclog = (xlog_in_core_t )iclog_hndl;
				357	int abortflg, spl;
				358
				359	#if defined(DEBUG) \|\| defined(XLOG_NOLOG)
				360	if (!xlog_debug && xlog_target == log->l_targ)
				361	return 0;
				362	#endif
				363	cb->cb_next = NULL;
				364	spl = LOG_LOCK(log);
				365	abortflg = (iclog->ic_state & XLOG_STATE_IOERROR);
				366	if (!abortflg) {
				367	ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) \|\|
				368	(iclog->ic_state == XLOG_STATE_WANT_SYNC));
				369	cb->cb_next = NULL;
				370	*(iclog->ic_callback_tail) = cb;
				371	iclog->ic_callback_tail = &(cb->cb_next);
				372	}
				373	LOG_UNLOCK(log, spl);
				374	return abortflg;
				375	} /* xfs_log_notify */
				376
				377	int
				378	xfs_log_release_iclog(xfs_mount_t *mp,
				379	void *iclog_hndl)
				380	{
				381	xlog_t *log = mp->m_log;
				382	xlog_in_core_t iclog = (xlog_in_core_t )iclog_hndl;
				383
				384	if (xlog_state_release_iclog(log, iclog)) {
				385	xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
				386	return(EIO);
				387	}
				388
				389	return 0;
				390	}
				391
				392	/*
				393	* 1. Reserve an amount of on-disk log space and return a ticket corresponding
				394	* to the reservation.
				395	* 2. Potentially, push buffers at tail of log to disk.
				396	*
				397	* Each reservation is going to reserve extra space for a log record header.
				398	* When writes happen to the on-disk log, we don't subtract the length of the
				399	* log record header from any reservation. By wasting space in each
				400	* reservation, we prevent over allocation problems.
				401	*/
				402	int
				403	xfs_log_reserve(xfs_mount_t *mp,
				404	int unit_bytes,
				405	int cnt,
				406	xfs_log_ticket_t *ticket,
				407	__uint8_t client,
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	408	uint flags,
				409	uint t_type)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	410	{
				411	xlog_t *log = mp->m_log;
				412	xlog_ticket_t *internal_ticket;
				413	int retval;
				414
				415	#if defined(DEBUG) \|\| defined(XLOG_NOLOG)
				416	if (!xlog_debug && xlog_target == log->l_targ)
				417	return 0;
				418	#endif
				419	retval = 0;
				420	ASSERT(client == XFS_TRANSACTION \|\| client == XFS_LOG);
				421	ASSERT((flags & XFS_LOG_NOSLEEP) == 0);
				422
				423	if (XLOG_FORCED_SHUTDOWN(log))
				424	return XFS_ERROR(EIO);
				425
				426	XFS_STATS_INC(xs_try_logspace);
				427
				428	if (*ticket != NULL) {
				429	ASSERT(flags & XFS_LOG_PERM_RESERV);
				430	internal_ticket = (xlog_ticket_t )ticket;
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	431	xlog_trace_loggrant(log, internal_ticket, "xfs_log_reserve: existing ticket (permanent trans)");
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	432	xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
				433	retval = xlog_regrant_write_log_space(log, internal_ticket);
				434	} else {
				435	/* may sleep if need to allocate more tickets */
				436	internal_ticket = xlog_ticket_get(log, unit_bytes, cnt,
				437	client, flags);
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	438	internal_ticket->t_trans_type = t_type;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	439	*ticket = internal_ticket;
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	440	xlog_trace_loggrant(log, internal_ticket,
				441	(internal_ticket->t_flags & XLOG_TIC_PERM_RESERV) ?
				442	"xfs_log_reserve: create new ticket (permanent trans)" :
				443	"xfs_log_reserve: create new ticket");
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	444	xlog_grant_push_ail(mp,
				445	(internal_ticket->t_unit_res *
				446	internal_ticket->t_cnt));
				447	retval = xlog_grant_log_space(log, internal_ticket);
				448	}
				449
				450	return retval;
				451	} /* xfs_log_reserve */
				452
				453
				454	/*
				455	* Mount a log filesystem
				456	*
				457	* mp - ubiquitous xfs mount point structure
				458	* log_target - buftarg of on-disk log device
				459	* blk_offset - Start block # where block size is 512 bytes (BBSIZE)
				460	* num_bblocks - Number of BBSIZE blocks in on-disk log
				461	*
				462	* Return error or zero.
				463	*/
				464	int
				465	xfs_log_mount(xfs_mount_t *mp,
				466	xfs_buftarg_t *log_target,
				467	xfs_daddr_t blk_offset,
				468	int num_bblks)
				469	{
				470	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
				471	cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname);
				472	else {
				473	cmn_err(CE_NOTE,
				474	"!Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.",
				475	mp->m_fsname);
				476	ASSERT(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY);
				477	}
				478
				479	mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
				480
				481	#if defined(DEBUG) \|\| defined(XLOG_NOLOG)
				482	if (!xlog_debug) {
				483	cmn_err(CE_NOTE, "log dev: %s", XFS_BUFTARG_NAME(log_target));
				484	return 0;
				485	}
				486	#endif
				487	/*
				488	* skip log recovery on a norecovery mount. pretend it all
				489	* just worked.
				490	*/
				491	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
				492	int error;
				493	vfs_t *vfsp = XFS_MTOVFS(mp);
				494	int readonly = (vfsp->vfs_flag & VFS_RDONLY);
				495
				496	if (readonly)
				497	vfsp->vfs_flag &= ~VFS_RDONLY;
				498
				499	error = xlog_recover(mp->m_log, readonly);
				500
				501	if (readonly)
				502	vfsp->vfs_flag \|= VFS_RDONLY;
				503	if (error) {
				504	cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error);
				505	xlog_unalloc_log(mp->m_log);
				506	return error;
				507	}
				508	}
				509
				510	/* Normal transactions can now occur */
				511	mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
				512
				513	/* End mounting message in xfs_log_mount_finish */
				514	return 0;
				515	} /* xfs_log_mount */
				516
				517	/*
				518	* Finish the recovery of the file system. This is separate from
				519	* the xfs_log_mount() call, because it depends on the code in
				520	* xfs_mountfs() to read in the root and real-time bitmap inodes
				521	* between calling xfs_log_mount() and here.
				522	*
				523	* mp - ubiquitous xfs mount point structure
				524	*/
				525	int
				526	xfs_log_mount_finish(xfs_mount_t *mp, int mfsi_flags)
				527	{
				528	int error;
				529
				530	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
				531	error = xlog_recover_finish(mp->m_log, mfsi_flags);
				532	else {
				533	error = 0;
				534	ASSERT(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY);
				535	}
				536
				537	return error;
				538	}
				539
				540	/*
				541	* Unmount processing for the log.
				542	*/
				543	int
				544	xfs_log_unmount(xfs_mount_t *mp)
				545	{
				546	int error;
				547
				548	error = xfs_log_unmount_write(mp);
				549	xfs_log_unmount_dealloc(mp);
				550	return (error);
				551	}
				552
				553	/*
				554	* Final log writes as part of unmount.
				555	*
				556	* Mark the filesystem clean as unmount happens. Note that during relocation
				557	* this routine needs to be executed as part of source-bag while the
				558	* deallocation must not be done until source-end.
				559	*/
				560
				561	/*
				562	* Unmount record used to have a string "Unmount filesystem--" in the
				563	* data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
				564	* We just write the magic number now since that particular field isn't
				565	* currently architecture converted and "nUmount" is a bit foo.
				566	* As far as I know, there weren't any dependencies on the old behaviour.
				567	*/
				568
				569	int
				570	xfs_log_unmount_write(xfs_mount_t *mp)
				571	{
				572	xlog_t *log = mp->m_log;
				573	xlog_in_core_t *iclog;
				574	#ifdef DEBUG
				575	xlog_in_core_t *first_iclog;
				576	#endif
				577	xfs_log_iovec_t reg[1];
				578	xfs_log_ticket_t tic = NULL;
				579	xfs_lsn_t lsn;
				580	int error;
				581	SPLDECL(s);
				582
				583	/* the data section must be 32 bit size aligned */
				584	struct {
				585	__uint16_t magic;
				586	__uint16_t pad1;
				587	__uint32_t pad2; /* may as well make it 64 bits */
				588	} magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
				589
				590	#if defined(DEBUG) \|\| defined(XLOG_NOLOG)
				591	if (!xlog_debug && xlog_target == log->l_targ)
				592	return 0;
				593	#endif
				594
				595	/*
				596	* Don't write out unmount record on read-only mounts.
				597	* Or, if we are doing a forced umount (typically because of IO errors).
				598	*/
				599	if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY)
				600	return 0;
				601
				602	xfs_log_force(mp, 0, XFS_LOG_FORCE\|XFS_LOG_SYNC);
				603
				604	#ifdef DEBUG
				605	first_iclog = iclog = log->l_iclog;
				606	do {
				607	if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
				608	ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE);
				609	ASSERT(iclog->ic_offset == 0);
				610	}
				611	iclog = iclog->ic_next;
				612	} while (iclog != first_iclog);
				613	#endif
				614	if (! (XLOG_FORCED_SHUTDOWN(log))) {
				615	reg[0].i_addr = (void*)&magic;
				616	reg[0].i_len = sizeof(magic);
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	617	XLOG_VEC_SET_TYPE(&reg[0], XLOG_REG_TYPE_UNMOUNT);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	618
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	619	error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	620	if (!error) {
				621	/* remove inited flag */
				622	((xlog_ticket_t *)tic)->t_flags = 0;
				623	error = xlog_write(mp, reg, 1, tic, &lsn,
				624	NULL, XLOG_UNMOUNT_TRANS);
				625	/*
				626	* At this point, we're umounting anyway,
				627	* so there's no point in transitioning log state
				628	* to IOERROR. Just continue...
				629	*/
				630	}
				631
				632	if (error) {
				633	xfs_fs_cmn_err(CE_ALERT, mp,
				634	"xfs_log_unmount: unmount record failed");
				635	}
				636
				637
				638	s = LOG_LOCK(log);
				639	iclog = log->l_iclog;
				640	iclog->ic_refcnt++;
				641	LOG_UNLOCK(log, s);
				642	xlog_state_want_sync(log, iclog);
				643	(void) xlog_state_release_iclog(log, iclog);
				644
				645	s = LOG_LOCK(log);
				646	if (!(iclog->ic_state == XLOG_STATE_ACTIVE \|\|
				647	iclog->ic_state == XLOG_STATE_DIRTY)) {
				648	if (!XLOG_FORCED_SHUTDOWN(log)) {
				649	sv_wait(&iclog->ic_forcesema, PMEM,
				650	&log->l_icloglock, s);
				651	} else {
				652	LOG_UNLOCK(log, s);
				653	}
				654	} else {
				655	LOG_UNLOCK(log, s);
				656	}
				657	if (tic)
				658	xlog_state_put_ticket(log, tic);
				659	} else {
				660	/*
				661	* We're already in forced_shutdown mode, couldn't
				662	* even attempt to write out the unmount transaction.
				663	*
				664	* Go through the motions of sync'ing and releasing
				665	* the iclog, even though no I/O will actually happen,
				666	* we need to wait for other log I/O's that may already
				667	* be in progress. Do this as a separate section of
				668	* code so we'll know if we ever get stuck here that
				669	* we're in this odd situation of trying to unmount
				670	* a file system that went into forced_shutdown as
				671	* the result of an unmount..
				672	*/
				673	s = LOG_LOCK(log);
				674	iclog = log->l_iclog;
				675	iclog->ic_refcnt++;
				676	LOG_UNLOCK(log, s);
				677
				678	xlog_state_want_sync(log, iclog);
				679	(void) xlog_state_release_iclog(log, iclog);
				680
				681	s = LOG_LOCK(log);
				682
				683	if ( ! ( iclog->ic_state == XLOG_STATE_ACTIVE
				684	\|\| iclog->ic_state == XLOG_STATE_DIRTY
				685	\|\| iclog->ic_state == XLOG_STATE_IOERROR) ) {
				686
				687	sv_wait(&iclog->ic_forcesema, PMEM,
				688	&log->l_icloglock, s);
				689	} else {
				690	LOG_UNLOCK(log, s);
				691	}
				692	}
				693
				694	return 0;
				695	} /* xfs_log_unmount_write */
				696
				697	/*
				698	* Deallocate log structures for unmount/relocation.
				699	*/
				700	void
				701	xfs_log_unmount_dealloc(xfs_mount_t *mp)
				702	{
				703	xlog_unalloc_log(mp->m_log);
				704	}
				705
				706	/*
				707	* Write region vectors to log. The write happens using the space reservation
				708	* of the ticket (tic). It is not a requirement that all writes for a given
				709	* transaction occur with one call to xfs_log_write().
				710	*/
				711	int
				712	xfs_log_write(xfs_mount_t * mp,
				713	xfs_log_iovec_t reg[],
				714	int nentries,
				715	xfs_log_ticket_t tic,
				716	xfs_lsn_t *start_lsn)
				717	{
				718	int error;
				719	xlog_t *log = mp->m_log;
				720
				721	#if defined(DEBUG) \|\| defined(XLOG_NOLOG)
				722	if (!xlog_debug && xlog_target == log->l_targ) {
				723	*start_lsn = 0;
				724	return 0;
				725	}
				726	#endif
				727	if (XLOG_FORCED_SHUTDOWN(log))
				728	return XFS_ERROR(EIO);
				729
				730	if ((error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0))) {
				731	xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
				732	}
				733	return (error);
				734	} /* xfs_log_write */
				735
				736
				737	void
				738	xfs_log_move_tail(xfs_mount_t *mp,
				739	xfs_lsn_t tail_lsn)
				740	{
				741	xlog_ticket_t *tic;
				742	xlog_t *log = mp->m_log;
				743	int need_bytes, free_bytes, cycle, bytes;
				744	SPLDECL(s);
				745
				746	#if defined(DEBUG) \|\| defined(XLOG_NOLOG)
				747	if (!xlog_debug && xlog_target == log->l_targ)
				748	return;
				749	#endif
				750	/* XXXsup tmp */
				751	if (XLOG_FORCED_SHUTDOWN(log))
				752	return;
				753	ASSERT(!XFS_FORCED_SHUTDOWN(mp));
				754
				755	if (tail_lsn == 0) {
				756	/* needed since sync_lsn is 64 bits */
				757	s = LOG_LOCK(log);
				758	tail_lsn = log->l_last_sync_lsn;
				759	LOG_UNLOCK(log, s);
				760	}
				761
				762	s = GRANT_LOCK(log);
				763
				764	/* Also an invalid lsn. 1 implies that we aren't passing in a valid
				765	* tail_lsn.
				766	*/
				767	if (tail_lsn != 1) {
				768	log->l_tail_lsn = tail_lsn;
				769	}
				770
				771	if ((tic = log->l_write_headq)) {
				772	#ifdef DEBUG
				773	if (log->l_flags & XLOG_ACTIVE_RECOVERY)
				774	panic("Recovery problem");
				775	#endif
				776	cycle = log->l_grant_write_cycle;
				777	bytes = log->l_grant_write_bytes;
				778	free_bytes = xlog_space_left(log, cycle, bytes);
				779	do {
				780	ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
				781
				782	if (free_bytes < tic->t_unit_res && tail_lsn != 1)
				783	break;
				784	tail_lsn = 0;
				785	free_bytes -= tic->t_unit_res;
				786	sv_signal(&tic->t_sema);
				787	tic = tic->t_next;
				788	} while (tic != log->l_write_headq);
				789	}
				790	if ((tic = log->l_reserve_headq)) {
				791	#ifdef DEBUG
				792	if (log->l_flags & XLOG_ACTIVE_RECOVERY)
				793	panic("Recovery problem");
				794	#endif
				795	cycle = log->l_grant_reserve_cycle;
				796	bytes = log->l_grant_reserve_bytes;
				797	free_bytes = xlog_space_left(log, cycle, bytes);
				798	do {
				799	if (tic->t_flags & XLOG_TIC_PERM_RESERV)
				800	need_bytes = tic->t_unit_res*tic->t_cnt;
				801	else
				802	need_bytes = tic->t_unit_res;
				803	if (free_bytes < need_bytes && tail_lsn != 1)
				804	break;
				805	tail_lsn = 0;
				806	free_bytes -= need_bytes;
				807	sv_signal(&tic->t_sema);
				808	tic = tic->t_next;
				809	} while (tic != log->l_reserve_headq);
				810	}
				811	GRANT_UNLOCK(log, s);
				812	} /* xfs_log_move_tail */
				813
				814	/*
				815	* Determine if we have a transaction that has gone to disk
				816	* that needs to be covered. Log activity needs to be idle (no AIL and
				817	* nothing in the iclogs). And, we need to be in the right state indicating
				818	* something has gone out.
				819	*/
				820	int
				821	xfs_log_need_covered(xfs_mount_t *mp)
				822	{
				823	SPLDECL(s);
				824	int needed = 0, gen;
				825	xlog_t *log = mp->m_log;
				826	vfs_t *vfsp = XFS_MTOVFS(mp);
				827
				828	if (fs_frozen(vfsp) \|\| XFS_FORCED_SHUTDOWN(mp) \|\|
				829	(vfsp->vfs_flag & VFS_RDONLY))
				830	return 0;
				831
				832	s = LOG_LOCK(log);
				833	if (((log->l_covered_state == XLOG_STATE_COVER_NEED) \|\|
				834	(log->l_covered_state == XLOG_STATE_COVER_NEED2))
				835	&& !xfs_trans_first_ail(mp, &gen)
				836	&& xlog_iclogs_empty(log)) {
				837	if (log->l_covered_state == XLOG_STATE_COVER_NEED)
				838	log->l_covered_state = XLOG_STATE_COVER_DONE;
				839	else {
				840	ASSERT(log->l_covered_state == XLOG_STATE_COVER_NEED2);
				841	log->l_covered_state = XLOG_STATE_COVER_DONE2;
				842	}
				843	needed = 1;
				844	}
				845	LOG_UNLOCK(log, s);
				846	return(needed);
				847	}
				848
				849	/******************************************************************************
				850	*
				851	* local routines
				852	*
				853	******************************************************************************
				854	*/
				855
				856	/* xfs_trans_tail_ail returns 0 when there is nothing in the list.
				857	* The log manager must keep track of the last LR which was committed
				858	* to disk. The lsn of this LR will become the new tail_lsn whenever
				859	* xfs_trans_tail_ail returns 0. If we don't do this, we run into
				860	* the situation where stuff could be written into the log but nothing
				861	* was ever in the AIL when asked. Eventually, we panic since the
				862	* tail hits the head.
				863	*
				864	* We may be holding the log iclog lock upon entering this routine.
				865	*/
				866	xfs_lsn_t
				867	xlog_assign_tail_lsn(xfs_mount_t *mp)
				868	{
				869	xfs_lsn_t tail_lsn;
				870	SPLDECL(s);
				871	xlog_t *log = mp->m_log;
				872
				873	tail_lsn = xfs_trans_tail_ail(mp);
				874	s = GRANT_LOCK(log);
				875	if (tail_lsn != 0) {
				876	log->l_tail_lsn = tail_lsn;
				877	} else {
				878	tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn;
				879	}
				880	GRANT_UNLOCK(log, s);
				881
				882	return tail_lsn;
				883	} /* xlog_assign_tail_lsn */
				884
				885
				886	/*
				887	* Return the space in the log between the tail and the head. The head
				888	* is passed in the cycle/bytes formal parms. In the special case where
				889	* the reserve head has wrapped passed the tail, this calculation is no
				890	* longer valid. In this case, just return 0 which means there is no space
				891	* in the log. This works for all places where this function is called
				892	* with the reserve head. Of course, if the write head were to ever
				893	* wrap the tail, we should blow up. Rather than catch this case here,
				894	* we depend on other ASSERTions in other parts of the code. XXXmiken
				895	*
				896	* This code also handles the case where the reservation head is behind
				897	* the tail. The details of this case are described below, but the end
				898	* result is that we return the size of the log as the amount of space left.
				899	*/
				900	int
				901	xlog_space_left(xlog_t *log, int cycle, int bytes)
				902	{
				903	int free_bytes;
				904	int tail_bytes;
				905	int tail_cycle;
				906
				907	tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn));
				908	tail_cycle = CYCLE_LSN(log->l_tail_lsn);
				909	if ((tail_cycle == cycle) && (bytes >= tail_bytes)) {
				910	free_bytes = log->l_logsize - (bytes - tail_bytes);
				911	} else if ((tail_cycle + 1) < cycle) {
				912	return 0;
				913	} else if (tail_cycle < cycle) {
				914	ASSERT(tail_cycle == (cycle - 1));
				915	free_bytes = tail_bytes - bytes;
				916	} else {
				917	/*
				918	* The reservation head is behind the tail.
				919	* In this case we just want to return the size of the
				920	* log as the amount of space left.
				921	*/
				922	xfs_fs_cmn_err(CE_ALERT, log->l_mp,
				923	"xlog_space_left: head behind tail\n"
				924	" tail_cycle = %d, tail_bytes = %d\n"
				925	" GH cycle = %d, GH bytes = %d",
				926	tail_cycle, tail_bytes, cycle, bytes);
				927	ASSERT(0);
				928	free_bytes = log->l_logsize;
				929	}
				930	return free_bytes;
				931	} /* xlog_space_left */
				932
				933
				934	/*
				935	* Log function which is called when an io completes.
				936	*
				937	* The log manager needs its own routine, in order to control what
				938	* happens with the buffer after the write completes.
				939	*/
				940	void
				941	xlog_iodone(xfs_buf_t *bp)
				942	{
				943	xlog_in_core_t *iclog;
				944	xlog_t *l;
				945	int aborted;
				946
				947	iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
				948	ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long) 2);
				949	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
				950	aborted = 0;
				951
				952	/*
				953	* Some versions of cpp barf on the recursive definition of
				954	* ic_log -> hic_fields.ic_log and expand ic_log twice when
				955	* it is passed through two macros. Workaround broken cpp.
				956	*/
				957	l = iclog->ic_log;
				958
				959	/*
				960	* Race to shutdown the filesystem if we see an error.
				961	*/
				962	if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp,
				963	XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
				964	xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp));
				965	XFS_BUF_STALE(bp);
				966	xfs_force_shutdown(l->l_mp, XFS_LOG_IO_ERROR);
				967	/*
				968	* This flag will be propagated to the trans-committed
				969	* callback routines to let them know that the log-commit
				970	* didn't succeed.
				971	*/
				972	aborted = XFS_LI_ABORTED;
				973	} else if (iclog->ic_state & XLOG_STATE_IOERROR) {
				974	aborted = XFS_LI_ABORTED;
				975	}
				976	xlog_state_done_syncing(iclog, aborted);
				977	if (!(XFS_BUF_ISASYNC(bp))) {
				978	/*
				979	* Corresponding psema() will be done in bwrite(). If we don't
				980	* vsema() here, panic.
				981	*/
				982	XFS_BUF_V_IODONESEMA(bp);
				983	}
				984	} /* xlog_iodone */
				985
				986	/*
				987	* The bdstrat callback function for log bufs. This gives us a central
				988	* place to trap bufs in case we get hit by a log I/O error and need to
				989	* shutdown. Actually, in practice, even when we didn't get a log error,
				990	* we transition the iclogs to IOERROR state after flushing all existing
				991	* iclogs to disk. This is because we don't want anymore new transactions to be
				992	* started or completed afterwards.
				993	*/
				994	STATIC int
				995	xlog_bdstrat_cb(struct xfs_buf *bp)
				996	{
				997	xlog_in_core_t *iclog;
				998
				999	iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
				1000
				1001	if ((iclog->ic_state & XLOG_STATE_IOERROR) == 0) {
				1002	/* note for irix bstrat will need struct bdevsw passed
				1003	* Fix the following macro if the code ever is merged
				1004	*/
				1005	XFS_bdstrat(bp);
				1006	return 0;
				1007	}
				1008
				1009	xfs_buftrace("XLOG__BDSTRAT IOERROR", bp);
				1010	XFS_BUF_ERROR(bp, EIO);
				1011	XFS_BUF_STALE(bp);
				1012	xfs_biodone(bp);
				1013	return (XFS_ERROR(EIO));
				1014
				1015
				1016	}
				1017
				1018	/*
				1019	* Return size of each in-core log record buffer.
				1020	*
				1021	* Low memory machines only get 2 16KB buffers. We don't want to waste
				1022	* memory here. However, all other machines get at least 2 32KB buffers.
				1023	* The number is hard coded because we don't care about the minimum
				1024	* memory size, just 32MB systems.
				1025	*
				1026	* If the filesystem blocksize is too large, we may need to choose a
				1027	* larger size since the directory code currently logs entire blocks.
				1028	*/
				1029
				1030	STATIC void
				1031	xlog_get_iclog_buffer_size(xfs_mount_t *mp,
				1032	xlog_t *log)
				1033	{
				1034	int size;
				1035	int xhdrs;
				1036
				1037	#if defined(DEBUG) \|\| defined(XLOG_NOLOG)
				1038	/*
				1039	* When logbufs == 0, someone has disabled the log from the FSTAB
				1040	* file. This is not a documented feature. We need to set xlog_debug
				1041	* to zero (this deactivates the log) and set xlog_target to the
				1042	* appropriate device. Only one filesystem may be affected as such
				1043	* since this is just a performance hack to test what we might be able
				1044	* to get if the log were not present.
				1045	*/
				1046	if (mp->m_logbufs == 0) {
				1047	xlog_debug = 0;
				1048	xlog_target = log->l_targ;
				1049	log->l_iclog_bufs = XLOG_MIN_ICLOGS;
				1050	} else
				1051	#endif
				1052	{
				1053	/*
				1054	* This is the normal path. If m_logbufs == -1, then the
				1055	* admin has chosen to use the system defaults for logbuffers.
				1056	*/
				1057	if (mp->m_logbufs == -1) {
				1058	if (xfs_physmem <= btoc(12810241024)) {
				1059	log->l_iclog_bufs = XLOG_MIN_ICLOGS;
				1060	} else if (xfs_physmem <= btoc(40010241024)) {
				1061	log->l_iclog_bufs = XLOG_MED_ICLOGS;
				1062	} else {
				1063	/* 256K with 32K bufs */
				1064	log->l_iclog_bufs = XLOG_MAX_ICLOGS;
				1065	}
				1066	} else
				1067	log->l_iclog_bufs = mp->m_logbufs;
				1068
				1069	#if defined(DEBUG) \|\| defined(XLOG_NOLOG)
				1070	/* We are reactivating a filesystem after it was inactive */
				1071	if (log->l_targ == xlog_target) {
				1072	xlog_target = NULL;
				1073	xlog_debug = 1;
				1074	}
				1075	#endif
				1076	}
				1077
				1078	/*
				1079	* Buffer size passed in from mount system call.
				1080	*/
				1081	if (mp->m_logbsize != -1) {
				1082	size = log->l_iclog_size = mp->m_logbsize;
				1083	log->l_iclog_size_log = 0;
				1084	while (size != 1) {
				1085	log->l_iclog_size_log++;
				1086	size >>= 1;
				1087	}
				1088
				1089	if (XFS_SB_VERSION_HASLOGV2(&mp->m_sb)) {
				1090	/* # headers = size / 32K
				1091	* one header holds cycles from 32K of data
				1092	*/
				1093
				1094	xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE;
				1095	if (mp->m_logbsize % XLOG_HEADER_CYCLE_SIZE)
				1096	xhdrs++;
				1097	log->l_iclog_hsize = xhdrs << BBSHIFT;
				1098	log->l_iclog_heads = xhdrs;
				1099	} else {
				1100	ASSERT(mp->m_logbsize <= XLOG_BIG_RECORD_BSIZE);
				1101	log->l_iclog_hsize = BBSIZE;
				1102	log->l_iclog_heads = 1;
				1103	}
				1104	return;
				1105	}
				1106
				1107	/*
				1108	* Special case machines that have less than 32MB of memory.
				1109	* All machines with more memory use 32KB buffers.
				1110	*/
				1111	if (xfs_physmem <= btoc(3210241024)) {
				1112	/* Don't change; min configuration */
				1113	log->l_iclog_size = XLOG_RECORD_BSIZE; /* 16k */
				1114	log->l_iclog_size_log = XLOG_RECORD_BSHIFT;
				1115	} else {
				1116	log->l_iclog_size = XLOG_BIG_RECORD_BSIZE; /* 32k */
				1117	log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
				1118	}
				1119
				1120	/* the default log size is 16k or 32k which is one header sector */
				1121	log->l_iclog_hsize = BBSIZE;
				1122	log->l_iclog_heads = 1;
				1123
				1124	/*
				1125	* For 16KB, we use 3 32KB buffers. For 32KB block sizes, we use
				1126	* 4 32KB buffers. For 64KB block sizes, we use 8 32KB buffers.
				1127	*/
				1128	if (mp->m_sb.sb_blocksize >= 16*1024) {
				1129	log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
				1130	log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
				1131	if (mp->m_logbufs == -1) {
				1132	switch (mp->m_sb.sb_blocksize) {
				1133	case 161024: / 16 KB */
				1134	log->l_iclog_bufs = 3;
				1135	break;
				1136	case 321024: / 32 KB */
				1137	log->l_iclog_bufs = 4;
				1138	break;
				1139	case 641024: / 64 KB */
				1140	log->l_iclog_bufs = 8;
				1141	break;
				1142	default:
				1143	xlog_panic("XFS: Invalid blocksize");
				1144	break;
				1145	}
				1146	}
				1147	}
				1148	} /* xlog_get_iclog_buffer_size */
				1149
				1150
				1151	/*
				1152	* This routine initializes some of the log structure for a given mount point.
				1153	* Its primary purpose is to fill in enough, so recovery can occur. However,
				1154	* some other stuff may be filled in too.
				1155	*/
				1156	STATIC xlog_t *
				1157	xlog_alloc_log(xfs_mount_t *mp,
				1158	xfs_buftarg_t *log_target,
				1159	xfs_daddr_t blk_offset,
				1160	int num_bblks)
				1161	{
				1162	xlog_t *log;
				1163	xlog_rec_header_t *head;
				1164	xlog_in_core_t **iclogp;
				1165	xlog_in_core_t iclog, prev_iclog=NULL;
				1166	xfs_buf_t *bp;
				1167	int i;
				1168	int iclogsize;
				1169
				1170	log = (xlog_t *)kmem_zalloc(sizeof(xlog_t), KM_SLEEP);
				1171
				1172	log->l_mp = mp;
				1173	log->l_targ = log_target;
				1174	log->l_logsize = BBTOB(num_bblks);
				1175	log->l_logBBstart = blk_offset;
				1176	log->l_logBBsize = num_bblks;
				1177	log->l_covered_state = XLOG_STATE_COVER_IDLE;
				1178	log->l_flags \|= XLOG_ACTIVE_RECOVERY;
				1179
				1180	log->l_prev_block = -1;
				1181	ASSIGN_ANY_LSN_HOST(log->l_tail_lsn, 1, 0);
				1182	/* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
				1183	log->l_last_sync_lsn = log->l_tail_lsn;
				1184	log->l_curr_cycle = 1; /* 0 is bad since this is initial value */
				1185	log->l_grant_reserve_cycle = 1;
				1186	log->l_grant_write_cycle = 1;
				1187
				1188	if (XFS_SB_VERSION_HASSECTOR(&mp->m_sb)) {
				1189	log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT;
				1190	ASSERT(log->l_sectbb_log <= mp->m_sectbb_log);
				1191	/* for larger sector sizes, must have v2 or external log */
				1192	ASSERT(log->l_sectbb_log == 0 \|\|
				1193	log->l_logBBstart == 0 \|\|
				1194	XFS_SB_VERSION_HASLOGV2(&mp->m_sb));
				1195	ASSERT(mp->m_sb.sb_logsectlog >= BBSHIFT);
				1196	}
				1197	log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1;
				1198
				1199	xlog_get_iclog_buffer_size(mp, log);
				1200
				1201	bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp);
				1202	XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
				1203	XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
				1204	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
				1205	ASSERT(XFS_BUF_ISBUSY(bp));
				1206	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
				1207	log->l_xbuf = bp;
				1208
				1209	spinlock_init(&log->l_icloglock, "iclog");
				1210	spinlock_init(&log->l_grant_lock, "grhead_iclog");
				1211	initnsema(&log->l_flushsema, 0, "ic-flush");
				1212	xlog_state_ticket_alloc(log); /* wait until after icloglock inited */
				1213
				1214	/* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
				1215	ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
				1216
				1217	iclogp = &log->l_iclog;
				1218	/*
				1219	* The amount of memory to allocate for the iclog structure is
				1220	* rather funky due to the way the structure is defined. It is
				1221	* done this way so that we can use different sizes for machines
				1222	* with different amounts of memory. See the definition of
				1223	* xlog_in_core_t in xfs_log_priv.h for details.
				1224	*/
				1225	iclogsize = log->l_iclog_size;
				1226	ASSERT(log->l_iclog_size >= 4096);
				1227	for (i=0; i < log->l_iclog_bufs; i++) {
				1228	iclogp = (xlog_in_core_t )
				1229	kmem_zalloc(sizeof(xlog_in_core_t), KM_SLEEP);
				1230	iclog = *iclogp;
				1231	iclog->hic_data = (xlog_in_core_2_t *)
				1232	kmem_zalloc(iclogsize, KM_SLEEP);
				1233
				1234	iclog->ic_prev = prev_iclog;
				1235	prev_iclog = iclog;
				1236	log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
				1237
				1238	head = &iclog->ic_header;
				1239	memset(head, 0, sizeof(xlog_rec_header_t));
				1240	INT_SET(head->h_magicno, ARCH_CONVERT, XLOG_HEADER_MAGIC_NUM);
				1241	INT_SET(head->h_version, ARCH_CONVERT,
				1242	XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? 2 : 1);
				1243	INT_SET(head->h_size, ARCH_CONVERT, log->l_iclog_size);
				1244	/* new fields */
				1245	INT_SET(head->h_fmt, ARCH_CONVERT, XLOG_FMT);
				1246	memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
				1247
				1248	bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp);
				1249	XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
				1250	XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
				1251	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
				1252	iclog->ic_bp = bp;
				1253
				1254	iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize;
				1255	iclog->ic_state = XLOG_STATE_ACTIVE;
				1256	iclog->ic_log = log;
				1257	iclog->ic_callback_tail = &(iclog->ic_callback);
				1258	iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize;
				1259
				1260	ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
				1261	ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
				1262	sv_init(&iclog->ic_forcesema, SV_DEFAULT, "iclog-force");
				1263	sv_init(&iclog->ic_writesema, SV_DEFAULT, "iclog-write");
				1264
				1265	iclogp = &iclog->ic_next;
				1266	}
				1267	iclogp = log->l_iclog; / complete ring */
				1268	log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
				1269
				1270	return log;
				1271	} /* xlog_alloc_log */
				1272
				1273
				1274	/*
				1275	* Write out the commit record of a transaction associated with the given
				1276	* ticket. Return the lsn of the commit record.
				1277	*/
				1278	STATIC int
				1279	xlog_commit_record(xfs_mount_t *mp,
				1280	xlog_ticket_t *ticket,
				1281	xlog_in_core_t **iclog,
				1282	xfs_lsn_t *commitlsnp)
				1283	{
				1284	int error;
				1285	xfs_log_iovec_t reg[1];
				1286
				1287	reg[0].i_addr = NULL;
				1288	reg[0].i_len = 0;
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	1289	XLOG_VEC_SET_TYPE(&reg[0], XLOG_REG_TYPE_COMMIT);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1290
				1291	ASSERT_ALWAYS(iclog);
				1292	if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp,
				1293	iclog, XLOG_COMMIT_TRANS))) {
				1294	xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
				1295	}
				1296	return (error);
				1297	} /* xlog_commit_record */
				1298
				1299
				1300	/*
				1301	* Push on the buffer cache code if we ever use more than 75% of the on-disk
				1302	* log space. This code pushes on the lsn which would supposedly free up
				1303	* the 25% which we want to leave free. We may need to adopt a policy which
				1304	* pushes on an lsn which is further along in the log once we reach the high
				1305	* water mark. In this manner, we would be creating a low water mark.
				1306	*/
				1307	void
				1308	xlog_grant_push_ail(xfs_mount_t *mp,
				1309	int need_bytes)
				1310	{
				1311	xlog_t log = mp->m_log; / pointer to the log */
				1312	xfs_lsn_t tail_lsn; /* lsn of the log tail */
				1313	xfs_lsn_t threshold_lsn = 0; /* lsn we'd like to be at */
				1314	int free_blocks; /* free blocks left to write to */
				1315	int free_bytes; /* free bytes left to write to */
				1316	int threshold_block; /* block in lsn we'd like to be at */
				1317	int threshold_cycle; /* lsn cycle we'd like to be at */
				1318	int free_threshold;
				1319	SPLDECL(s);
				1320
				1321	ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
				1322
				1323	s = GRANT_LOCK(log);
				1324	free_bytes = xlog_space_left(log,
				1325	log->l_grant_reserve_cycle,
				1326	log->l_grant_reserve_bytes);
				1327	tail_lsn = log->l_tail_lsn;
				1328	free_blocks = BTOBBT(free_bytes);
				1329
				1330	/*
				1331	* Set the threshold for the minimum number of free blocks in the
				1332	* log to the maximum of what the caller needs, one quarter of the
				1333	* log, and 256 blocks.
				1334	*/
				1335	free_threshold = BTOBB(need_bytes);
				1336	free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
				1337	free_threshold = MAX(free_threshold, 256);
				1338	if (free_blocks < free_threshold) {
				1339	threshold_block = BLOCK_LSN(tail_lsn) + free_threshold;
				1340	threshold_cycle = CYCLE_LSN(tail_lsn);
				1341	if (threshold_block >= log->l_logBBsize) {
				1342	threshold_block -= log->l_logBBsize;
				1343	threshold_cycle += 1;
				1344	}
				1345	ASSIGN_ANY_LSN_HOST(threshold_lsn, threshold_cycle,
				1346	threshold_block);
				1347
				1348	/* Don't pass in an lsn greater than the lsn of the last
				1349	* log record known to be on disk.
				1350	*/
				1351	if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0)
				1352	threshold_lsn = log->l_last_sync_lsn;
				1353	}
				1354	GRANT_UNLOCK(log, s);
				1355
				1356	/*
				1357	* Get the transaction layer to kick the dirty buffers out to
				1358	* disk asynchronously. No point in trying to do this if
				1359	* the filesystem is shutting down.
				1360	*/
				1361	if (threshold_lsn &&
				1362	!XLOG_FORCED_SHUTDOWN(log))
				1363	xfs_trans_push_ail(mp, threshold_lsn);
				1364	} /* xlog_grant_push_ail */
				1365
				1366
				1367	/*
				1368	* Flush out the in-core log (iclog) to the on-disk log in an asynchronous
				1369	* fashion. Previously, we should have moved the current iclog
				1370	* ptr in the log to point to the next available iclog. This allows further
				1371	* write to continue while this code syncs out an iclog ready to go.
				1372	* Before an in-core log can be written out, the data section must be scanned
				1373	* to save away the 1st word of each BBSIZE block into the header. We replace
				1374	* it with the current cycle count. Each BBSIZE block is tagged with the
				1375	* cycle count because there in an implicit assumption that drives will
				1376	* guarantee that entire 512 byte blocks get written at once. In other words,
				1377	* we can't have part of a 512 byte block written and part not written. By
				1378	* tagging each block, we will know which blocks are valid when recovering
				1379	* after an unclean shutdown.
				1380	*
				1381	* This routine is single threaded on the iclog. No other thread can be in
				1382	* this routine with the same iclog. Changing contents of iclog can there-
				1383	* fore be done without grabbing the state machine lock. Updating the global
				1384	* log will require grabbing the lock though.
				1385	*
				1386	* The entire log manager uses a logical block numbering scheme. Only
				1387	* log_sync (and then only bwrite()) know about the fact that the log may
				1388	* not start with block zero on a given device. The log block start offset
				1389	* is added immediately before calling bwrite().
				1390	*/
				1391
				1392	int
				1393	xlog_sync(xlog_t *log,
				1394	xlog_in_core_t *iclog)
				1395	{
				1396	xfs_caddr_t dptr; /* pointer to byte sized element */
				1397	xfs_buf_t *bp;
				1398	int i, ops;
				1399	uint count; /* byte count of bwrite */
				1400	uint count_init; /* initial count before roundup */
				1401	int roundoff; /* roundoff to BB or stripe */
				1402	int split = 0; /* split write into two regions */
				1403	int error;
				1404	SPLDECL(s);
				1405	int v2 = XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb);
				1406
				1407	XFS_STATS_INC(xs_log_writes);
				1408	ASSERT(iclog->ic_refcnt == 0);
				1409
				1410	/* Add for LR header */
				1411	count_init = log->l_iclog_hsize + iclog->ic_offset;
				1412
				1413	/* Round out the log write size */
				1414	if (v2 && log->l_mp->m_sb.sb_logsunit > 1) {
				1415	/* we have a v2 stripe unit to use */
				1416	count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
				1417	} else {
				1418	count = BBTOB(BTOBB(count_init));
				1419	}
				1420	roundoff = count - count_init;
				1421	ASSERT(roundoff >= 0);
				1422	ASSERT((v2 && log->l_mp->m_sb.sb_logsunit > 1 &&
				1423	roundoff < log->l_mp->m_sb.sb_logsunit)
				1424	\|\|
				1425	(log->l_mp->m_sb.sb_logsunit <= 1 &&
				1426	roundoff < BBTOB(1)));
				1427
				1428	/* move grant heads by roundoff in sync */
				1429	s = GRANT_LOCK(log);
				1430	XLOG_GRANT_ADD_SPACE(log, roundoff, 'w');
				1431	XLOG_GRANT_ADD_SPACE(log, roundoff, 'r');
				1432	GRANT_UNLOCK(log, s);
				1433
				1434	/* put cycle number in every block */
				1435	xlog_pack_data(log, iclog, roundoff);
				1436
				1437	/* real byte length */
				1438	if (v2) {
				1439	INT_SET(iclog->ic_header.h_len,
				1440	ARCH_CONVERT,
				1441	iclog->ic_offset + roundoff);
				1442	} else {
				1443	INT_SET(iclog->ic_header.h_len, ARCH_CONVERT, iclog->ic_offset);
				1444	}
				1445
				1446	/* put ops count in correct order */
				1447	ops = iclog->ic_header.h_num_logops;
				1448	INT_SET(iclog->ic_header.h_num_logops, ARCH_CONVERT, ops);
				1449
				1450	bp = iclog->ic_bp;
				1451	ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1);
				1452	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2);
				1453	XFS_BUF_SET_ADDR(bp, BLOCK_LSN(INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT)));
				1454
				1455	XFS_STATS_ADD(xs_log_blocks, BTOBB(count));
				1456
				1457	/* Do we need to split this write into 2 parts? */
				1458	if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
				1459	split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
				1460	count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
				1461	iclog->ic_bwritecnt = 2; /* split into 2 writes */
				1462	} else {
				1463	iclog->ic_bwritecnt = 1;
				1464	}
				1465	XFS_BUF_SET_PTR(bp, (xfs_caddr_t) &(iclog->ic_header), count);
				1466	XFS_BUF_SET_FSPRIVATE(bp, iclog); /* save for later */
				1467	XFS_BUF_BUSY(bp);
				1468	XFS_BUF_ASYNC(bp);
				1469	/*
				1470	* Do a disk write cache flush for the log block.
				1471	* This is a bit of a sledgehammer, it would be better
				1472	* to use a tag barrier here that just prevents reordering.
				1473	* It may not be needed to flush the first split block in the log wrap
				1474	* case, but do it anyways to be safe -AK
				1475	*/
				1476	if (!(log->l_mp->m_flags & XFS_MOUNT_NOLOGFLUSH))
				1477	XFS_BUF_FLUSH(bp);
				1478
				1479	ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
				1480	ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
				1481
				1482	xlog_verify_iclog(log, iclog, count, B_TRUE);
				1483
				1484	/* account for log which doesn't start at block #0 */
				1485	XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
				1486	/*
				1487	* Don't call xfs_bwrite here. We do log-syncs even when the filesystem
				1488	* is shutting down.
				1489	*/
				1490	XFS_BUF_WRITE(bp);
				1491
				1492	if ((error = XFS_bwrite(bp))) {
				1493	xfs_ioerror_alert("xlog_sync", log->l_mp, bp,
				1494	XFS_BUF_ADDR(bp));
				1495	return (error);
				1496	}
				1497	if (split) {
				1498	bp = iclog->ic_log->l_xbuf;
				1499	ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) ==
				1500	(unsigned long)1);
				1501	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2);
				1502	XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */
				1503	XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+
				1504	(__psint_t)count), split);
				1505	XFS_BUF_SET_FSPRIVATE(bp, iclog);
				1506	XFS_BUF_BUSY(bp);
				1507	XFS_BUF_ASYNC(bp);
				1508	if (!(log->l_mp->m_flags & XFS_MOUNT_NOLOGFLUSH))
				1509	XFS_BUF_FLUSH(bp);
				1510	dptr = XFS_BUF_PTR(bp);
				1511	/*
				1512	* Bump the cycle numbers at the start of each block
				1513	* since this part of the buffer is at the start of
				1514	* a new cycle. Watch out for the header magic number
				1515	* case, though.
				1516	*/
				1517	for (i=0; i<split; i += BBSIZE) {
				1518	INT_MOD((uint )dptr, ARCH_CONVERT, +1);
				1519	if (INT_GET((uint )dptr, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM)
				1520	INT_MOD((uint )dptr, ARCH_CONVERT, +1);
				1521	dptr += BBSIZE;
				1522	}
				1523
				1524	ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
				1525	ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
				1526
				1527	/* account for internal log which does't start at block #0 */
				1528	XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
				1529	XFS_BUF_WRITE(bp);
				1530	if ((error = XFS_bwrite(bp))) {
				1531	xfs_ioerror_alert("xlog_sync (split)", log->l_mp,
				1532	bp, XFS_BUF_ADDR(bp));
				1533	return (error);
				1534	}
				1535	}
				1536	return (0);
				1537	} /* xlog_sync */
				1538
				1539
				1540	/*
				1541	* Unallocate a log structure
				1542	*/
				1543	void
				1544	xlog_unalloc_log(xlog_t *log)
				1545	{
				1546	xlog_in_core_t iclog, next_iclog;
				1547	xlog_ticket_t tic, next_tic;
				1548	int i;
				1549
				1550
				1551	iclog = log->l_iclog;
				1552	for (i=0; i<log->l_iclog_bufs; i++) {
				1553	sv_destroy(&iclog->ic_forcesema);
				1554	sv_destroy(&iclog->ic_writesema);
				1555	xfs_buf_free(iclog->ic_bp);
				1556	#ifdef XFS_LOG_TRACE
				1557	if (iclog->ic_trace != NULL) {
				1558	ktrace_free(iclog->ic_trace);
				1559	}
				1560	#endif
				1561	next_iclog = iclog->ic_next;
				1562	kmem_free(iclog->hic_data, log->l_iclog_size);
				1563	kmem_free(iclog, sizeof(xlog_in_core_t));
				1564	iclog = next_iclog;
				1565	}
				1566	freesema(&log->l_flushsema);
				1567	spinlock_destroy(&log->l_icloglock);
				1568	spinlock_destroy(&log->l_grant_lock);
				1569
				1570	/* XXXsup take a look at this again. */
				1571	if ((log->l_ticket_cnt != log->l_ticket_tcnt) &&
				1572	!XLOG_FORCED_SHUTDOWN(log)) {
				1573	xfs_fs_cmn_err(CE_WARN, log->l_mp,
				1574	"xlog_unalloc_log: (cnt: %d, total: %d)",
				1575	log->l_ticket_cnt, log->l_ticket_tcnt);
				1576	/* ASSERT(log->l_ticket_cnt == log->l_ticket_tcnt); */
				1577
				1578	} else {
				1579	tic = log->l_unmount_free;
				1580	while (tic) {
				1581	next_tic = tic->t_next;
				1582	kmem_free(tic, NBPP);
				1583	tic = next_tic;
				1584	}
				1585	}
				1586	xfs_buf_free(log->l_xbuf);
				1587	#ifdef XFS_LOG_TRACE
				1588	if (log->l_trace != NULL) {
				1589	ktrace_free(log->l_trace);
				1590	}
				1591	if (log->l_grant_trace != NULL) {
				1592	ktrace_free(log->l_grant_trace);
				1593	}
				1594	#endif
				1595	log->l_mp->m_log = NULL;
				1596	kmem_free(log, sizeof(xlog_t));
				1597	} /* xlog_unalloc_log */
				1598
				1599	/*
				1600	* Update counters atomically now that memcpy is done.
				1601	*/
				1602	/* ARGSUSED */
				1603	static inline void
				1604	xlog_state_finish_copy(xlog_t *log,
				1605	xlog_in_core_t *iclog,
				1606	int record_cnt,
				1607	int copy_bytes)
				1608	{
				1609	SPLDECL(s);
				1610
				1611	s = LOG_LOCK(log);
				1612
				1613	iclog->ic_header.h_num_logops += record_cnt;
				1614	iclog->ic_offset += copy_bytes;
				1615
				1616	LOG_UNLOCK(log, s);
				1617	} /* xlog_state_finish_copy */
				1618
				1619
				1620
				1621
				1622	/*
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	1623	* print out info relating to regions written which consume
				1624	* the reservation
				1625	*/
				1626	#if defined(XFS_LOG_RES_DEBUG)
				1627	STATIC void
				1628	xlog_print_tic_res(xfs_mount_t mp, xlog_ticket_t ticket)
				1629	{
				1630	uint i;
				1631	uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
				1632
				1633	/* match with XLOG_REG_TYPE_* in xfs_log.h */
				1634	static char *res_type_str[XLOG_REG_TYPE_MAX] = {
				1635	"bformat",
				1636	"bchunk",
				1637	"efi_format",
				1638	"efd_format",
				1639	"iformat",
				1640	"icore",
				1641	"iext",
				1642	"ibroot",
				1643	"ilocal",
				1644	"iattr_ext",
				1645	"iattr_broot",
				1646	"iattr_local",
				1647	"qformat",
				1648	"dquot",
				1649	"quotaoff",
				1650	"LR header",
				1651	"unmount",
				1652	"commit",
				1653	"trans header"
				1654	};
				1655	static char *trans_type_str[XFS_TRANS_TYPE_MAX] = {
				1656	"SETATTR_NOT_SIZE",
				1657	"SETATTR_SIZE",
				1658	"INACTIVE",
				1659	"CREATE",
				1660	"CREATE_TRUNC",
				1661	"TRUNCATE_FILE",
				1662	"REMOVE",
				1663	"LINK",
				1664	"RENAME",
				1665	"MKDIR",
				1666	"RMDIR",
				1667	"SYMLINK",
				1668	"SET_DMATTRS",
				1669	"GROWFS",
				1670	"STRAT_WRITE",
				1671	"DIOSTRAT",
				1672	"WRITE_SYNC",
				1673	"WRITEID",
				1674	"ADDAFORK",
				1675	"ATTRINVAL",
				1676	"ATRUNCATE",
				1677	"ATTR_SET",
				1678	"ATTR_RM",
				1679	"ATTR_FLAG",
				1680	"CLEAR_AGI_BUCKET",
				1681	"QM_SBCHANGE",
				1682	"DUMMY1",
				1683	"DUMMY2",
				1684	"QM_QUOTAOFF",
				1685	"QM_DQALLOC",
				1686	"QM_SETQLIM",
				1687	"QM_DQCLUSTER",
				1688	"QM_QINOCREATE",
				1689	"QM_QUOTAOFF_END",
				1690	"SB_UNIT",
				1691	"FSYNC_TS",
				1692	"GROWFSRT_ALLOC",
				1693	"GROWFSRT_ZERO",
				1694	"GROWFSRT_FREE",
				1695	"SWAPEXT"
				1696	};
				1697
				1698	xfs_fs_cmn_err(CE_WARN, mp,
				1699	"xfs_log_write: reservation summary:\n"
				1700	" trans type = %s (%u)\n"
				1701	" unit res = %d bytes\n"
				1702	" current res = %d bytes\n"
				1703	" total reg = %u bytes (o/flow = %u bytes)\n"
				1704	" ophdrs = %u (ophdr space = %u bytes)\n"
				1705	" ophdr + reg = %u bytes\n"
				1706	" num regions = %u\n",
				1707	((ticket->t_trans_type <= 0 \|\|
				1708	ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
				1709	"bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
				1710	ticket->t_trans_type,
				1711	ticket->t_unit_res,
				1712	ticket->t_curr_res,
				1713	ticket->t_res_arr_sum, ticket->t_res_o_flow,
				1714	ticket->t_res_num_ophdrs, ophdr_spc,
				1715	ticket->t_res_arr_sum +
				1716	ticket->t_res_o_flow + ophdr_spc,
				1717	ticket->t_res_num);
				1718
				1719	for (i = 0; i < ticket->t_res_num; i++) {
				1720	uint r_type = ticket->t_res_arr[i].r_type;
				1721	cmn_err(CE_WARN,
				1722	"region[%u]: %s - %u bytes\n",
				1723	i,
				1724	((r_type <= 0 \|\| r_type > XLOG_REG_TYPE_MAX) ?
				1725	"bad-rtype" : res_type_str[r_type-1]),
				1726	ticket->t_res_arr[i].r_len);
				1727	}
				1728	}
				1729	#else
				1730	#define xlog_print_tic_res(mp, ticket)
				1731	#endif
				1732
				1733	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1734	* Write some region out to in-core log
				1735	*
				1736	* This will be called when writing externally provided regions or when
				1737	* writing out a commit record for a given transaction.
				1738	*
				1739	* General algorithm:
				1740	* 1. Find total length of this write. This may include adding to the
				1741	* lengths passed in.
				1742	* 2. Check whether we violate the tickets reservation.
				1743	* 3. While writing to this iclog
				1744	* A. Reserve as much space in this iclog as can get
				1745	* B. If this is first write, save away start lsn
				1746	* C. While writing this region:
				1747	* 1. If first write of transaction, write start record
				1748	* 2. Write log operation header (header per region)
				1749	* 3. Find out if we can fit entire region into this iclog
				1750	* 4. Potentially, verify destination memcpy ptr
				1751	* 5. Memcpy (partial) region
				1752	* 6. If partial copy, release iclog; otherwise, continue
				1753	* copying more regions into current iclog
				1754	* 4. Mark want sync bit (in simulation mode)
				1755	* 5. Release iclog for potential flush to on-disk log.
				1756	*
				1757	* ERRORS:
				1758	* 1. Panic if reservation is overrun. This should never happen since
				1759	* reservation amounts are generated internal to the filesystem.
				1760	* NOTES:
				1761	* 1. Tickets are single threaded data structures.
				1762	* 2. The XLOG_END_TRANS & XLOG_CONTINUE_TRANS flags are passed down to the
				1763	* syncing routine. When a single log_write region needs to span
				1764	* multiple in-core logs, the XLOG_CONTINUE_TRANS bit should be set
				1765	* on all log operation writes which don't contain the end of the
				1766	* region. The XLOG_END_TRANS bit is used for the in-core log
				1767	* operation which contains the end of the continued log_write region.
				1768	* 3. When xlog_state_get_iclog_space() grabs the rest of the current iclog,
				1769	* we don't really know exactly how much space will be used. As a result,
				1770	* we don't update ic_offset until the end when we know exactly how many
				1771	* bytes have been written out.
				1772	*/
				1773	int
				1774	xlog_write(xfs_mount_t * mp,
				1775	xfs_log_iovec_t reg[],
				1776	int nentries,
				1777	xfs_log_ticket_t tic,
				1778	xfs_lsn_t *start_lsn,
				1779	xlog_in_core_t **commit_iclog,
				1780	uint flags)
				1781	{
				1782	xlog_t *log = mp->m_log;
				1783	xlog_ticket_t ticket = (xlog_ticket_t )tic;
				1784	xlog_op_header_t logop_head; / ptr to log operation header */
				1785	xlog_in_core_t iclog; / ptr to current in-core log */
				1786	__psint_t ptr; /* copy address into data region */
				1787	int len; /* # xlog_write() bytes 2 still copy */
				1788	int index; /* region index currently copying */
				1789	int log_offset; /* offset (from 0) into data region */
				1790	int start_rec_copy; /* # bytes to copy for start record */
				1791	int partial_copy; /* did we split a region? */
				1792	int partial_copy_len;/* # bytes copied if split region */
				1793	int need_copy; /* # bytes need to memcpy this region */
				1794	int copy_len; /* # bytes actually memcpy'ing */
				1795	int copy_off; /* # bytes from entry start */
				1796	int contwr; /* continued write of in-core log? */
				1797	int error;
				1798	int record_cnt = 0, data_cnt = 0;
				1799
				1800	partial_copy_len = partial_copy = 0;
				1801
				1802	/* Calculate potential maximum space. Each region gets its own
				1803	* xlog_op_header_t and may need to be double word aligned.
				1804	*/
				1805	len = 0;
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	1806	if (ticket->t_flags & XLOG_TIC_INITED) { /* acct for start rec of xact */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1807	len += sizeof(xlog_op_header_t);
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	1808	XLOG_TIC_ADD_OPHDR(ticket);
				1809	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1810
				1811	for (index = 0; index < nentries; index++) {
				1812	len += sizeof(xlog_op_header_t); /* each region gets >= 1 */
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	1813	XLOG_TIC_ADD_OPHDR(ticket);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1814	len += reg[index].i_len;
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	1815	XLOG_TIC_ADD_REGION(ticket, reg[index].i_len, reg[index].i_type);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1816	}
				1817	contwr = *start_lsn = 0;
				1818
				1819	if (ticket->t_curr_res < len) {
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	1820	xlog_print_tic_res(mp, ticket);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1821	#ifdef DEBUG
				1822	xlog_panic(
				1823	"xfs_log_write: reservation ran out. Need to up reservation");
				1824	#else
				1825	/* Customer configurable panic */
				1826	xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
				1827	"xfs_log_write: reservation ran out. Need to up reservation");
				1828	/* If we did not panic, shutdown the filesystem */
				1829	xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
				1830	#endif
				1831	} else
				1832	ticket->t_curr_res -= len;
				1833
				1834	for (index = 0; index < nentries; ) {
				1835	if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
				1836	&contwr, &log_offset)))
				1837	return (error);
				1838
				1839	ASSERT(log_offset <= iclog->ic_size - 1);
				1840	ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset);
				1841
				1842	/* start_lsn is the first lsn written to. That's all we need. */
				1843	if (! *start_lsn)
				1844	*start_lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT);
				1845
				1846	/* This loop writes out as many regions as can fit in the amount
				1847	* of space which was allocated by xlog_state_get_iclog_space().
				1848	*/
				1849	while (index < nentries) {
				1850	ASSERT(reg[index].i_len % sizeof(__int32_t) == 0);
				1851	ASSERT((__psint_t)ptr % sizeof(__int32_t) == 0);
				1852	start_rec_copy = 0;
				1853
				1854	/* If first write for transaction, insert start record.
				1855	* We can't be trying to commit if we are inited. We can't
				1856	* have any "partial_copy" if we are inited.
				1857	*/
				1858	if (ticket->t_flags & XLOG_TIC_INITED) {
				1859	logop_head = (xlog_op_header_t *)ptr;
				1860	INT_SET(logop_head->oh_tid, ARCH_CONVERT, ticket->t_tid);
				1861	logop_head->oh_clientid = ticket->t_clientid;
				1862	logop_head->oh_len = 0;
				1863	logop_head->oh_flags = XLOG_START_TRANS;
				1864	logop_head->oh_res2 = 0;
				1865	ticket->t_flags &= ~XLOG_TIC_INITED; /* clear bit */
				1866	record_cnt++;
				1867
				1868	start_rec_copy = sizeof(xlog_op_header_t);
				1869	xlog_write_adv_cnt(ptr, len, log_offset, start_rec_copy);
				1870	}
				1871
				1872	/* Copy log operation header directly into data section */
				1873	logop_head = (xlog_op_header_t *)ptr;
				1874	INT_SET(logop_head->oh_tid, ARCH_CONVERT, ticket->t_tid);
				1875	logop_head->oh_clientid = ticket->t_clientid;
				1876	logop_head->oh_res2 = 0;
				1877
				1878	/* header copied directly */
				1879	xlog_write_adv_cnt(ptr, len, log_offset, sizeof(xlog_op_header_t));
				1880
				1881	/* are we copying a commit or unmount record? */
				1882	logop_head->oh_flags = flags;
				1883
				1884	/*
				1885	* We've seen logs corrupted with bad transaction client
				1886	* ids. This makes sure that XFS doesn't generate them on.
				1887	* Turn this into an EIO and shut down the filesystem.
				1888	*/
				1889	switch (logop_head->oh_clientid) {
				1890	case XFS_TRANSACTION:
				1891	case XFS_VOLUME:
				1892	case XFS_LOG:
				1893	break;
				1894	default:
				1895	xfs_fs_cmn_err(CE_WARN, mp,
				1896	"Bad XFS transaction clientid 0x%x in ticket 0x%p",
				1897	logop_head->oh_clientid, tic);
				1898	return XFS_ERROR(EIO);
				1899	}
				1900
				1901	/* Partial write last time? => (partial_copy != 0)
				1902	* need_copy is the amount we'd like to copy if everything could
				1903	* fit in the current memcpy.
				1904	*/
				1905	need_copy = reg[index].i_len - partial_copy_len;
				1906
				1907	copy_off = partial_copy_len;
				1908	if (need_copy <= iclog->ic_size - log_offset) { /complete write /
				1909	INT_SET(logop_head->oh_len, ARCH_CONVERT, copy_len = need_copy);
				1910	if (partial_copy)
				1911	logop_head->oh_flags\|= (XLOG_END_TRANS\|XLOG_WAS_CONT_TRANS);
				1912	partial_copy_len = partial_copy = 0;
				1913	} else { /* partial write */
				1914	copy_len = iclog->ic_size - log_offset;
				1915	INT_SET(logop_head->oh_len, ARCH_CONVERT, copy_len);
				1916	logop_head->oh_flags \|= XLOG_CONTINUE_TRANS;
				1917	if (partial_copy)
				1918	logop_head->oh_flags \|= XLOG_WAS_CONT_TRANS;
				1919	partial_copy_len += copy_len;
				1920	partial_copy++;
				1921	len += sizeof(xlog_op_header_t); /* from splitting of region */
				1922	/* account for new log op header */
				1923	ticket->t_curr_res -= sizeof(xlog_op_header_t);
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	1924	XLOG_TIC_ADD_OPHDR(ticket);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1925	}
				1926	xlog_verify_dest_ptr(log, ptr);
				1927
				1928	/* copy region */
				1929	ASSERT(copy_len >= 0);
				1930	memcpy((xfs_caddr_t)ptr, reg[index].i_addr + copy_off, copy_len);
				1931	xlog_write_adv_cnt(ptr, len, log_offset, copy_len);
				1932
				1933	/* make copy_len total bytes copied, including headers */
				1934	copy_len += start_rec_copy + sizeof(xlog_op_header_t);
				1935	record_cnt++;
				1936	data_cnt += contwr ? copy_len : 0;
				1937	if (partial_copy) { /* copied partial region */
				1938	/* already marked WANT_SYNC by xlog_state_get_iclog_space */
				1939	xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
				1940	record_cnt = data_cnt = 0;
				1941	if ((error = xlog_state_release_iclog(log, iclog)))
				1942	return (error);
				1943	break; /* don't increment index */
				1944	} else { /* copied entire region */
				1945	index++;
				1946	partial_copy_len = partial_copy = 0;
				1947
				1948	if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
				1949	xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
				1950	record_cnt = data_cnt = 0;
				1951	xlog_state_want_sync(log, iclog);
				1952	if (commit_iclog) {
				1953	ASSERT(flags & XLOG_COMMIT_TRANS);
				1954	*commit_iclog = iclog;
				1955	} else if ((error = xlog_state_release_iclog(log, iclog)))
				1956	return (error);
				1957	if (index == nentries)
				1958	return 0; /* we are done */
				1959	else
				1960	break;
				1961	}
				1962	} /* if (partial_copy) */
				1963	} /* while (index < nentries) */
				1964	} /* for (index = 0; index < nentries; ) */
				1965	ASSERT(len == 0);
				1966
				1967	xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
				1968	if (commit_iclog) {
				1969	ASSERT(flags & XLOG_COMMIT_TRANS);
				1970	*commit_iclog = iclog;
				1971	return 0;
				1972	}
				1973	return (xlog_state_release_iclog(log, iclog));
				1974	} /* xlog_write */
				1975
				1976
				1977	/*****************************************************************************
				1978	*
				1979	* State Machine functions
				1980	*
				1981	*****************************************************************************
				1982	*/
				1983
				1984	/* Clean iclogs starting from the head. This ordering must be
				1985	* maintained, so an iclog doesn't become ACTIVE beyond one that
				1986	* is SYNCING. This is also required to maintain the notion that we use
				1987	* a counting semaphore to hold off would be writers to the log when every
				1988	* iclog is trying to sync to disk.
				1989	*
				1990	* State Change: DIRTY -> ACTIVE
				1991	*/
Christoph Hellwig	ba0f32d	2005-06-21 15:36:52 +1000	[diff] [blame]	1992	STATIC void
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1993	xlog_state_clean_log(xlog_t *log)
				1994	{
				1995	xlog_in_core_t *iclog;
				1996	int changed = 0;
				1997
				1998	iclog = log->l_iclog;
				1999	do {
				2000	if (iclog->ic_state == XLOG_STATE_DIRTY) {
				2001	iclog->ic_state = XLOG_STATE_ACTIVE;
				2002	iclog->ic_offset = 0;
				2003	iclog->ic_callback = NULL; /* don't need to free */
				2004	/*
				2005	* If the number of ops in this iclog indicate it just
				2006	* contains the dummy transaction, we can
				2007	* change state into IDLE (the second time around).
				2008	* Otherwise we should change the state into
				2009	* NEED a dummy.
				2010	* We don't need to cover the dummy.
				2011	*/
				2012	if (!changed &&
				2013	(INT_GET(iclog->ic_header.h_num_logops, ARCH_CONVERT) == XLOG_COVER_OPS)) {
				2014	changed = 1;
				2015	} else {
				2016	/*
				2017	* We have two dirty iclogs so start over
				2018	* This could also be num of ops indicates
				2019	* this is not the dummy going out.
				2020	*/
				2021	changed = 2;
				2022	}
				2023	iclog->ic_header.h_num_logops = 0;
				2024	memset(iclog->ic_header.h_cycle_data, 0,
				2025	sizeof(iclog->ic_header.h_cycle_data));
				2026	iclog->ic_header.h_lsn = 0;
				2027	} else if (iclog->ic_state == XLOG_STATE_ACTIVE)
				2028	/* do nothing */;
				2029	else
				2030	break; /* stop cleaning */
				2031	iclog = iclog->ic_next;
				2032	} while (iclog != log->l_iclog);
				2033
				2034	/* log is locked when we are called */
				2035	/*
				2036	* Change state for the dummy log recording.
				2037	* We usually go to NEED. But we go to NEED2 if the changed indicates
				2038	* we are done writing the dummy record.
				2039	* If we are done with the second dummy recored (DONE2), then
				2040	* we go to IDLE.
				2041	*/
				2042	if (changed) {
				2043	switch (log->l_covered_state) {
				2044	case XLOG_STATE_COVER_IDLE:
				2045	case XLOG_STATE_COVER_NEED:
				2046	case XLOG_STATE_COVER_NEED2:
				2047	log->l_covered_state = XLOG_STATE_COVER_NEED;
				2048	break;
				2049
				2050	case XLOG_STATE_COVER_DONE:
				2051	if (changed == 1)
				2052	log->l_covered_state = XLOG_STATE_COVER_NEED2;
				2053	else
				2054	log->l_covered_state = XLOG_STATE_COVER_NEED;
				2055	break;
				2056
				2057	case XLOG_STATE_COVER_DONE2:
				2058	if (changed == 1)
				2059	log->l_covered_state = XLOG_STATE_COVER_IDLE;
				2060	else
				2061	log->l_covered_state = XLOG_STATE_COVER_NEED;
				2062	break;
				2063
				2064	default:
				2065	ASSERT(0);
				2066	}
				2067	}
				2068	} /* xlog_state_clean_log */
				2069
				2070	STATIC xfs_lsn_t
				2071	xlog_get_lowest_lsn(
				2072	xlog_t *log)
				2073	{
				2074	xlog_in_core_t *lsn_log;
				2075	xfs_lsn_t lowest_lsn, lsn;
				2076
				2077	lsn_log = log->l_iclog;
				2078	lowest_lsn = 0;
				2079	do {
				2080	if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE\|XLOG_STATE_DIRTY))) {
				2081	lsn = INT_GET(lsn_log->ic_header.h_lsn, ARCH_CONVERT);
				2082	if ((lsn && !lowest_lsn) \|\|
				2083	(XFS_LSN_CMP(lsn, lowest_lsn) < 0)) {
				2084	lowest_lsn = lsn;
				2085	}
				2086	}
				2087	lsn_log = lsn_log->ic_next;
				2088	} while (lsn_log != log->l_iclog);
				2089	return(lowest_lsn);
				2090	}
				2091
				2092
				2093	STATIC void
				2094	xlog_state_do_callback(
				2095	xlog_t *log,
				2096	int aborted,
				2097	xlog_in_core_t *ciclog)
				2098	{
				2099	xlog_in_core_t *iclog;
				2100	xlog_in_core_t first_iclog; / used to know when we've
				2101	* processed all iclogs once */
				2102	xfs_log_callback_t cb, cb_next;
				2103	int flushcnt = 0;
				2104	xfs_lsn_t lowest_lsn;
				2105	int ioerrors; /* counter: iclogs with errors */
				2106	int loopdidcallbacks; /* flag: inner loop did callbacks*/
				2107	int funcdidcallbacks; /* flag: function did callbacks */
				2108	int repeats; /* for issuing console warnings if
				2109	* looping too many times */
				2110	SPLDECL(s);
				2111
				2112	s = LOG_LOCK(log);
				2113	first_iclog = iclog = log->l_iclog;
				2114	ioerrors = 0;
				2115	funcdidcallbacks = 0;
				2116	repeats = 0;
				2117
				2118	do {
				2119	/*
				2120	* Scan all iclogs starting with the one pointed to by the
				2121	* log. Reset this starting point each time the log is
				2122	* unlocked (during callbacks).
				2123	*
				2124	* Keep looping through iclogs until one full pass is made
				2125	* without running any callbacks.
				2126	*/
				2127	first_iclog = log->l_iclog;
				2128	iclog = log->l_iclog;
				2129	loopdidcallbacks = 0;
				2130	repeats++;
				2131
				2132	do {
				2133
				2134	/* skip all iclogs in the ACTIVE & DIRTY states */
				2135	if (iclog->ic_state &
				2136	(XLOG_STATE_ACTIVE\|XLOG_STATE_DIRTY)) {
				2137	iclog = iclog->ic_next;
				2138	continue;
				2139	}
				2140
				2141	/*
				2142	* Between marking a filesystem SHUTDOWN and stopping
				2143	* the log, we do flush all iclogs to disk (if there
				2144	* wasn't a log I/O error). So, we do want things to
				2145	* go smoothly in case of just a SHUTDOWN w/o a
				2146	* LOG_IO_ERROR.
				2147	*/
				2148	if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
				2149	/*
				2150	* Can only perform callbacks in order. Since
				2151	* this iclog is not in the DONE_SYNC/
				2152	* DO_CALLBACK state, we skip the rest and
				2153	* just try to clean up. If we set our iclog
				2154	* to DO_CALLBACK, we will not process it when
				2155	* we retry since a previous iclog is in the
				2156	* CALLBACK and the state cannot change since
				2157	* we are holding the LOG_LOCK.
				2158	*/
				2159	if (!(iclog->ic_state &
				2160	(XLOG_STATE_DONE_SYNC \|
				2161	XLOG_STATE_DO_CALLBACK))) {
				2162	if (ciclog && (ciclog->ic_state ==
				2163	XLOG_STATE_DONE_SYNC)) {
				2164	ciclog->ic_state = XLOG_STATE_DO_CALLBACK;
				2165	}
				2166	break;
				2167	}
				2168	/*
				2169	* We now have an iclog that is in either the
				2170	* DO_CALLBACK or DONE_SYNC states. The other
				2171	* states (WANT_SYNC, SYNCING, or CALLBACK were
				2172	* caught by the above if and are going to
				2173	* clean (i.e. we aren't doing their callbacks)
				2174	* see the above if.
				2175	*/
				2176
				2177	/*
				2178	* We will do one more check here to see if we
				2179	* have chased our tail around.
				2180	*/
				2181
				2182	lowest_lsn = xlog_get_lowest_lsn(log);
				2183	if (lowest_lsn && (
				2184	XFS_LSN_CMP(
				2185	lowest_lsn,
				2186	INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT)
				2187	)<0)) {
				2188	iclog = iclog->ic_next;
				2189	continue; /* Leave this iclog for
				2190	* another thread */
				2191	}
				2192
				2193	iclog->ic_state = XLOG_STATE_CALLBACK;
				2194
				2195	LOG_UNLOCK(log, s);
				2196
				2197	/* l_last_sync_lsn field protected by
				2198	* GRANT_LOCK. Don't worry about iclog's lsn.
				2199	* No one else can be here except us.
				2200	*/
				2201	s = GRANT_LOCK(log);
				2202	ASSERT(XFS_LSN_CMP(
				2203	log->l_last_sync_lsn,
				2204	INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT)
				2205	)<=0);
				2206	log->l_last_sync_lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT);
				2207	GRANT_UNLOCK(log, s);
				2208
				2209	/*
				2210	* Keep processing entries in the callback list
				2211	* until we come around and it is empty. We
				2212	* need to atomically see that the list is
				2213	* empty and change the state to DIRTY so that
				2214	* we don't miss any more callbacks being added.
				2215	*/
				2216	s = LOG_LOCK(log);
				2217	} else {
				2218	ioerrors++;
				2219	}
				2220	cb = iclog->ic_callback;
				2221
				2222	while (cb != 0) {
				2223	iclog->ic_callback_tail = &(iclog->ic_callback);
				2224	iclog->ic_callback = NULL;
				2225	LOG_UNLOCK(log, s);
				2226
				2227	/* perform callbacks in the order given */
				2228	for (; cb != 0; cb = cb_next) {
				2229	cb_next = cb->cb_next;
				2230	cb->cb_func(cb->cb_arg, aborted);
				2231	}
				2232	s = LOG_LOCK(log);
				2233	cb = iclog->ic_callback;
				2234	}
				2235
				2236	loopdidcallbacks++;
				2237	funcdidcallbacks++;
				2238
				2239	ASSERT(iclog->ic_callback == 0);
				2240	if (!(iclog->ic_state & XLOG_STATE_IOERROR))
				2241	iclog->ic_state = XLOG_STATE_DIRTY;
				2242
				2243	/*
				2244	* Transition from DIRTY to ACTIVE if applicable.
				2245	* NOP if STATE_IOERROR.
				2246	*/
				2247	xlog_state_clean_log(log);
				2248
				2249	/* wake up threads waiting in xfs_log_force() */
				2250	sv_broadcast(&iclog->ic_forcesema);
				2251
				2252	iclog = iclog->ic_next;
				2253	} while (first_iclog != iclog);
				2254	if (repeats && (repeats % 10) == 0) {
				2255	xfs_fs_cmn_err(CE_WARN, log->l_mp,
				2256	"xlog_state_do_callback: looping %d", repeats);
				2257	}
				2258	} while (!ioerrors && loopdidcallbacks);
				2259
				2260	/*
				2261	* make one last gasp attempt to see if iclogs are being left in
				2262	* limbo..
				2263	*/
				2264	#ifdef DEBUG
				2265	if (funcdidcallbacks) {
				2266	first_iclog = iclog = log->l_iclog;
				2267	do {
				2268	ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
				2269	/*
				2270	* Terminate the loop if iclogs are found in states
				2271	* which will cause other threads to clean up iclogs.
				2272	*
				2273	* SYNCING - i/o completion will go through logs
				2274	* DONE_SYNC - interrupt thread should be waiting for
				2275	* LOG_LOCK
				2276	* IOERROR - give up hope all ye who enter here
				2277	*/
				2278	if (iclog->ic_state == XLOG_STATE_WANT_SYNC \|\|
				2279	iclog->ic_state == XLOG_STATE_SYNCING \|\|
				2280	iclog->ic_state == XLOG_STATE_DONE_SYNC \|\|
				2281	iclog->ic_state == XLOG_STATE_IOERROR )
				2282	break;
				2283	iclog = iclog->ic_next;
				2284	} while (first_iclog != iclog);
				2285	}
				2286	#endif
				2287
				2288	if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE\|XLOG_STATE_IOERROR)) {
				2289	flushcnt = log->l_flushcnt;
				2290	log->l_flushcnt = 0;
				2291	}
				2292	LOG_UNLOCK(log, s);
				2293	while (flushcnt--)
				2294	vsema(&log->l_flushsema);
				2295	} /* xlog_state_do_callback */
				2296
				2297
				2298	/*
				2299	* Finish transitioning this iclog to the dirty state.
				2300	*
				2301	* Make sure that we completely execute this routine only when this is
				2302	* the last call to the iclog. There is a good chance that iclog flushes,
				2303	* when we reach the end of the physical log, get turned into 2 separate
				2304	* calls to bwrite. Hence, one iclog flush could generate two calls to this
				2305	* routine. By using the reference count bwritecnt, we guarantee that only
				2306	* the second completion goes through.
				2307	*
				2308	* Callbacks could take time, so they are done outside the scope of the
				2309	* global state machine log lock. Assume that the calls to cvsema won't
				2310	* take a long time. At least we know it won't sleep.
				2311	*/
				2312	void
				2313	xlog_state_done_syncing(
				2314	xlog_in_core_t *iclog,
				2315	int aborted)
				2316	{
				2317	xlog_t *log = iclog->ic_log;
				2318	SPLDECL(s);
				2319
				2320	s = LOG_LOCK(log);
				2321
				2322	ASSERT(iclog->ic_state == XLOG_STATE_SYNCING \|\|
				2323	iclog->ic_state == XLOG_STATE_IOERROR);
				2324	ASSERT(iclog->ic_refcnt == 0);
				2325	ASSERT(iclog->ic_bwritecnt == 1 \|\| iclog->ic_bwritecnt == 2);
				2326
				2327
				2328	/*
				2329	* If we got an error, either on the first buffer, or in the case of
				2330	* split log writes, on the second, we mark ALL iclogs STATE_IOERROR,
				2331	* and none should ever be attempted to be written to disk
				2332	* again.
				2333	*/
				2334	if (iclog->ic_state != XLOG_STATE_IOERROR) {
				2335	if (--iclog->ic_bwritecnt == 1) {
				2336	LOG_UNLOCK(log, s);
				2337	return;
				2338	}
				2339	iclog->ic_state = XLOG_STATE_DONE_SYNC;
				2340	}
				2341
				2342	/*
				2343	* Someone could be sleeping prior to writing out the next
				2344	* iclog buffer, we wake them all, one will get to do the
				2345	* I/O, the others get to wait for the result.
				2346	*/
				2347	sv_broadcast(&iclog->ic_writesema);
				2348	LOG_UNLOCK(log, s);
				2349	xlog_state_do_callback(log, aborted, iclog); /* also cleans log */
				2350	} /* xlog_state_done_syncing */
				2351
				2352
				2353	/*
				2354	* If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
				2355	* sleep. The flush semaphore is set to the number of in-core buffers and
				2356	* decremented around disk syncing. Therefore, if all buffers are syncing,
				2357	* this semaphore will cause new writes to sleep until a sync completes.
				2358	* Otherwise, this code just does p() followed by v(). This approximates
				2359	* a sleep/wakeup except we can't race.
				2360	*
				2361	* The in-core logs are used in a circular fashion. They are not used
				2362	* out-of-order even when an iclog past the head is free.
				2363	*
				2364	* return:
				2365	* * log_offset where xlog_write() can start writing into the in-core
				2366	* log's data space.
				2367	* * in-core log pointer to which xlog_write() should write.
				2368	* * boolean indicating this is a continued write to an in-core log.
				2369	* If this is the last write, then the in-core log's offset field
				2370	* needs to be incremented, depending on the amount of data which
				2371	* is copied.
				2372	*/
				2373	int
				2374	xlog_state_get_iclog_space(xlog_t *log,
				2375	int len,
				2376	xlog_in_core_t **iclogp,
				2377	xlog_ticket_t *ticket,
				2378	int *continued_write,
				2379	int *logoffsetp)
				2380	{
				2381	SPLDECL(s);
				2382	int log_offset;
				2383	xlog_rec_header_t *head;
				2384	xlog_in_core_t *iclog;
				2385	int error;
				2386
				2387	restart:
				2388	s = LOG_LOCK(log);
				2389	if (XLOG_FORCED_SHUTDOWN(log)) {
				2390	LOG_UNLOCK(log, s);
				2391	return XFS_ERROR(EIO);
				2392	}
				2393
				2394	iclog = log->l_iclog;
				2395	if (! (iclog->ic_state == XLOG_STATE_ACTIVE)) {
				2396	log->l_flushcnt++;
				2397	LOG_UNLOCK(log, s);
				2398	xlog_trace_iclog(iclog, XLOG_TRACE_SLEEP_FLUSH);
				2399	XFS_STATS_INC(xs_log_noiclogs);
				2400	/* Ensure that log writes happen */
				2401	psema(&log->l_flushsema, PINOD);
				2402	goto restart;
				2403	}
				2404	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
				2405	head = &iclog->ic_header;
				2406
				2407	iclog->ic_refcnt++; /* prevents sync */
				2408	log_offset = iclog->ic_offset;
				2409
				2410	/* On the 1st write to an iclog, figure out lsn. This works
				2411	* if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are
				2412	* committing to. If the offset is set, that's how many blocks
				2413	* must be written.
				2414	*/
				2415	if (log_offset == 0) {
				2416	ticket->t_curr_res -= log->l_iclog_hsize;
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	2417	XLOG_TIC_ADD_REGION(ticket,
				2418	log->l_iclog_hsize,
				2419	XLOG_REG_TYPE_LRHEADER);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2420	INT_SET(head->h_cycle, ARCH_CONVERT, log->l_curr_cycle);
				2421	ASSIGN_LSN(head->h_lsn, log);
				2422	ASSERT(log->l_curr_block >= 0);
				2423	}
				2424
				2425	/* If there is enough room to write everything, then do it. Otherwise,
				2426	* claim the rest of the region and make sure the XLOG_STATE_WANT_SYNC
				2427	* bit is on, so this will get flushed out. Don't update ic_offset
				2428	* until you know exactly how many bytes get copied. Therefore, wait
				2429	* until later to update ic_offset.
				2430	*
				2431	* xlog_write() algorithm assumes that at least 2 xlog_op_header_t's
				2432	* can fit into remaining data section.
				2433	*/
				2434	if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
				2435	xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
				2436
				2437	/* If I'm the only one writing to this iclog, sync it to disk */
				2438	if (iclog->ic_refcnt == 1) {
				2439	LOG_UNLOCK(log, s);
				2440	if ((error = xlog_state_release_iclog(log, iclog)))
				2441	return (error);
				2442	} else {
				2443	iclog->ic_refcnt--;
				2444	LOG_UNLOCK(log, s);
				2445	}
				2446	goto restart;
				2447	}
				2448
				2449	/* Do we have enough room to write the full amount in the remainder
				2450	* of this iclog? Or must we continue a write on the next iclog and
				2451	* mark this iclog as completely taken? In the case where we switch
				2452	* iclogs (to mark it taken), this particular iclog will release/sync
				2453	* to disk in xlog_write().
				2454	*/
				2455	if (len <= iclog->ic_size - iclog->ic_offset) {
				2456	*continued_write = 0;
				2457	iclog->ic_offset += len;
				2458	} else {
				2459	*continued_write = 1;
				2460	xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
				2461	}
				2462	*iclogp = iclog;
				2463
				2464	ASSERT(iclog->ic_offset <= iclog->ic_size);
				2465	LOG_UNLOCK(log, s);
				2466
				2467	*logoffsetp = log_offset;
				2468	return 0;
				2469	} /* xlog_state_get_iclog_space */
				2470
				2471	/*
				2472	* Atomically get the log space required for a log ticket.
				2473	*
				2474	* Once a ticket gets put onto the reserveq, it will only return after
				2475	* the needed reservation is satisfied.
				2476	*/
				2477	STATIC int
				2478	xlog_grant_log_space(xlog_t *log,
				2479	xlog_ticket_t *tic)
				2480	{
				2481	int free_bytes;
				2482	int need_bytes;
				2483	SPLDECL(s);
				2484	#ifdef DEBUG
				2485	xfs_lsn_t tail_lsn;
				2486	#endif
				2487
				2488
				2489	#ifdef DEBUG
				2490	if (log->l_flags & XLOG_ACTIVE_RECOVERY)
				2491	panic("grant Recovery problem");
				2492	#endif
				2493
				2494	/* Is there space or do we need to sleep? */
				2495	s = GRANT_LOCK(log);
				2496	xlog_trace_loggrant(log, tic, "xlog_grant_log_space: enter");
				2497
				2498	/* something is already sleeping; insert new transaction at end */
				2499	if (log->l_reserve_headq) {
				2500	XLOG_INS_TICKETQ(log->l_reserve_headq, tic);
				2501	xlog_trace_loggrant(log, tic,
				2502	"xlog_grant_log_space: sleep 1");
				2503	/*
				2504	* Gotta check this before going to sleep, while we're
				2505	* holding the grant lock.
				2506	*/
				2507	if (XLOG_FORCED_SHUTDOWN(log))
				2508	goto error_return;
				2509
				2510	XFS_STATS_INC(xs_sleep_logspace);
				2511	sv_wait(&tic->t_sema, PINOD\|PLTWAIT, &log->l_grant_lock, s);
				2512	/*
				2513	* If we got an error, and the filesystem is shutting down,
				2514	* we'll catch it down below. So just continue...
				2515	*/
				2516	xlog_trace_loggrant(log, tic,
				2517	"xlog_grant_log_space: wake 1");
				2518	s = GRANT_LOCK(log);
				2519	}
				2520	if (tic->t_flags & XFS_LOG_PERM_RESERV)
				2521	need_bytes = tic->t_unit_res*tic->t_ocnt;
				2522	else
				2523	need_bytes = tic->t_unit_res;
				2524
				2525	redo:
				2526	if (XLOG_FORCED_SHUTDOWN(log))
				2527	goto error_return;
				2528
				2529	free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle,
				2530	log->l_grant_reserve_bytes);
				2531	if (free_bytes < need_bytes) {
				2532	if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
				2533	XLOG_INS_TICKETQ(log->l_reserve_headq, tic);
				2534	xlog_trace_loggrant(log, tic,
				2535	"xlog_grant_log_space: sleep 2");
				2536	XFS_STATS_INC(xs_sleep_logspace);
				2537	sv_wait(&tic->t_sema, PINOD\|PLTWAIT, &log->l_grant_lock, s);
				2538
				2539	if (XLOG_FORCED_SHUTDOWN(log)) {
				2540	s = GRANT_LOCK(log);
				2541	goto error_return;
				2542	}
				2543
				2544	xlog_trace_loggrant(log, tic,
				2545	"xlog_grant_log_space: wake 2");
				2546	xlog_grant_push_ail(log->l_mp, need_bytes);
				2547	s = GRANT_LOCK(log);
				2548	goto redo;
				2549	} else if (tic->t_flags & XLOG_TIC_IN_Q)
				2550	XLOG_DEL_TICKETQ(log->l_reserve_headq, tic);
				2551
				2552	/* we've got enough space */
				2553	XLOG_GRANT_ADD_SPACE(log, need_bytes, 'w');
				2554	XLOG_GRANT_ADD_SPACE(log, need_bytes, 'r');
				2555	#ifdef DEBUG
				2556	tail_lsn = log->l_tail_lsn;
				2557	/*
				2558	* Check to make sure the grant write head didn't just over lap the
				2559	* tail. If the cycles are the same, we can't be overlapping.
				2560	* Otherwise, make sure that the cycles differ by exactly one and
				2561	* check the byte count.
				2562	*/
				2563	if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
				2564	ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
				2565	ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
				2566	}
				2567	#endif
				2568	xlog_trace_loggrant(log, tic, "xlog_grant_log_space: exit");
				2569	xlog_verify_grant_head(log, 1);
				2570	GRANT_UNLOCK(log, s);
				2571	return 0;
				2572
				2573	error_return:
				2574	if (tic->t_flags & XLOG_TIC_IN_Q)
				2575	XLOG_DEL_TICKETQ(log->l_reserve_headq, tic);
				2576	xlog_trace_loggrant(log, tic, "xlog_grant_log_space: err_ret");
				2577	/*
				2578	* If we are failing, make sure the ticket doesn't have any
				2579	* current reservations. We don't want to add this back when
				2580	* the ticket/transaction gets cancelled.
				2581	*/
				2582	tic->t_curr_res = 0;
				2583	tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
				2584	GRANT_UNLOCK(log, s);
				2585	return XFS_ERROR(EIO);
				2586	} /* xlog_grant_log_space */
				2587
				2588
				2589	/*
				2590	* Replenish the byte reservation required by moving the grant write head.
				2591	*
				2592	*
				2593	*/
				2594	STATIC int
				2595	xlog_regrant_write_log_space(xlog_t *log,
				2596	xlog_ticket_t *tic)
				2597	{
				2598	SPLDECL(s);
				2599	int free_bytes, need_bytes;
				2600	xlog_ticket_t *ntic;
				2601	#ifdef DEBUG
				2602	xfs_lsn_t tail_lsn;
				2603	#endif
				2604
				2605	tic->t_curr_res = tic->t_unit_res;
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	2606	XLOG_TIC_RESET_RES(tic);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2607
				2608	if (tic->t_cnt > 0)
				2609	return (0);
				2610
				2611	#ifdef DEBUG
				2612	if (log->l_flags & XLOG_ACTIVE_RECOVERY)
				2613	panic("regrant Recovery problem");
				2614	#endif
				2615
				2616	s = GRANT_LOCK(log);
				2617	xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: enter");
				2618
				2619	if (XLOG_FORCED_SHUTDOWN(log))
				2620	goto error_return;
				2621
				2622	/* If there are other waiters on the queue then give them a
				2623	* chance at logspace before us. Wake up the first waiters,
				2624	* if we do not wake up all the waiters then go to sleep waiting
				2625	* for more free space, otherwise try to get some space for
				2626	* this transaction.
				2627	*/
				2628
				2629	if ((ntic = log->l_write_headq)) {
				2630	free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
				2631	log->l_grant_write_bytes);
				2632	do {
				2633	ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
				2634
				2635	if (free_bytes < ntic->t_unit_res)
				2636	break;
				2637	free_bytes -= ntic->t_unit_res;
				2638	sv_signal(&ntic->t_sema);
				2639	ntic = ntic->t_next;
				2640	} while (ntic != log->l_write_headq);
				2641
				2642	if (ntic != log->l_write_headq) {
				2643	if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
				2644	XLOG_INS_TICKETQ(log->l_write_headq, tic);
				2645
				2646	xlog_trace_loggrant(log, tic,
				2647	"xlog_regrant_write_log_space: sleep 1");
				2648	XFS_STATS_INC(xs_sleep_logspace);
				2649	sv_wait(&tic->t_sema, PINOD\|PLTWAIT,
				2650	&log->l_grant_lock, s);
				2651
				2652	/* If we're shutting down, this tic is already
				2653	* off the queue */
				2654	if (XLOG_FORCED_SHUTDOWN(log)) {
				2655	s = GRANT_LOCK(log);
				2656	goto error_return;
				2657	}
				2658
				2659	xlog_trace_loggrant(log, tic,
				2660	"xlog_regrant_write_log_space: wake 1");
				2661	xlog_grant_push_ail(log->l_mp, tic->t_unit_res);
				2662	s = GRANT_LOCK(log);
				2663	}
				2664	}
				2665
				2666	need_bytes = tic->t_unit_res;
				2667
				2668	redo:
				2669	if (XLOG_FORCED_SHUTDOWN(log))
				2670	goto error_return;
				2671
				2672	free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
				2673	log->l_grant_write_bytes);
				2674	if (free_bytes < need_bytes) {
				2675	if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
				2676	XLOG_INS_TICKETQ(log->l_write_headq, tic);
				2677	XFS_STATS_INC(xs_sleep_logspace);
				2678	sv_wait(&tic->t_sema, PINOD\|PLTWAIT, &log->l_grant_lock, s);
				2679
				2680	/* If we're shutting down, this tic is already off the queue */
				2681	if (XLOG_FORCED_SHUTDOWN(log)) {
				2682	s = GRANT_LOCK(log);
				2683	goto error_return;
				2684	}
				2685
				2686	xlog_trace_loggrant(log, tic,
				2687	"xlog_regrant_write_log_space: wake 2");
				2688	xlog_grant_push_ail(log->l_mp, need_bytes);
				2689	s = GRANT_LOCK(log);
				2690	goto redo;
				2691	} else if (tic->t_flags & XLOG_TIC_IN_Q)
				2692	XLOG_DEL_TICKETQ(log->l_write_headq, tic);
				2693
				2694	XLOG_GRANT_ADD_SPACE(log, need_bytes, 'w'); /* we've got enough space */
				2695	#ifdef DEBUG
				2696	tail_lsn = log->l_tail_lsn;
				2697	if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
				2698	ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
				2699	ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
				2700	}
				2701	#endif
				2702
				2703	xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: exit");
				2704	xlog_verify_grant_head(log, 1);
				2705	GRANT_UNLOCK(log, s);
				2706	return (0);
				2707
				2708
				2709	error_return:
				2710	if (tic->t_flags & XLOG_TIC_IN_Q)
				2711	XLOG_DEL_TICKETQ(log->l_reserve_headq, tic);
				2712	xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: err_ret");
				2713	/*
				2714	* If we are failing, make sure the ticket doesn't have any
				2715	* current reservations. We don't want to add this back when
				2716	* the ticket/transaction gets cancelled.
				2717	*/
				2718	tic->t_curr_res = 0;
				2719	tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
				2720	GRANT_UNLOCK(log, s);
				2721	return XFS_ERROR(EIO);
				2722	} /* xlog_regrant_write_log_space */
				2723
				2724
				2725	/* The first cnt-1 times through here we don't need to
				2726	* move the grant write head because the permanent
				2727	* reservation has reserved cnt times the unit amount.
				2728	* Release part of current permanent unit reservation and
				2729	* reset current reservation to be one units worth. Also
				2730	* move grant reservation head forward.
				2731	*/
				2732	STATIC void
				2733	xlog_regrant_reserve_log_space(xlog_t *log,
				2734	xlog_ticket_t *ticket)
				2735	{
				2736	SPLDECL(s);
				2737
				2738	xlog_trace_loggrant(log, ticket,
				2739	"xlog_regrant_reserve_log_space: enter");
				2740	if (ticket->t_cnt > 0)
				2741	ticket->t_cnt--;
				2742
				2743	s = GRANT_LOCK(log);
				2744	XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'w');
				2745	XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'r');
				2746	ticket->t_curr_res = ticket->t_unit_res;
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	2747	XLOG_TIC_RESET_RES(ticket);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2748	xlog_trace_loggrant(log, ticket,
				2749	"xlog_regrant_reserve_log_space: sub current res");
				2750	xlog_verify_grant_head(log, 1);
				2751
				2752	/* just return if we still have some of the pre-reserved space */
				2753	if (ticket->t_cnt > 0) {
				2754	GRANT_UNLOCK(log, s);
				2755	return;
				2756	}
				2757
				2758	XLOG_GRANT_ADD_SPACE(log, ticket->t_unit_res, 'r');
				2759	xlog_trace_loggrant(log, ticket,
				2760	"xlog_regrant_reserve_log_space: exit");
				2761	xlog_verify_grant_head(log, 0);
				2762	GRANT_UNLOCK(log, s);
				2763	ticket->t_curr_res = ticket->t_unit_res;
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	2764	XLOG_TIC_RESET_RES(ticket);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2765	} /* xlog_regrant_reserve_log_space */
				2766
				2767
				2768	/*
				2769	* Give back the space left from a reservation.
				2770	*
				2771	* All the information we need to make a correct determination of space left
				2772	* is present. For non-permanent reservations, things are quite easy. The
				2773	* count should have been decremented to zero. We only need to deal with the
				2774	* space remaining in the current reservation part of the ticket. If the
				2775	* ticket contains a permanent reservation, there may be left over space which
				2776	* needs to be released. A count of N means that N-1 refills of the current
				2777	* reservation can be done before we need to ask for more space. The first
				2778	* one goes to fill up the first current reservation. Once we run out of
				2779	* space, the count will stay at zero and the only space remaining will be
				2780	* in the current reservation field.
				2781	*/
				2782	STATIC void
				2783	xlog_ungrant_log_space(xlog_t *log,
				2784	xlog_ticket_t *ticket)
				2785	{
				2786	SPLDECL(s);
				2787
				2788	if (ticket->t_cnt > 0)
				2789	ticket->t_cnt--;
				2790
				2791	s = GRANT_LOCK(log);
				2792	xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: enter");
				2793
				2794	XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'w');
				2795	XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'r');
				2796
				2797	xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: sub current");
				2798
				2799	/* If this is a permanent reservation ticket, we may be able to free
				2800	* up more space based on the remaining count.
				2801	*/
				2802	if (ticket->t_cnt > 0) {
				2803	ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
				2804	XLOG_GRANT_SUB_SPACE(log, ticket->t_unit_res*ticket->t_cnt,'w');
				2805	XLOG_GRANT_SUB_SPACE(log, ticket->t_unit_res*ticket->t_cnt,'r');
				2806	}
				2807
				2808	xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: exit");
				2809	xlog_verify_grant_head(log, 1);
				2810	GRANT_UNLOCK(log, s);
				2811	xfs_log_move_tail(log->l_mp, 1);
				2812	} /* xlog_ungrant_log_space */
				2813
				2814
				2815	/*
				2816	* Atomically put back used ticket.
				2817	*/
				2818	void
				2819	xlog_state_put_ticket(xlog_t *log,
				2820	xlog_ticket_t *tic)
				2821	{
				2822	unsigned long s;
				2823
				2824	s = LOG_LOCK(log);
				2825	xlog_ticket_put(log, tic);
				2826	LOG_UNLOCK(log, s);
				2827	} /* xlog_state_put_ticket */
				2828
				2829	/*
				2830	* Flush iclog to disk if this is the last reference to the given iclog and
				2831	* the WANT_SYNC bit is set.
				2832	*
				2833	* When this function is entered, the iclog is not necessarily in the
				2834	* WANT_SYNC state. It may be sitting around waiting to get filled.
				2835	*
				2836	*
				2837	*/
				2838	int
				2839	xlog_state_release_iclog(xlog_t *log,
				2840	xlog_in_core_t *iclog)
				2841	{
				2842	SPLDECL(s);
				2843	int sync = 0; /* do we sync? */
				2844
				2845	xlog_assign_tail_lsn(log->l_mp);
				2846
				2847	s = LOG_LOCK(log);
				2848
				2849	if (iclog->ic_state & XLOG_STATE_IOERROR) {
				2850	LOG_UNLOCK(log, s);
				2851	return XFS_ERROR(EIO);
				2852	}
				2853
				2854	ASSERT(iclog->ic_refcnt > 0);
				2855	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE \|\|
				2856	iclog->ic_state == XLOG_STATE_WANT_SYNC);
				2857
				2858	if (--iclog->ic_refcnt == 0 &&
				2859	iclog->ic_state == XLOG_STATE_WANT_SYNC) {
				2860	sync++;
				2861	iclog->ic_state = XLOG_STATE_SYNCING;
				2862	INT_SET(iclog->ic_header.h_tail_lsn, ARCH_CONVERT, log->l_tail_lsn);
				2863	xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn);
				2864	/* cycle incremented when incrementing curr_block */
				2865	}
				2866
				2867	LOG_UNLOCK(log, s);
				2868
				2869	/*
				2870	* We let the log lock go, so it's possible that we hit a log I/O
				2871	* error or someother SHUTDOWN condition that marks the iclog
				2872	* as XLOG_STATE_IOERROR before the bwrite. However, we know that
				2873	* this iclog has consistent data, so we ignore IOERROR
				2874	* flags after this point.
				2875	*/
				2876	if (sync) {
				2877	return xlog_sync(log, iclog);
				2878	}
				2879	return (0);
				2880
				2881	} /* xlog_state_release_iclog */
				2882
				2883
				2884	/*
				2885	* This routine will mark the current iclog in the ring as WANT_SYNC
				2886	* and move the current iclog pointer to the next iclog in the ring.
				2887	* When this routine is called from xlog_state_get_iclog_space(), the
				2888	* exact size of the iclog has not yet been determined. All we know is
				2889	* that every data block. We have run out of space in this log record.
				2890	*/
				2891	STATIC void
				2892	xlog_state_switch_iclogs(xlog_t *log,
				2893	xlog_in_core_t *iclog,
				2894	int eventual_size)
				2895	{
				2896	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
				2897	if (!eventual_size)
				2898	eventual_size = iclog->ic_offset;
				2899	iclog->ic_state = XLOG_STATE_WANT_SYNC;
				2900	INT_SET(iclog->ic_header.h_prev_block, ARCH_CONVERT, log->l_prev_block);
				2901	log->l_prev_block = log->l_curr_block;
				2902	log->l_prev_cycle = log->l_curr_cycle;
				2903
				2904	/* roll log?: ic_offset changed later */
				2905	log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
				2906
				2907	/* Round up to next log-sunit */
				2908	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) &&
				2909	log->l_mp->m_sb.sb_logsunit > 1) {
				2910	__uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit);
				2911	log->l_curr_block = roundup(log->l_curr_block, sunit_bb);
				2912	}
				2913
				2914	if (log->l_curr_block >= log->l_logBBsize) {
				2915	log->l_curr_cycle++;
				2916	if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
				2917	log->l_curr_cycle++;
				2918	log->l_curr_block -= log->l_logBBsize;
				2919	ASSERT(log->l_curr_block >= 0);
				2920	}
				2921	ASSERT(iclog == log->l_iclog);
				2922	log->l_iclog = iclog->ic_next;
				2923	} /* xlog_state_switch_iclogs */
				2924
				2925
				2926	/*
				2927	* Write out all data in the in-core log as of this exact moment in time.
				2928	*
				2929	* Data may be written to the in-core log during this call. However,
				2930	* we don't guarantee this data will be written out. A change from past
				2931	* implementation means this routine will not write out zero length LRs.
				2932	*
				2933	* Basically, we try and perform an intelligent scan of the in-core logs.
				2934	* If we determine there is no flushable data, we just return. There is no
				2935	* flushable data if:
				2936	*
				2937	* 1. the current iclog is active and has no data; the previous iclog
				2938	* is in the active or dirty state.
				2939	* 2. the current iclog is drity, and the previous iclog is in the
				2940	* active or dirty state.
				2941	*
				2942	* We may sleep (call psema) if:
				2943	*
				2944	* 1. the current iclog is not in the active nor dirty state.
				2945	* 2. the current iclog dirty, and the previous iclog is not in the
				2946	* active nor dirty state.
				2947	* 3. the current iclog is active, and there is another thread writing
				2948	* to this particular iclog.
				2949	* 4. a) the current iclog is active and has no other writers
				2950	* b) when we return from flushing out this iclog, it is still
				2951	* not in the active nor dirty state.
				2952	*/
				2953	STATIC int
				2954	xlog_state_sync_all(xlog_t *log, uint flags)
				2955	{
				2956	xlog_in_core_t *iclog;
				2957	xfs_lsn_t lsn;
				2958	SPLDECL(s);
				2959
				2960	s = LOG_LOCK(log);
				2961
				2962	iclog = log->l_iclog;
				2963	if (iclog->ic_state & XLOG_STATE_IOERROR) {
				2964	LOG_UNLOCK(log, s);
				2965	return XFS_ERROR(EIO);
				2966	}
				2967
				2968	/* If the head iclog is not active nor dirty, we just attach
				2969	* ourselves to the head and go to sleep.
				2970	*/
				2971	if (iclog->ic_state == XLOG_STATE_ACTIVE \|\|
				2972	iclog->ic_state == XLOG_STATE_DIRTY) {
				2973	/*
				2974	* If the head is dirty or (active and empty), then
				2975	* we need to look at the previous iclog. If the previous
				2976	* iclog is active or dirty we are done. There is nothing
				2977	* to sync out. Otherwise, we attach ourselves to the
				2978	* previous iclog and go to sleep.
				2979	*/
				2980	if (iclog->ic_state == XLOG_STATE_DIRTY \|\|
				2981	(iclog->ic_refcnt == 0 && iclog->ic_offset == 0)) {
				2982	iclog = iclog->ic_prev;
				2983	if (iclog->ic_state == XLOG_STATE_ACTIVE \|\|
				2984	iclog->ic_state == XLOG_STATE_DIRTY)
				2985	goto no_sleep;
				2986	else
				2987	goto maybe_sleep;
				2988	} else {
				2989	if (iclog->ic_refcnt == 0) {
				2990	/* We are the only one with access to this
				2991	* iclog. Flush it out now. There should
				2992	* be a roundoff of zero to show that someone
				2993	* has already taken care of the roundoff from
				2994	* the previous sync.
				2995	*/
				2996	iclog->ic_refcnt++;
				2997	lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT);
				2998	xlog_state_switch_iclogs(log, iclog, 0);
				2999	LOG_UNLOCK(log, s);
				3000
				3001	if (xlog_state_release_iclog(log, iclog))
				3002	return XFS_ERROR(EIO);
				3003	s = LOG_LOCK(log);
				3004	if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) == lsn &&
				3005	iclog->ic_state != XLOG_STATE_DIRTY)
				3006	goto maybe_sleep;
				3007	else
				3008	goto no_sleep;
				3009	} else {
				3010	/* Someone else is writing to this iclog.
				3011	* Use its call to flush out the data. However,
				3012	* the other thread may not force out this LR,
				3013	* so we mark it WANT_SYNC.
				3014	*/
				3015	xlog_state_switch_iclogs(log, iclog, 0);
				3016	goto maybe_sleep;
				3017	}
				3018	}
				3019	}
				3020
				3021	/* By the time we come around again, the iclog could've been filled
				3022	* which would give it another lsn. If we have a new lsn, just
				3023	* return because the relevant data has been flushed.
				3024	*/
				3025	maybe_sleep:
				3026	if (flags & XFS_LOG_SYNC) {
				3027	/*
				3028	* We must check if we're shutting down here, before
				3029	* we wait, while we're holding the LOG_LOCK.
				3030	* Then we check again after waking up, in case our
				3031	* sleep was disturbed by a bad news.
				3032	*/
				3033	if (iclog->ic_state & XLOG_STATE_IOERROR) {
				3034	LOG_UNLOCK(log, s);
				3035	return XFS_ERROR(EIO);
				3036	}
				3037	XFS_STATS_INC(xs_log_force_sleep);
				3038	sv_wait(&iclog->ic_forcesema, PINOD, &log->l_icloglock, s);
				3039	/*
				3040	* No need to grab the log lock here since we're
				3041	* only deciding whether or not to return EIO
				3042	* and the memory read should be atomic.
				3043	*/
				3044	if (iclog->ic_state & XLOG_STATE_IOERROR)
				3045	return XFS_ERROR(EIO);
				3046
				3047	} else {
				3048
				3049	no_sleep:
				3050	LOG_UNLOCK(log, s);
				3051	}
				3052	return 0;
				3053	} /* xlog_state_sync_all */
				3054
				3055
				3056	/*
				3057	* Used by code which implements synchronous log forces.
				3058	*
				3059	* Find in-core log with lsn.
				3060	* If it is in the DIRTY state, just return.
				3061	* If it is in the ACTIVE state, move the in-core log into the WANT_SYNC
				3062	* state and go to sleep or return.
				3063	* If it is in any other state, go to sleep or return.
				3064	*
				3065	* If filesystem activity goes to zero, the iclog will get flushed only by
				3066	* bdflush().
				3067	*/
				3068	int
				3069	xlog_state_sync(xlog_t *log,
				3070	xfs_lsn_t lsn,
				3071	uint flags)
				3072	{
				3073	xlog_in_core_t *iclog;
				3074	int already_slept = 0;
				3075	SPLDECL(s);
				3076
				3077
				3078	try_again:
				3079	s = LOG_LOCK(log);
				3080	iclog = log->l_iclog;
				3081
				3082	if (iclog->ic_state & XLOG_STATE_IOERROR) {
				3083	LOG_UNLOCK(log, s);
				3084	return XFS_ERROR(EIO);
				3085	}
				3086
				3087	do {
				3088	if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) != lsn) {
				3089	iclog = iclog->ic_next;
				3090	continue;
				3091	}
				3092
				3093	if (iclog->ic_state == XLOG_STATE_DIRTY) {
				3094	LOG_UNLOCK(log, s);
				3095	return 0;
				3096	}
				3097
				3098	if (iclog->ic_state == XLOG_STATE_ACTIVE) {
				3099	/*
				3100	* We sleep here if we haven't already slept (e.g.
				3101	* this is the first time we've looked at the correct
				3102	* iclog buf) and the buffer before us is going to
				3103	* be sync'ed. The reason for this is that if we
				3104	* are doing sync transactions here, by waiting for
				3105	* the previous I/O to complete, we can allow a few
				3106	* more transactions into this iclog before we close
				3107	* it down.
				3108	*
				3109	* Otherwise, we mark the buffer WANT_SYNC, and bump
				3110	* up the refcnt so we can release the log (which drops
				3111	* the ref count). The state switch keeps new transaction
				3112	* commits from using this buffer. When the current commits
				3113	* finish writing into the buffer, the refcount will drop to
				3114	* zero and the buffer will go out then.
				3115	*/
				3116	if (!already_slept &&
				3117	(iclog->ic_prev->ic_state & (XLOG_STATE_WANT_SYNC \|
				3118	XLOG_STATE_SYNCING))) {
				3119	ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
				3120	XFS_STATS_INC(xs_log_force_sleep);
				3121	sv_wait(&iclog->ic_prev->ic_writesema, PSWP,
				3122	&log->l_icloglock, s);
				3123	already_slept = 1;
				3124	goto try_again;
				3125	} else {
				3126	iclog->ic_refcnt++;
				3127	xlog_state_switch_iclogs(log, iclog, 0);
				3128	LOG_UNLOCK(log, s);
				3129	if (xlog_state_release_iclog(log, iclog))
				3130	return XFS_ERROR(EIO);
				3131	s = LOG_LOCK(log);
				3132	}
				3133	}
				3134
				3135	if ((flags & XFS_LOG_SYNC) && /* sleep */
				3136	!(iclog->ic_state & (XLOG_STATE_ACTIVE \| XLOG_STATE_DIRTY))) {
				3137
				3138	/*
				3139	* Don't wait on the forcesema if we know that we've
				3140	* gotten a log write error.
				3141	*/
				3142	if (iclog->ic_state & XLOG_STATE_IOERROR) {
				3143	LOG_UNLOCK(log, s);
				3144	return XFS_ERROR(EIO);
				3145	}
				3146	XFS_STATS_INC(xs_log_force_sleep);
				3147	sv_wait(&iclog->ic_forcesema, PSWP, &log->l_icloglock, s);
				3148	/*
				3149	* No need to grab the log lock here since we're
				3150	* only deciding whether or not to return EIO
				3151	* and the memory read should be atomic.
				3152	*/
				3153	if (iclog->ic_state & XLOG_STATE_IOERROR)
				3154	return XFS_ERROR(EIO);
				3155	} else { /* just return */
				3156	LOG_UNLOCK(log, s);
				3157	}
				3158	return 0;
				3159
				3160	} while (iclog != log->l_iclog);
				3161
				3162	LOG_UNLOCK(log, s);
				3163	return (0);
				3164	} /* xlog_state_sync */
				3165
				3166
				3167	/*
				3168	* Called when we want to mark the current iclog as being ready to sync to
				3169	* disk.
				3170	*/
				3171	void
				3172	xlog_state_want_sync(xlog_t log, xlog_in_core_t iclog)
				3173	{
				3174	SPLDECL(s);
				3175
				3176	s = LOG_LOCK(log);
				3177
				3178	if (iclog->ic_state == XLOG_STATE_ACTIVE) {
				3179	xlog_state_switch_iclogs(log, iclog, 0);
				3180	} else {
				3181	ASSERT(iclog->ic_state &
				3182	(XLOG_STATE_WANT_SYNC\|XLOG_STATE_IOERROR));
				3183	}
				3184
				3185	LOG_UNLOCK(log, s);
				3186	} /* xlog_state_want_sync */
				3187
				3188
				3189
				3190	/*****************************************************************************
				3191	*
				3192	* TICKET functions
				3193	*
				3194	*****************************************************************************
				3195	*/
				3196
				3197	/*
				3198	* Algorithm doesn't take into account page size. ;-(
				3199	*/
				3200	STATIC void
				3201	xlog_state_ticket_alloc(xlog_t *log)
				3202	{
				3203	xlog_ticket_t *t_list;
				3204	xlog_ticket_t *next;
				3205	xfs_caddr_t buf;
				3206	uint i = (NBPP / sizeof(xlog_ticket_t)) - 2;
				3207	SPLDECL(s);
				3208
				3209	/*
				3210	* The kmem_zalloc may sleep, so we shouldn't be holding the
				3211	* global lock. XXXmiken: may want to use zone allocator.
				3212	*/
				3213	buf = (xfs_caddr_t) kmem_zalloc(NBPP, KM_SLEEP);
				3214
				3215	s = LOG_LOCK(log);
				3216
				3217	/* Attach 1st ticket to Q, so we can keep track of allocated memory */
				3218	t_list = (xlog_ticket_t *)buf;
				3219	t_list->t_next = log->l_unmount_free;
				3220	log->l_unmount_free = t_list++;
				3221	log->l_ticket_cnt++;
				3222	log->l_ticket_tcnt++;
				3223
				3224	/* Next ticket becomes first ticket attached to ticket free list */
				3225	if (log->l_freelist != NULL) {
				3226	ASSERT(log->l_tail != NULL);
				3227	log->l_tail->t_next = t_list;
				3228	} else {
				3229	log->l_freelist = t_list;
				3230	}
				3231	log->l_ticket_cnt++;
				3232	log->l_ticket_tcnt++;
				3233
				3234	/* Cycle through rest of alloc'ed memory, building up free Q */
				3235	for ( ; i > 0; i--) {
				3236	next = t_list + 1;
				3237	t_list->t_next = next;
				3238	t_list = next;
				3239	log->l_ticket_cnt++;
				3240	log->l_ticket_tcnt++;
				3241	}
				3242	t_list->t_next = NULL;
				3243	log->l_tail = t_list;
				3244	LOG_UNLOCK(log, s);
				3245	} /* xlog_state_ticket_alloc */
				3246
				3247
				3248	/*
				3249	* Put ticket into free list
				3250	*
				3251	* Assumption: log lock is held around this call.
				3252	*/
				3253	STATIC void
				3254	xlog_ticket_put(xlog_t *log,
				3255	xlog_ticket_t *ticket)
				3256	{
				3257	sv_destroy(&ticket->t_sema);
				3258
				3259	/*
				3260	* Don't think caching will make that much difference. It's
				3261	* more important to make debug easier.
				3262	*/
				3263	#if 0
				3264	/* real code will want to use LIFO for caching */
				3265	ticket->t_next = log->l_freelist;
				3266	log->l_freelist = ticket;
				3267	/* no need to clear fields */
				3268	#else
				3269	/* When we debug, it is easier if tickets are cycled */
				3270	ticket->t_next = NULL;
				3271	if (log->l_tail != 0) {
				3272	log->l_tail->t_next = ticket;
				3273	} else {
				3274	ASSERT(log->l_freelist == 0);
				3275	log->l_freelist = ticket;
				3276	}
				3277	log->l_tail = ticket;
				3278	#endif /* DEBUG */
				3279	log->l_ticket_cnt++;
				3280	} /* xlog_ticket_put */
				3281
				3282
				3283	/*
				3284	* Grab ticket off freelist or allocation some more
				3285	*/
				3286	xlog_ticket_t *
				3287	xlog_ticket_get(xlog_t *log,
				3288	int unit_bytes,
				3289	int cnt,
				3290	char client,
				3291	uint xflags)
				3292	{
				3293	xlog_ticket_t *tic;
				3294	uint num_headers;
				3295	SPLDECL(s);
				3296
				3297	alloc:
				3298	if (log->l_freelist == NULL)
				3299	xlog_state_ticket_alloc(log); /* potentially sleep */
				3300
				3301	s = LOG_LOCK(log);
				3302	if (log->l_freelist == NULL) {
				3303	LOG_UNLOCK(log, s);
				3304	goto alloc;
				3305	}
				3306	tic = log->l_freelist;
				3307	log->l_freelist = tic->t_next;
				3308	if (log->l_freelist == NULL)
				3309	log->l_tail = NULL;
				3310	log->l_ticket_cnt--;
				3311	LOG_UNLOCK(log, s);
				3312
				3313	/*
				3314	* Permanent reservations have up to 'cnt'-1 active log operations
				3315	* in the log. A unit in this case is the amount of space for one
				3316	* of these log operations. Normal reservations have a cnt of 1
				3317	* and their unit amount is the total amount of space required.
				3318	*
				3319	* The following lines of code account for non-transaction data
Tim Shimmin	32fb9b5	2005-09-02 16:41:43 +1000	[diff] [blame]	3320	* which occupy space in the on-disk log.
				3321	*
				3322	* Normal form of a transaction is:
				3323	* <oph><trans-hdr><start-oph><reg1-oph><reg1><reg2-oph>...<commit-oph>
				3324	* and then there are LR hdrs, split-recs and roundoff at end of syncs.
				3325	*
				3326	* We need to account for all the leadup data and trailer data
				3327	* around the transaction data.
				3328	* And then we need to account for the worst case in terms of using
				3329	* more space.
				3330	* The worst case will happen if:
				3331	* - the placement of the transaction happens to be such that the
				3332	* roundoff is at its maximum
				3333	* - the transaction data is synced before the commit record is synced
				3334	* i.e. <transaction-data><roundoff> \| <commit-rec><roundoff>
				3335	* Therefore the commit record is in its own Log Record.
				3336	* This can happen as the commit record is called with its
				3337	* own region to xlog_write().
				3338	* This then means that in the worst case, roundoff can happen for
				3339	* the commit-rec as well.
				3340	* The commit-rec is smaller than padding in this scenario and so it is
				3341	* not added separately.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3342	*/
				3343
Tim Shimmin	32fb9b5	2005-09-02 16:41:43 +1000	[diff] [blame]	3344	/* for trans header */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3345	unit_bytes += sizeof(xlog_op_header_t);
Tim Shimmin	32fb9b5	2005-09-02 16:41:43 +1000	[diff] [blame]	3346	unit_bytes += sizeof(xfs_trans_header_t);
				3347
				3348	/* for start-rec */
				3349	unit_bytes += sizeof(xlog_op_header_t);
				3350
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3351	/* for LR headers */
				3352	num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log);
				3353	unit_bytes += log->l_iclog_hsize * num_headers;
				3354
Tim Shimmin	32fb9b5	2005-09-02 16:41:43 +1000	[diff] [blame]	3355	/* for commit-rec LR header - note: padding will subsume the ophdr */
				3356	unit_bytes += log->l_iclog_hsize;
				3357
				3358	/* for split-recs - ophdrs added when data split over LRs */
				3359	unit_bytes += sizeof(xlog_op_header_t) * num_headers;
				3360
				3361	/* for roundoff padding for transaction data and one for commit record */
				3362	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) &&
				3363	log->l_mp->m_sb.sb_logsunit > 1) {
				3364	/* log su roundoff */
				3365	unit_bytes += 2*log->l_mp->m_sb.sb_logsunit;
				3366	} else {
				3367	/* BB roundoff */
				3368	unit_bytes += 2*BBSIZE;
				3369	}
				3370
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3371	tic->t_unit_res = unit_bytes;
				3372	tic->t_curr_res = unit_bytes;
				3373	tic->t_cnt = cnt;
				3374	tic->t_ocnt = cnt;
				3375	tic->t_tid = (xlog_tid_t)((__psint_t)tic & 0xffffffff);
				3376	tic->t_clientid = client;
				3377	tic->t_flags = XLOG_TIC_INITED;
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	3378	tic->t_trans_type = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3379	if (xflags & XFS_LOG_PERM_RESERV)
				3380	tic->t_flags \|= XLOG_TIC_PERM_RESERV;
				3381	sv_init(&(tic->t_sema), SV_DEFAULT, "logtick");
				3382
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	3383	XLOG_TIC_RESET_RES(tic);
				3384
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3385	return tic;
				3386	} /* xlog_ticket_get */
				3387
				3388
				3389	/******************************************************************************
				3390	*
				3391	* Log debug routines
				3392	*
				3393	******************************************************************************
				3394	*/
				3395	#if defined(DEBUG) && !defined(XLOG_NOLOG)
				3396	/*
				3397	* Make sure that the destination ptr is within the valid data region of
				3398	* one of the iclogs. This uses backup pointers stored in a different
				3399	* part of the log in case we trash the log structure.
				3400	*/
				3401	void
				3402	xlog_verify_dest_ptr(xlog_t *log,
				3403	__psint_t ptr)
				3404	{
				3405	int i;
				3406	int good_ptr = 0;
				3407
				3408	for (i=0; i < log->l_iclog_bufs; i++) {
				3409	if (ptr >= (__psint_t)log->l_iclog_bak[i] &&
				3410	ptr <= (__psint_t)log->l_iclog_bak[i]+log->l_iclog_size)
				3411	good_ptr++;
				3412	}
				3413	if (! good_ptr)
				3414	xlog_panic("xlog_verify_dest_ptr: invalid ptr");
				3415	} /* xlog_verify_dest_ptr */
				3416
				3417	STATIC void
				3418	xlog_verify_grant_head(xlog_t *log, int equals)
				3419	{
				3420	if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) {
				3421	if (equals)
				3422	ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes);
				3423	else
				3424	ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes);
				3425	} else {
				3426	ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle);
				3427	ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes);
				3428	}
				3429	} /* xlog_verify_grant_head */
				3430
				3431	/* check if it will fit */
				3432	STATIC void
				3433	xlog_verify_tail_lsn(xlog_t *log,
				3434	xlog_in_core_t *iclog,
				3435	xfs_lsn_t tail_lsn)
				3436	{
				3437	int blocks;
				3438
				3439	if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) {
				3440	blocks =
				3441	log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn));
				3442	if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize))
				3443	xlog_panic("xlog_verify_tail_lsn: ran out of log space");
				3444	} else {
				3445	ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle);
				3446
				3447	if (BLOCK_LSN(tail_lsn) == log->l_prev_block)
				3448	xlog_panic("xlog_verify_tail_lsn: tail wrapped");
				3449
				3450	blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block;
				3451	if (blocks < BTOBB(iclog->ic_offset) + 1)
				3452	xlog_panic("xlog_verify_tail_lsn: ran out of log space");
				3453	}
				3454	} /* xlog_verify_tail_lsn */
				3455
				3456	/*
				3457	* Perform a number of checks on the iclog before writing to disk.
				3458	*
				3459	* 1. Make sure the iclogs are still circular
				3460	* 2. Make sure we have a good magic number
				3461	* 3. Make sure we don't have magic numbers in the data
				3462	* 4. Check fields of each log operation header for:
				3463	* A. Valid client identifier
				3464	* B. tid ptr value falls in valid ptr space (user space code)
				3465	* C. Length in log record header is correct according to the
				3466	* individual operation headers within record.
				3467	* 5. When a bwrite will occur within 5 blocks of the front of the physical
				3468	* log, check the preceding blocks of the physical log to make sure all
				3469	* the cycle numbers agree with the current cycle number.
				3470	*/
				3471	STATIC void
				3472	xlog_verify_iclog(xlog_t *log,
				3473	xlog_in_core_t *iclog,
				3474	int count,
				3475	boolean_t syncing)
				3476	{
				3477	xlog_op_header_t *ophead;
				3478	xlog_in_core_t *icptr;
				3479	xlog_in_core_2_t *xhdr;
				3480	xfs_caddr_t ptr;
				3481	xfs_caddr_t base_ptr;
				3482	__psint_t field_offset;
				3483	__uint8_t clientid;
				3484	int len, i, j, k, op_len;
				3485	int idx;
				3486	SPLDECL(s);
				3487
				3488	/* check validity of iclog pointers */
				3489	s = LOG_LOCK(log);
				3490	icptr = log->l_iclog;
				3491	for (i=0; i < log->l_iclog_bufs; i++) {
				3492	if (icptr == 0)
				3493	xlog_panic("xlog_verify_iclog: invalid ptr");
				3494	icptr = icptr->ic_next;
				3495	}
				3496	if (icptr != log->l_iclog)
				3497	xlog_panic("xlog_verify_iclog: corrupt iclog ring");
				3498	LOG_UNLOCK(log, s);
				3499
				3500	/* check log magic numbers */
				3501	ptr = (xfs_caddr_t) &(iclog->ic_header);
				3502	if (INT_GET((uint )ptr, ARCH_CONVERT) != XLOG_HEADER_MAGIC_NUM)
				3503	xlog_panic("xlog_verify_iclog: invalid magic num");
				3504
				3505	for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&(iclog->ic_header))+count;
				3506	ptr += BBSIZE) {
				3507	if (INT_GET((uint )ptr, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM)
				3508	xlog_panic("xlog_verify_iclog: unexpected magic num");
				3509	}
				3510
				3511	/* check fields */
				3512	len = INT_GET(iclog->ic_header.h_num_logops, ARCH_CONVERT);
				3513	ptr = iclog->ic_datap;
				3514	base_ptr = ptr;
				3515	ophead = (xlog_op_header_t *)ptr;
				3516	xhdr = (xlog_in_core_2_t *)&iclog->ic_header;
				3517	for (i = 0; i < len; i++) {
				3518	ophead = (xlog_op_header_t *)ptr;
				3519
				3520	/* clientid is only 1 byte */
				3521	field_offset = (__psint_t)
				3522	((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr);
				3523	if (syncing == B_FALSE \|\| (field_offset & 0x1ff)) {
				3524	clientid = ophead->oh_clientid;
				3525	} else {
				3526	idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap);
				3527	if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
				3528	j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
				3529	k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
				3530	clientid = GET_CLIENT_ID(xhdr[j].hic_xheader.xh_cycle_data[k], ARCH_CONVERT);
				3531	} else {
				3532	clientid = GET_CLIENT_ID(iclog->ic_header.h_cycle_data[idx], ARCH_CONVERT);
				3533	}
				3534	}
				3535	if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
Christoph Hellwig	da1650a	2005-11-02 10:21:35 +1100	[diff] [blame^]	3536	cmn_err(CE_WARN, "xlog_verify_iclog: "
				3537	"invalid clientid %d op 0x%p offset 0x%lx",
				3538	clientid, ophead, (unsigned long)field_offset);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3539
				3540	/* check length */
				3541	field_offset = (__psint_t)
				3542	((xfs_caddr_t)&(ophead->oh_len) - base_ptr);
				3543	if (syncing == B_FALSE \|\| (field_offset & 0x1ff)) {
				3544	op_len = INT_GET(ophead->oh_len, ARCH_CONVERT);
				3545	} else {
				3546	idx = BTOBBT((__psint_t)&ophead->oh_len -
				3547	(__psint_t)iclog->ic_datap);
				3548	if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
				3549	j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
				3550	k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
				3551	op_len = INT_GET(xhdr[j].hic_xheader.xh_cycle_data[k], ARCH_CONVERT);
				3552	} else {
				3553	op_len = INT_GET(iclog->ic_header.h_cycle_data[idx], ARCH_CONVERT);
				3554	}
				3555	}
				3556	ptr += sizeof(xlog_op_header_t) + op_len;
				3557	}
				3558	} /* xlog_verify_iclog */
				3559	#endif /* DEBUG && !XLOG_NOLOG */
				3560
				3561	/*
				3562	* Mark all iclogs IOERROR. LOG_LOCK is held by the caller.
				3563	*/
				3564	STATIC int
				3565	xlog_state_ioerror(
				3566	xlog_t *log)
				3567	{
				3568	xlog_in_core_t iclog, ic;
				3569
				3570	iclog = log->l_iclog;
				3571	if (! (iclog->ic_state & XLOG_STATE_IOERROR)) {
				3572	/*
				3573	* Mark all the incore logs IOERROR.
				3574	* From now on, no log flushes will result.
				3575	*/
				3576	ic = iclog;
				3577	do {
				3578	ic->ic_state = XLOG_STATE_IOERROR;
				3579	ic = ic->ic_next;
				3580	} while (ic != iclog);
				3581	return (0);
				3582	}
				3583	/*
				3584	* Return non-zero, if state transition has already happened.
				3585	*/
				3586	return (1);
				3587	}
				3588
				3589	/*
				3590	* This is called from xfs_force_shutdown, when we're forcibly
				3591	* shutting down the filesystem, typically because of an IO error.
				3592	* Our main objectives here are to make sure that:
				3593	* a. the filesystem gets marked 'SHUTDOWN' for all interested
				3594	* parties to find out, 'atomically'.
				3595	* b. those who're sleeping on log reservations, pinned objects and
				3596	* other resources get woken up, and be told the bad news.
				3597	* c. nothing new gets queued up after (a) and (b) are done.
				3598	* d. if !logerror, flush the iclogs to disk, then seal them off
				3599	* for business.
				3600	*/
				3601	int
				3602	xfs_log_force_umount(
				3603	struct xfs_mount *mp,
				3604	int logerror)
				3605	{
				3606	xlog_ticket_t *tic;
				3607	xlog_t *log;
				3608	int retval;
				3609	SPLDECL(s);
				3610	SPLDECL(s2);
				3611
				3612	log = mp->m_log;
				3613
				3614	/*
				3615	* If this happens during log recovery, don't worry about
				3616	* locking; the log isn't open for business yet.
				3617	*/
				3618	if (!log \|\|
				3619	log->l_flags & XLOG_ACTIVE_RECOVERY) {
				3620	mp->m_flags \|= XFS_MOUNT_FS_SHUTDOWN;
				3621	XFS_BUF_DONE(mp->m_sb_bp);
				3622	return (0);
				3623	}
				3624
				3625	/*
				3626	* Somebody could've already done the hard work for us.
				3627	* No need to get locks for this.
				3628	*/
				3629	if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) {
				3630	ASSERT(XLOG_FORCED_SHUTDOWN(log));
				3631	return (1);
				3632	}
				3633	retval = 0;
				3634	/*
				3635	* We must hold both the GRANT lock and the LOG lock,
				3636	* before we mark the filesystem SHUTDOWN and wake
				3637	* everybody up to tell the bad news.
				3638	*/
				3639	s = GRANT_LOCK(log);
				3640	s2 = LOG_LOCK(log);
				3641	mp->m_flags \|= XFS_MOUNT_FS_SHUTDOWN;
				3642	XFS_BUF_DONE(mp->m_sb_bp);
				3643	/*
				3644	* This flag is sort of redundant because of the mount flag, but
				3645	* it's good to maintain the separation between the log and the rest
				3646	* of XFS.
				3647	*/
				3648	log->l_flags \|= XLOG_IO_ERROR;
				3649
				3650	/*
				3651	* If we hit a log error, we want to mark all the iclogs IOERROR
				3652	* while we're still holding the loglock.
				3653	*/
				3654	if (logerror)
				3655	retval = xlog_state_ioerror(log);
				3656	LOG_UNLOCK(log, s2);
				3657
				3658	/*
				3659	* We don't want anybody waiting for log reservations
				3660	* after this. That means we have to wake up everybody
				3661	* queued up on reserve_headq as well as write_headq.
				3662	* In addition, we make sure in xlog_{re}grant_log_space
				3663	* that we don't enqueue anything once the SHUTDOWN flag
				3664	* is set, and this action is protected by the GRANTLOCK.
				3665	*/
				3666	if ((tic = log->l_reserve_headq)) {
				3667	do {
				3668	sv_signal(&tic->t_sema);
				3669	tic = tic->t_next;
				3670	} while (tic != log->l_reserve_headq);
				3671	}
				3672
				3673	if ((tic = log->l_write_headq)) {
				3674	do {
				3675	sv_signal(&tic->t_sema);
				3676	tic = tic->t_next;
				3677	} while (tic != log->l_write_headq);
				3678	}
				3679	GRANT_UNLOCK(log, s);
				3680
				3681	if (! (log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
				3682	ASSERT(!logerror);
				3683	/*
				3684	* Force the incore logs to disk before shutting the
				3685	* log down completely.
				3686	*/
				3687	xlog_state_sync_all(log, XFS_LOG_FORCE\|XFS_LOG_SYNC);
				3688	s2 = LOG_LOCK(log);
				3689	retval = xlog_state_ioerror(log);
				3690	LOG_UNLOCK(log, s2);
				3691	}
				3692	/*
				3693	* Wake up everybody waiting on xfs_log_force.
				3694	* Callback all log item committed functions as if the
				3695	* log writes were completed.
				3696	*/
				3697	xlog_state_do_callback(log, XFS_LI_ABORTED, NULL);
				3698
				3699	#ifdef XFSERRORDEBUG
				3700	{
				3701	xlog_in_core_t *iclog;
				3702
				3703	s = LOG_LOCK(log);
				3704	iclog = log->l_iclog;
				3705	do {
				3706	ASSERT(iclog->ic_callback == 0);
				3707	iclog = iclog->ic_next;
				3708	} while (iclog != log->l_iclog);
				3709	LOG_UNLOCK(log, s);
				3710	}
				3711	#endif
				3712	/* return non-zero if log IOERROR transition had already happened */
				3713	return (retval);
				3714	}
				3715
Christoph Hellwig	ba0f32d	2005-06-21 15:36:52 +1000	[diff] [blame]	3716	STATIC int
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3717	xlog_iclogs_empty(xlog_t *log)
				3718	{
				3719	xlog_in_core_t *iclog;
				3720
				3721	iclog = log->l_iclog;
				3722	do {
				3723	/* endianness does not matter here, zero is zero in
				3724	* any language.
				3725	*/
				3726	if (iclog->ic_header.h_num_logops)
				3727	return(0);
				3728	iclog = iclog->ic_next;
				3729	} while (iclog != log->l_iclog);
				3730	return(1);
				3731	}