Blame - fs/xfs/xfs_log.c - fp2-dev/kernel/msm

blob: 092d5fb096b13e8c5fa9c65bfe02693a3870592c [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
				3	*
				4	* This program is free software; you can redistribute it and/or modify it
				5	* under the terms of version 2 of the GNU General Public License as
				6	* published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it would be useful, but
				9	* WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
				11	*
				12	* Further, this software is distributed without any warranty that it is
				13	* free of the rightful claim of any third person regarding infringement
				14	* or the like. Any license provided herein, whether implied or
				15	* otherwise, applies only to this software file. Patent licenses, if
				16	* any, provided herein do not apply to combinations of this program with
				17	* other software, or any other product whatsoever.
				18	*
				19	* You should have received a copy of the GNU General Public License along
				20	* with this program; if not, write the Free Software Foundation, Inc., 59
				21	* Temple Place - Suite 330, Boston MA 02111-1307, USA.
				22	*
				23	* Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
				24	* Mountain View, CA 94043, or:
				25	*
				26	* http://www.sgi.com
				27	*
				28	* For further information regarding this notice, see:
				29	*
				30	* http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
				31	*/
				32
				33	/*
				34	* High level interface routines for log manager
				35	*/
				36
				37	#include "xfs.h"
				38	#include "xfs_macros.h"
				39	#include "xfs_types.h"
				40	#include "xfs_inum.h"
				41	#include "xfs_ag.h"
				42	#include "xfs_sb.h"
				43	#include "xfs_log.h"
				44	#include "xfs_trans.h"
				45	#include "xfs_dir.h"
				46	#include "xfs_dmapi.h"
				47	#include "xfs_mount.h"
				48	#include "xfs_error.h"
				49	#include "xfs_log_priv.h"
				50	#include "xfs_buf_item.h"
				51	#include "xfs_alloc_btree.h"
				52	#include "xfs_log_recover.h"
				53	#include "xfs_bit.h"
				54	#include "xfs_rw.h"
				55	#include "xfs_trans_priv.h"
				56
				57
				58	#define xlog_write_adv_cnt(ptr, len, off, bytes) \
				59	{ (ptr) += (bytes); \
				60	(len) -= (bytes); \
				61	(off) += (bytes);}
				62
				63	/* Local miscellaneous function prototypes */
				64	STATIC int xlog_bdstrat_cb(struct xfs_buf *);
				65	STATIC int xlog_commit_record(xfs_mount_t mp, xlog_ticket_t ticket,
				66	xlog_in_core_t *, xfs_lsn_t );
				67	STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
				68	xfs_buftarg_t *log_target,
				69	xfs_daddr_t blk_offset,
				70	int num_bblks);
				71	STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes);
				72	STATIC int xlog_sync(xlog_t log, xlog_in_core_t iclog);
				73	STATIC void xlog_unalloc_log(xlog_t *log);
				74	STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[],
				75	int nentries, xfs_log_ticket_t tic,
				76	xfs_lsn_t *start_lsn,
				77	xlog_in_core_t **commit_iclog,
				78	uint flags);
				79
				80	/* local state machine functions */
				81	STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
				82	STATIC void xlog_state_do_callback(xlog_t log,int aborted, xlog_in_core_t iclog);
				83	STATIC int xlog_state_get_iclog_space(xlog_t *log,
				84	int len,
				85	xlog_in_core_t **iclog,
				86	xlog_ticket_t *ticket,
				87	int *continued_write,
				88	int *logoffsetp);
				89	STATIC void xlog_state_put_ticket(xlog_t *log,
				90	xlog_ticket_t *tic);
				91	STATIC int xlog_state_release_iclog(xlog_t *log,
				92	xlog_in_core_t *iclog);
				93	STATIC void xlog_state_switch_iclogs(xlog_t *log,
				94	xlog_in_core_t *iclog,
				95	int eventual_size);
				96	STATIC int xlog_state_sync(xlog_t *log, xfs_lsn_t lsn, uint flags);
				97	STATIC int xlog_state_sync_all(xlog_t *log, uint flags);
				98	STATIC void xlog_state_want_sync(xlog_t log, xlog_in_core_t iclog);
				99
				100	/* local functions to manipulate grant head */
				101	STATIC int xlog_grant_log_space(xlog_t *log,
				102	xlog_ticket_t *xtic);
				103	STATIC void xlog_grant_push_ail(xfs_mount_t *mp,
				104	int need_bytes);
				105	STATIC void xlog_regrant_reserve_log_space(xlog_t *log,
				106	xlog_ticket_t *ticket);
				107	STATIC int xlog_regrant_write_log_space(xlog_t *log,
				108	xlog_ticket_t *ticket);
				109	STATIC void xlog_ungrant_log_space(xlog_t *log,
				110	xlog_ticket_t *ticket);
				111
				112
				113	/* local ticket functions */
				114	STATIC void xlog_state_ticket_alloc(xlog_t *log);
				115	STATIC xlog_ticket_t xlog_ticket_get(xlog_t log,
				116	int unit_bytes,
				117	int count,
				118	char clientid,
				119	uint flags);
				120	STATIC void xlog_ticket_put(xlog_t log, xlog_ticket_t ticket);
				121
				122	/* local debug functions */
				123	#if defined(DEBUG) && !defined(XLOG_NOLOG)
				124	STATIC void xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr);
				125	STATIC void xlog_verify_grant_head(xlog_t *log, int equals);
				126	STATIC void xlog_verify_iclog(xlog_t log, xlog_in_core_t iclog,
				127	int count, boolean_t syncing);
				128	STATIC void xlog_verify_tail_lsn(xlog_t log, xlog_in_core_t iclog,
				129	xfs_lsn_t tail_lsn);
				130	#else
				131	#define xlog_verify_dest_ptr(a,b)
				132	#define xlog_verify_grant_head(a,b)
				133	#define xlog_verify_iclog(a,b,c,d)
				134	#define xlog_verify_tail_lsn(a,b,c)
				135	#endif
				136
				137	int xlog_iclogs_empty(xlog_t *log);
				138
				139	#ifdef DEBUG
				140	int xlog_do_error = 0;
				141	int xlog_req_num = 0;
				142	int xlog_error_mod = 33;
				143	#endif
				144
				145	#define XLOG_FORCED_SHUTDOWN(log) (log->l_flags & XLOG_IO_ERROR)
				146
				147	/*
				148	* 0 => disable log manager
				149	* 1 => enable log manager
				150	* 2 => enable log manager and log debugging
				151	*/
				152	#if defined(XLOG_NOLOG) \|\| defined(DEBUG)
				153	int xlog_debug = 1;
				154	xfs_buftarg_t *xlog_target;
				155	#endif
				156
				157	#if defined(XFS_LOG_TRACE)
				158
				159	void
				160	xlog_trace_loggrant(xlog_t log, xlog_ticket_t tic, xfs_caddr_t string)
				161	{
				162	if (! log->l_grant_trace) {
				163	log->l_grant_trace = ktrace_alloc(1024, KM_NOSLEEP);
				164	if (! log->l_grant_trace)
				165	return;
				166	}
				167
				168	ktrace_enter(log->l_grant_trace,
				169	(void *)tic,
				170	(void *)log->l_reserve_headq,
				171	(void *)log->l_write_headq,
				172	(void *)((unsigned long)log->l_grant_reserve_cycle),
				173	(void *)((unsigned long)log->l_grant_reserve_bytes),
				174	(void *)((unsigned long)log->l_grant_write_cycle),
				175	(void *)((unsigned long)log->l_grant_write_bytes),
				176	(void *)((unsigned long)log->l_curr_cycle),
				177	(void *)((unsigned long)log->l_curr_block),
				178	(void *)((unsigned long)CYCLE_LSN(log->l_tail_lsn)),
				179	(void *)((unsigned long)BLOCK_LSN(log->l_tail_lsn)),
				180	(void *)string,
				181	(void *)((unsigned long)13),
				182	(void *)((unsigned long)14),
				183	(void *)((unsigned long)15),
				184	(void *)((unsigned long)16));
				185	}
				186
				187	void
				188	xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
				189	{
				190	pid_t pid;
				191
				192	pid = current_pid();
				193
				194	if (!iclog->ic_trace)
				195	iclog->ic_trace = ktrace_alloc(256, KM_SLEEP);
				196	ktrace_enter(iclog->ic_trace,
				197	(void *)((unsigned long)state),
				198	(void *)((unsigned long)pid),
				199	(void *)0,
				200	(void *)0,
				201	(void *)0,
				202	(void *)0,
				203	(void *)0,
				204	(void *)0,
				205	(void *)0,
				206	(void *)0,
				207	(void *)0,
				208	(void *)0,
				209	(void *)0,
				210	(void *)0,
				211	(void *)0,
				212	(void *)0);
				213	}
				214
				215	#else
				216	#define xlog_trace_loggrant(log,tic,string)
				217	#define xlog_trace_iclog(iclog,state)
				218	#endif /* XFS_LOG_TRACE */
				219
				220	/*
				221	* NOTES:
				222	*
				223	* 1. currblock field gets updated at startup and after in-core logs
				224	* marked as with WANT_SYNC.
				225	*/
				226
				227	/*
				228	* This routine is called when a user of a log manager ticket is done with
				229	* the reservation. If the ticket was ever used, then a commit record for
				230	* the associated transaction is written out as a log operation header with
				231	* no data. The flag XLOG_TIC_INITED is set when the first write occurs with
				232	* a given ticket. If the ticket was one with a permanent reservation, then
				233	* a few operations are done differently. Permanent reservation tickets by
				234	* default don't release the reservation. They just commit the current
				235	* transaction with the belief that the reservation is still needed. A flag
				236	* must be passed in before permanent reservations are actually released.
				237	* When these type of tickets are not released, they need to be set into
				238	* the inited state again. By doing this, a start record will be written
				239	* out when the next write occurs.
				240	*/
				241	xfs_lsn_t
				242	xfs_log_done(xfs_mount_t *mp,
				243	xfs_log_ticket_t xtic,
				244	void **iclog,
				245	uint flags)
				246	{
				247	xlog_t *log = mp->m_log;
				248	xlog_ticket_t *ticket = (xfs_log_ticket_t) xtic;
				249	xfs_lsn_t lsn = 0;
				250
				251	#if defined(DEBUG) \|\| defined(XLOG_NOLOG)
				252	if (!xlog_debug && xlog_target == log->l_targ)
				253	return 0;
				254	#endif
				255
				256	if (XLOG_FORCED_SHUTDOWN(log) \|\|
				257	/*
				258	* If nothing was ever written, don't write out commit record.
				259	* If we get an error, just continue and give back the log ticket.
				260	*/
				261	(((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
				262	(xlog_commit_record(mp, ticket,
				263	(xlog_in_core_t **)iclog, &lsn)))) {
				264	lsn = (xfs_lsn_t) -1;
				265	if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
				266	flags \|= XFS_LOG_REL_PERM_RESERV;
				267	}
				268	}
				269
				270
				271	if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 \|\|
				272	(flags & XFS_LOG_REL_PERM_RESERV)) {
				273	/*
				274	* Release ticket if not permanent reservation or a specifc
				275	* request has been made to release a permanent reservation.
				276	*/
				277	xlog_ungrant_log_space(log, ticket);
				278	xlog_state_put_ticket(log, ticket);
				279	} else {
				280	xlog_regrant_reserve_log_space(log, ticket);
				281	}
				282
				283	/* If this ticket was a permanent reservation and we aren't
				284	* trying to release it, reset the inited flags; so next time
				285	* we write, a start record will be written out.
				286	*/
				287	if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) &&
				288	(flags & XFS_LOG_REL_PERM_RESERV) == 0)
				289	ticket->t_flags \|= XLOG_TIC_INITED;
				290
				291	return lsn;
				292	} /* xfs_log_done */
				293
				294
				295	/*
				296	* Force the in-core log to disk. If flags == XFS_LOG_SYNC,
				297	* the force is done synchronously.
				298	*
				299	* Asynchronous forces are implemented by setting the WANT_SYNC
				300	* bit in the appropriate in-core log and then returning.
				301	*
				302	* Synchronous forces are implemented with a semaphore. All callers
				303	* to force a given lsn to disk will wait on a semaphore attached to the
				304	* specific in-core log. When given in-core log finally completes its
				305	* write to disk, that thread will wake up all threads waiting on the
				306	* semaphore.
				307	*/
				308	int
				309	xfs_log_force(xfs_mount_t *mp,
				310	xfs_lsn_t lsn,
				311	uint flags)
				312	{
				313	int rval;
				314	xlog_t *log = mp->m_log;
				315
				316	#if defined(DEBUG) \|\| defined(XLOG_NOLOG)
				317	if (!xlog_debug && xlog_target == log->l_targ)
				318	return 0;
				319	#endif
				320
				321	ASSERT(flags & XFS_LOG_FORCE);
				322
				323	XFS_STATS_INC(xs_log_force);
				324
				325	if ((log->l_flags & XLOG_IO_ERROR) == 0) {
				326	if (lsn == 0)
				327	rval = xlog_state_sync_all(log, flags);
				328	else
				329	rval = xlog_state_sync(log, lsn, flags);
				330	} else {
				331	rval = XFS_ERROR(EIO);
				332	}
				333
				334	return rval;
				335
				336	} /* xfs_log_force */
				337
				338	/*
				339	* Attaches a new iclog I/O completion callback routine during
				340	* transaction commit. If the log is in error state, a non-zero
				341	* return code is handed back and the caller is responsible for
				342	* executing the callback at an appropriate time.
				343	*/
				344	int
				345	xfs_log_notify(xfs_mount_t mp, / mount of partition */
				346	void iclog_hndl, / iclog to hang callback off */
				347	xfs_log_callback_t *cb)
				348	{
				349	xlog_t *log = mp->m_log;
				350	xlog_in_core_t iclog = (xlog_in_core_t )iclog_hndl;
				351	int abortflg, spl;
				352
				353	#if defined(DEBUG) \|\| defined(XLOG_NOLOG)
				354	if (!xlog_debug && xlog_target == log->l_targ)
				355	return 0;
				356	#endif
				357	cb->cb_next = NULL;
				358	spl = LOG_LOCK(log);
				359	abortflg = (iclog->ic_state & XLOG_STATE_IOERROR);
				360	if (!abortflg) {
				361	ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) \|\|
				362	(iclog->ic_state == XLOG_STATE_WANT_SYNC));
				363	cb->cb_next = NULL;
				364	*(iclog->ic_callback_tail) = cb;
				365	iclog->ic_callback_tail = &(cb->cb_next);
				366	}
				367	LOG_UNLOCK(log, spl);
				368	return abortflg;
				369	} /* xfs_log_notify */
				370
				371	int
				372	xfs_log_release_iclog(xfs_mount_t *mp,
				373	void *iclog_hndl)
				374	{
				375	xlog_t *log = mp->m_log;
				376	xlog_in_core_t iclog = (xlog_in_core_t )iclog_hndl;
				377
				378	if (xlog_state_release_iclog(log, iclog)) {
				379	xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
				380	return(EIO);
				381	}
				382
				383	return 0;
				384	}
				385
				386	/*
				387	* 1. Reserve an amount of on-disk log space and return a ticket corresponding
				388	* to the reservation.
				389	* 2. Potentially, push buffers at tail of log to disk.
				390	*
				391	* Each reservation is going to reserve extra space for a log record header.
				392	* When writes happen to the on-disk log, we don't subtract the length of the
				393	* log record header from any reservation. By wasting space in each
				394	* reservation, we prevent over allocation problems.
				395	*/
				396	int
				397	xfs_log_reserve(xfs_mount_t *mp,
				398	int unit_bytes,
				399	int cnt,
				400	xfs_log_ticket_t *ticket,
				401	__uint8_t client,
				402	uint flags)
				403	{
				404	xlog_t *log = mp->m_log;
				405	xlog_ticket_t *internal_ticket;
				406	int retval;
				407
				408	#if defined(DEBUG) \|\| defined(XLOG_NOLOG)
				409	if (!xlog_debug && xlog_target == log->l_targ)
				410	return 0;
				411	#endif
				412	retval = 0;
				413	ASSERT(client == XFS_TRANSACTION \|\| client == XFS_LOG);
				414	ASSERT((flags & XFS_LOG_NOSLEEP) == 0);
				415
				416	if (XLOG_FORCED_SHUTDOWN(log))
				417	return XFS_ERROR(EIO);
				418
				419	XFS_STATS_INC(xs_try_logspace);
				420
				421	if (*ticket != NULL) {
				422	ASSERT(flags & XFS_LOG_PERM_RESERV);
				423	internal_ticket = (xlog_ticket_t )ticket;
				424	xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
				425	retval = xlog_regrant_write_log_space(log, internal_ticket);
				426	} else {
				427	/* may sleep if need to allocate more tickets */
				428	internal_ticket = xlog_ticket_get(log, unit_bytes, cnt,
				429	client, flags);
				430	*ticket = internal_ticket;
				431	xlog_grant_push_ail(mp,
				432	(internal_ticket->t_unit_res *
				433	internal_ticket->t_cnt));
				434	retval = xlog_grant_log_space(log, internal_ticket);
				435	}
				436
				437	return retval;
				438	} /* xfs_log_reserve */
				439
				440
				441	/*
				442	* Mount a log filesystem
				443	*
				444	* mp - ubiquitous xfs mount point structure
				445	* log_target - buftarg of on-disk log device
				446	* blk_offset - Start block # where block size is 512 bytes (BBSIZE)
				447	* num_bblocks - Number of BBSIZE blocks in on-disk log
				448	*
				449	* Return error or zero.
				450	*/
				451	int
				452	xfs_log_mount(xfs_mount_t *mp,
				453	xfs_buftarg_t *log_target,
				454	xfs_daddr_t blk_offset,
				455	int num_bblks)
				456	{
				457	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
				458	cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname);
				459	else {
				460	cmn_err(CE_NOTE,
				461	"!Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.",
				462	mp->m_fsname);
				463	ASSERT(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY);
				464	}
				465
				466	mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
				467
				468	#if defined(DEBUG) \|\| defined(XLOG_NOLOG)
				469	if (!xlog_debug) {
				470	cmn_err(CE_NOTE, "log dev: %s", XFS_BUFTARG_NAME(log_target));
				471	return 0;
				472	}
				473	#endif
				474	/*
				475	* skip log recovery on a norecovery mount. pretend it all
				476	* just worked.
				477	*/
				478	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
				479	int error;
				480	vfs_t *vfsp = XFS_MTOVFS(mp);
				481	int readonly = (vfsp->vfs_flag & VFS_RDONLY);
				482
				483	if (readonly)
				484	vfsp->vfs_flag &= ~VFS_RDONLY;
				485
				486	error = xlog_recover(mp->m_log, readonly);
				487
				488	if (readonly)
				489	vfsp->vfs_flag \|= VFS_RDONLY;
				490	if (error) {
				491	cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error);
				492	xlog_unalloc_log(mp->m_log);
				493	return error;
				494	}
				495	}
				496
				497	/* Normal transactions can now occur */
				498	mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
				499
				500	/* End mounting message in xfs_log_mount_finish */
				501	return 0;
				502	} /* xfs_log_mount */
				503
				504	/*
				505	* Finish the recovery of the file system. This is separate from
				506	* the xfs_log_mount() call, because it depends on the code in
				507	* xfs_mountfs() to read in the root and real-time bitmap inodes
				508	* between calling xfs_log_mount() and here.
				509	*
				510	* mp - ubiquitous xfs mount point structure
				511	*/
				512	int
				513	xfs_log_mount_finish(xfs_mount_t *mp, int mfsi_flags)
				514	{
				515	int error;
				516
				517	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
				518	error = xlog_recover_finish(mp->m_log, mfsi_flags);
				519	else {
				520	error = 0;
				521	ASSERT(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY);
				522	}
				523
				524	return error;
				525	}
				526
				527	/*
				528	* Unmount processing for the log.
				529	*/
				530	int
				531	xfs_log_unmount(xfs_mount_t *mp)
				532	{
				533	int error;
				534
				535	error = xfs_log_unmount_write(mp);
				536	xfs_log_unmount_dealloc(mp);
				537	return (error);
				538	}
				539
				540	/*
				541	* Final log writes as part of unmount.
				542	*
				543	* Mark the filesystem clean as unmount happens. Note that during relocation
				544	* this routine needs to be executed as part of source-bag while the
				545	* deallocation must not be done until source-end.
				546	*/
				547
				548	/*
				549	* Unmount record used to have a string "Unmount filesystem--" in the
				550	* data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
				551	* We just write the magic number now since that particular field isn't
				552	* currently architecture converted and "nUmount" is a bit foo.
				553	* As far as I know, there weren't any dependencies on the old behaviour.
				554	*/
				555
				556	int
				557	xfs_log_unmount_write(xfs_mount_t *mp)
				558	{
				559	xlog_t *log = mp->m_log;
				560	xlog_in_core_t *iclog;
				561	#ifdef DEBUG
				562	xlog_in_core_t *first_iclog;
				563	#endif
				564	xfs_log_iovec_t reg[1];
				565	xfs_log_ticket_t tic = NULL;
				566	xfs_lsn_t lsn;
				567	int error;
				568	SPLDECL(s);
				569
				570	/* the data section must be 32 bit size aligned */
				571	struct {
				572	__uint16_t magic;
				573	__uint16_t pad1;
				574	__uint32_t pad2; /* may as well make it 64 bits */
				575	} magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
				576
				577	#if defined(DEBUG) \|\| defined(XLOG_NOLOG)
				578	if (!xlog_debug && xlog_target == log->l_targ)
				579	return 0;
				580	#endif
				581
				582	/*
				583	* Don't write out unmount record on read-only mounts.
				584	* Or, if we are doing a forced umount (typically because of IO errors).
				585	*/
				586	if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY)
				587	return 0;
				588
				589	xfs_log_force(mp, 0, XFS_LOG_FORCE\|XFS_LOG_SYNC);
				590
				591	#ifdef DEBUG
				592	first_iclog = iclog = log->l_iclog;
				593	do {
				594	if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
				595	ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE);
				596	ASSERT(iclog->ic_offset == 0);
				597	}
				598	iclog = iclog->ic_next;
				599	} while (iclog != first_iclog);
				600	#endif
				601	if (! (XLOG_FORCED_SHUTDOWN(log))) {
				602	reg[0].i_addr = (void*)&magic;
				603	reg[0].i_len = sizeof(magic);
				604
				605	error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0);
				606	if (!error) {
				607	/* remove inited flag */
				608	((xlog_ticket_t *)tic)->t_flags = 0;
				609	error = xlog_write(mp, reg, 1, tic, &lsn,
				610	NULL, XLOG_UNMOUNT_TRANS);
				611	/*
				612	* At this point, we're umounting anyway,
				613	* so there's no point in transitioning log state
				614	* to IOERROR. Just continue...
				615	*/
				616	}
				617
				618	if (error) {
				619	xfs_fs_cmn_err(CE_ALERT, mp,
				620	"xfs_log_unmount: unmount record failed");
				621	}
				622
				623
				624	s = LOG_LOCK(log);
				625	iclog = log->l_iclog;
				626	iclog->ic_refcnt++;
				627	LOG_UNLOCK(log, s);
				628	xlog_state_want_sync(log, iclog);
				629	(void) xlog_state_release_iclog(log, iclog);
				630
				631	s = LOG_LOCK(log);
				632	if (!(iclog->ic_state == XLOG_STATE_ACTIVE \|\|
				633	iclog->ic_state == XLOG_STATE_DIRTY)) {
				634	if (!XLOG_FORCED_SHUTDOWN(log)) {
				635	sv_wait(&iclog->ic_forcesema, PMEM,
				636	&log->l_icloglock, s);
				637	} else {
				638	LOG_UNLOCK(log, s);
				639	}
				640	} else {
				641	LOG_UNLOCK(log, s);
				642	}
				643	if (tic)
				644	xlog_state_put_ticket(log, tic);
				645	} else {
				646	/*
				647	* We're already in forced_shutdown mode, couldn't
				648	* even attempt to write out the unmount transaction.
				649	*
				650	* Go through the motions of sync'ing and releasing
				651	* the iclog, even though no I/O will actually happen,
				652	* we need to wait for other log I/O's that may already
				653	* be in progress. Do this as a separate section of
				654	* code so we'll know if we ever get stuck here that
				655	* we're in this odd situation of trying to unmount
				656	* a file system that went into forced_shutdown as
				657	* the result of an unmount..
				658	*/
				659	s = LOG_LOCK(log);
				660	iclog = log->l_iclog;
				661	iclog->ic_refcnt++;
				662	LOG_UNLOCK(log, s);
				663
				664	xlog_state_want_sync(log, iclog);
				665	(void) xlog_state_release_iclog(log, iclog);
				666
				667	s = LOG_LOCK(log);
				668
				669	if ( ! ( iclog->ic_state == XLOG_STATE_ACTIVE
				670	\|\| iclog->ic_state == XLOG_STATE_DIRTY
				671	\|\| iclog->ic_state == XLOG_STATE_IOERROR) ) {
				672
				673	sv_wait(&iclog->ic_forcesema, PMEM,
				674	&log->l_icloglock, s);
				675	} else {
				676	LOG_UNLOCK(log, s);
				677	}
				678	}
				679
				680	return 0;
				681	} /* xfs_log_unmount_write */
				682
				683	/*
				684	* Deallocate log structures for unmount/relocation.
				685	*/
				686	void
				687	xfs_log_unmount_dealloc(xfs_mount_t *mp)
				688	{
				689	xlog_unalloc_log(mp->m_log);
				690	}
				691
				692	/*
				693	* Write region vectors to log. The write happens using the space reservation
				694	* of the ticket (tic). It is not a requirement that all writes for a given
				695	* transaction occur with one call to xfs_log_write().
				696	*/
				697	int
				698	xfs_log_write(xfs_mount_t * mp,
				699	xfs_log_iovec_t reg[],
				700	int nentries,
				701	xfs_log_ticket_t tic,
				702	xfs_lsn_t *start_lsn)
				703	{
				704	int error;
				705	xlog_t *log = mp->m_log;
				706
				707	#if defined(DEBUG) \|\| defined(XLOG_NOLOG)
				708	if (!xlog_debug && xlog_target == log->l_targ) {
				709	*start_lsn = 0;
				710	return 0;
				711	}
				712	#endif
				713	if (XLOG_FORCED_SHUTDOWN(log))
				714	return XFS_ERROR(EIO);
				715
				716	if ((error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0))) {
				717	xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
				718	}
				719	return (error);
				720	} /* xfs_log_write */
				721
				722
				723	void
				724	xfs_log_move_tail(xfs_mount_t *mp,
				725	xfs_lsn_t tail_lsn)
				726	{
				727	xlog_ticket_t *tic;
				728	xlog_t *log = mp->m_log;
				729	int need_bytes, free_bytes, cycle, bytes;
				730	SPLDECL(s);
				731
				732	#if defined(DEBUG) \|\| defined(XLOG_NOLOG)
				733	if (!xlog_debug && xlog_target == log->l_targ)
				734	return;
				735	#endif
				736	/* XXXsup tmp */
				737	if (XLOG_FORCED_SHUTDOWN(log))
				738	return;
				739	ASSERT(!XFS_FORCED_SHUTDOWN(mp));
				740
				741	if (tail_lsn == 0) {
				742	/* needed since sync_lsn is 64 bits */
				743	s = LOG_LOCK(log);
				744	tail_lsn = log->l_last_sync_lsn;
				745	LOG_UNLOCK(log, s);
				746	}
				747
				748	s = GRANT_LOCK(log);
				749
				750	/* Also an invalid lsn. 1 implies that we aren't passing in a valid
				751	* tail_lsn.
				752	*/
				753	if (tail_lsn != 1) {
				754	log->l_tail_lsn = tail_lsn;
				755	}
				756
				757	if ((tic = log->l_write_headq)) {
				758	#ifdef DEBUG
				759	if (log->l_flags & XLOG_ACTIVE_RECOVERY)
				760	panic("Recovery problem");
				761	#endif
				762	cycle = log->l_grant_write_cycle;
				763	bytes = log->l_grant_write_bytes;
				764	free_bytes = xlog_space_left(log, cycle, bytes);
				765	do {
				766	ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
				767
				768	if (free_bytes < tic->t_unit_res && tail_lsn != 1)
				769	break;
				770	tail_lsn = 0;
				771	free_bytes -= tic->t_unit_res;
				772	sv_signal(&tic->t_sema);
				773	tic = tic->t_next;
				774	} while (tic != log->l_write_headq);
				775	}
				776	if ((tic = log->l_reserve_headq)) {
				777	#ifdef DEBUG
				778	if (log->l_flags & XLOG_ACTIVE_RECOVERY)
				779	panic("Recovery problem");
				780	#endif
				781	cycle = log->l_grant_reserve_cycle;
				782	bytes = log->l_grant_reserve_bytes;
				783	free_bytes = xlog_space_left(log, cycle, bytes);
				784	do {
				785	if (tic->t_flags & XLOG_TIC_PERM_RESERV)
				786	need_bytes = tic->t_unit_res*tic->t_cnt;
				787	else
				788	need_bytes = tic->t_unit_res;
				789	if (free_bytes < need_bytes && tail_lsn != 1)
				790	break;
				791	tail_lsn = 0;
				792	free_bytes -= need_bytes;
				793	sv_signal(&tic->t_sema);
				794	tic = tic->t_next;
				795	} while (tic != log->l_reserve_headq);
				796	}
				797	GRANT_UNLOCK(log, s);
				798	} /* xfs_log_move_tail */
				799
				800	/*
				801	* Determine if we have a transaction that has gone to disk
				802	* that needs to be covered. Log activity needs to be idle (no AIL and
				803	* nothing in the iclogs). And, we need to be in the right state indicating
				804	* something has gone out.
				805	*/
				806	int
				807	xfs_log_need_covered(xfs_mount_t *mp)
				808	{
				809	SPLDECL(s);
				810	int needed = 0, gen;
				811	xlog_t *log = mp->m_log;
				812	vfs_t *vfsp = XFS_MTOVFS(mp);
				813
				814	if (fs_frozen(vfsp) \|\| XFS_FORCED_SHUTDOWN(mp) \|\|
				815	(vfsp->vfs_flag & VFS_RDONLY))
				816	return 0;
				817
				818	s = LOG_LOCK(log);
				819	if (((log->l_covered_state == XLOG_STATE_COVER_NEED) \|\|
				820	(log->l_covered_state == XLOG_STATE_COVER_NEED2))
				821	&& !xfs_trans_first_ail(mp, &gen)
				822	&& xlog_iclogs_empty(log)) {
				823	if (log->l_covered_state == XLOG_STATE_COVER_NEED)
				824	log->l_covered_state = XLOG_STATE_COVER_DONE;
				825	else {
				826	ASSERT(log->l_covered_state == XLOG_STATE_COVER_NEED2);
				827	log->l_covered_state = XLOG_STATE_COVER_DONE2;
				828	}
				829	needed = 1;
				830	}
				831	LOG_UNLOCK(log, s);
				832	return(needed);
				833	}
				834
				835	/******************************************************************************
				836	*
				837	* local routines
				838	*
				839	******************************************************************************
				840	*/
				841
				842	/* xfs_trans_tail_ail returns 0 when there is nothing in the list.
				843	* The log manager must keep track of the last LR which was committed
				844	* to disk. The lsn of this LR will become the new tail_lsn whenever
				845	* xfs_trans_tail_ail returns 0. If we don't do this, we run into
				846	* the situation where stuff could be written into the log but nothing
				847	* was ever in the AIL when asked. Eventually, we panic since the
				848	* tail hits the head.
				849	*
				850	* We may be holding the log iclog lock upon entering this routine.
				851	*/
				852	xfs_lsn_t
				853	xlog_assign_tail_lsn(xfs_mount_t *mp)
				854	{
				855	xfs_lsn_t tail_lsn;
				856	SPLDECL(s);
				857	xlog_t *log = mp->m_log;
				858
				859	tail_lsn = xfs_trans_tail_ail(mp);
				860	s = GRANT_LOCK(log);
				861	if (tail_lsn != 0) {
				862	log->l_tail_lsn = tail_lsn;
				863	} else {
				864	tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn;
				865	}
				866	GRANT_UNLOCK(log, s);
				867
				868	return tail_lsn;
				869	} /* xlog_assign_tail_lsn */
				870
				871
				872	/*
				873	* Return the space in the log between the tail and the head. The head
				874	* is passed in the cycle/bytes formal parms. In the special case where
				875	* the reserve head has wrapped passed the tail, this calculation is no
				876	* longer valid. In this case, just return 0 which means there is no space
				877	* in the log. This works for all places where this function is called
				878	* with the reserve head. Of course, if the write head were to ever
				879	* wrap the tail, we should blow up. Rather than catch this case here,
				880	* we depend on other ASSERTions in other parts of the code. XXXmiken
				881	*
				882	* This code also handles the case where the reservation head is behind
				883	* the tail. The details of this case are described below, but the end
				884	* result is that we return the size of the log as the amount of space left.
				885	*/
				886	int
				887	xlog_space_left(xlog_t *log, int cycle, int bytes)
				888	{
				889	int free_bytes;
				890	int tail_bytes;
				891	int tail_cycle;
				892
				893	tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn));
				894	tail_cycle = CYCLE_LSN(log->l_tail_lsn);
				895	if ((tail_cycle == cycle) && (bytes >= tail_bytes)) {
				896	free_bytes = log->l_logsize - (bytes - tail_bytes);
				897	} else if ((tail_cycle + 1) < cycle) {
				898	return 0;
				899	} else if (tail_cycle < cycle) {
				900	ASSERT(tail_cycle == (cycle - 1));
				901	free_bytes = tail_bytes - bytes;
				902	} else {
				903	/*
				904	* The reservation head is behind the tail.
				905	* In this case we just want to return the size of the
				906	* log as the amount of space left.
				907	*/
				908	xfs_fs_cmn_err(CE_ALERT, log->l_mp,
				909	"xlog_space_left: head behind tail\n"
				910	" tail_cycle = %d, tail_bytes = %d\n"
				911	" GH cycle = %d, GH bytes = %d",
				912	tail_cycle, tail_bytes, cycle, bytes);
				913	ASSERT(0);
				914	free_bytes = log->l_logsize;
				915	}
				916	return free_bytes;
				917	} /* xlog_space_left */
				918
				919
				920	/*
				921	* Log function which is called when an io completes.
				922	*
				923	* The log manager needs its own routine, in order to control what
				924	* happens with the buffer after the write completes.
				925	*/
				926	void
				927	xlog_iodone(xfs_buf_t *bp)
				928	{
				929	xlog_in_core_t *iclog;
				930	xlog_t *l;
				931	int aborted;
				932
				933	iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
				934	ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long) 2);
				935	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
				936	aborted = 0;
				937
				938	/*
				939	* Some versions of cpp barf on the recursive definition of
				940	* ic_log -> hic_fields.ic_log and expand ic_log twice when
				941	* it is passed through two macros. Workaround broken cpp.
				942	*/
				943	l = iclog->ic_log;
				944
				945	/*
				946	* Race to shutdown the filesystem if we see an error.
				947	*/
				948	if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp,
				949	XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
				950	xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp));
				951	XFS_BUF_STALE(bp);
				952	xfs_force_shutdown(l->l_mp, XFS_LOG_IO_ERROR);
				953	/*
				954	* This flag will be propagated to the trans-committed
				955	* callback routines to let them know that the log-commit
				956	* didn't succeed.
				957	*/
				958	aborted = XFS_LI_ABORTED;
				959	} else if (iclog->ic_state & XLOG_STATE_IOERROR) {
				960	aborted = XFS_LI_ABORTED;
				961	}
				962	xlog_state_done_syncing(iclog, aborted);
				963	if (!(XFS_BUF_ISASYNC(bp))) {
				964	/*
				965	* Corresponding psema() will be done in bwrite(). If we don't
				966	* vsema() here, panic.
				967	*/
				968	XFS_BUF_V_IODONESEMA(bp);
				969	}
				970	} /* xlog_iodone */
				971
				972	/*
				973	* The bdstrat callback function for log bufs. This gives us a central
				974	* place to trap bufs in case we get hit by a log I/O error and need to
				975	* shutdown. Actually, in practice, even when we didn't get a log error,
				976	* we transition the iclogs to IOERROR state after flushing all existing
				977	* iclogs to disk. This is because we don't want anymore new transactions to be
				978	* started or completed afterwards.
				979	*/
				980	STATIC int
				981	xlog_bdstrat_cb(struct xfs_buf *bp)
				982	{
				983	xlog_in_core_t *iclog;
				984
				985	iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
				986
				987	if ((iclog->ic_state & XLOG_STATE_IOERROR) == 0) {
				988	/* note for irix bstrat will need struct bdevsw passed
				989	* Fix the following macro if the code ever is merged
				990	*/
				991	XFS_bdstrat(bp);
				992	return 0;
				993	}
				994
				995	xfs_buftrace("XLOG__BDSTRAT IOERROR", bp);
				996	XFS_BUF_ERROR(bp, EIO);
				997	XFS_BUF_STALE(bp);
				998	xfs_biodone(bp);
				999	return (XFS_ERROR(EIO));
				1000
				1001
				1002	}
				1003
				1004	/*
				1005	* Return size of each in-core log record buffer.
				1006	*
				1007	* Low memory machines only get 2 16KB buffers. We don't want to waste
				1008	* memory here. However, all other machines get at least 2 32KB buffers.
				1009	* The number is hard coded because we don't care about the minimum
				1010	* memory size, just 32MB systems.
				1011	*
				1012	* If the filesystem blocksize is too large, we may need to choose a
				1013	* larger size since the directory code currently logs entire blocks.
				1014	*/
				1015
				1016	STATIC void
				1017	xlog_get_iclog_buffer_size(xfs_mount_t *mp,
				1018	xlog_t *log)
				1019	{
				1020	int size;
				1021	int xhdrs;
				1022
				1023	#if defined(DEBUG) \|\| defined(XLOG_NOLOG)
				1024	/*
				1025	* When logbufs == 0, someone has disabled the log from the FSTAB
				1026	* file. This is not a documented feature. We need to set xlog_debug
				1027	* to zero (this deactivates the log) and set xlog_target to the
				1028	* appropriate device. Only one filesystem may be affected as such
				1029	* since this is just a performance hack to test what we might be able
				1030	* to get if the log were not present.
				1031	*/
				1032	if (mp->m_logbufs == 0) {
				1033	xlog_debug = 0;
				1034	xlog_target = log->l_targ;
				1035	log->l_iclog_bufs = XLOG_MIN_ICLOGS;
				1036	} else
				1037	#endif
				1038	{
				1039	/*
				1040	* This is the normal path. If m_logbufs == -1, then the
				1041	* admin has chosen to use the system defaults for logbuffers.
				1042	*/
				1043	if (mp->m_logbufs == -1) {
				1044	if (xfs_physmem <= btoc(12810241024)) {
				1045	log->l_iclog_bufs = XLOG_MIN_ICLOGS;
				1046	} else if (xfs_physmem <= btoc(40010241024)) {
				1047	log->l_iclog_bufs = XLOG_MED_ICLOGS;
				1048	} else {
				1049	/* 256K with 32K bufs */
				1050	log->l_iclog_bufs = XLOG_MAX_ICLOGS;
				1051	}
				1052	} else
				1053	log->l_iclog_bufs = mp->m_logbufs;
				1054
				1055	#if defined(DEBUG) \|\| defined(XLOG_NOLOG)
				1056	/* We are reactivating a filesystem after it was inactive */
				1057	if (log->l_targ == xlog_target) {
				1058	xlog_target = NULL;
				1059	xlog_debug = 1;
				1060	}
				1061	#endif
				1062	}
				1063
				1064	/*
				1065	* Buffer size passed in from mount system call.
				1066	*/
				1067	if (mp->m_logbsize != -1) {
				1068	size = log->l_iclog_size = mp->m_logbsize;
				1069	log->l_iclog_size_log = 0;
				1070	while (size != 1) {
				1071	log->l_iclog_size_log++;
				1072	size >>= 1;
				1073	}
				1074
				1075	if (XFS_SB_VERSION_HASLOGV2(&mp->m_sb)) {
				1076	/* # headers = size / 32K
				1077	* one header holds cycles from 32K of data
				1078	*/
				1079
				1080	xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE;
				1081	if (mp->m_logbsize % XLOG_HEADER_CYCLE_SIZE)
				1082	xhdrs++;
				1083	log->l_iclog_hsize = xhdrs << BBSHIFT;
				1084	log->l_iclog_heads = xhdrs;
				1085	} else {
				1086	ASSERT(mp->m_logbsize <= XLOG_BIG_RECORD_BSIZE);
				1087	log->l_iclog_hsize = BBSIZE;
				1088	log->l_iclog_heads = 1;
				1089	}
				1090	return;
				1091	}
				1092
				1093	/*
				1094	* Special case machines that have less than 32MB of memory.
				1095	* All machines with more memory use 32KB buffers.
				1096	*/
				1097	if (xfs_physmem <= btoc(3210241024)) {
				1098	/* Don't change; min configuration */
				1099	log->l_iclog_size = XLOG_RECORD_BSIZE; /* 16k */
				1100	log->l_iclog_size_log = XLOG_RECORD_BSHIFT;
				1101	} else {
				1102	log->l_iclog_size = XLOG_BIG_RECORD_BSIZE; /* 32k */
				1103	log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
				1104	}
				1105
				1106	/* the default log size is 16k or 32k which is one header sector */
				1107	log->l_iclog_hsize = BBSIZE;
				1108	log->l_iclog_heads = 1;
				1109
				1110	/*
				1111	* For 16KB, we use 3 32KB buffers. For 32KB block sizes, we use
				1112	* 4 32KB buffers. For 64KB block sizes, we use 8 32KB buffers.
				1113	*/
				1114	if (mp->m_sb.sb_blocksize >= 16*1024) {
				1115	log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
				1116	log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
				1117	if (mp->m_logbufs == -1) {
				1118	switch (mp->m_sb.sb_blocksize) {
				1119	case 161024: / 16 KB */
				1120	log->l_iclog_bufs = 3;
				1121	break;
				1122	case 321024: / 32 KB */
				1123	log->l_iclog_bufs = 4;
				1124	break;
				1125	case 641024: / 64 KB */
				1126	log->l_iclog_bufs = 8;
				1127	break;
				1128	default:
				1129	xlog_panic("XFS: Invalid blocksize");
				1130	break;
				1131	}
				1132	}
				1133	}
				1134	} /* xlog_get_iclog_buffer_size */
				1135
				1136
				1137	/*
				1138	* This routine initializes some of the log structure for a given mount point.
				1139	* Its primary purpose is to fill in enough, so recovery can occur. However,
				1140	* some other stuff may be filled in too.
				1141	*/
				1142	STATIC xlog_t *
				1143	xlog_alloc_log(xfs_mount_t *mp,
				1144	xfs_buftarg_t *log_target,
				1145	xfs_daddr_t blk_offset,
				1146	int num_bblks)
				1147	{
				1148	xlog_t *log;
				1149	xlog_rec_header_t *head;
				1150	xlog_in_core_t **iclogp;
				1151	xlog_in_core_t iclog, prev_iclog=NULL;
				1152	xfs_buf_t *bp;
				1153	int i;
				1154	int iclogsize;
				1155
				1156	log = (xlog_t *)kmem_zalloc(sizeof(xlog_t), KM_SLEEP);
				1157
				1158	log->l_mp = mp;
				1159	log->l_targ = log_target;
				1160	log->l_logsize = BBTOB(num_bblks);
				1161	log->l_logBBstart = blk_offset;
				1162	log->l_logBBsize = num_bblks;
				1163	log->l_covered_state = XLOG_STATE_COVER_IDLE;
				1164	log->l_flags \|= XLOG_ACTIVE_RECOVERY;
				1165
				1166	log->l_prev_block = -1;
				1167	ASSIGN_ANY_LSN_HOST(log->l_tail_lsn, 1, 0);
				1168	/* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
				1169	log->l_last_sync_lsn = log->l_tail_lsn;
				1170	log->l_curr_cycle = 1; /* 0 is bad since this is initial value */
				1171	log->l_grant_reserve_cycle = 1;
				1172	log->l_grant_write_cycle = 1;
				1173
				1174	if (XFS_SB_VERSION_HASSECTOR(&mp->m_sb)) {
				1175	log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT;
				1176	ASSERT(log->l_sectbb_log <= mp->m_sectbb_log);
				1177	/* for larger sector sizes, must have v2 or external log */
				1178	ASSERT(log->l_sectbb_log == 0 \|\|
				1179	log->l_logBBstart == 0 \|\|
				1180	XFS_SB_VERSION_HASLOGV2(&mp->m_sb));
				1181	ASSERT(mp->m_sb.sb_logsectlog >= BBSHIFT);
				1182	}
				1183	log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1;
				1184
				1185	xlog_get_iclog_buffer_size(mp, log);
				1186
				1187	bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp);
				1188	XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
				1189	XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
				1190	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
				1191	ASSERT(XFS_BUF_ISBUSY(bp));
				1192	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
				1193	log->l_xbuf = bp;
				1194
				1195	spinlock_init(&log->l_icloglock, "iclog");
				1196	spinlock_init(&log->l_grant_lock, "grhead_iclog");
				1197	initnsema(&log->l_flushsema, 0, "ic-flush");
				1198	xlog_state_ticket_alloc(log); /* wait until after icloglock inited */
				1199
				1200	/* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
				1201	ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
				1202
				1203	iclogp = &log->l_iclog;
				1204	/*
				1205	* The amount of memory to allocate for the iclog structure is
				1206	* rather funky due to the way the structure is defined. It is
				1207	* done this way so that we can use different sizes for machines
				1208	* with different amounts of memory. See the definition of
				1209	* xlog_in_core_t in xfs_log_priv.h for details.
				1210	*/
				1211	iclogsize = log->l_iclog_size;
				1212	ASSERT(log->l_iclog_size >= 4096);
				1213	for (i=0; i < log->l_iclog_bufs; i++) {
				1214	iclogp = (xlog_in_core_t )
				1215	kmem_zalloc(sizeof(xlog_in_core_t), KM_SLEEP);
				1216	iclog = *iclogp;
				1217	iclog->hic_data = (xlog_in_core_2_t *)
				1218	kmem_zalloc(iclogsize, KM_SLEEP);
				1219
				1220	iclog->ic_prev = prev_iclog;
				1221	prev_iclog = iclog;
				1222	log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
				1223
				1224	head = &iclog->ic_header;
				1225	memset(head, 0, sizeof(xlog_rec_header_t));
				1226	INT_SET(head->h_magicno, ARCH_CONVERT, XLOG_HEADER_MAGIC_NUM);
				1227	INT_SET(head->h_version, ARCH_CONVERT,
				1228	XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? 2 : 1);
				1229	INT_SET(head->h_size, ARCH_CONVERT, log->l_iclog_size);
				1230	/* new fields */
				1231	INT_SET(head->h_fmt, ARCH_CONVERT, XLOG_FMT);
				1232	memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
				1233
				1234	bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp);
				1235	XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
				1236	XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
				1237	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
				1238	iclog->ic_bp = bp;
				1239
				1240	iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize;
				1241	iclog->ic_state = XLOG_STATE_ACTIVE;
				1242	iclog->ic_log = log;
				1243	iclog->ic_callback_tail = &(iclog->ic_callback);
				1244	iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize;
				1245
				1246	ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
				1247	ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
				1248	sv_init(&iclog->ic_forcesema, SV_DEFAULT, "iclog-force");
				1249	sv_init(&iclog->ic_writesema, SV_DEFAULT, "iclog-write");
				1250
				1251	iclogp = &iclog->ic_next;
				1252	}
				1253	iclogp = log->l_iclog; / complete ring */
				1254	log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
				1255
				1256	return log;
				1257	} /* xlog_alloc_log */
				1258
				1259
				1260	/*
				1261	* Write out the commit record of a transaction associated with the given
				1262	* ticket. Return the lsn of the commit record.
				1263	*/
				1264	STATIC int
				1265	xlog_commit_record(xfs_mount_t *mp,
				1266	xlog_ticket_t *ticket,
				1267	xlog_in_core_t **iclog,
				1268	xfs_lsn_t *commitlsnp)
				1269	{
				1270	int error;
				1271	xfs_log_iovec_t reg[1];
				1272
				1273	reg[0].i_addr = NULL;
				1274	reg[0].i_len = 0;
				1275
				1276	ASSERT_ALWAYS(iclog);
				1277	if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp,
				1278	iclog, XLOG_COMMIT_TRANS))) {
				1279	xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
				1280	}
				1281	return (error);
				1282	} /* xlog_commit_record */
				1283
				1284
				1285	/*
				1286	* Push on the buffer cache code if we ever use more than 75% of the on-disk
				1287	* log space. This code pushes on the lsn which would supposedly free up
				1288	* the 25% which we want to leave free. We may need to adopt a policy which
				1289	* pushes on an lsn which is further along in the log once we reach the high
				1290	* water mark. In this manner, we would be creating a low water mark.
				1291	*/
				1292	void
				1293	xlog_grant_push_ail(xfs_mount_t *mp,
				1294	int need_bytes)
				1295	{
				1296	xlog_t log = mp->m_log; / pointer to the log */
				1297	xfs_lsn_t tail_lsn; /* lsn of the log tail */
				1298	xfs_lsn_t threshold_lsn = 0; /* lsn we'd like to be at */
				1299	int free_blocks; /* free blocks left to write to */
				1300	int free_bytes; /* free bytes left to write to */
				1301	int threshold_block; /* block in lsn we'd like to be at */
				1302	int threshold_cycle; /* lsn cycle we'd like to be at */
				1303	int free_threshold;
				1304	SPLDECL(s);
				1305
				1306	ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
				1307
				1308	s = GRANT_LOCK(log);
				1309	free_bytes = xlog_space_left(log,
				1310	log->l_grant_reserve_cycle,
				1311	log->l_grant_reserve_bytes);
				1312	tail_lsn = log->l_tail_lsn;
				1313	free_blocks = BTOBBT(free_bytes);
				1314
				1315	/*
				1316	* Set the threshold for the minimum number of free blocks in the
				1317	* log to the maximum of what the caller needs, one quarter of the
				1318	* log, and 256 blocks.
				1319	*/
				1320	free_threshold = BTOBB(need_bytes);
				1321	free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
				1322	free_threshold = MAX(free_threshold, 256);
				1323	if (free_blocks < free_threshold) {
				1324	threshold_block = BLOCK_LSN(tail_lsn) + free_threshold;
				1325	threshold_cycle = CYCLE_LSN(tail_lsn);
				1326	if (threshold_block >= log->l_logBBsize) {
				1327	threshold_block -= log->l_logBBsize;
				1328	threshold_cycle += 1;
				1329	}
				1330	ASSIGN_ANY_LSN_HOST(threshold_lsn, threshold_cycle,
				1331	threshold_block);
				1332
				1333	/* Don't pass in an lsn greater than the lsn of the last
				1334	* log record known to be on disk.
				1335	*/
				1336	if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0)
				1337	threshold_lsn = log->l_last_sync_lsn;
				1338	}
				1339	GRANT_UNLOCK(log, s);
				1340
				1341	/*
				1342	* Get the transaction layer to kick the dirty buffers out to
				1343	* disk asynchronously. No point in trying to do this if
				1344	* the filesystem is shutting down.
				1345	*/
				1346	if (threshold_lsn &&
				1347	!XLOG_FORCED_SHUTDOWN(log))
				1348	xfs_trans_push_ail(mp, threshold_lsn);
				1349	} /* xlog_grant_push_ail */
				1350
				1351
				1352	/*
				1353	* Flush out the in-core log (iclog) to the on-disk log in an asynchronous
				1354	* fashion. Previously, we should have moved the current iclog
				1355	* ptr in the log to point to the next available iclog. This allows further
				1356	* write to continue while this code syncs out an iclog ready to go.
				1357	* Before an in-core log can be written out, the data section must be scanned
				1358	* to save away the 1st word of each BBSIZE block into the header. We replace
				1359	* it with the current cycle count. Each BBSIZE block is tagged with the
				1360	* cycle count because there in an implicit assumption that drives will
				1361	* guarantee that entire 512 byte blocks get written at once. In other words,
				1362	* we can't have part of a 512 byte block written and part not written. By
				1363	* tagging each block, we will know which blocks are valid when recovering
				1364	* after an unclean shutdown.
				1365	*
				1366	* This routine is single threaded on the iclog. No other thread can be in
				1367	* this routine with the same iclog. Changing contents of iclog can there-
				1368	* fore be done without grabbing the state machine lock. Updating the global
				1369	* log will require grabbing the lock though.
				1370	*
				1371	* The entire log manager uses a logical block numbering scheme. Only
				1372	* log_sync (and then only bwrite()) know about the fact that the log may
				1373	* not start with block zero on a given device. The log block start offset
				1374	* is added immediately before calling bwrite().
				1375	*/
				1376
				1377	int
				1378	xlog_sync(xlog_t *log,
				1379	xlog_in_core_t *iclog)
				1380	{
				1381	xfs_caddr_t dptr; /* pointer to byte sized element */
				1382	xfs_buf_t *bp;
				1383	int i, ops;
				1384	uint count; /* byte count of bwrite */
				1385	uint count_init; /* initial count before roundup */
				1386	int roundoff; /* roundoff to BB or stripe */
				1387	int split = 0; /* split write into two regions */
				1388	int error;
				1389	SPLDECL(s);
				1390	int v2 = XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb);
				1391
				1392	XFS_STATS_INC(xs_log_writes);
				1393	ASSERT(iclog->ic_refcnt == 0);
				1394
				1395	/* Add for LR header */
				1396	count_init = log->l_iclog_hsize + iclog->ic_offset;
				1397
				1398	/* Round out the log write size */
				1399	if (v2 && log->l_mp->m_sb.sb_logsunit > 1) {
				1400	/* we have a v2 stripe unit to use */
				1401	count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
				1402	} else {
				1403	count = BBTOB(BTOBB(count_init));
				1404	}
				1405	roundoff = count - count_init;
				1406	ASSERT(roundoff >= 0);
				1407	ASSERT((v2 && log->l_mp->m_sb.sb_logsunit > 1 &&
				1408	roundoff < log->l_mp->m_sb.sb_logsunit)
				1409	\|\|
				1410	(log->l_mp->m_sb.sb_logsunit <= 1 &&
				1411	roundoff < BBTOB(1)));
				1412
				1413	/* move grant heads by roundoff in sync */
				1414	s = GRANT_LOCK(log);
				1415	XLOG_GRANT_ADD_SPACE(log, roundoff, 'w');
				1416	XLOG_GRANT_ADD_SPACE(log, roundoff, 'r');
				1417	GRANT_UNLOCK(log, s);
				1418
				1419	/* put cycle number in every block */
				1420	xlog_pack_data(log, iclog, roundoff);
				1421
				1422	/* real byte length */
				1423	if (v2) {
				1424	INT_SET(iclog->ic_header.h_len,
				1425	ARCH_CONVERT,
				1426	iclog->ic_offset + roundoff);
				1427	} else {
				1428	INT_SET(iclog->ic_header.h_len, ARCH_CONVERT, iclog->ic_offset);
				1429	}
				1430
				1431	/* put ops count in correct order */
				1432	ops = iclog->ic_header.h_num_logops;
				1433	INT_SET(iclog->ic_header.h_num_logops, ARCH_CONVERT, ops);
				1434
				1435	bp = iclog->ic_bp;
				1436	ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1);
				1437	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2);
				1438	XFS_BUF_SET_ADDR(bp, BLOCK_LSN(INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT)));
				1439
				1440	XFS_STATS_ADD(xs_log_blocks, BTOBB(count));
				1441
				1442	/* Do we need to split this write into 2 parts? */
				1443	if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
				1444	split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
				1445	count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
				1446	iclog->ic_bwritecnt = 2; /* split into 2 writes */
				1447	} else {
				1448	iclog->ic_bwritecnt = 1;
				1449	}
				1450	XFS_BUF_SET_PTR(bp, (xfs_caddr_t) &(iclog->ic_header), count);
				1451	XFS_BUF_SET_FSPRIVATE(bp, iclog); /* save for later */
				1452	XFS_BUF_BUSY(bp);
				1453	XFS_BUF_ASYNC(bp);
				1454	/*
				1455	* Do a disk write cache flush for the log block.
				1456	* This is a bit of a sledgehammer, it would be better
				1457	* to use a tag barrier here that just prevents reordering.
				1458	* It may not be needed to flush the first split block in the log wrap
				1459	* case, but do it anyways to be safe -AK
				1460	*/
				1461	if (!(log->l_mp->m_flags & XFS_MOUNT_NOLOGFLUSH))
				1462	XFS_BUF_FLUSH(bp);
				1463
				1464	ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
				1465	ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
				1466
				1467	xlog_verify_iclog(log, iclog, count, B_TRUE);
				1468
				1469	/* account for log which doesn't start at block #0 */
				1470	XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
				1471	/*
				1472	* Don't call xfs_bwrite here. We do log-syncs even when the filesystem
				1473	* is shutting down.
				1474	*/
				1475	XFS_BUF_WRITE(bp);
				1476
				1477	if ((error = XFS_bwrite(bp))) {
				1478	xfs_ioerror_alert("xlog_sync", log->l_mp, bp,
				1479	XFS_BUF_ADDR(bp));
				1480	return (error);
				1481	}
				1482	if (split) {
				1483	bp = iclog->ic_log->l_xbuf;
				1484	ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) ==
				1485	(unsigned long)1);
				1486	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2);
				1487	XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */
				1488	XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+
				1489	(__psint_t)count), split);
				1490	XFS_BUF_SET_FSPRIVATE(bp, iclog);
				1491	XFS_BUF_BUSY(bp);
				1492	XFS_BUF_ASYNC(bp);
				1493	if (!(log->l_mp->m_flags & XFS_MOUNT_NOLOGFLUSH))
				1494	XFS_BUF_FLUSH(bp);
				1495	dptr = XFS_BUF_PTR(bp);
				1496	/*
				1497	* Bump the cycle numbers at the start of each block
				1498	* since this part of the buffer is at the start of
				1499	* a new cycle. Watch out for the header magic number
				1500	* case, though.
				1501	*/
				1502	for (i=0; i<split; i += BBSIZE) {
				1503	INT_MOD((uint )dptr, ARCH_CONVERT, +1);
				1504	if (INT_GET((uint )dptr, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM)
				1505	INT_MOD((uint )dptr, ARCH_CONVERT, +1);
				1506	dptr += BBSIZE;
				1507	}
				1508
				1509	ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
				1510	ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
				1511
				1512	/* account for internal log which does't start at block #0 */
				1513	XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
				1514	XFS_BUF_WRITE(bp);
				1515	if ((error = XFS_bwrite(bp))) {
				1516	xfs_ioerror_alert("xlog_sync (split)", log->l_mp,
				1517	bp, XFS_BUF_ADDR(bp));
				1518	return (error);
				1519	}
				1520	}
				1521	return (0);
				1522	} /* xlog_sync */
				1523
				1524
				1525	/*
				1526	* Unallocate a log structure
				1527	*/
				1528	void
				1529	xlog_unalloc_log(xlog_t *log)
				1530	{
				1531	xlog_in_core_t iclog, next_iclog;
				1532	xlog_ticket_t tic, next_tic;
				1533	int i;
				1534
				1535
				1536	iclog = log->l_iclog;
				1537	for (i=0; i<log->l_iclog_bufs; i++) {
				1538	sv_destroy(&iclog->ic_forcesema);
				1539	sv_destroy(&iclog->ic_writesema);
				1540	xfs_buf_free(iclog->ic_bp);
				1541	#ifdef XFS_LOG_TRACE
				1542	if (iclog->ic_trace != NULL) {
				1543	ktrace_free(iclog->ic_trace);
				1544	}
				1545	#endif
				1546	next_iclog = iclog->ic_next;
				1547	kmem_free(iclog->hic_data, log->l_iclog_size);
				1548	kmem_free(iclog, sizeof(xlog_in_core_t));
				1549	iclog = next_iclog;
				1550	}
				1551	freesema(&log->l_flushsema);
				1552	spinlock_destroy(&log->l_icloglock);
				1553	spinlock_destroy(&log->l_grant_lock);
				1554
				1555	/* XXXsup take a look at this again. */
				1556	if ((log->l_ticket_cnt != log->l_ticket_tcnt) &&
				1557	!XLOG_FORCED_SHUTDOWN(log)) {
				1558	xfs_fs_cmn_err(CE_WARN, log->l_mp,
				1559	"xlog_unalloc_log: (cnt: %d, total: %d)",
				1560	log->l_ticket_cnt, log->l_ticket_tcnt);
				1561	/* ASSERT(log->l_ticket_cnt == log->l_ticket_tcnt); */
				1562
				1563	} else {
				1564	tic = log->l_unmount_free;
				1565	while (tic) {
				1566	next_tic = tic->t_next;
				1567	kmem_free(tic, NBPP);
				1568	tic = next_tic;
				1569	}
				1570	}
				1571	xfs_buf_free(log->l_xbuf);
				1572	#ifdef XFS_LOG_TRACE
				1573	if (log->l_trace != NULL) {
				1574	ktrace_free(log->l_trace);
				1575	}
				1576	if (log->l_grant_trace != NULL) {
				1577	ktrace_free(log->l_grant_trace);
				1578	}
				1579	#endif
				1580	log->l_mp->m_log = NULL;
				1581	kmem_free(log, sizeof(xlog_t));
				1582	} /* xlog_unalloc_log */
				1583
				1584	/*
				1585	* Update counters atomically now that memcpy is done.
				1586	*/
				1587	/* ARGSUSED */
				1588	static inline void
				1589	xlog_state_finish_copy(xlog_t *log,
				1590	xlog_in_core_t *iclog,
				1591	int record_cnt,
				1592	int copy_bytes)
				1593	{
				1594	SPLDECL(s);
				1595
				1596	s = LOG_LOCK(log);
				1597
				1598	iclog->ic_header.h_num_logops += record_cnt;
				1599	iclog->ic_offset += copy_bytes;
				1600
				1601	LOG_UNLOCK(log, s);
				1602	} /* xlog_state_finish_copy */
				1603
				1604
				1605
				1606
				1607	/*
				1608	* Write some region out to in-core log
				1609	*
				1610	* This will be called when writing externally provided regions or when
				1611	* writing out a commit record for a given transaction.
				1612	*
				1613	* General algorithm:
				1614	* 1. Find total length of this write. This may include adding to the
				1615	* lengths passed in.
				1616	* 2. Check whether we violate the tickets reservation.
				1617	* 3. While writing to this iclog
				1618	* A. Reserve as much space in this iclog as can get
				1619	* B. If this is first write, save away start lsn
				1620	* C. While writing this region:
				1621	* 1. If first write of transaction, write start record
				1622	* 2. Write log operation header (header per region)
				1623	* 3. Find out if we can fit entire region into this iclog
				1624	* 4. Potentially, verify destination memcpy ptr
				1625	* 5. Memcpy (partial) region
				1626	* 6. If partial copy, release iclog; otherwise, continue
				1627	* copying more regions into current iclog
				1628	* 4. Mark want sync bit (in simulation mode)
				1629	* 5. Release iclog for potential flush to on-disk log.
				1630	*
				1631	* ERRORS:
				1632	* 1. Panic if reservation is overrun. This should never happen since
				1633	* reservation amounts are generated internal to the filesystem.
				1634	* NOTES:
				1635	* 1. Tickets are single threaded data structures.
				1636	* 2. The XLOG_END_TRANS & XLOG_CONTINUE_TRANS flags are passed down to the
				1637	* syncing routine. When a single log_write region needs to span
				1638	* multiple in-core logs, the XLOG_CONTINUE_TRANS bit should be set
				1639	* on all log operation writes which don't contain the end of the
				1640	* region. The XLOG_END_TRANS bit is used for the in-core log
				1641	* operation which contains the end of the continued log_write region.
				1642	* 3. When xlog_state_get_iclog_space() grabs the rest of the current iclog,
				1643	* we don't really know exactly how much space will be used. As a result,
				1644	* we don't update ic_offset until the end when we know exactly how many
				1645	* bytes have been written out.
				1646	*/
				1647	int
				1648	xlog_write(xfs_mount_t * mp,
				1649	xfs_log_iovec_t reg[],
				1650	int nentries,
				1651	xfs_log_ticket_t tic,
				1652	xfs_lsn_t *start_lsn,
				1653	xlog_in_core_t **commit_iclog,
				1654	uint flags)
				1655	{
				1656	xlog_t *log = mp->m_log;
				1657	xlog_ticket_t ticket = (xlog_ticket_t )tic;
				1658	xlog_op_header_t logop_head; / ptr to log operation header */
				1659	xlog_in_core_t iclog; / ptr to current in-core log */
				1660	__psint_t ptr; /* copy address into data region */
				1661	int len; /* # xlog_write() bytes 2 still copy */
				1662	int index; /* region index currently copying */
				1663	int log_offset; /* offset (from 0) into data region */
				1664	int start_rec_copy; /* # bytes to copy for start record */
				1665	int partial_copy; /* did we split a region? */
				1666	int partial_copy_len;/* # bytes copied if split region */
				1667	int need_copy; /* # bytes need to memcpy this region */
				1668	int copy_len; /* # bytes actually memcpy'ing */
				1669	int copy_off; /* # bytes from entry start */
				1670	int contwr; /* continued write of in-core log? */
				1671	int error;
				1672	int record_cnt = 0, data_cnt = 0;
				1673
				1674	partial_copy_len = partial_copy = 0;
				1675
				1676	/* Calculate potential maximum space. Each region gets its own
				1677	* xlog_op_header_t and may need to be double word aligned.
				1678	*/
				1679	len = 0;
				1680	if (ticket->t_flags & XLOG_TIC_INITED) /* acct for start rec of xact */
				1681	len += sizeof(xlog_op_header_t);
				1682
				1683	for (index = 0; index < nentries; index++) {
				1684	len += sizeof(xlog_op_header_t); /* each region gets >= 1 */
				1685	len += reg[index].i_len;
				1686	}
				1687	contwr = *start_lsn = 0;
				1688
				1689	if (ticket->t_curr_res < len) {
				1690	#ifdef DEBUG
				1691	xlog_panic(
				1692	"xfs_log_write: reservation ran out. Need to up reservation");
				1693	#else
				1694	/* Customer configurable panic */
				1695	xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
				1696	"xfs_log_write: reservation ran out. Need to up reservation");
				1697	/* If we did not panic, shutdown the filesystem */
				1698	xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
				1699	#endif
				1700	} else
				1701	ticket->t_curr_res -= len;
				1702
				1703	for (index = 0; index < nentries; ) {
				1704	if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
				1705	&contwr, &log_offset)))
				1706	return (error);
				1707
				1708	ASSERT(log_offset <= iclog->ic_size - 1);
				1709	ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset);
				1710
				1711	/* start_lsn is the first lsn written to. That's all we need. */
				1712	if (! *start_lsn)
				1713	*start_lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT);
				1714
				1715	/* This loop writes out as many regions as can fit in the amount
				1716	* of space which was allocated by xlog_state_get_iclog_space().
				1717	*/
				1718	while (index < nentries) {
				1719	ASSERT(reg[index].i_len % sizeof(__int32_t) == 0);
				1720	ASSERT((__psint_t)ptr % sizeof(__int32_t) == 0);
				1721	start_rec_copy = 0;
				1722
				1723	/* If first write for transaction, insert start record.
				1724	* We can't be trying to commit if we are inited. We can't
				1725	* have any "partial_copy" if we are inited.
				1726	*/
				1727	if (ticket->t_flags & XLOG_TIC_INITED) {
				1728	logop_head = (xlog_op_header_t *)ptr;
				1729	INT_SET(logop_head->oh_tid, ARCH_CONVERT, ticket->t_tid);
				1730	logop_head->oh_clientid = ticket->t_clientid;
				1731	logop_head->oh_len = 0;
				1732	logop_head->oh_flags = XLOG_START_TRANS;
				1733	logop_head->oh_res2 = 0;
				1734	ticket->t_flags &= ~XLOG_TIC_INITED; /* clear bit */
				1735	record_cnt++;
				1736
				1737	start_rec_copy = sizeof(xlog_op_header_t);
				1738	xlog_write_adv_cnt(ptr, len, log_offset, start_rec_copy);
				1739	}
				1740
				1741	/* Copy log operation header directly into data section */
				1742	logop_head = (xlog_op_header_t *)ptr;
				1743	INT_SET(logop_head->oh_tid, ARCH_CONVERT, ticket->t_tid);
				1744	logop_head->oh_clientid = ticket->t_clientid;
				1745	logop_head->oh_res2 = 0;
				1746
				1747	/* header copied directly */
				1748	xlog_write_adv_cnt(ptr, len, log_offset, sizeof(xlog_op_header_t));
				1749
				1750	/* are we copying a commit or unmount record? */
				1751	logop_head->oh_flags = flags;
				1752
				1753	/*
				1754	* We've seen logs corrupted with bad transaction client
				1755	* ids. This makes sure that XFS doesn't generate them on.
				1756	* Turn this into an EIO and shut down the filesystem.
				1757	*/
				1758	switch (logop_head->oh_clientid) {
				1759	case XFS_TRANSACTION:
				1760	case XFS_VOLUME:
				1761	case XFS_LOG:
				1762	break;
				1763	default:
				1764	xfs_fs_cmn_err(CE_WARN, mp,
				1765	"Bad XFS transaction clientid 0x%x in ticket 0x%p",
				1766	logop_head->oh_clientid, tic);
				1767	return XFS_ERROR(EIO);
				1768	}
				1769
				1770	/* Partial write last time? => (partial_copy != 0)
				1771	* need_copy is the amount we'd like to copy if everything could
				1772	* fit in the current memcpy.
				1773	*/
				1774	need_copy = reg[index].i_len - partial_copy_len;
				1775
				1776	copy_off = partial_copy_len;
				1777	if (need_copy <= iclog->ic_size - log_offset) { /complete write /
				1778	INT_SET(logop_head->oh_len, ARCH_CONVERT, copy_len = need_copy);
				1779	if (partial_copy)
				1780	logop_head->oh_flags\|= (XLOG_END_TRANS\|XLOG_WAS_CONT_TRANS);
				1781	partial_copy_len = partial_copy = 0;
				1782	} else { /* partial write */
				1783	copy_len = iclog->ic_size - log_offset;
				1784	INT_SET(logop_head->oh_len, ARCH_CONVERT, copy_len);
				1785	logop_head->oh_flags \|= XLOG_CONTINUE_TRANS;
				1786	if (partial_copy)
				1787	logop_head->oh_flags \|= XLOG_WAS_CONT_TRANS;
				1788	partial_copy_len += copy_len;
				1789	partial_copy++;
				1790	len += sizeof(xlog_op_header_t); /* from splitting of region */
				1791	/* account for new log op header */
				1792	ticket->t_curr_res -= sizeof(xlog_op_header_t);
				1793	}
				1794	xlog_verify_dest_ptr(log, ptr);
				1795
				1796	/* copy region */
				1797	ASSERT(copy_len >= 0);
				1798	memcpy((xfs_caddr_t)ptr, reg[index].i_addr + copy_off, copy_len);
				1799	xlog_write_adv_cnt(ptr, len, log_offset, copy_len);
				1800
				1801	/* make copy_len total bytes copied, including headers */
				1802	copy_len += start_rec_copy + sizeof(xlog_op_header_t);
				1803	record_cnt++;
				1804	data_cnt += contwr ? copy_len : 0;
				1805	if (partial_copy) { /* copied partial region */
				1806	/* already marked WANT_SYNC by xlog_state_get_iclog_space */
				1807	xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
				1808	record_cnt = data_cnt = 0;
				1809	if ((error = xlog_state_release_iclog(log, iclog)))
				1810	return (error);
				1811	break; /* don't increment index */
				1812	} else { /* copied entire region */
				1813	index++;
				1814	partial_copy_len = partial_copy = 0;
				1815
				1816	if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
				1817	xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
				1818	record_cnt = data_cnt = 0;
				1819	xlog_state_want_sync(log, iclog);
				1820	if (commit_iclog) {
				1821	ASSERT(flags & XLOG_COMMIT_TRANS);
				1822	*commit_iclog = iclog;
				1823	} else if ((error = xlog_state_release_iclog(log, iclog)))
				1824	return (error);
				1825	if (index == nentries)
				1826	return 0; /* we are done */
				1827	else
				1828	break;
				1829	}
				1830	} /* if (partial_copy) */
				1831	} /* while (index < nentries) */
				1832	} /* for (index = 0; index < nentries; ) */
				1833	ASSERT(len == 0);
				1834
				1835	xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
				1836	if (commit_iclog) {
				1837	ASSERT(flags & XLOG_COMMIT_TRANS);
				1838	*commit_iclog = iclog;
				1839	return 0;
				1840	}
				1841	return (xlog_state_release_iclog(log, iclog));
				1842	} /* xlog_write */
				1843
				1844
				1845	/*****************************************************************************
				1846	*
				1847	* State Machine functions
				1848	*
				1849	*****************************************************************************
				1850	*/
				1851
				1852	/* Clean iclogs starting from the head. This ordering must be
				1853	* maintained, so an iclog doesn't become ACTIVE beyond one that
				1854	* is SYNCING. This is also required to maintain the notion that we use
				1855	* a counting semaphore to hold off would be writers to the log when every
				1856	* iclog is trying to sync to disk.
				1857	*
				1858	* State Change: DIRTY -> ACTIVE
				1859	*/
				1860	void
				1861	xlog_state_clean_log(xlog_t *log)
				1862	{
				1863	xlog_in_core_t *iclog;
				1864	int changed = 0;
				1865
				1866	iclog = log->l_iclog;
				1867	do {
				1868	if (iclog->ic_state == XLOG_STATE_DIRTY) {
				1869	iclog->ic_state = XLOG_STATE_ACTIVE;
				1870	iclog->ic_offset = 0;
				1871	iclog->ic_callback = NULL; /* don't need to free */
				1872	/*
				1873	* If the number of ops in this iclog indicate it just
				1874	* contains the dummy transaction, we can
				1875	* change state into IDLE (the second time around).
				1876	* Otherwise we should change the state into
				1877	* NEED a dummy.
				1878	* We don't need to cover the dummy.
				1879	*/
				1880	if (!changed &&
				1881	(INT_GET(iclog->ic_header.h_num_logops, ARCH_CONVERT) == XLOG_COVER_OPS)) {
				1882	changed = 1;
				1883	} else {
				1884	/*
				1885	* We have two dirty iclogs so start over
				1886	* This could also be num of ops indicates
				1887	* this is not the dummy going out.
				1888	*/
				1889	changed = 2;
				1890	}
				1891	iclog->ic_header.h_num_logops = 0;
				1892	memset(iclog->ic_header.h_cycle_data, 0,
				1893	sizeof(iclog->ic_header.h_cycle_data));
				1894	iclog->ic_header.h_lsn = 0;
				1895	} else if (iclog->ic_state == XLOG_STATE_ACTIVE)
				1896	/* do nothing */;
				1897	else
				1898	break; /* stop cleaning */
				1899	iclog = iclog->ic_next;
				1900	} while (iclog != log->l_iclog);
				1901
				1902	/* log is locked when we are called */
				1903	/*
				1904	* Change state for the dummy log recording.
				1905	* We usually go to NEED. But we go to NEED2 if the changed indicates
				1906	* we are done writing the dummy record.
				1907	* If we are done with the second dummy recored (DONE2), then
				1908	* we go to IDLE.
				1909	*/
				1910	if (changed) {
				1911	switch (log->l_covered_state) {
				1912	case XLOG_STATE_COVER_IDLE:
				1913	case XLOG_STATE_COVER_NEED:
				1914	case XLOG_STATE_COVER_NEED2:
				1915	log->l_covered_state = XLOG_STATE_COVER_NEED;
				1916	break;
				1917
				1918	case XLOG_STATE_COVER_DONE:
				1919	if (changed == 1)
				1920	log->l_covered_state = XLOG_STATE_COVER_NEED2;
				1921	else
				1922	log->l_covered_state = XLOG_STATE_COVER_NEED;
				1923	break;
				1924
				1925	case XLOG_STATE_COVER_DONE2:
				1926	if (changed == 1)
				1927	log->l_covered_state = XLOG_STATE_COVER_IDLE;
				1928	else
				1929	log->l_covered_state = XLOG_STATE_COVER_NEED;
				1930	break;
				1931
				1932	default:
				1933	ASSERT(0);
				1934	}
				1935	}
				1936	} /* xlog_state_clean_log */
				1937
				1938	STATIC xfs_lsn_t
				1939	xlog_get_lowest_lsn(
				1940	xlog_t *log)
				1941	{
				1942	xlog_in_core_t *lsn_log;
				1943	xfs_lsn_t lowest_lsn, lsn;
				1944
				1945	lsn_log = log->l_iclog;
				1946	lowest_lsn = 0;
				1947	do {
				1948	if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE\|XLOG_STATE_DIRTY))) {
				1949	lsn = INT_GET(lsn_log->ic_header.h_lsn, ARCH_CONVERT);
				1950	if ((lsn && !lowest_lsn) \|\|
				1951	(XFS_LSN_CMP(lsn, lowest_lsn) < 0)) {
				1952	lowest_lsn = lsn;
				1953	}
				1954	}
				1955	lsn_log = lsn_log->ic_next;
				1956	} while (lsn_log != log->l_iclog);
				1957	return(lowest_lsn);
				1958	}
				1959
				1960
				1961	STATIC void
				1962	xlog_state_do_callback(
				1963	xlog_t *log,
				1964	int aborted,
				1965	xlog_in_core_t *ciclog)
				1966	{
				1967	xlog_in_core_t *iclog;
				1968	xlog_in_core_t first_iclog; / used to know when we've
				1969	* processed all iclogs once */
				1970	xfs_log_callback_t cb, cb_next;
				1971	int flushcnt = 0;
				1972	xfs_lsn_t lowest_lsn;
				1973	int ioerrors; /* counter: iclogs with errors */
				1974	int loopdidcallbacks; /* flag: inner loop did callbacks*/
				1975	int funcdidcallbacks; /* flag: function did callbacks */
				1976	int repeats; /* for issuing console warnings if
				1977	* looping too many times */
				1978	SPLDECL(s);
				1979
				1980	s = LOG_LOCK(log);
				1981	first_iclog = iclog = log->l_iclog;
				1982	ioerrors = 0;
				1983	funcdidcallbacks = 0;
				1984	repeats = 0;
				1985
				1986	do {
				1987	/*
				1988	* Scan all iclogs starting with the one pointed to by the
				1989	* log. Reset this starting point each time the log is
				1990	* unlocked (during callbacks).
				1991	*
				1992	* Keep looping through iclogs until one full pass is made
				1993	* without running any callbacks.
				1994	*/
				1995	first_iclog = log->l_iclog;
				1996	iclog = log->l_iclog;
				1997	loopdidcallbacks = 0;
				1998	repeats++;
				1999
				2000	do {
				2001
				2002	/* skip all iclogs in the ACTIVE & DIRTY states */
				2003	if (iclog->ic_state &
				2004	(XLOG_STATE_ACTIVE\|XLOG_STATE_DIRTY)) {
				2005	iclog = iclog->ic_next;
				2006	continue;
				2007	}
				2008
				2009	/*
				2010	* Between marking a filesystem SHUTDOWN and stopping
				2011	* the log, we do flush all iclogs to disk (if there
				2012	* wasn't a log I/O error). So, we do want things to
				2013	* go smoothly in case of just a SHUTDOWN w/o a
				2014	* LOG_IO_ERROR.
				2015	*/
				2016	if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
				2017	/*
				2018	* Can only perform callbacks in order. Since
				2019	* this iclog is not in the DONE_SYNC/
				2020	* DO_CALLBACK state, we skip the rest and
				2021	* just try to clean up. If we set our iclog
				2022	* to DO_CALLBACK, we will not process it when
				2023	* we retry since a previous iclog is in the
				2024	* CALLBACK and the state cannot change since
				2025	* we are holding the LOG_LOCK.
				2026	*/
				2027	if (!(iclog->ic_state &
				2028	(XLOG_STATE_DONE_SYNC \|
				2029	XLOG_STATE_DO_CALLBACK))) {
				2030	if (ciclog && (ciclog->ic_state ==
				2031	XLOG_STATE_DONE_SYNC)) {
				2032	ciclog->ic_state = XLOG_STATE_DO_CALLBACK;
				2033	}
				2034	break;
				2035	}
				2036	/*
				2037	* We now have an iclog that is in either the
				2038	* DO_CALLBACK or DONE_SYNC states. The other
				2039	* states (WANT_SYNC, SYNCING, or CALLBACK were
				2040	* caught by the above if and are going to
				2041	* clean (i.e. we aren't doing their callbacks)
				2042	* see the above if.
				2043	*/
				2044
				2045	/*
				2046	* We will do one more check here to see if we
				2047	* have chased our tail around.
				2048	*/
				2049
				2050	lowest_lsn = xlog_get_lowest_lsn(log);
				2051	if (lowest_lsn && (
				2052	XFS_LSN_CMP(
				2053	lowest_lsn,
				2054	INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT)
				2055	)<0)) {
				2056	iclog = iclog->ic_next;
				2057	continue; /* Leave this iclog for
				2058	* another thread */
				2059	}
				2060
				2061	iclog->ic_state = XLOG_STATE_CALLBACK;
				2062
				2063	LOG_UNLOCK(log, s);
				2064
				2065	/* l_last_sync_lsn field protected by
				2066	* GRANT_LOCK. Don't worry about iclog's lsn.
				2067	* No one else can be here except us.
				2068	*/
				2069	s = GRANT_LOCK(log);
				2070	ASSERT(XFS_LSN_CMP(
				2071	log->l_last_sync_lsn,
				2072	INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT)
				2073	)<=0);
				2074	log->l_last_sync_lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT);
				2075	GRANT_UNLOCK(log, s);
				2076
				2077	/*
				2078	* Keep processing entries in the callback list
				2079	* until we come around and it is empty. We
				2080	* need to atomically see that the list is
				2081	* empty and change the state to DIRTY so that
				2082	* we don't miss any more callbacks being added.
				2083	*/
				2084	s = LOG_LOCK(log);
				2085	} else {
				2086	ioerrors++;
				2087	}
				2088	cb = iclog->ic_callback;
				2089
				2090	while (cb != 0) {
				2091	iclog->ic_callback_tail = &(iclog->ic_callback);
				2092	iclog->ic_callback = NULL;
				2093	LOG_UNLOCK(log, s);
				2094
				2095	/* perform callbacks in the order given */
				2096	for (; cb != 0; cb = cb_next) {
				2097	cb_next = cb->cb_next;
				2098	cb->cb_func(cb->cb_arg, aborted);
				2099	}
				2100	s = LOG_LOCK(log);
				2101	cb = iclog->ic_callback;
				2102	}
				2103
				2104	loopdidcallbacks++;
				2105	funcdidcallbacks++;
				2106
				2107	ASSERT(iclog->ic_callback == 0);
				2108	if (!(iclog->ic_state & XLOG_STATE_IOERROR))
				2109	iclog->ic_state = XLOG_STATE_DIRTY;
				2110
				2111	/*
				2112	* Transition from DIRTY to ACTIVE if applicable.
				2113	* NOP if STATE_IOERROR.
				2114	*/
				2115	xlog_state_clean_log(log);
				2116
				2117	/* wake up threads waiting in xfs_log_force() */
				2118	sv_broadcast(&iclog->ic_forcesema);
				2119
				2120	iclog = iclog->ic_next;
				2121	} while (first_iclog != iclog);
				2122	if (repeats && (repeats % 10) == 0) {
				2123	xfs_fs_cmn_err(CE_WARN, log->l_mp,
				2124	"xlog_state_do_callback: looping %d", repeats);
				2125	}
				2126	} while (!ioerrors && loopdidcallbacks);
				2127
				2128	/*
				2129	* make one last gasp attempt to see if iclogs are being left in
				2130	* limbo..
				2131	*/
				2132	#ifdef DEBUG
				2133	if (funcdidcallbacks) {
				2134	first_iclog = iclog = log->l_iclog;
				2135	do {
				2136	ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
				2137	/*
				2138	* Terminate the loop if iclogs are found in states
				2139	* which will cause other threads to clean up iclogs.
				2140	*
				2141	* SYNCING - i/o completion will go through logs
				2142	* DONE_SYNC - interrupt thread should be waiting for
				2143	* LOG_LOCK
				2144	* IOERROR - give up hope all ye who enter here
				2145	*/
				2146	if (iclog->ic_state == XLOG_STATE_WANT_SYNC \|\|
				2147	iclog->ic_state == XLOG_STATE_SYNCING \|\|
				2148	iclog->ic_state == XLOG_STATE_DONE_SYNC \|\|
				2149	iclog->ic_state == XLOG_STATE_IOERROR )
				2150	break;
				2151	iclog = iclog->ic_next;
				2152	} while (first_iclog != iclog);
				2153	}
				2154	#endif
				2155
				2156	if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE\|XLOG_STATE_IOERROR)) {
				2157	flushcnt = log->l_flushcnt;
				2158	log->l_flushcnt = 0;
				2159	}
				2160	LOG_UNLOCK(log, s);
				2161	while (flushcnt--)
				2162	vsema(&log->l_flushsema);
				2163	} /* xlog_state_do_callback */
				2164
				2165
				2166	/*
				2167	* Finish transitioning this iclog to the dirty state.
				2168	*
				2169	* Make sure that we completely execute this routine only when this is
				2170	* the last call to the iclog. There is a good chance that iclog flushes,
				2171	* when we reach the end of the physical log, get turned into 2 separate
				2172	* calls to bwrite. Hence, one iclog flush could generate two calls to this
				2173	* routine. By using the reference count bwritecnt, we guarantee that only
				2174	* the second completion goes through.
				2175	*
				2176	* Callbacks could take time, so they are done outside the scope of the
				2177	* global state machine log lock. Assume that the calls to cvsema won't
				2178	* take a long time. At least we know it won't sleep.
				2179	*/
				2180	void
				2181	xlog_state_done_syncing(
				2182	xlog_in_core_t *iclog,
				2183	int aborted)
				2184	{
				2185	xlog_t *log = iclog->ic_log;
				2186	SPLDECL(s);
				2187
				2188	s = LOG_LOCK(log);
				2189
				2190	ASSERT(iclog->ic_state == XLOG_STATE_SYNCING \|\|
				2191	iclog->ic_state == XLOG_STATE_IOERROR);
				2192	ASSERT(iclog->ic_refcnt == 0);
				2193	ASSERT(iclog->ic_bwritecnt == 1 \|\| iclog->ic_bwritecnt == 2);
				2194
				2195
				2196	/*
				2197	* If we got an error, either on the first buffer, or in the case of
				2198	* split log writes, on the second, we mark ALL iclogs STATE_IOERROR,
				2199	* and none should ever be attempted to be written to disk
				2200	* again.
				2201	*/
				2202	if (iclog->ic_state != XLOG_STATE_IOERROR) {
				2203	if (--iclog->ic_bwritecnt == 1) {
				2204	LOG_UNLOCK(log, s);
				2205	return;
				2206	}
				2207	iclog->ic_state = XLOG_STATE_DONE_SYNC;
				2208	}
				2209
				2210	/*
				2211	* Someone could be sleeping prior to writing out the next
				2212	* iclog buffer, we wake them all, one will get to do the
				2213	* I/O, the others get to wait for the result.
				2214	*/
				2215	sv_broadcast(&iclog->ic_writesema);
				2216	LOG_UNLOCK(log, s);
				2217	xlog_state_do_callback(log, aborted, iclog); /* also cleans log */
				2218	} /* xlog_state_done_syncing */
				2219
				2220
				2221	/*
				2222	* If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
				2223	* sleep. The flush semaphore is set to the number of in-core buffers and
				2224	* decremented around disk syncing. Therefore, if all buffers are syncing,
				2225	* this semaphore will cause new writes to sleep until a sync completes.
				2226	* Otherwise, this code just does p() followed by v(). This approximates
				2227	* a sleep/wakeup except we can't race.
				2228	*
				2229	* The in-core logs are used in a circular fashion. They are not used
				2230	* out-of-order even when an iclog past the head is free.
				2231	*
				2232	* return:
				2233	* * log_offset where xlog_write() can start writing into the in-core
				2234	* log's data space.
				2235	* * in-core log pointer to which xlog_write() should write.
				2236	* * boolean indicating this is a continued write to an in-core log.
				2237	* If this is the last write, then the in-core log's offset field
				2238	* needs to be incremented, depending on the amount of data which
				2239	* is copied.
				2240	*/
				2241	int
				2242	xlog_state_get_iclog_space(xlog_t *log,
				2243	int len,
				2244	xlog_in_core_t **iclogp,
				2245	xlog_ticket_t *ticket,
				2246	int *continued_write,
				2247	int *logoffsetp)
				2248	{
				2249	SPLDECL(s);
				2250	int log_offset;
				2251	xlog_rec_header_t *head;
				2252	xlog_in_core_t *iclog;
				2253	int error;
				2254
				2255	restart:
				2256	s = LOG_LOCK(log);
				2257	if (XLOG_FORCED_SHUTDOWN(log)) {
				2258	LOG_UNLOCK(log, s);
				2259	return XFS_ERROR(EIO);
				2260	}
				2261
				2262	iclog = log->l_iclog;
				2263	if (! (iclog->ic_state == XLOG_STATE_ACTIVE)) {
				2264	log->l_flushcnt++;
				2265	LOG_UNLOCK(log, s);
				2266	xlog_trace_iclog(iclog, XLOG_TRACE_SLEEP_FLUSH);
				2267	XFS_STATS_INC(xs_log_noiclogs);
				2268	/* Ensure that log writes happen */
				2269	psema(&log->l_flushsema, PINOD);
				2270	goto restart;
				2271	}
				2272	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
				2273	head = &iclog->ic_header;
				2274
				2275	iclog->ic_refcnt++; /* prevents sync */
				2276	log_offset = iclog->ic_offset;
				2277
				2278	/* On the 1st write to an iclog, figure out lsn. This works
				2279	* if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are
				2280	* committing to. If the offset is set, that's how many blocks
				2281	* must be written.
				2282	*/
				2283	if (log_offset == 0) {
				2284	ticket->t_curr_res -= log->l_iclog_hsize;
				2285	INT_SET(head->h_cycle, ARCH_CONVERT, log->l_curr_cycle);
				2286	ASSIGN_LSN(head->h_lsn, log);
				2287	ASSERT(log->l_curr_block >= 0);
				2288	}
				2289
				2290	/* If there is enough room to write everything, then do it. Otherwise,
				2291	* claim the rest of the region and make sure the XLOG_STATE_WANT_SYNC
				2292	* bit is on, so this will get flushed out. Don't update ic_offset
				2293	* until you know exactly how many bytes get copied. Therefore, wait
				2294	* until later to update ic_offset.
				2295	*
				2296	* xlog_write() algorithm assumes that at least 2 xlog_op_header_t's
				2297	* can fit into remaining data section.
				2298	*/
				2299	if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
				2300	xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
				2301
				2302	/* If I'm the only one writing to this iclog, sync it to disk */
				2303	if (iclog->ic_refcnt == 1) {
				2304	LOG_UNLOCK(log, s);
				2305	if ((error = xlog_state_release_iclog(log, iclog)))
				2306	return (error);
				2307	} else {
				2308	iclog->ic_refcnt--;
				2309	LOG_UNLOCK(log, s);
				2310	}
				2311	goto restart;
				2312	}
				2313
				2314	/* Do we have enough room to write the full amount in the remainder
				2315	* of this iclog? Or must we continue a write on the next iclog and
				2316	* mark this iclog as completely taken? In the case where we switch
				2317	* iclogs (to mark it taken), this particular iclog will release/sync
				2318	* to disk in xlog_write().
				2319	*/
				2320	if (len <= iclog->ic_size - iclog->ic_offset) {
				2321	*continued_write = 0;
				2322	iclog->ic_offset += len;
				2323	} else {
				2324	*continued_write = 1;
				2325	xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
				2326	}
				2327	*iclogp = iclog;
				2328
				2329	ASSERT(iclog->ic_offset <= iclog->ic_size);
				2330	LOG_UNLOCK(log, s);
				2331
				2332	*logoffsetp = log_offset;
				2333	return 0;
				2334	} /* xlog_state_get_iclog_space */
				2335
				2336	/*
				2337	* Atomically get the log space required for a log ticket.
				2338	*
				2339	* Once a ticket gets put onto the reserveq, it will only return after
				2340	* the needed reservation is satisfied.
				2341	*/
				2342	STATIC int
				2343	xlog_grant_log_space(xlog_t *log,
				2344	xlog_ticket_t *tic)
				2345	{
				2346	int free_bytes;
				2347	int need_bytes;
				2348	SPLDECL(s);
				2349	#ifdef DEBUG
				2350	xfs_lsn_t tail_lsn;
				2351	#endif
				2352
				2353
				2354	#ifdef DEBUG
				2355	if (log->l_flags & XLOG_ACTIVE_RECOVERY)
				2356	panic("grant Recovery problem");
				2357	#endif
				2358
				2359	/* Is there space or do we need to sleep? */
				2360	s = GRANT_LOCK(log);
				2361	xlog_trace_loggrant(log, tic, "xlog_grant_log_space: enter");
				2362
				2363	/* something is already sleeping; insert new transaction at end */
				2364	if (log->l_reserve_headq) {
				2365	XLOG_INS_TICKETQ(log->l_reserve_headq, tic);
				2366	xlog_trace_loggrant(log, tic,
				2367	"xlog_grant_log_space: sleep 1");
				2368	/*
				2369	* Gotta check this before going to sleep, while we're
				2370	* holding the grant lock.
				2371	*/
				2372	if (XLOG_FORCED_SHUTDOWN(log))
				2373	goto error_return;
				2374
				2375	XFS_STATS_INC(xs_sleep_logspace);
				2376	sv_wait(&tic->t_sema, PINOD\|PLTWAIT, &log->l_grant_lock, s);
				2377	/*
				2378	* If we got an error, and the filesystem is shutting down,
				2379	* we'll catch it down below. So just continue...
				2380	*/
				2381	xlog_trace_loggrant(log, tic,
				2382	"xlog_grant_log_space: wake 1");
				2383	s = GRANT_LOCK(log);
				2384	}
				2385	if (tic->t_flags & XFS_LOG_PERM_RESERV)
				2386	need_bytes = tic->t_unit_res*tic->t_ocnt;
				2387	else
				2388	need_bytes = tic->t_unit_res;
				2389
				2390	redo:
				2391	if (XLOG_FORCED_SHUTDOWN(log))
				2392	goto error_return;
				2393
				2394	free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle,
				2395	log->l_grant_reserve_bytes);
				2396	if (free_bytes < need_bytes) {
				2397	if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
				2398	XLOG_INS_TICKETQ(log->l_reserve_headq, tic);
				2399	xlog_trace_loggrant(log, tic,
				2400	"xlog_grant_log_space: sleep 2");
				2401	XFS_STATS_INC(xs_sleep_logspace);
				2402	sv_wait(&tic->t_sema, PINOD\|PLTWAIT, &log->l_grant_lock, s);
				2403
				2404	if (XLOG_FORCED_SHUTDOWN(log)) {
				2405	s = GRANT_LOCK(log);
				2406	goto error_return;
				2407	}
				2408
				2409	xlog_trace_loggrant(log, tic,
				2410	"xlog_grant_log_space: wake 2");
				2411	xlog_grant_push_ail(log->l_mp, need_bytes);
				2412	s = GRANT_LOCK(log);
				2413	goto redo;
				2414	} else if (tic->t_flags & XLOG_TIC_IN_Q)
				2415	XLOG_DEL_TICKETQ(log->l_reserve_headq, tic);
				2416
				2417	/* we've got enough space */
				2418	XLOG_GRANT_ADD_SPACE(log, need_bytes, 'w');
				2419	XLOG_GRANT_ADD_SPACE(log, need_bytes, 'r');
				2420	#ifdef DEBUG
				2421	tail_lsn = log->l_tail_lsn;
				2422	/*
				2423	* Check to make sure the grant write head didn't just over lap the
				2424	* tail. If the cycles are the same, we can't be overlapping.
				2425	* Otherwise, make sure that the cycles differ by exactly one and
				2426	* check the byte count.
				2427	*/
				2428	if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
				2429	ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
				2430	ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
				2431	}
				2432	#endif
				2433	xlog_trace_loggrant(log, tic, "xlog_grant_log_space: exit");
				2434	xlog_verify_grant_head(log, 1);
				2435	GRANT_UNLOCK(log, s);
				2436	return 0;
				2437
				2438	error_return:
				2439	if (tic->t_flags & XLOG_TIC_IN_Q)
				2440	XLOG_DEL_TICKETQ(log->l_reserve_headq, tic);
				2441	xlog_trace_loggrant(log, tic, "xlog_grant_log_space: err_ret");
				2442	/*
				2443	* If we are failing, make sure the ticket doesn't have any
				2444	* current reservations. We don't want to add this back when
				2445	* the ticket/transaction gets cancelled.
				2446	*/
				2447	tic->t_curr_res = 0;
				2448	tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
				2449	GRANT_UNLOCK(log, s);
				2450	return XFS_ERROR(EIO);
				2451	} /* xlog_grant_log_space */
				2452
				2453
				2454	/*
				2455	* Replenish the byte reservation required by moving the grant write head.
				2456	*
				2457	*
				2458	*/
				2459	STATIC int
				2460	xlog_regrant_write_log_space(xlog_t *log,
				2461	xlog_ticket_t *tic)
				2462	{
				2463	SPLDECL(s);
				2464	int free_bytes, need_bytes;
				2465	xlog_ticket_t *ntic;
				2466	#ifdef DEBUG
				2467	xfs_lsn_t tail_lsn;
				2468	#endif
				2469
				2470	tic->t_curr_res = tic->t_unit_res;
				2471
				2472	if (tic->t_cnt > 0)
				2473	return (0);
				2474
				2475	#ifdef DEBUG
				2476	if (log->l_flags & XLOG_ACTIVE_RECOVERY)
				2477	panic("regrant Recovery problem");
				2478	#endif
				2479
				2480	s = GRANT_LOCK(log);
				2481	xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: enter");
				2482
				2483	if (XLOG_FORCED_SHUTDOWN(log))
				2484	goto error_return;
				2485
				2486	/* If there are other waiters on the queue then give them a
				2487	* chance at logspace before us. Wake up the first waiters,
				2488	* if we do not wake up all the waiters then go to sleep waiting
				2489	* for more free space, otherwise try to get some space for
				2490	* this transaction.
				2491	*/
				2492
				2493	if ((ntic = log->l_write_headq)) {
				2494	free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
				2495	log->l_grant_write_bytes);
				2496	do {
				2497	ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
				2498
				2499	if (free_bytes < ntic->t_unit_res)
				2500	break;
				2501	free_bytes -= ntic->t_unit_res;
				2502	sv_signal(&ntic->t_sema);
				2503	ntic = ntic->t_next;
				2504	} while (ntic != log->l_write_headq);
				2505
				2506	if (ntic != log->l_write_headq) {
				2507	if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
				2508	XLOG_INS_TICKETQ(log->l_write_headq, tic);
				2509
				2510	xlog_trace_loggrant(log, tic,
				2511	"xlog_regrant_write_log_space: sleep 1");
				2512	XFS_STATS_INC(xs_sleep_logspace);
				2513	sv_wait(&tic->t_sema, PINOD\|PLTWAIT,
				2514	&log->l_grant_lock, s);
				2515
				2516	/* If we're shutting down, this tic is already
				2517	* off the queue */
				2518	if (XLOG_FORCED_SHUTDOWN(log)) {
				2519	s = GRANT_LOCK(log);
				2520	goto error_return;
				2521	}
				2522
				2523	xlog_trace_loggrant(log, tic,
				2524	"xlog_regrant_write_log_space: wake 1");
				2525	xlog_grant_push_ail(log->l_mp, tic->t_unit_res);
				2526	s = GRANT_LOCK(log);
				2527	}
				2528	}
				2529
				2530	need_bytes = tic->t_unit_res;
				2531
				2532	redo:
				2533	if (XLOG_FORCED_SHUTDOWN(log))
				2534	goto error_return;
				2535
				2536	free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
				2537	log->l_grant_write_bytes);
				2538	if (free_bytes < need_bytes) {
				2539	if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
				2540	XLOG_INS_TICKETQ(log->l_write_headq, tic);
				2541	XFS_STATS_INC(xs_sleep_logspace);
				2542	sv_wait(&tic->t_sema, PINOD\|PLTWAIT, &log->l_grant_lock, s);
				2543
				2544	/* If we're shutting down, this tic is already off the queue */
				2545	if (XLOG_FORCED_SHUTDOWN(log)) {
				2546	s = GRANT_LOCK(log);
				2547	goto error_return;
				2548	}
				2549
				2550	xlog_trace_loggrant(log, tic,
				2551	"xlog_regrant_write_log_space: wake 2");
				2552	xlog_grant_push_ail(log->l_mp, need_bytes);
				2553	s = GRANT_LOCK(log);
				2554	goto redo;
				2555	} else if (tic->t_flags & XLOG_TIC_IN_Q)
				2556	XLOG_DEL_TICKETQ(log->l_write_headq, tic);
				2557
				2558	XLOG_GRANT_ADD_SPACE(log, need_bytes, 'w'); /* we've got enough space */
				2559	#ifdef DEBUG
				2560	tail_lsn = log->l_tail_lsn;
				2561	if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
				2562	ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
				2563	ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
				2564	}
				2565	#endif
				2566
				2567	xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: exit");
				2568	xlog_verify_grant_head(log, 1);
				2569	GRANT_UNLOCK(log, s);
				2570	return (0);
				2571
				2572
				2573	error_return:
				2574	if (tic->t_flags & XLOG_TIC_IN_Q)
				2575	XLOG_DEL_TICKETQ(log->l_reserve_headq, tic);
				2576	xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: err_ret");
				2577	/*
				2578	* If we are failing, make sure the ticket doesn't have any
				2579	* current reservations. We don't want to add this back when
				2580	* the ticket/transaction gets cancelled.
				2581	*/
				2582	tic->t_curr_res = 0;
				2583	tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
				2584	GRANT_UNLOCK(log, s);
				2585	return XFS_ERROR(EIO);
				2586	} /* xlog_regrant_write_log_space */
				2587
				2588
				2589	/* The first cnt-1 times through here we don't need to
				2590	* move the grant write head because the permanent
				2591	* reservation has reserved cnt times the unit amount.
				2592	* Release part of current permanent unit reservation and
				2593	* reset current reservation to be one units worth. Also
				2594	* move grant reservation head forward.
				2595	*/
				2596	STATIC void
				2597	xlog_regrant_reserve_log_space(xlog_t *log,
				2598	xlog_ticket_t *ticket)
				2599	{
				2600	SPLDECL(s);
				2601
				2602	xlog_trace_loggrant(log, ticket,
				2603	"xlog_regrant_reserve_log_space: enter");
				2604	if (ticket->t_cnt > 0)
				2605	ticket->t_cnt--;
				2606
				2607	s = GRANT_LOCK(log);
				2608	XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'w');
				2609	XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'r');
				2610	ticket->t_curr_res = ticket->t_unit_res;
				2611	xlog_trace_loggrant(log, ticket,
				2612	"xlog_regrant_reserve_log_space: sub current res");
				2613	xlog_verify_grant_head(log, 1);
				2614
				2615	/* just return if we still have some of the pre-reserved space */
				2616	if (ticket->t_cnt > 0) {
				2617	GRANT_UNLOCK(log, s);
				2618	return;
				2619	}
				2620
				2621	XLOG_GRANT_ADD_SPACE(log, ticket->t_unit_res, 'r');
				2622	xlog_trace_loggrant(log, ticket,
				2623	"xlog_regrant_reserve_log_space: exit");
				2624	xlog_verify_grant_head(log, 0);
				2625	GRANT_UNLOCK(log, s);
				2626	ticket->t_curr_res = ticket->t_unit_res;
				2627	} /* xlog_regrant_reserve_log_space */
				2628
				2629
				2630	/*
				2631	* Give back the space left from a reservation.
				2632	*
				2633	* All the information we need to make a correct determination of space left
				2634	* is present. For non-permanent reservations, things are quite easy. The
				2635	* count should have been decremented to zero. We only need to deal with the
				2636	* space remaining in the current reservation part of the ticket. If the
				2637	* ticket contains a permanent reservation, there may be left over space which
				2638	* needs to be released. A count of N means that N-1 refills of the current
				2639	* reservation can be done before we need to ask for more space. The first
				2640	* one goes to fill up the first current reservation. Once we run out of
				2641	* space, the count will stay at zero and the only space remaining will be
				2642	* in the current reservation field.
				2643	*/
				2644	STATIC void
				2645	xlog_ungrant_log_space(xlog_t *log,
				2646	xlog_ticket_t *ticket)
				2647	{
				2648	SPLDECL(s);
				2649
				2650	if (ticket->t_cnt > 0)
				2651	ticket->t_cnt--;
				2652
				2653	s = GRANT_LOCK(log);
				2654	xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: enter");
				2655
				2656	XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'w');
				2657	XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'r');
				2658
				2659	xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: sub current");
				2660
				2661	/* If this is a permanent reservation ticket, we may be able to free
				2662	* up more space based on the remaining count.
				2663	*/
				2664	if (ticket->t_cnt > 0) {
				2665	ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
				2666	XLOG_GRANT_SUB_SPACE(log, ticket->t_unit_res*ticket->t_cnt,'w');
				2667	XLOG_GRANT_SUB_SPACE(log, ticket->t_unit_res*ticket->t_cnt,'r');
				2668	}
				2669
				2670	xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: exit");
				2671	xlog_verify_grant_head(log, 1);
				2672	GRANT_UNLOCK(log, s);
				2673	xfs_log_move_tail(log->l_mp, 1);
				2674	} /* xlog_ungrant_log_space */
				2675
				2676
				2677	/*
				2678	* Atomically put back used ticket.
				2679	*/
				2680	void
				2681	xlog_state_put_ticket(xlog_t *log,
				2682	xlog_ticket_t *tic)
				2683	{
				2684	unsigned long s;
				2685
				2686	s = LOG_LOCK(log);
				2687	xlog_ticket_put(log, tic);
				2688	LOG_UNLOCK(log, s);
				2689	} /* xlog_state_put_ticket */
				2690
				2691	/*
				2692	* Flush iclog to disk if this is the last reference to the given iclog and
				2693	* the WANT_SYNC bit is set.
				2694	*
				2695	* When this function is entered, the iclog is not necessarily in the
				2696	* WANT_SYNC state. It may be sitting around waiting to get filled.
				2697	*
				2698	*
				2699	*/
				2700	int
				2701	xlog_state_release_iclog(xlog_t *log,
				2702	xlog_in_core_t *iclog)
				2703	{
				2704	SPLDECL(s);
				2705	int sync = 0; /* do we sync? */
				2706
				2707	xlog_assign_tail_lsn(log->l_mp);
				2708
				2709	s = LOG_LOCK(log);
				2710
				2711	if (iclog->ic_state & XLOG_STATE_IOERROR) {
				2712	LOG_UNLOCK(log, s);
				2713	return XFS_ERROR(EIO);
				2714	}
				2715
				2716	ASSERT(iclog->ic_refcnt > 0);
				2717	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE \|\|
				2718	iclog->ic_state == XLOG_STATE_WANT_SYNC);
				2719
				2720	if (--iclog->ic_refcnt == 0 &&
				2721	iclog->ic_state == XLOG_STATE_WANT_SYNC) {
				2722	sync++;
				2723	iclog->ic_state = XLOG_STATE_SYNCING;
				2724	INT_SET(iclog->ic_header.h_tail_lsn, ARCH_CONVERT, log->l_tail_lsn);
				2725	xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn);
				2726	/* cycle incremented when incrementing curr_block */
				2727	}
				2728
				2729	LOG_UNLOCK(log, s);
				2730
				2731	/*
				2732	* We let the log lock go, so it's possible that we hit a log I/O
				2733	* error or someother SHUTDOWN condition that marks the iclog
				2734	* as XLOG_STATE_IOERROR before the bwrite. However, we know that
				2735	* this iclog has consistent data, so we ignore IOERROR
				2736	* flags after this point.
				2737	*/
				2738	if (sync) {
				2739	return xlog_sync(log, iclog);
				2740	}
				2741	return (0);
				2742
				2743	} /* xlog_state_release_iclog */
				2744
				2745
				2746	/*
				2747	* This routine will mark the current iclog in the ring as WANT_SYNC
				2748	* and move the current iclog pointer to the next iclog in the ring.
				2749	* When this routine is called from xlog_state_get_iclog_space(), the
				2750	* exact size of the iclog has not yet been determined. All we know is
				2751	* that every data block. We have run out of space in this log record.
				2752	*/
				2753	STATIC void
				2754	xlog_state_switch_iclogs(xlog_t *log,
				2755	xlog_in_core_t *iclog,
				2756	int eventual_size)
				2757	{
				2758	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
				2759	if (!eventual_size)
				2760	eventual_size = iclog->ic_offset;
				2761	iclog->ic_state = XLOG_STATE_WANT_SYNC;
				2762	INT_SET(iclog->ic_header.h_prev_block, ARCH_CONVERT, log->l_prev_block);
				2763	log->l_prev_block = log->l_curr_block;
				2764	log->l_prev_cycle = log->l_curr_cycle;
				2765
				2766	/* roll log?: ic_offset changed later */
				2767	log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
				2768
				2769	/* Round up to next log-sunit */
				2770	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) &&
				2771	log->l_mp->m_sb.sb_logsunit > 1) {
				2772	__uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit);
				2773	log->l_curr_block = roundup(log->l_curr_block, sunit_bb);
				2774	}
				2775
				2776	if (log->l_curr_block >= log->l_logBBsize) {
				2777	log->l_curr_cycle++;
				2778	if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
				2779	log->l_curr_cycle++;
				2780	log->l_curr_block -= log->l_logBBsize;
				2781	ASSERT(log->l_curr_block >= 0);
				2782	}
				2783	ASSERT(iclog == log->l_iclog);
				2784	log->l_iclog = iclog->ic_next;
				2785	} /* xlog_state_switch_iclogs */
				2786
				2787
				2788	/*
				2789	* Write out all data in the in-core log as of this exact moment in time.
				2790	*
				2791	* Data may be written to the in-core log during this call. However,
				2792	* we don't guarantee this data will be written out. A change from past
				2793	* implementation means this routine will not write out zero length LRs.
				2794	*
				2795	* Basically, we try and perform an intelligent scan of the in-core logs.
				2796	* If we determine there is no flushable data, we just return. There is no
				2797	* flushable data if:
				2798	*
				2799	* 1. the current iclog is active and has no data; the previous iclog
				2800	* is in the active or dirty state.
				2801	* 2. the current iclog is drity, and the previous iclog is in the
				2802	* active or dirty state.
				2803	*
				2804	* We may sleep (call psema) if:
				2805	*
				2806	* 1. the current iclog is not in the active nor dirty state.
				2807	* 2. the current iclog dirty, and the previous iclog is not in the
				2808	* active nor dirty state.
				2809	* 3. the current iclog is active, and there is another thread writing
				2810	* to this particular iclog.
				2811	* 4. a) the current iclog is active and has no other writers
				2812	* b) when we return from flushing out this iclog, it is still
				2813	* not in the active nor dirty state.
				2814	*/
				2815	STATIC int
				2816	xlog_state_sync_all(xlog_t *log, uint flags)
				2817	{
				2818	xlog_in_core_t *iclog;
				2819	xfs_lsn_t lsn;
				2820	SPLDECL(s);
				2821
				2822	s = LOG_LOCK(log);
				2823
				2824	iclog = log->l_iclog;
				2825	if (iclog->ic_state & XLOG_STATE_IOERROR) {
				2826	LOG_UNLOCK(log, s);
				2827	return XFS_ERROR(EIO);
				2828	}
				2829
				2830	/* If the head iclog is not active nor dirty, we just attach
				2831	* ourselves to the head and go to sleep.
				2832	*/
				2833	if (iclog->ic_state == XLOG_STATE_ACTIVE \|\|
				2834	iclog->ic_state == XLOG_STATE_DIRTY) {
				2835	/*
				2836	* If the head is dirty or (active and empty), then
				2837	* we need to look at the previous iclog. If the previous
				2838	* iclog is active or dirty we are done. There is nothing
				2839	* to sync out. Otherwise, we attach ourselves to the
				2840	* previous iclog and go to sleep.
				2841	*/
				2842	if (iclog->ic_state == XLOG_STATE_DIRTY \|\|
				2843	(iclog->ic_refcnt == 0 && iclog->ic_offset == 0)) {
				2844	iclog = iclog->ic_prev;
				2845	if (iclog->ic_state == XLOG_STATE_ACTIVE \|\|
				2846	iclog->ic_state == XLOG_STATE_DIRTY)
				2847	goto no_sleep;
				2848	else
				2849	goto maybe_sleep;
				2850	} else {
				2851	if (iclog->ic_refcnt == 0) {
				2852	/* We are the only one with access to this
				2853	* iclog. Flush it out now. There should
				2854	* be a roundoff of zero to show that someone
				2855	* has already taken care of the roundoff from
				2856	* the previous sync.
				2857	*/
				2858	iclog->ic_refcnt++;
				2859	lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT);
				2860	xlog_state_switch_iclogs(log, iclog, 0);
				2861	LOG_UNLOCK(log, s);
				2862
				2863	if (xlog_state_release_iclog(log, iclog))
				2864	return XFS_ERROR(EIO);
				2865	s = LOG_LOCK(log);
				2866	if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) == lsn &&
				2867	iclog->ic_state != XLOG_STATE_DIRTY)
				2868	goto maybe_sleep;
				2869	else
				2870	goto no_sleep;
				2871	} else {
				2872	/* Someone else is writing to this iclog.
				2873	* Use its call to flush out the data. However,
				2874	* the other thread may not force out this LR,
				2875	* so we mark it WANT_SYNC.
				2876	*/
				2877	xlog_state_switch_iclogs(log, iclog, 0);
				2878	goto maybe_sleep;
				2879	}
				2880	}
				2881	}
				2882
				2883	/* By the time we come around again, the iclog could've been filled
				2884	* which would give it another lsn. If we have a new lsn, just
				2885	* return because the relevant data has been flushed.
				2886	*/
				2887	maybe_sleep:
				2888	if (flags & XFS_LOG_SYNC) {
				2889	/*
				2890	* We must check if we're shutting down here, before
				2891	* we wait, while we're holding the LOG_LOCK.
				2892	* Then we check again after waking up, in case our
				2893	* sleep was disturbed by a bad news.
				2894	*/
				2895	if (iclog->ic_state & XLOG_STATE_IOERROR) {
				2896	LOG_UNLOCK(log, s);
				2897	return XFS_ERROR(EIO);
				2898	}
				2899	XFS_STATS_INC(xs_log_force_sleep);
				2900	sv_wait(&iclog->ic_forcesema, PINOD, &log->l_icloglock, s);
				2901	/*
				2902	* No need to grab the log lock here since we're
				2903	* only deciding whether or not to return EIO
				2904	* and the memory read should be atomic.
				2905	*/
				2906	if (iclog->ic_state & XLOG_STATE_IOERROR)
				2907	return XFS_ERROR(EIO);
				2908
				2909	} else {
				2910
				2911	no_sleep:
				2912	LOG_UNLOCK(log, s);
				2913	}
				2914	return 0;
				2915	} /* xlog_state_sync_all */
				2916
				2917
				2918	/*
				2919	* Used by code which implements synchronous log forces.
				2920	*
				2921	* Find in-core log with lsn.
				2922	* If it is in the DIRTY state, just return.
				2923	* If it is in the ACTIVE state, move the in-core log into the WANT_SYNC
				2924	* state and go to sleep or return.
				2925	* If it is in any other state, go to sleep or return.
				2926	*
				2927	* If filesystem activity goes to zero, the iclog will get flushed only by
				2928	* bdflush().
				2929	*/
				2930	int
				2931	xlog_state_sync(xlog_t *log,
				2932	xfs_lsn_t lsn,
				2933	uint flags)
				2934	{
				2935	xlog_in_core_t *iclog;
				2936	int already_slept = 0;
				2937	SPLDECL(s);
				2938
				2939
				2940	try_again:
				2941	s = LOG_LOCK(log);
				2942	iclog = log->l_iclog;
				2943
				2944	if (iclog->ic_state & XLOG_STATE_IOERROR) {
				2945	LOG_UNLOCK(log, s);
				2946	return XFS_ERROR(EIO);
				2947	}
				2948
				2949	do {
				2950	if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) != lsn) {
				2951	iclog = iclog->ic_next;
				2952	continue;
				2953	}
				2954
				2955	if (iclog->ic_state == XLOG_STATE_DIRTY) {
				2956	LOG_UNLOCK(log, s);
				2957	return 0;
				2958	}
				2959
				2960	if (iclog->ic_state == XLOG_STATE_ACTIVE) {
				2961	/*
				2962	* We sleep here if we haven't already slept (e.g.
				2963	* this is the first time we've looked at the correct
				2964	* iclog buf) and the buffer before us is going to
				2965	* be sync'ed. The reason for this is that if we
				2966	* are doing sync transactions here, by waiting for
				2967	* the previous I/O to complete, we can allow a few
				2968	* more transactions into this iclog before we close
				2969	* it down.
				2970	*
				2971	* Otherwise, we mark the buffer WANT_SYNC, and bump
				2972	* up the refcnt so we can release the log (which drops
				2973	* the ref count). The state switch keeps new transaction
				2974	* commits from using this buffer. When the current commits
				2975	* finish writing into the buffer, the refcount will drop to
				2976	* zero and the buffer will go out then.
				2977	*/
				2978	if (!already_slept &&
				2979	(iclog->ic_prev->ic_state & (XLOG_STATE_WANT_SYNC \|
				2980	XLOG_STATE_SYNCING))) {
				2981	ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
				2982	XFS_STATS_INC(xs_log_force_sleep);
				2983	sv_wait(&iclog->ic_prev->ic_writesema, PSWP,
				2984	&log->l_icloglock, s);
				2985	already_slept = 1;
				2986	goto try_again;
				2987	} else {
				2988	iclog->ic_refcnt++;
				2989	xlog_state_switch_iclogs(log, iclog, 0);
				2990	LOG_UNLOCK(log, s);
				2991	if (xlog_state_release_iclog(log, iclog))
				2992	return XFS_ERROR(EIO);
				2993	s = LOG_LOCK(log);
				2994	}
				2995	}
				2996
				2997	if ((flags & XFS_LOG_SYNC) && /* sleep */
				2998	!(iclog->ic_state & (XLOG_STATE_ACTIVE \| XLOG_STATE_DIRTY))) {
				2999
				3000	/*
				3001	* Don't wait on the forcesema if we know that we've
				3002	* gotten a log write error.
				3003	*/
				3004	if (iclog->ic_state & XLOG_STATE_IOERROR) {
				3005	LOG_UNLOCK(log, s);
				3006	return XFS_ERROR(EIO);
				3007	}
				3008	XFS_STATS_INC(xs_log_force_sleep);
				3009	sv_wait(&iclog->ic_forcesema, PSWP, &log->l_icloglock, s);
				3010	/*
				3011	* No need to grab the log lock here since we're
				3012	* only deciding whether or not to return EIO
				3013	* and the memory read should be atomic.
				3014	*/
				3015	if (iclog->ic_state & XLOG_STATE_IOERROR)
				3016	return XFS_ERROR(EIO);
				3017	} else { /* just return */
				3018	LOG_UNLOCK(log, s);
				3019	}
				3020	return 0;
				3021
				3022	} while (iclog != log->l_iclog);
				3023
				3024	LOG_UNLOCK(log, s);
				3025	return (0);
				3026	} /* xlog_state_sync */
				3027
				3028
				3029	/*
				3030	* Called when we want to mark the current iclog as being ready to sync to
				3031	* disk.
				3032	*/
				3033	void
				3034	xlog_state_want_sync(xlog_t log, xlog_in_core_t iclog)
				3035	{
				3036	SPLDECL(s);
				3037
				3038	s = LOG_LOCK(log);
				3039
				3040	if (iclog->ic_state == XLOG_STATE_ACTIVE) {
				3041	xlog_state_switch_iclogs(log, iclog, 0);
				3042	} else {
				3043	ASSERT(iclog->ic_state &
				3044	(XLOG_STATE_WANT_SYNC\|XLOG_STATE_IOERROR));
				3045	}
				3046
				3047	LOG_UNLOCK(log, s);
				3048	} /* xlog_state_want_sync */
				3049
				3050
				3051
				3052	/*****************************************************************************
				3053	*
				3054	* TICKET functions
				3055	*
				3056	*****************************************************************************
				3057	*/
				3058
				3059	/*
				3060	* Algorithm doesn't take into account page size. ;-(
				3061	*/
				3062	STATIC void
				3063	xlog_state_ticket_alloc(xlog_t *log)
				3064	{
				3065	xlog_ticket_t *t_list;
				3066	xlog_ticket_t *next;
				3067	xfs_caddr_t buf;
				3068	uint i = (NBPP / sizeof(xlog_ticket_t)) - 2;
				3069	SPLDECL(s);
				3070
				3071	/*
				3072	* The kmem_zalloc may sleep, so we shouldn't be holding the
				3073	* global lock. XXXmiken: may want to use zone allocator.
				3074	*/
				3075	buf = (xfs_caddr_t) kmem_zalloc(NBPP, KM_SLEEP);
				3076
				3077	s = LOG_LOCK(log);
				3078
				3079	/* Attach 1st ticket to Q, so we can keep track of allocated memory */
				3080	t_list = (xlog_ticket_t *)buf;
				3081	t_list->t_next = log->l_unmount_free;
				3082	log->l_unmount_free = t_list++;
				3083	log->l_ticket_cnt++;
				3084	log->l_ticket_tcnt++;
				3085
				3086	/* Next ticket becomes first ticket attached to ticket free list */
				3087	if (log->l_freelist != NULL) {
				3088	ASSERT(log->l_tail != NULL);
				3089	log->l_tail->t_next = t_list;
				3090	} else {
				3091	log->l_freelist = t_list;
				3092	}
				3093	log->l_ticket_cnt++;
				3094	log->l_ticket_tcnt++;
				3095
				3096	/* Cycle through rest of alloc'ed memory, building up free Q */
				3097	for ( ; i > 0; i--) {
				3098	next = t_list + 1;
				3099	t_list->t_next = next;
				3100	t_list = next;
				3101	log->l_ticket_cnt++;
				3102	log->l_ticket_tcnt++;
				3103	}
				3104	t_list->t_next = NULL;
				3105	log->l_tail = t_list;
				3106	LOG_UNLOCK(log, s);
				3107	} /* xlog_state_ticket_alloc */
				3108
				3109
				3110	/*
				3111	* Put ticket into free list
				3112	*
				3113	* Assumption: log lock is held around this call.
				3114	*/
				3115	STATIC void
				3116	xlog_ticket_put(xlog_t *log,
				3117	xlog_ticket_t *ticket)
				3118	{
				3119	sv_destroy(&ticket->t_sema);
				3120
				3121	/*
				3122	* Don't think caching will make that much difference. It's
				3123	* more important to make debug easier.
				3124	*/
				3125	#if 0
				3126	/* real code will want to use LIFO for caching */
				3127	ticket->t_next = log->l_freelist;
				3128	log->l_freelist = ticket;
				3129	/* no need to clear fields */
				3130	#else
				3131	/* When we debug, it is easier if tickets are cycled */
				3132	ticket->t_next = NULL;
				3133	if (log->l_tail != 0) {
				3134	log->l_tail->t_next = ticket;
				3135	} else {
				3136	ASSERT(log->l_freelist == 0);
				3137	log->l_freelist = ticket;
				3138	}
				3139	log->l_tail = ticket;
				3140	#endif /* DEBUG */
				3141	log->l_ticket_cnt++;
				3142	} /* xlog_ticket_put */
				3143
				3144
				3145	/*
				3146	* Grab ticket off freelist or allocation some more
				3147	*/
				3148	xlog_ticket_t *
				3149	xlog_ticket_get(xlog_t *log,
				3150	int unit_bytes,
				3151	int cnt,
				3152	char client,
				3153	uint xflags)
				3154	{
				3155	xlog_ticket_t *tic;
				3156	uint num_headers;
				3157	SPLDECL(s);
				3158
				3159	alloc:
				3160	if (log->l_freelist == NULL)
				3161	xlog_state_ticket_alloc(log); /* potentially sleep */
				3162
				3163	s = LOG_LOCK(log);
				3164	if (log->l_freelist == NULL) {
				3165	LOG_UNLOCK(log, s);
				3166	goto alloc;
				3167	}
				3168	tic = log->l_freelist;
				3169	log->l_freelist = tic->t_next;
				3170	if (log->l_freelist == NULL)
				3171	log->l_tail = NULL;
				3172	log->l_ticket_cnt--;
				3173	LOG_UNLOCK(log, s);
				3174
				3175	/*
				3176	* Permanent reservations have up to 'cnt'-1 active log operations
				3177	* in the log. A unit in this case is the amount of space for one
				3178	* of these log operations. Normal reservations have a cnt of 1
				3179	* and their unit amount is the total amount of space required.
				3180	*
				3181	* The following lines of code account for non-transaction data
				3182	* which occupy space in the on-disk log.
				3183	*/
				3184
				3185	/* for start-rec */
				3186	unit_bytes += sizeof(xlog_op_header_t);
				3187
				3188	/* for padding */
				3189	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) &&
				3190	log->l_mp->m_sb.sb_logsunit > 1) {
				3191	/* log su roundoff */
				3192	unit_bytes += log->l_mp->m_sb.sb_logsunit;
				3193	} else {
				3194	/* BB roundoff */
				3195	unit_bytes += BBSIZE;
				3196	}
				3197
				3198	/* for commit-rec */
				3199	unit_bytes += sizeof(xlog_op_header_t);
				3200
				3201	/* for LR headers */
				3202	num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log);
				3203	unit_bytes += log->l_iclog_hsize * num_headers;
				3204
				3205	tic->t_unit_res = unit_bytes;
				3206	tic->t_curr_res = unit_bytes;
				3207	tic->t_cnt = cnt;
				3208	tic->t_ocnt = cnt;
				3209	tic->t_tid = (xlog_tid_t)((__psint_t)tic & 0xffffffff);
				3210	tic->t_clientid = client;
				3211	tic->t_flags = XLOG_TIC_INITED;
				3212	if (xflags & XFS_LOG_PERM_RESERV)
				3213	tic->t_flags \|= XLOG_TIC_PERM_RESERV;
				3214	sv_init(&(tic->t_sema), SV_DEFAULT, "logtick");
				3215
				3216	return tic;
				3217	} /* xlog_ticket_get */
				3218
				3219
				3220	/******************************************************************************
				3221	*
				3222	* Log debug routines
				3223	*
				3224	******************************************************************************
				3225	*/
				3226	#if defined(DEBUG) && !defined(XLOG_NOLOG)
				3227	/*
				3228	* Make sure that the destination ptr is within the valid data region of
				3229	* one of the iclogs. This uses backup pointers stored in a different
				3230	* part of the log in case we trash the log structure.
				3231	*/
				3232	void
				3233	xlog_verify_dest_ptr(xlog_t *log,
				3234	__psint_t ptr)
				3235	{
				3236	int i;
				3237	int good_ptr = 0;
				3238
				3239	for (i=0; i < log->l_iclog_bufs; i++) {
				3240	if (ptr >= (__psint_t)log->l_iclog_bak[i] &&
				3241	ptr <= (__psint_t)log->l_iclog_bak[i]+log->l_iclog_size)
				3242	good_ptr++;
				3243	}
				3244	if (! good_ptr)
				3245	xlog_panic("xlog_verify_dest_ptr: invalid ptr");
				3246	} /* xlog_verify_dest_ptr */
				3247
				3248	STATIC void
				3249	xlog_verify_grant_head(xlog_t *log, int equals)
				3250	{
				3251	if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) {
				3252	if (equals)
				3253	ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes);
				3254	else
				3255	ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes);
				3256	} else {
				3257	ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle);
				3258	ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes);
				3259	}
				3260	} /* xlog_verify_grant_head */
				3261
				3262	/* check if it will fit */
				3263	STATIC void
				3264	xlog_verify_tail_lsn(xlog_t *log,
				3265	xlog_in_core_t *iclog,
				3266	xfs_lsn_t tail_lsn)
				3267	{
				3268	int blocks;
				3269
				3270	if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) {
				3271	blocks =
				3272	log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn));
				3273	if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize))
				3274	xlog_panic("xlog_verify_tail_lsn: ran out of log space");
				3275	} else {
				3276	ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle);
				3277
				3278	if (BLOCK_LSN(tail_lsn) == log->l_prev_block)
				3279	xlog_panic("xlog_verify_tail_lsn: tail wrapped");
				3280
				3281	blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block;
				3282	if (blocks < BTOBB(iclog->ic_offset) + 1)
				3283	xlog_panic("xlog_verify_tail_lsn: ran out of log space");
				3284	}
				3285	} /* xlog_verify_tail_lsn */
				3286
				3287	/*
				3288	* Perform a number of checks on the iclog before writing to disk.
				3289	*
				3290	* 1. Make sure the iclogs are still circular
				3291	* 2. Make sure we have a good magic number
				3292	* 3. Make sure we don't have magic numbers in the data
				3293	* 4. Check fields of each log operation header for:
				3294	* A. Valid client identifier
				3295	* B. tid ptr value falls in valid ptr space (user space code)
				3296	* C. Length in log record header is correct according to the
				3297	* individual operation headers within record.
				3298	* 5. When a bwrite will occur within 5 blocks of the front of the physical
				3299	* log, check the preceding blocks of the physical log to make sure all
				3300	* the cycle numbers agree with the current cycle number.
				3301	*/
				3302	STATIC void
				3303	xlog_verify_iclog(xlog_t *log,
				3304	xlog_in_core_t *iclog,
				3305	int count,
				3306	boolean_t syncing)
				3307	{
				3308	xlog_op_header_t *ophead;
				3309	xlog_in_core_t *icptr;
				3310	xlog_in_core_2_t *xhdr;
				3311	xfs_caddr_t ptr;
				3312	xfs_caddr_t base_ptr;
				3313	__psint_t field_offset;
				3314	__uint8_t clientid;
				3315	int len, i, j, k, op_len;
				3316	int idx;
				3317	SPLDECL(s);
				3318
				3319	/* check validity of iclog pointers */
				3320	s = LOG_LOCK(log);
				3321	icptr = log->l_iclog;
				3322	for (i=0; i < log->l_iclog_bufs; i++) {
				3323	if (icptr == 0)
				3324	xlog_panic("xlog_verify_iclog: invalid ptr");
				3325	icptr = icptr->ic_next;
				3326	}
				3327	if (icptr != log->l_iclog)
				3328	xlog_panic("xlog_verify_iclog: corrupt iclog ring");
				3329	LOG_UNLOCK(log, s);
				3330
				3331	/* check log magic numbers */
				3332	ptr = (xfs_caddr_t) &(iclog->ic_header);
				3333	if (INT_GET((uint )ptr, ARCH_CONVERT) != XLOG_HEADER_MAGIC_NUM)
				3334	xlog_panic("xlog_verify_iclog: invalid magic num");
				3335
				3336	for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&(iclog->ic_header))+count;
				3337	ptr += BBSIZE) {
				3338	if (INT_GET((uint )ptr, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM)
				3339	xlog_panic("xlog_verify_iclog: unexpected magic num");
				3340	}
				3341
				3342	/* check fields */
				3343	len = INT_GET(iclog->ic_header.h_num_logops, ARCH_CONVERT);
				3344	ptr = iclog->ic_datap;
				3345	base_ptr = ptr;
				3346	ophead = (xlog_op_header_t *)ptr;
				3347	xhdr = (xlog_in_core_2_t *)&iclog->ic_header;
				3348	for (i = 0; i < len; i++) {
				3349	ophead = (xlog_op_header_t *)ptr;
				3350
				3351	/* clientid is only 1 byte */
				3352	field_offset = (__psint_t)
				3353	((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr);
				3354	if (syncing == B_FALSE \|\| (field_offset & 0x1ff)) {
				3355	clientid = ophead->oh_clientid;
				3356	} else {
				3357	idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap);
				3358	if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
				3359	j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
				3360	k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
				3361	clientid = GET_CLIENT_ID(xhdr[j].hic_xheader.xh_cycle_data[k], ARCH_CONVERT);
				3362	} else {
				3363	clientid = GET_CLIENT_ID(iclog->ic_header.h_cycle_data[idx], ARCH_CONVERT);
				3364	}
				3365	}
				3366	if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
				3367	cmn_err(CE_WARN, "xlog_verify_iclog: invalid clientid %d op 0x%p offset 0x%x", clientid, ophead, field_offset);
				3368
				3369	/* check length */
				3370	field_offset = (__psint_t)
				3371	((xfs_caddr_t)&(ophead->oh_len) - base_ptr);
				3372	if (syncing == B_FALSE \|\| (field_offset & 0x1ff)) {
				3373	op_len = INT_GET(ophead->oh_len, ARCH_CONVERT);
				3374	} else {
				3375	idx = BTOBBT((__psint_t)&ophead->oh_len -
				3376	(__psint_t)iclog->ic_datap);
				3377	if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
				3378	j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
				3379	k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
				3380	op_len = INT_GET(xhdr[j].hic_xheader.xh_cycle_data[k], ARCH_CONVERT);
				3381	} else {
				3382	op_len = INT_GET(iclog->ic_header.h_cycle_data[idx], ARCH_CONVERT);
				3383	}
				3384	}
				3385	ptr += sizeof(xlog_op_header_t) + op_len;
				3386	}
				3387	} /* xlog_verify_iclog */
				3388	#endif /* DEBUG && !XLOG_NOLOG */
				3389
				3390	/*
				3391	* Mark all iclogs IOERROR. LOG_LOCK is held by the caller.
				3392	*/
				3393	STATIC int
				3394	xlog_state_ioerror(
				3395	xlog_t *log)
				3396	{
				3397	xlog_in_core_t iclog, ic;
				3398
				3399	iclog = log->l_iclog;
				3400	if (! (iclog->ic_state & XLOG_STATE_IOERROR)) {
				3401	/*
				3402	* Mark all the incore logs IOERROR.
				3403	* From now on, no log flushes will result.
				3404	*/
				3405	ic = iclog;
				3406	do {
				3407	ic->ic_state = XLOG_STATE_IOERROR;
				3408	ic = ic->ic_next;
				3409	} while (ic != iclog);
				3410	return (0);
				3411	}
				3412	/*
				3413	* Return non-zero, if state transition has already happened.
				3414	*/
				3415	return (1);
				3416	}
				3417
				3418	/*
				3419	* This is called from xfs_force_shutdown, when we're forcibly
				3420	* shutting down the filesystem, typically because of an IO error.
				3421	* Our main objectives here are to make sure that:
				3422	* a. the filesystem gets marked 'SHUTDOWN' for all interested
				3423	* parties to find out, 'atomically'.
				3424	* b. those who're sleeping on log reservations, pinned objects and
				3425	* other resources get woken up, and be told the bad news.
				3426	* c. nothing new gets queued up after (a) and (b) are done.
				3427	* d. if !logerror, flush the iclogs to disk, then seal them off
				3428	* for business.
				3429	*/
				3430	int
				3431	xfs_log_force_umount(
				3432	struct xfs_mount *mp,
				3433	int logerror)
				3434	{
				3435	xlog_ticket_t *tic;
				3436	xlog_t *log;
				3437	int retval;
				3438	SPLDECL(s);
				3439	SPLDECL(s2);
				3440
				3441	log = mp->m_log;
				3442
				3443	/*
				3444	* If this happens during log recovery, don't worry about
				3445	* locking; the log isn't open for business yet.
				3446	*/
				3447	if (!log \|\|
				3448	log->l_flags & XLOG_ACTIVE_RECOVERY) {
				3449	mp->m_flags \|= XFS_MOUNT_FS_SHUTDOWN;
				3450	XFS_BUF_DONE(mp->m_sb_bp);
				3451	return (0);
				3452	}
				3453
				3454	/*
				3455	* Somebody could've already done the hard work for us.
				3456	* No need to get locks for this.
				3457	*/
				3458	if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) {
				3459	ASSERT(XLOG_FORCED_SHUTDOWN(log));
				3460	return (1);
				3461	}
				3462	retval = 0;
				3463	/*
				3464	* We must hold both the GRANT lock and the LOG lock,
				3465	* before we mark the filesystem SHUTDOWN and wake
				3466	* everybody up to tell the bad news.
				3467	*/
				3468	s = GRANT_LOCK(log);
				3469	s2 = LOG_LOCK(log);
				3470	mp->m_flags \|= XFS_MOUNT_FS_SHUTDOWN;
				3471	XFS_BUF_DONE(mp->m_sb_bp);
				3472	/*
				3473	* This flag is sort of redundant because of the mount flag, but
				3474	* it's good to maintain the separation between the log and the rest
				3475	* of XFS.
				3476	*/
				3477	log->l_flags \|= XLOG_IO_ERROR;
				3478
				3479	/*
				3480	* If we hit a log error, we want to mark all the iclogs IOERROR
				3481	* while we're still holding the loglock.
				3482	*/
				3483	if (logerror)
				3484	retval = xlog_state_ioerror(log);
				3485	LOG_UNLOCK(log, s2);
				3486
				3487	/*
				3488	* We don't want anybody waiting for log reservations
				3489	* after this. That means we have to wake up everybody
				3490	* queued up on reserve_headq as well as write_headq.
				3491	* In addition, we make sure in xlog_{re}grant_log_space
				3492	* that we don't enqueue anything once the SHUTDOWN flag
				3493	* is set, and this action is protected by the GRANTLOCK.
				3494	*/
				3495	if ((tic = log->l_reserve_headq)) {
				3496	do {
				3497	sv_signal(&tic->t_sema);
				3498	tic = tic->t_next;
				3499	} while (tic != log->l_reserve_headq);
				3500	}
				3501
				3502	if ((tic = log->l_write_headq)) {
				3503	do {
				3504	sv_signal(&tic->t_sema);
				3505	tic = tic->t_next;
				3506	} while (tic != log->l_write_headq);
				3507	}
				3508	GRANT_UNLOCK(log, s);
				3509
				3510	if (! (log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
				3511	ASSERT(!logerror);
				3512	/*
				3513	* Force the incore logs to disk before shutting the
				3514	* log down completely.
				3515	*/
				3516	xlog_state_sync_all(log, XFS_LOG_FORCE\|XFS_LOG_SYNC);
				3517	s2 = LOG_LOCK(log);
				3518	retval = xlog_state_ioerror(log);
				3519	LOG_UNLOCK(log, s2);
				3520	}
				3521	/*
				3522	* Wake up everybody waiting on xfs_log_force.
				3523	* Callback all log item committed functions as if the
				3524	* log writes were completed.
				3525	*/
				3526	xlog_state_do_callback(log, XFS_LI_ABORTED, NULL);
				3527
				3528	#ifdef XFSERRORDEBUG
				3529	{
				3530	xlog_in_core_t *iclog;
				3531
				3532	s = LOG_LOCK(log);
				3533	iclog = log->l_iclog;
				3534	do {
				3535	ASSERT(iclog->ic_callback == 0);
				3536	iclog = iclog->ic_next;
				3537	} while (iclog != log->l_iclog);
				3538	LOG_UNLOCK(log, s);
				3539	}
				3540	#endif
				3541	/* return non-zero if log IOERROR transition had already happened */
				3542	return (retval);
				3543	}
				3544
				3545	int
				3546	xlog_iclogs_empty(xlog_t *log)
				3547	{
				3548	xlog_in_core_t *iclog;
				3549
				3550	iclog = log->l_iclog;
				3551	do {
				3552	/* endianness does not matter here, zero is zero in
				3553	* any language.
				3554	*/
				3555	if (iclog->ic_header.h_num_logops)
				3556	return(0);
				3557	iclog = iclog->ic_next;
				3558	} while (iclog != log->l_iclog);
				3559	return(1);
				3560	}