Blame - drivers/staging/lustre/lustre/include/cl_object.h - kernel/msm-4.9

blob: d4c33dd110abb4e5eaa988261d0e34a8414db23c [file] [log] [blame]

Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1	/*
				2	* GPL HEADER START
				3	*
				4	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
				5	*
				6	* This program is free software; you can redistribute it and/or modify
				7	* it under the terms of the GNU General Public License version 2 only,
				8	* as published by the Free Software Foundation.
				9	*
				10	* This program is distributed in the hope that it will be useful, but
				11	* WITHOUT ANY WARRANTY; without even the implied warranty of
				12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				13	* General Public License version 2 for more details (a copy is included
				14	* in the LICENSE file that accompanied this code).
				15	*
				16	* You should have received a copy of the GNU General Public License
				17	* version 2 along with this program; If not, see
				18	* http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
				19	*
				20	* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
				21	* CA 95054 USA or visit www.sun.com if you need additional information or
				22	* have any questions.
				23	*
				24	* GPL HEADER END
				25	*/
				26	/*
				27	* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
				28	* Use is subject to license terms.
				29	*
Andreas Dilger	1dc563a	2015-11-08 18:09:37 -0500	[diff] [blame]	30	* Copyright (c) 2011, 2015, Intel Corporation.
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	31	*/
				32	/*
				33	* This file is part of Lustre, http://www.lustre.org/
				34	* Lustre is a trademark of Sun Microsystems, Inc.
				35	*/
				36	#ifndef _LUSTRE_CL_OBJECT_H
				37	#define _LUSTRE_CL_OBJECT_H
				38
				39	/** \defgroup clio clio
				40	*
				41	* Client objects implement io operations and cache pages.
				42	*
				43	* Examples: lov and osc are implementations of cl interface.
				44	*
				45	* Big Theory Statement.
				46	*
				47	* Layered objects.
				48	*
				49	* Client implementation is based on the following data-types:
				50	*
				51	* - cl_object
				52	*
				53	* - cl_page
				54	*
				55	* - cl_lock represents an extent lock on an object.
				56	*
				57	* - cl_io represents high-level i/o activity such as whole read/write
				58	* system call, or write-out of pages from under the lock being
				59	* canceled. cl_io has sub-ios that can be stopped and resumed
				60	* independently, thus achieving high degree of transfer
				61	* parallelism. Single cl_io can be advanced forward by
				62	* the multiple threads (although in the most usual case of
				63	* read/write system call it is associated with the single user
				64	* thread, that issued the system call).
				65	*
				66	* - cl_req represents a collection of pages for a transfer. cl_req is
				67	* constructed by req-forming engine that tries to saturate
				68	* transport with large and continuous transfers.
				69	*
				70	* Terminology
				71	*
				72	* - to avoid confusion high-level I/O operation like read or write system
				73	* call is referred to as "an io", whereas low-level I/O operation, like
				74	* RPC, is referred to as "a transfer"
				75	*
				76	* - "generic code" means generic (not file system specific) code in the
				77	* hosting environment. "cl-code" means code (mostly in cl_*.c files) that
				78	* is not layer specific.
				79	*
				80	* Locking.
				81	*
				82	* - i_mutex
				83	* - PG_locked
				84	* - cl_object_header::coh_page_guard
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	85	* - lu_site::ls_guard
				86	*
				87	* See the top comment in cl_object.c for the description of overall locking and
				88	* reference-counting design.
				89	*
				90	* See comments below for the description of i/o, page, and dlm-locking
				91	* design.
				92	*
				93	* @{
				94	*/
				95
				96	/*
				97	* super-class definitions.
				98	*/
Greg Kroah-Hartman	1accaad	2014-07-11 21:34:24 -0700	[diff] [blame]	99	#include "lu_object.h"
John L. Hammond	0d34565	2016-03-30 19:48:45 -0400	[diff] [blame]	100	#include <linux/atomic.h>
John L. Hammond	91b3aaf	2014-09-09 13:39:08 -0500	[diff] [blame]	101	#include "linux/lustre_compat25.h"
Greg Kroah-Hartman	1accaad	2014-07-11 21:34:24 -0700	[diff] [blame]	102	#include <linux/mutex.h>
				103	#include <linux/radix-tree.h>
John L. Hammond	0d34565	2016-03-30 19:48:45 -0400	[diff] [blame]	104	#include <linux/spinlock.h>
				105	#include <linux/wait.h>
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	106
				107	struct inode;
				108
				109	struct cl_device;
				110	struct cl_device_operations;
				111
				112	struct cl_object;
				113	struct cl_object_page_operations;
				114	struct cl_object_lock_operations;
				115
				116	struct cl_page;
				117	struct cl_page_slice;
				118	struct cl_lock;
				119	struct cl_lock_slice;
				120
				121	struct cl_lock_operations;
				122	struct cl_page_operations;
				123
				124	struct cl_io;
				125	struct cl_io_slice;
				126
				127	struct cl_req;
				128	struct cl_req_slice;
				129
				130	/**
				131	* Operations for each data device in the client stack.
				132	*
				133	* \see vvp_cl_ops, lov_cl_ops, lovsub_cl_ops, osc_cl_ops
				134	*/
				135	struct cl_device_operations {
				136	/**
				137	* Initialize cl_req. This method is called top-to-bottom on all
				138	* devices in the stack to get them a chance to allocate layer-private
				139	* data, and to attach them to the cl_req by calling
				140	* cl_req_slice_add().
				141	*
				142	* \see osc_req_init(), lov_req_init(), lovsub_req_init()
John L. Hammond	103b8bd	2016-03-30 19:48:54 -0400	[diff] [blame]	143	* \see vvp_req_init()
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	144	*/
				145	int (cdo_req_init)(const struct lu_env env, struct cl_device *dev,
				146	struct cl_req *req);
				147	};
				148
				149	/**
				150	* Device in the client stack.
				151	*
John L. Hammond	3c95b83	2016-03-30 19:48:46 -0400	[diff] [blame]	152	* \see vvp_device, lov_device, lovsub_device, osc_device
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	153	*/
				154	struct cl_device {
				155	/** Super-class. */
				156	struct lu_device cd_lu_dev;
				157	/** Per-layer operation vector. */
				158	const struct cl_device_operations *cd_ops;
				159	};
				160
				161	/** \addtogroup cl_object cl_object
Oleg Drokin	c56e256	2016-02-24 22:00:25 -0500	[diff] [blame]	162	* @{
				163	*/
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	164	/**
				165	* "Data attributes" of cl_object. Data attributes can be updated
				166	* independently for a sub-object, and top-object's attributes are calculated
				167	* from sub-objects' ones.
				168	*/
				169	struct cl_attr {
				170	/** Object size, in bytes */
				171	loff_t cat_size;
				172	/**
				173	* Known minimal size, in bytes.
				174	*
				175	* This is only valid when at least one DLM lock is held.
				176	*/
				177	loff_t cat_kms;
				178	/** Modification time. Measured in seconds since epoch. */
Arnd Bergmann	46c360f	2015-09-27 16:45:02 -0400	[diff] [blame]	179	time64_t cat_mtime;
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	180	/** Access time. Measured in seconds since epoch. */
Arnd Bergmann	46c360f	2015-09-27 16:45:02 -0400	[diff] [blame]	181	time64_t cat_atime;
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	182	/** Change time. Measured in seconds since epoch. */
Arnd Bergmann	46c360f	2015-09-27 16:45:02 -0400	[diff] [blame]	183	time64_t cat_ctime;
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	184	/**
				185	* Blocks allocated to this cl_object on the server file system.
				186	*
				187	* \todo XXX An interface for block size is needed.
				188	*/
				189	__u64 cat_blocks;
				190	/**
				191	* User identifier for quota purposes.
				192	*/
				193	uid_t cat_uid;
				194	/**
				195	* Group identifier for quota purposes.
				196	*/
				197	gid_t cat_gid;
				198	};
				199
				200	/**
				201	* Fields in cl_attr that are being set.
				202	*/
				203	enum cl_attr_valid {
				204	CAT_SIZE = 1 << 0,
				205	CAT_KMS = 1 << 1,
				206	CAT_MTIME = 1 << 3,
				207	CAT_ATIME = 1 << 4,
				208	CAT_CTIME = 1 << 5,
				209	CAT_BLOCKS = 1 << 6,
				210	CAT_UID = 1 << 7,
				211	CAT_GID = 1 << 8
				212	};
				213
				214	/**
				215	* Sub-class of lu_object with methods common for objects on the client
				216	* stacks.
				217	*
				218	* cl_object: represents a regular file system object, both a file and a
				219	* stripe. cl_object is based on lu_object: it is identified by a fid,
				220	* layered, cached, hashed, and lrued. Important distinction with the server
				221	* side, where md_object and dt_object are used, is that cl_object "fans out"
				222	* at the lov/sns level: depending on the file layout, single file is
				223	* represented as a set of "sub-objects" (stripes). At the implementation
				224	* level, struct lov_object contains an array of cl_objects. Each sub-object
				225	* is a full-fledged cl_object, having its fid, living in the lru and hash
				226	* table.
				227	*
				228	* This leads to the next important difference with the server side: on the
				229	* client, it's quite usual to have objects with the different sequence of
				230	* layers. For example, typical top-object is composed of the following
				231	* layers:
				232	*
				233	* - vvp
				234	* - lov
				235	*
				236	* whereas its sub-objects are composed of
				237	*
				238	* - lovsub
				239	* - osc
				240	*
				241	* layers. Here "lovsub" is a mostly dummy layer, whose purpose is to keep
				242	* track of the object-subobject relationship.
				243	*
				244	* Sub-objects are not cached independently: when top-object is about to
				245	* be discarded from the memory, all its sub-objects are torn-down and
				246	* destroyed too.
				247	*
John L. Hammond	8c7b0e1	2016-03-30 19:48:47 -0400	[diff] [blame]	248	* \see vvp_object, lov_object, lovsub_object, osc_object
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	249	*/
				250	struct cl_object {
				251	/** super class */
				252	struct lu_object co_lu;
				253	/** per-object-layer operations */
				254	const struct cl_object_operations *co_ops;
				255	/** offset of page slice in cl_page buffer */
				256	int co_slice_off;
				257	};
				258
				259	/**
				260	* Description of the client object configuration. This is used for the
				261	* creation of a new client object that is identified by a more state than
				262	* fid.
				263	*/
				264	struct cl_object_conf {
				265	/** Super-class. */
				266	struct lu_object_conf coc_lu;
				267	union {
				268	/**
				269	* Object layout. This is consumed by lov.
				270	*/
				271	struct lustre_md *coc_md;
				272	/**
				273	* Description of particular stripe location in the
				274	* cluster. This is consumed by osc.
				275	*/
				276	struct lov_oinfo *coc_oinfo;
				277	} u;
				278	/**
				279	* VFS inode. This is consumed by vvp.
				280	*/
				281	struct inode *coc_inode;
				282	/**
				283	* Layout lock handle.
				284	*/
				285	struct ldlm_lock *coc_lock;
				286	/**
				287	* Operation to handle layout, OBJECT_CONF_XYZ.
				288	*/
				289	int coc_opc;
				290	};
				291
				292	enum {
				293	/** configure layout, set up a new stripe, must be called while
Oleg Drokin	c56e256	2016-02-24 22:00:25 -0500	[diff] [blame]	294	* holding layout lock.
				295	*/
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	296	OBJECT_CONF_SET = 0,
				297	/** invalidate the current stripe configuration due to losing
Oleg Drokin	c56e256	2016-02-24 22:00:25 -0500	[diff] [blame]	298	* layout lock.
				299	*/
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	300	OBJECT_CONF_INVALIDATE = 1,
Oleg Drokin	c56e256	2016-02-24 22:00:25 -0500	[diff] [blame]	301	/** wait for old layout to go away so that new layout can be set up. */
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	302	OBJECT_CONF_WAIT = 2
				303	};
				304
				305	/**
				306	* Operations implemented for each cl object layer.
				307	*
				308	* \see vvp_ops, lov_ops, lovsub_ops, osc_ops
				309	*/
				310	struct cl_object_operations {
				311	/**
				312	* Initialize page slice for this layer. Called top-to-bottom through
				313	* every object layer when a new cl_page is instantiated. Layer
				314	* keeping private per-page data, or requiring its own page operations
				315	* vector should allocate these data here, and attach then to the page
				316	* by calling cl_page_slice_add(). \a vmpage is locked (in the VM
				317	* sense). Optional.
				318	*
				319	* \retval NULL success.
				320	*
				321	* \retval ERR_PTR(errno) failure code.
				322	*
				323	* \retval valid-pointer pointer to already existing referenced page
				324	* to be used instead of newly created.
				325	*/
				326	int (coo_page_init)(const struct lu_env env, struct cl_object *obj,
Jinshan Xiong	7addf40	2016-03-30 19:48:32 -0400	[diff] [blame]	327	struct cl_page *page, pgoff_t index);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	328	/**
				329	* Initialize lock slice for this layer. Called top-to-bottom through
				330	* every object layer when a new cl_lock is instantiated. Layer
				331	* keeping private per-lock data, or requiring its own lock operations
				332	* vector should allocate these data here, and attach then to the lock
				333	* by calling cl_lock_slice_add(). Mandatory.
				334	*/
				335	int (coo_lock_init)(const struct lu_env env,
				336	struct cl_object obj, struct cl_lock lock,
				337	const struct cl_io *io);
				338	/**
				339	* Initialize io state for a given layer.
				340	*
				341	* called top-to-bottom once per io existence to initialize io
				342	* state. If layer wants to keep some state for this type of io, it
				343	* has to embed struct cl_io_slice in lu_env::le_ses, and register
				344	* slice with cl_io_slice_add(). It is guaranteed that all threads
				345	* participating in this io share the same session.
				346	*/
				347	int (coo_io_init)(const struct lu_env env,
				348	struct cl_object obj, struct cl_io io);
				349	/**
				350	* Fill portion of \a attr that this layer controls. This method is
				351	* called top-to-bottom through all object layers.
				352	*
				353	* \pre cl_object_header::coh_attr_guard of the top-object is locked.
				354	*
				355	* \return 0: to continue
				356	* \return +ve: to stop iterating through layers (but 0 is returned
				357	* from enclosing cl_object_attr_get())
				358	* \return -ve: to signal error
				359	*/
				360	int (coo_attr_get)(const struct lu_env env, struct cl_object *obj,
				361	struct cl_attr *attr);
				362	/**
				363	* Update attributes.
				364	*
				365	* \a valid is a bitmask composed from enum #cl_attr_valid, and
				366	* indicating what attributes are to be set.
				367	*
				368	* \pre cl_object_header::coh_attr_guard of the top-object is locked.
				369	*
				370	* \return the same convention as for
				371	* cl_object_operations::coo_attr_get() is used.
				372	*/
				373	int (coo_attr_set)(const struct lu_env env, struct cl_object *obj,
				374	const struct cl_attr *attr, unsigned valid);
				375	/**
				376	* Update object configuration. Called top-to-bottom to modify object
				377	* configuration.
				378	*
				379	* XXX error conditions and handling.
				380	*/
				381	int (coo_conf_set)(const struct lu_env env, struct cl_object *obj,
				382	const struct cl_object_conf *conf);
				383	/**
				384	* Glimpse ast. Executed when glimpse ast arrives for a lock on this
				385	* object. Layers are supposed to fill parts of \a lvb that will be
				386	* shipped to the glimpse originator as a glimpse result.
				387	*
John L. Hammond	8c7b0e1	2016-03-30 19:48:47 -0400	[diff] [blame]	388	* \see vvp_object_glimpse(), lovsub_object_glimpse(),
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	389	* \see osc_object_glimpse()
				390	*/
				391	int (coo_glimpse)(const struct lu_env env,
				392	const struct cl_object obj, struct ost_lvb lvb);
Jinshan Xiong	d9d4790	2016-03-30 19:48:28 -0400	[diff] [blame]	393	/**
				394	* Object prune method. Called when the layout is going to change on
				395	* this object, therefore each layer has to clean up their cache,
				396	* mainly pages and locks.
				397	*/
				398	int (coo_prune)(const struct lu_env env, struct cl_object *obj);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	399	};
				400
				401	/**
				402	* Extended header for client object.
				403	*/
				404	struct cl_object_header {
				405	/** Standard lu_object_header. cl_object::co_lu::lo_header points
Oleg Drokin	c56e256	2016-02-24 22:00:25 -0500	[diff] [blame]	406	* here.
				407	*/
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	408	struct lu_object_header coh_lu;
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	409
				410	/**
				411	* Parent object. It is assumed that an object has a well-defined
				412	* parent, but not a well-defined child (there may be multiple
				413	* sub-objects, for the same top-object). cl_object_header::coh_parent
				414	* field allows certain code to be written generically, without
				415	* limiting possible cl_object layouts unduly.
				416	*/
				417	struct cl_object_header *coh_parent;
				418	/**
				419	* Protects consistency between cl_attr of parent object and
				420	* attributes of sub-objects, that the former is calculated ("merged")
				421	* from.
				422	*
				423	* \todo XXX this can be read/write lock if needed.
				424	*/
				425	spinlock_t coh_attr_guard;
				426	/**
				427	* Size of cl_page + page slices
				428	*/
				429	unsigned short coh_page_bufsize;
				430	/**
				431	* Number of objects above this one: 0 for a top-object, 1 for its
				432	* sub-object, etc.
				433	*/
				434	unsigned char coh_nesting;
				435	};
				436
				437	/**
				438	* Helper macro: iterate over all layers of the object \a obj, assigning every
				439	* layer top-to-bottom to \a slice.
				440	*/
				441	#define cl_object_for_each(slice, obj) \
				442	list_for_each_entry((slice), \
				443	&(obj)->co_lu.lo_header->loh_layers, \
				444	co_lu.lo_linkage)
				445	/**
				446	* Helper macro: iterate over all layers of the object \a obj, assigning every
				447	* layer bottom-to-top to \a slice.
				448	*/
				449	#define cl_object_for_each_reverse(slice, obj) \
				450	list_for_each_entry_reverse((slice), \
				451	&(obj)->co_lu.lo_header->loh_layers, \
				452	co_lu.lo_linkage)
				453	/** @} cl_object */
				454
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	455	#define CL_PAGE_EOF ((pgoff_t)~0ull)
				456
				457	/** \addtogroup cl_page cl_page
Oleg Drokin	c56e256	2016-02-24 22:00:25 -0500	[diff] [blame]	458	* @{
				459	*/
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	460
				461	/** \struct cl_page
				462	* Layered client page.
				463	*
				464	* cl_page: represents a portion of a file, cached in the memory. All pages
				465	* of the given file are of the same size, and are kept in the radix tree
				466	* hanging off the cl_object. cl_page doesn't fan out, but as sub-objects
				467	* of the top-level file object are first class cl_objects, they have their
				468	* own radix trees of pages and hence page is implemented as a sequence of
				469	* struct cl_pages's, linked into double-linked list through
				470	* cl_page::cp_parent and cl_page::cp_child pointers, each residing in the
				471	* corresponding radix tree at the corresponding logical offset.
				472	*
				473	* cl_page is associated with VM page of the hosting environment (struct
				474	* page in Linux kernel, for example), struct page. It is assumed, that this
				475	* association is implemented by one of cl_page layers (top layer in the
				476	* current design) that
				477	*
				478	* - intercepts per-VM-page call-backs made by the environment (e.g.,
				479	* memory pressure),
				480	*
				481	* - translates state (page flag bits) and locking between lustre and
				482	* environment.
				483	*
				484	* The association between cl_page and struct page is immutable and
				485	* established when cl_page is created.
				486	*
				487	* cl_page can be "owned" by a particular cl_io (see below), guaranteeing
				488	* this io an exclusive access to this page w.r.t. other io attempts and
				489	* various events changing page state (such as transfer completion, or
				490	* eviction of the page from the memory). Note, that in general cl_io
				491	* cannot be identified with a particular thread, and page ownership is not
				492	* exactly equal to the current thread holding a lock on the page. Layer
				493	* implementing association between cl_page and struct page has to implement
				494	* ownership on top of available synchronization mechanisms.
				495	*
				496	* While lustre client maintains the notion of an page ownership by io,
				497	* hosting MM/VM usually has its own page concurrency control
				498	* mechanisms. For example, in Linux, page access is synchronized by the
				499	* per-page PG_locked bit-lock, and generic kernel code (generic_file_*())
				500	* takes care to acquire and release such locks as necessary around the
				501	* calls to the file system methods (->readpage(), ->prepare_write(),
				502	* ->commit_write(), etc.). This leads to the situation when there are two
				503	* different ways to own a page in the client:
				504	*
				505	* - client code explicitly and voluntary owns the page (cl_page_own());
				506	*
				507	* - VM locks a page and then calls the client, that has "to assume"
				508	* the ownership from the VM (cl_page_assume()).
				509	*
				510	* Dual methods to release ownership are cl_page_disown() and
				511	* cl_page_unassume().
				512	*
				513	* cl_page is reference counted (cl_page::cp_ref). When reference counter
				514	* drops to 0, the page is returned to the cache, unless it is in
				515	* cl_page_state::CPS_FREEING state, in which case it is immediately
				516	* destroyed.
				517	*
				518	* The general logic guaranteeing the absence of "existential races" for
				519	* pages is the following:
				520	*
				521	* - there are fixed known ways for a thread to obtain a new reference
				522	* to a page:
				523	*
				524	* - by doing a lookup in the cl_object radix tree, protected by the
				525	* spin-lock;
				526	*
				527	* - by starting from VM-locked struct page and following some
				528	* hosting environment method (e.g., following ->private pointer in
				529	* the case of Linux kernel), see cl_vmpage_page();
				530	*
				531	* - when the page enters cl_page_state::CPS_FREEING state, all these
				532	* ways are severed with the proper synchronization
				533	* (cl_page_delete());
				534	*
				535	* - entry into cl_page_state::CPS_FREEING is serialized by the VM page
				536	* lock;
				537	*
				538	* - no new references to the page in cl_page_state::CPS_FREEING state
				539	* are allowed (checked in cl_page_get()).
				540	*
				541	* Together this guarantees that when last reference to a
				542	* cl_page_state::CPS_FREEING page is released, it is safe to destroy the
				543	* page, as neither references to it can be acquired at that point, nor
				544	* ones exist.
				545	*
				546	* cl_page is a state machine. States are enumerated in enum
				547	* cl_page_state. Possible state transitions are enumerated in
				548	* cl_page_state_set(). State transition process (i.e., actual changing of
				549	* cl_page::cp_state field) is protected by the lock on the underlying VM
				550	* page.
				551	*
				552	* Linux Kernel implementation.
				553	*
				554	* Binding between cl_page and struct page (which is a typedef for
				555	* struct page) is implemented in the vvp layer. cl_page is attached to the
				556	* ->private pointer of the struct page, together with the setting of
				557	* PG_private bit in page->flags, and acquiring additional reference on the
				558	* struct page (much like struct buffer_head, or any similar file system
				559	* private data structures).
				560	*
				561	* PG_locked lock is used to implement both ownership and transfer
				562	* synchronization, that is, page is VM-locked in CPS_{OWNED,PAGE{IN,OUT}}
				563	* states. No additional references are acquired for the duration of the
				564	* transfer.
				565	*
				566	* \warning THIS IS NOT the behavior expected by the Linux kernel, where
				567	* write-out is "protected" by the special PG_writeback bit.
				568	*/
				569
				570	/**
				571	* States of cl_page. cl_page.c assumes particular order here.
				572	*
				573	* The page state machine is rather crude, as it doesn't recognize finer page
				574	* states like "dirty" or "up to date". This is because such states are not
				575	* always well defined for the whole stack (see, for example, the
				576	* implementation of the read-ahead, that hides page up-to-dateness to track
				577	* cache hits accurately). Such sub-states are maintained by the layers that
				578	* are interested in them.
				579	*/
				580	enum cl_page_state {
				581	/**
				582	* Page is in the cache, un-owned. Page leaves cached state in the
				583	* following cases:
				584	*
				585	* - [cl_page_state::CPS_OWNED] io comes across the page and
				586	* owns it;
				587	*
				588	* - [cl_page_state::CPS_PAGEOUT] page is dirty, the
				589	* req-formation engine decides that it wants to include this page
				590	* into an cl_req being constructed, and yanks it from the cache;
				591	*
				592	* - [cl_page_state::CPS_FREEING] VM callback is executed to
				593	* evict the page form the memory;
				594	*
				595	* \invariant cl_page::cp_owner == NULL && cl_page::cp_req == NULL
				596	*/
				597	CPS_CACHED,
				598	/**
				599	* Page is exclusively owned by some cl_io. Page may end up in this
				600	* state as a result of
				601	*
				602	* - io creating new page and immediately owning it;
				603	*
				604	* - [cl_page_state::CPS_CACHED] io finding existing cached page
				605	* and owning it;
				606	*
				607	* - [cl_page_state::CPS_OWNED] io finding existing owned page
				608	* and waiting for owner to release the page;
				609	*
				610	* Page leaves owned state in the following cases:
				611	*
				612	* - [cl_page_state::CPS_CACHED] io decides to leave the page in
				613	* the cache, doing nothing;
				614	*
				615	* - [cl_page_state::CPS_PAGEIN] io starts read transfer for
				616	* this page;
				617	*
				618	* - [cl_page_state::CPS_PAGEOUT] io starts immediate write
				619	* transfer for this page;
				620	*
				621	* - [cl_page_state::CPS_FREEING] io decides to destroy this
				622	* page (e.g., as part of truncate or extent lock cancellation).
				623	*
				624	* \invariant cl_page::cp_owner != NULL && cl_page::cp_req == NULL
				625	*/
				626	CPS_OWNED,
				627	/**
				628	* Page is being written out, as a part of a transfer. This state is
				629	* entered when req-formation logic decided that it wants this page to
				630	* be sent through the wire _now_. Specifically, it means that once
				631	* this state is achieved, transfer completion handler (with either
				632	* success or failure indication) is guaranteed to be executed against
				633	* this page independently of any locks and any scheduling decisions
				634	* made by the hosting environment (that effectively means that the
				635	* page is never put into cl_page_state::CPS_PAGEOUT state "in
				636	* advance". This property is mentioned, because it is important when
				637	* reasoning about possible dead-locks in the system). The page can
				638	* enter this state as a result of
				639	*
				640	* - [cl_page_state::CPS_OWNED] an io requesting an immediate
				641	* write-out of this page, or
				642	*
				643	* - [cl_page_state::CPS_CACHED] req-forming engine deciding
				644	* that it has enough dirty pages cached to issue a "good"
				645	* transfer.
				646	*
				647	* The page leaves cl_page_state::CPS_PAGEOUT state when the transfer
				648	* is completed---it is moved into cl_page_state::CPS_CACHED state.
				649	*
				650	* Underlying VM page is locked for the duration of transfer.
				651	*
				652	* \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL
				653	*/
				654	CPS_PAGEOUT,
				655	/**
				656	* Page is being read in, as a part of a transfer. This is quite
				657	* similar to the cl_page_state::CPS_PAGEOUT state, except that
				658	* read-in is always "immediate"---there is no such thing a sudden
				659	* construction of read cl_req from cached, presumably not up to date,
				660	* pages.
				661	*
				662	* Underlying VM page is locked for the duration of transfer.
				663	*
				664	* \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL
				665	*/
				666	CPS_PAGEIN,
				667	/**
				668	* Page is being destroyed. This state is entered when client decides
				669	* that page has to be deleted from its host object, as, e.g., a part
				670	* of truncate.
				671	*
				672	* Once this state is reached, there is no way to escape it.
				673	*
				674	* \invariant: cl_page::cp_owner == NULL && cl_page::cp_req == NULL
				675	*/
				676	CPS_FREEING,
				677	CPS_NR
				678	};
				679
				680	enum cl_page_type {
				681	/** Host page, the page is from the host inode which the cl_page
Oleg Drokin	c56e256	2016-02-24 22:00:25 -0500	[diff] [blame]	682	* belongs to.
				683	*/
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	684	CPT_CACHEABLE = 1,
				685
				686	/** Transient page, the transient cl_page is used to bind a cl_page
				687	* to vmpage which is not belonging to the same object of cl_page.
Oleg Drokin	85f552d	2016-02-26 01:49:56 -0500	[diff] [blame]	688	* it is used in DirectIO and lockless IO.
Oleg Drokin	c56e256	2016-02-24 22:00:25 -0500	[diff] [blame]	689	*/
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	690	CPT_TRANSIENT,
				691	};
				692
				693	/**
				694	* Flags maintained for every cl_page.
				695	*/
				696	enum cl_page_flags {
				697	/**
				698	* Set when pagein completes. Used for debugging (read completes at
				699	* most once for a page).
				700	*/
				701	CPF_READ_COMPLETED = 1 << 0
				702	};
				703
				704	/**
				705	* Fields are protected by the lock on struct page, except for atomics and
				706	* immutables.
				707	*
				708	* \invariant Data type invariants are in cl_page_invariant(). Basically:
				709	* cl_page::cp_parent and cl_page::cp_child are a well-formed double-linked
				710	* list, consistent with the parent/child pointers in the cl_page::cp_obj and
				711	* cl_page::cp_owner (when set).
				712	*/
				713	struct cl_page {
				714	/** Reference counter. */
				715	atomic_t cp_ref;
				716	/** An object this page is a part of. Immutable after creation. */
				717	struct cl_object *cp_obj;
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	718	/** List of slices. Immutable after creation. */
				719	struct list_head cp_layers;
Jinshan Xiong	7addf40	2016-03-30 19:48:32 -0400	[diff] [blame]	720	/** vmpage */
				721	struct page *cp_vmpage;
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	722	/**
				723	* Page state. This field is const to avoid accidental update, it is
				724	* modified only internally within cl_page.c. Protected by a VM lock.
				725	*/
				726	const enum cl_page_state cp_state;
				727	/** Linkage of pages within group. Protected by cl_page::cp_mutex. */
				728	struct list_head cp_batch;
				729	/** Mutex serializing membership of a page in a batch. */
				730	struct mutex cp_mutex;
				731	/** Linkage of pages within cl_req. */
				732	struct list_head cp_flight;
				733	/** Transfer error. */
				734	int cp_error;
				735
				736	/**
				737	* Page type. Only CPT_TRANSIENT is used so far. Immutable after
				738	* creation.
				739	*/
				740	enum cl_page_type cp_type;
				741
				742	/**
				743	* Owning IO in cl_page_state::CPS_OWNED state. Sub-page can be owned
				744	* by sub-io. Protected by a VM lock.
				745	*/
				746	struct cl_io *cp_owner;
				747	/**
				748	* Debug information, the task is owning the page.
				749	*/
Greg Kroah-Hartman	68b636b	2013-08-04 08:56:42 +0800	[diff] [blame]	750	struct task_struct *cp_task;
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	751	/**
				752	* Owning IO request in cl_page_state::CPS_PAGEOUT and
				753	* cl_page_state::CPS_PAGEIN states. This field is maintained only in
				754	* the top-level pages. Protected by a VM lock.
				755	*/
				756	struct cl_req *cp_req;
				757	/** List of references to this page, for debugging. */
				758	struct lu_ref cp_reference;
				759	/** Link to an object, for debugging. */
John L. Hammond	631abc6	2013-07-25 01:17:30 +0800	[diff] [blame]	760	struct lu_ref_link cp_obj_ref;
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	761	/** Link to a queue, for debugging. */
John L. Hammond	631abc6	2013-07-25 01:17:30 +0800	[diff] [blame]	762	struct lu_ref_link cp_queue_ref;
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	763	/** Per-page flags from enum cl_page_flags. Protected by a VM lock. */
John L. Hammond	631abc6	2013-07-25 01:17:30 +0800	[diff] [blame]	764	unsigned cp_flags;
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	765	/** Assigned if doing a sync_io */
				766	struct cl_sync_io *cp_sync_io;
				767	};
				768
				769	/**
				770	* Per-layer part of cl_page.
				771	*
John L. Hammond	3a52f80	2016-03-30 19:48:48 -0400	[diff] [blame]	772	* \see vvp_page, lov_page, osc_page
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	773	*/
				774	struct cl_page_slice {
				775	struct cl_page *cpl_page;
Jinshan Xiong	7addf40	2016-03-30 19:48:32 -0400	[diff] [blame]	776	pgoff_t cpl_index;
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	777	/**
				778	* Object slice corresponding to this page slice. Immutable after
				779	* creation.
				780	*/
				781	struct cl_object *cpl_obj;
				782	const struct cl_page_operations *cpl_ops;
				783	/** Linkage into cl_page::cp_layers. Immutable after creation. */
				784	struct list_head cpl_linkage;
				785	};
				786
				787	/**
				788	* Lock mode. For the client extent locks.
				789	*
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	790	* \ingroup cl_lock
				791	*/
				792	enum cl_lock_mode {
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	793	CLM_READ,
				794	CLM_WRITE,
				795	CLM_GROUP
				796	};
				797
				798	/**
				799	* Requested transfer type.
				800	* \ingroup cl_req
				801	*/
				802	enum cl_req_type {
				803	CRT_READ,
				804	CRT_WRITE,
				805	CRT_NR
				806	};
				807
				808	/**
				809	* Per-layer page operations.
				810	*
				811	* Methods taking an \a io argument are for the activity happening in the
				812	* context of given \a io. Page is assumed to be owned by that io, except for
				813	* the obvious cases (like cl_page_operations::cpo_own()).
				814	*
				815	* \see vvp_page_ops, lov_page_ops, osc_page_ops
				816	*/
				817	struct cl_page_operations {
				818	/**
				819	* cl_page<->struct page methods. Only one layer in the stack has to
				820	* implement these. Current code assumes that this functionality is
				821	* provided by the topmost layer, see cl_page_disown0() as an example.
				822	*/
				823
				824	/**
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	825	* Called when \a io acquires this page into the exclusive
				826	* ownership. When this method returns, it is guaranteed that the is
				827	* not owned by other io, and no transfer is going on against
				828	* it. Optional.
				829	*
				830	* \see cl_page_own()
				831	* \see vvp_page_own(), lov_page_own()
				832	*/
				833	int (cpo_own)(const struct lu_env env,
				834	const struct cl_page_slice *slice,
				835	struct cl_io *io, int nonblock);
				836	/** Called when ownership it yielded. Optional.
				837	*
				838	* \see cl_page_disown()
				839	* \see vvp_page_disown()
				840	*/
				841	void (cpo_disown)(const struct lu_env env,
				842	const struct cl_page_slice slice, struct cl_io io);
				843	/**
				844	* Called for a page that is already "owned" by \a io from VM point of
				845	* view. Optional.
				846	*
				847	* \see cl_page_assume()
				848	* \see vvp_page_assume(), lov_page_assume()
				849	*/
				850	void (cpo_assume)(const struct lu_env env,
				851	const struct cl_page_slice slice, struct cl_io io);
				852	/** Dual to cl_page_operations::cpo_assume(). Optional. Called
				853	* bottom-to-top when IO releases a page without actually unlocking
				854	* it.
				855	*
				856	* \see cl_page_unassume()
				857	* \see vvp_page_unassume()
				858	*/
				859	void (cpo_unassume)(const struct lu_env env,
				860	const struct cl_page_slice *slice,
				861	struct cl_io *io);
				862	/**
				863	* Announces whether the page contains valid data or not by \a uptodate.
				864	*
				865	* \see cl_page_export()
				866	* \see vvp_page_export()
				867	*/
				868	void (cpo_export)(const struct lu_env env,
				869	const struct cl_page_slice *slice, int uptodate);
				870	/**
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	871	* Checks whether underlying VM page is locked (in the suitable
				872	* sense). Used for assertions.
				873	*
				874	* \retval -EBUSY: page is protected by a lock of a given mode;
				875	* \retval -ENODATA: page is not protected by a lock;
				876	* \retval 0: this layer cannot decide. (Should never happen.)
				877	*/
				878	int (cpo_is_vmlocked)(const struct lu_env env,
				879	const struct cl_page_slice *slice);
				880	/**
				881	* Page destruction.
				882	*/
				883
				884	/**
				885	* Called when page is truncated from the object. Optional.
				886	*
				887	* \see cl_page_discard()
				888	* \see vvp_page_discard(), osc_page_discard()
				889	*/
				890	void (cpo_discard)(const struct lu_env env,
				891	const struct cl_page_slice *slice,
				892	struct cl_io *io);
				893	/**
				894	* Called when page is removed from the cache, and is about to being
				895	* destroyed. Optional.
				896	*
				897	* \see cl_page_delete()
				898	* \see vvp_page_delete(), osc_page_delete()
				899	*/
				900	void (cpo_delete)(const struct lu_env env,
				901	const struct cl_page_slice *slice);
				902	/** Destructor. Frees resources and slice itself. */
				903	void (cpo_fini)(const struct lu_env env,
				904	struct cl_page_slice *slice);
				905
				906	/**
				907	* Checks whether the page is protected by a cl_lock. This is a
				908	* per-layer method, because certain layers have ways to check for the
				909	* lock much more efficiently than through the generic locks scan, or
				910	* implement locking mechanisms separate from cl_lock, e.g.,
				911	* LL_FILE_GROUP_LOCKED in vvp. If \a pending is true, check for locks
				912	* being canceled, or scheduled for cancellation as soon as the last
				913	* user goes away, too.
				914	*
				915	* \retval -EBUSY: page is protected by a lock of a given mode;
				916	* \retval -ENODATA: page is not protected by a lock;
				917	* \retval 0: this layer cannot decide.
				918	*
				919	* \see cl_page_is_under_lock()
				920	*/
				921	int (cpo_is_under_lock)(const struct lu_env env,
				922	const struct cl_page_slice *slice,
Jinshan Xiong	fd7444f	2016-03-30 19:48:33 -0400	[diff] [blame]	923	struct cl_io io, pgoff_t max);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	924
				925	/**
				926	* Optional debugging helper. Prints given page slice.
				927	*
				928	* \see cl_page_print()
				929	*/
				930	int (cpo_print)(const struct lu_env env,
				931	const struct cl_page_slice *slice,
				932	void *cookie, lu_printer_t p);
				933	/**
				934	* \name transfer
				935	*
				936	* Transfer methods. See comment on cl_req for a description of
				937	* transfer formation and life-cycle.
				938	*
				939	* @{
				940	*/
				941	/**
				942	* Request type dependent vector of operations.
				943	*
				944	* Transfer operations depend on transfer mode (cl_req_type). To avoid
				945	* passing transfer mode to each and every of these methods, and to
				946	* avoid branching on request type inside of the methods, separate
				947	* methods for cl_req_type:CRT_READ and cl_req_type:CRT_WRITE are
				948	* provided. That is, method invocation usually looks like
				949	*
				950	* slice->cp_ops.io[req->crq_type].cpo_method(env, slice, ...);
				951	*/
				952	struct {
				953	/**
				954	* Called when a page is submitted for a transfer as a part of
				955	* cl_page_list.
				956	*
				957	* \return 0 : page is eligible for submission;
				958	* \return -EALREADY : skip this page;
				959	* \return -ve : error.
				960	*
				961	* \see cl_page_prep()
				962	*/
				963	int (cpo_prep)(const struct lu_env env,
				964	const struct cl_page_slice *slice,
				965	struct cl_io *io);
				966	/**
				967	* Completion handler. This is guaranteed to be eventually
				968	* fired after cl_page_operations::cpo_prep() or
				969	* cl_page_operations::cpo_make_ready() call.
				970	*
				971	* This method can be called in a non-blocking context. It is
				972	* guaranteed however, that the page involved and its object
				973	* are pinned in memory (and, hence, calling cl_page_put() is
				974	* safe).
				975	*
				976	* \see cl_page_completion()
				977	*/
				978	void (cpo_completion)(const struct lu_env env,
				979	const struct cl_page_slice *slice,
				980	int ioret);
				981	/**
				982	* Called when cached page is about to be added to the
				983	* cl_req as a part of req formation.
				984	*
				985	* \return 0 : proceed with this page;
				986	* \return -EAGAIN : skip this page;
				987	* \return -ve : error.
				988	*
				989	* \see cl_page_make_ready()
				990	*/
				991	int (cpo_make_ready)(const struct lu_env env,
				992	const struct cl_page_slice *slice);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	993	} io[CRT_NR];
				994	/**
				995	* Tell transfer engine that only [to, from] part of a page should be
				996	* transmitted.
				997	*
				998	* This is used for immediate transfers.
				999	*
				1000	* \todo XXX this is not very good interface. It would be much better
				1001	* if all transfer parameters were supplied as arguments to
				1002	* cl_io_operations::cio_submit() call, but it is not clear how to do
				1003	* this for page queues.
				1004	*
				1005	* \see cl_page_clip()
				1006	*/
				1007	void (cpo_clip)(const struct lu_env env,
				1008	const struct cl_page_slice *slice,
				1009	int from, int to);
				1010	/**
				1011	* \pre the page was queued for transferring.
				1012	* \post page is removed from client's pending list, or -EBUSY
				1013	* is returned if it has already been in transferring.
				1014	*
				1015	* This is one of seldom page operation which is:
				1016	* 0. called from top level;
				1017	* 1. don't have vmpage locked;
				1018	* 2. every layer should synchronize execution of its ->cpo_cancel()
				1019	* with completion handlers. Osc uses client obd lock for this
				1020	* purpose. Based on there is no vvp_page_cancel and
				1021	* lov_page_cancel(), cpo_cancel is defacto protected by client lock.
				1022	*
				1023	* \see osc_page_cancel().
				1024	*/
				1025	int (cpo_cancel)(const struct lu_env env,
				1026	const struct cl_page_slice *slice);
				1027	/**
				1028	* Write out a page by kernel. This is only called by ll_writepage
				1029	* right now.
				1030	*
				1031	* \see cl_page_flush()
				1032	*/
				1033	int (cpo_flush)(const struct lu_env env,
				1034	const struct cl_page_slice *slice,
				1035	struct cl_io *io);
				1036	/** @} transfer */
				1037	};
				1038
				1039	/**
				1040	* Helper macro, dumping detailed information about \a page into a log.
				1041	*/
				1042	#define CL_PAGE_DEBUG(mask, env, page, format, ...) \
				1043	do { \
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1044	if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \
Oleg Drokin	83e8d02	2016-04-28 12:07:31 -0400	[diff] [blame]	1045	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1046	cl_page_print(env, &msgdata, lu_cdebug_printer, page); \
Mike Rapoport	b2952d6	2015-09-03 11:49:13 +0300	[diff] [blame]	1047	CDEBUG(mask, format, ## __VA_ARGS__); \
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1048	} \
				1049	} while (0)
				1050
				1051	/**
				1052	* Helper macro, dumping shorter information about \a page into a log.
				1053	*/
				1054	#define CL_PAGE_HEADER(mask, env, page, format, ...) \
				1055	do { \
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1056	if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \
Oleg Drokin	83e8d02	2016-04-28 12:07:31 -0400	[diff] [blame]	1057	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1058	cl_page_header_print(env, &msgdata, lu_cdebug_printer, page); \
Mike Rapoport	b2952d6	2015-09-03 11:49:13 +0300	[diff] [blame]	1059	CDEBUG(mask, format, ## __VA_ARGS__); \
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1060	} \
				1061	} while (0)
				1062
				1063	static inline int __page_in_use(const struct cl_page *page, int refc)
				1064	{
				1065	if (page->cp_type == CPT_CACHEABLE)
				1066	++refc;
				1067	LASSERT(atomic_read(&page->cp_ref) > 0);
				1068	return (atomic_read(&page->cp_ref) > refc);
				1069	}
Mike Rapoport	c9f6bb9	2015-10-13 16:03:42 +0300	[diff] [blame]	1070
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1071	#define cl_page_in_use(pg) __page_in_use(pg, 1)
				1072	#define cl_page_in_use_noref(pg) __page_in_use(pg, 0)
				1073
Jinshan Xiong	7addf40	2016-03-30 19:48:32 -0400	[diff] [blame]	1074	static inline struct page cl_page_vmpage(struct cl_page page)
				1075	{
				1076	LASSERT(page->cp_vmpage);
				1077	return page->cp_vmpage;
				1078	}
				1079
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1080	/** @} cl_page */
				1081
				1082	/** \addtogroup cl_lock cl_lock
Oleg Drokin	c56e256	2016-02-24 22:00:25 -0500	[diff] [blame]	1083	* @{
				1084	*/
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1085	/** \struct cl_lock
				1086	*
				1087	* Extent locking on the client.
				1088	*
				1089	* LAYERING
				1090	*
				1091	* The locking model of the new client code is built around
				1092	*
				1093	* struct cl_lock
				1094	*
				1095	* data-type representing an extent lock on a regular file. cl_lock is a
				1096	* layered object (much like cl_object and cl_page), it consists of a header
				1097	* (struct cl_lock) and a list of layers (struct cl_lock_slice), linked to
				1098	* cl_lock::cll_layers list through cl_lock_slice::cls_linkage.
				1099	*
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1100	* Typical cl_lock consists of the two layers:
				1101	*
				1102	* - vvp_lock (vvp specific data), and
				1103	* - lov_lock (lov specific data).
				1104	*
				1105	* lov_lock contains an array of sub-locks. Each of these sub-locks is a
				1106	* normal cl_lock: it has a header (struct cl_lock) and a list of layers:
				1107	*
				1108	* - lovsub_lock, and
				1109	* - osc_lock
				1110	*
				1111	* Each sub-lock is associated with a cl_object (representing stripe
				1112	* sub-object or the file to which top-level cl_lock is associated to), and is
				1113	* linked into that cl_object::coh_locks. In this respect cl_lock is similar to
				1114	* cl_object (that at lov layer also fans out into multiple sub-objects), and
				1115	* is different from cl_page, that doesn't fan out (there is usually exactly
				1116	* one osc_page for every vvp_page). We shall call vvp-lov portion of the lock
				1117	* a "top-lock" and its lovsub-osc portion a "sub-lock".
				1118	*
				1119	* LIFE CYCLE
				1120	*
Bobi Jam	71a96a0	2016-03-30 19:48:41 -0400	[diff] [blame]	1121	* cl_lock is a cacheless data container for the requirements of locks to
				1122	* complete the IO. cl_lock is created before I/O starts and destroyed when the
				1123	* I/O is complete.
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1124	*
Bobi Jam	71a96a0	2016-03-30 19:48:41 -0400	[diff] [blame]	1125	* cl_lock depends on LDLM lock to fulfill lock semantics. LDLM lock is attached
				1126	* to cl_lock at OSC layer. LDLM lock is still cacheable.
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1127	*
				1128	* INTERFACE AND USAGE
				1129	*
Bobi Jam	71a96a0	2016-03-30 19:48:41 -0400	[diff] [blame]	1130	* Two major methods are supported for cl_lock: clo_enqueue and clo_cancel. A
				1131	* cl_lock is enqueued by cl_lock_request(), which will call clo_enqueue()
				1132	* methods for each layer to enqueue the lock. At the LOV layer, if a cl_lock
				1133	* consists of multiple sub cl_locks, each sub locks will be enqueued
				1134	* correspondingly. At OSC layer, the lock enqueue request will tend to reuse
				1135	* cached LDLM lock; otherwise a new LDLM lock will have to be requested from
				1136	* OST side.
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1137	*
Bobi Jam	71a96a0	2016-03-30 19:48:41 -0400	[diff] [blame]	1138	* cl_lock_cancel() must be called to release a cl_lock after use. clo_cancel()
				1139	* method will be called for each layer to release the resource held by this
				1140	* lock. At OSC layer, the reference count of LDLM lock, which is held at
				1141	* clo_enqueue time, is released.
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1142	*
Bobi Jam	71a96a0	2016-03-30 19:48:41 -0400	[diff] [blame]	1143	* LDLM lock can only be canceled if there is no cl_lock using it.
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1144	*
				1145	* Overall process of the locking during IO operation is as following:
				1146	*
				1147	* - once parameters for IO are setup in cl_io, cl_io_operations::cio_lock()
				1148	* is called on each layer. Responsibility of this method is to add locks,
				1149	* needed by a given layer into cl_io.ci_lockset.
				1150	*
				1151	* - once locks for all layers were collected, they are sorted to avoid
				1152	* dead-locks (cl_io_locks_sort()), and enqueued.
				1153	*
				1154	* - when all locks are acquired, IO is performed;
				1155	*
Bobi Jam	71a96a0	2016-03-30 19:48:41 -0400	[diff] [blame]	1156	* - locks are released after IO is complete.
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1157	*
				1158	* Striping introduces major additional complexity into locking. The
				1159	* fundamental problem is that it is generally unsafe to actively use (hold)
				1160	* two locks on the different OST servers at the same time, as this introduces
				1161	* inter-server dependency and can lead to cascading evictions.
				1162	*
				1163	* Basic solution is to sub-divide large read/write IOs into smaller pieces so
				1164	* that no multi-stripe locks are taken (note that this design abandons POSIX
				1165	* read/write semantics). Such pieces ideally can be executed concurrently. At
				1166	* the same time, certain types of IO cannot be sub-divived, without
				1167	* sacrificing correctness. This includes:
				1168	*
				1169	* - O_APPEND write, where [0, EOF] lock has to be taken, to guarantee
				1170	* atomicity;
				1171	*
				1172	* - ftruncate(fd, offset), where [offset, EOF] lock has to be taken.
				1173	*
				1174	* Also, in the case of read(fd, buf, count) or write(fd, buf, count), where
				1175	* buf is a part of memory mapped Lustre file, a lock or locks protecting buf
				1176	* has to be held together with the usual lock on [offset, offset + count].
				1177	*
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1178	* Interaction with DLM
				1179	*
				1180	* In the expected setup, cl_lock is ultimately backed up by a collection of
				1181	* DLM locks (struct ldlm_lock). Association between cl_lock and DLM lock is
				1182	* implemented in osc layer, that also matches DLM events (ASTs, cancellation,
				1183	* etc.) into cl_lock_operation calls. See struct osc_lock for a more detailed
				1184	* description of interaction with DLM.
				1185	*/
				1186
				1187	/**
				1188	* Lock description.
				1189	*/
				1190	struct cl_lock_descr {
				1191	/** Object this lock is granted for. */
				1192	struct cl_object *cld_obj;
				1193	/** Index of the first page protected by this lock. */
				1194	pgoff_t cld_start;
				1195	/** Index of the last page (inclusive) protected by this lock. */
				1196	pgoff_t cld_end;
				1197	/** Group ID, for group lock */
				1198	__u64 cld_gid;
				1199	/** Lock mode. */
				1200	enum cl_lock_mode cld_mode;
				1201	/**
				1202	* flags to enqueue lock. A combination of bit-flags from
				1203	* enum cl_enq_flags.
				1204	*/
				1205	__u32 cld_enq_flags;
				1206	};
				1207
Jinshan Xiong	06563b5	2016-03-30 19:48:40 -0400	[diff] [blame]	1208	#define DDESCR "%s(%d):[%lu, %lu]:%x"
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1209	#define PDESCR(descr) \
				1210	cl_lock_mode_name((descr)->cld_mode), (descr)->cld_mode, \
Jinshan Xiong	06563b5	2016-03-30 19:48:40 -0400	[diff] [blame]	1211	(descr)->cld_start, (descr)->cld_end, (descr)->cld_enq_flags
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1212
				1213	const char *cl_lock_mode_name(const enum cl_lock_mode mode);
				1214
				1215	/**
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1216	* Layered client lock.
				1217	*/
				1218	struct cl_lock {
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1219	/** List of slices. Immutable after creation. */
				1220	struct list_head cll_layers;
Jinshan Xiong	06563b5	2016-03-30 19:48:40 -0400	[diff] [blame]	1221	/** lock attribute, extent, cl_object, etc. */
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1222	struct cl_lock_descr cll_descr;
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1223	};
				1224
				1225	/**
				1226	* Per-layer part of cl_lock
				1227	*
John L. Hammond	4a4eee0	2016-03-30 19:48:49 -0400	[diff] [blame]	1228	* \see vvp_lock, lov_lock, lovsub_lock, osc_lock
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1229	*/
				1230	struct cl_lock_slice {
				1231	struct cl_lock *cls_lock;
				1232	/** Object slice corresponding to this lock slice. Immutable after
Oleg Drokin	c56e256	2016-02-24 22:00:25 -0500	[diff] [blame]	1233	* creation.
				1234	*/
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1235	struct cl_object *cls_obj;
				1236	const struct cl_lock_operations *cls_ops;
				1237	/** Linkage into cl_lock::cll_layers. Immutable after creation. */
				1238	struct list_head cls_linkage;
				1239	};
				1240
				1241	/**
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1242	*
				1243	* \see vvp_lock_ops, lov_lock_ops, lovsub_lock_ops, osc_lock_ops
				1244	*/
				1245	struct cl_lock_operations {
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1246	/** @{ */
				1247	/**
				1248	* Attempts to enqueue the lock. Called top-to-bottom.
				1249	*
Jinshan Xiong	06563b5	2016-03-30 19:48:40 -0400	[diff] [blame]	1250	* \retval 0 this layer has enqueued the lock successfully
				1251	* \retval >0 this layer has enqueued the lock, but need to wait on
				1252	* @anchor for resources
				1253	* \retval -ve failure
				1254	*
John L. Hammond	4a4eee0	2016-03-30 19:48:49 -0400	[diff] [blame]	1255	* \see vvp_lock_enqueue(), lov_lock_enqueue(), lovsub_lock_enqueue(),
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1256	* \see osc_lock_enqueue()
				1257	*/
				1258	int (clo_enqueue)(const struct lu_env env,
				1259	const struct cl_lock_slice *slice,
Jinshan Xiong	06563b5	2016-03-30 19:48:40 -0400	[diff] [blame]	1260	struct cl_io io, struct cl_sync_io anchor);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1261	/**
Jinshan Xiong	06563b5	2016-03-30 19:48:40 -0400	[diff] [blame]	1262	* Cancel a lock, release its DLM lock ref, while does not cancel the
				1263	* DLM lock
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1264	*/
				1265	void (clo_cancel)(const struct lu_env env,
				1266	const struct cl_lock_slice *slice);
Jinshan Xiong	06563b5	2016-03-30 19:48:40 -0400	[diff] [blame]	1267	/** @} */
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1268	/**
				1269	* Destructor. Frees resources and the slice.
				1270	*
John L. Hammond	4a4eee0	2016-03-30 19:48:49 -0400	[diff] [blame]	1271	* \see vvp_lock_fini(), lov_lock_fini(), lovsub_lock_fini(),
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1272	* \see osc_lock_fini()
				1273	*/
				1274	void (clo_fini)(const struct lu_env env, struct cl_lock_slice *slice);
				1275	/**
				1276	* Optional debugging helper. Prints given lock slice.
				1277	*/
				1278	int (clo_print)(const struct lu_env env,
				1279	void *cookie, lu_printer_t p,
				1280	const struct cl_lock_slice *slice);
				1281	};
				1282
				1283	#define CL_LOCK_DEBUG(mask, env, lock, format, ...) \
				1284	do { \
				1285	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \
				1286	\
				1287	if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \
				1288	cl_lock_print(env, &msgdata, lu_cdebug_printer, lock); \
Mike Rapoport	b2952d6	2015-09-03 11:49:13 +0300	[diff] [blame]	1289	CDEBUG(mask, format, ## __VA_ARGS__); \
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1290	} \
				1291	} while (0)
				1292
				1293	#define CL_LOCK_ASSERT(expr, env, lock) do { \
				1294	if (likely(expr)) \
				1295	break; \
				1296	\
				1297	CL_LOCK_DEBUG(D_ERROR, env, lock, "failed at %s.\n", #expr); \
				1298	LBUG(); \
				1299	} while (0)
				1300
				1301	/** @} cl_lock */
				1302
				1303	/** \addtogroup cl_page_list cl_page_list
				1304	* Page list used to perform collective operations on a group of pages.
				1305	*
				1306	* Pages are added to the list one by one. cl_page_list acquires a reference
				1307	* for every page in it. Page list is used to perform collective operations on
				1308	* pages:
				1309	*
				1310	* - submit pages for an immediate transfer,
				1311	*
				1312	* - own pages on behalf of certain io (waiting for each page in turn),
				1313	*
				1314	* - discard pages.
				1315	*
				1316	* When list is finalized, it releases references on all pages it still has.
				1317	*
				1318	* \todo XXX concurrency control.
				1319	*
				1320	* @{
				1321	*/
				1322	struct cl_page_list {
				1323	unsigned pl_nr;
				1324	struct list_head pl_pages;
Greg Kroah-Hartman	68b636b	2013-08-04 08:56:42 +0800	[diff] [blame]	1325	struct task_struct *pl_owner;
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1326	};
				1327
				1328	/**
				1329	* A 2-queue of pages. A convenience data-type for common use case, 2-queue
				1330	* contains an incoming page list and an outgoing page list.
				1331	*/
				1332	struct cl_2queue {
				1333	struct cl_page_list c2_qin;
				1334	struct cl_page_list c2_qout;
				1335	};
				1336
				1337	/** @} cl_page_list */
				1338
				1339	/** \addtogroup cl_io cl_io
Oleg Drokin	c56e256	2016-02-24 22:00:25 -0500	[diff] [blame]	1340	* @{
				1341	*/
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1342	/** \struct cl_io
				1343	* I/O
				1344	*
				1345	* cl_io represents a high level I/O activity like
				1346	* read(2)/write(2)/truncate(2) system call, or cancellation of an extent
				1347	* lock.
				1348	*
				1349	* cl_io is a layered object, much like cl_{object,page,lock} but with one
				1350	* important distinction. We want to minimize number of calls to the allocator
				1351	* in the fast path, e.g., in the case of read(2) when everything is cached:
				1352	* client already owns the lock over region being read, and data are cached
				1353	* due to read-ahead. To avoid allocation of cl_io layers in such situations,
				1354	* per-layer io state is stored in the session, associated with the io, see
				1355	* struct {vvp,lov,osc}_io for example. Sessions allocation is amortized
				1356	* by using free-lists, see cl_env_get().
				1357	*
				1358	* There is a small predefined number of possible io types, enumerated in enum
				1359	* cl_io_type.
				1360	*
				1361	* cl_io is a state machine, that can be advanced concurrently by the multiple
				1362	* threads. It is up to these threads to control the concurrency and,
				1363	* specifically, to detect when io is done, and its state can be safely
				1364	* released.
				1365	*
				1366	* For read/write io overall execution plan is as following:
				1367	*
				1368	* (0) initialize io state through all layers;
				1369	*
				1370	* (1) loop: prepare chunk of work to do
				1371	*
				1372	* (2) call all layers to collect locks they need to process current chunk
				1373	*
				1374	* (3) sort all locks to avoid dead-locks, and acquire them
				1375	*
				1376	* (4) process the chunk: call per-page methods
				1377	* (cl_io_operations::cio_read_page() for read,
				1378	* cl_io_operations::cio_prepare_write(),
				1379	* cl_io_operations::cio_commit_write() for write)
				1380	*
				1381	* (5) release locks
				1382	*
				1383	* (6) repeat loop.
				1384	*
				1385	* To implement the "parallel IO mode", lov layer creates sub-io's (lazily to
				1386	* address allocation efficiency issues mentioned above), and returns with the
				1387	* special error condition from per-page method when current sub-io has to
				1388	* block. This causes io loop to be repeated, and lov switches to the next
				1389	* sub-io in its cl_io_operations::cio_iter_init() implementation.
				1390	*/
				1391
				1392	/** IO types */
				1393	enum cl_io_type {
				1394	/** read system call */
				1395	CIT_READ,
				1396	/** write system call */
				1397	CIT_WRITE,
				1398	/** truncate, utime system calls */
				1399	CIT_SETATTR,
				1400	/**
				1401	* page fault handling
				1402	*/
				1403	CIT_FAULT,
				1404	/**
				1405	* fsync system call handling
				1406	* To write out a range of file
				1407	*/
				1408	CIT_FSYNC,
				1409	/**
				1410	* Miscellaneous io. This is used for occasional io activity that
				1411	* doesn't fit into other types. Currently this is used for:
				1412	*
				1413	* - cancellation of an extent lock. This io exists as a context
				1414	* to write dirty pages from under the lock being canceled back
				1415	* to the server;
				1416	*
				1417	* - VM induced page write-out. An io context for writing page out
				1418	* for memory cleansing;
				1419	*
				1420	* - glimpse. An io context to acquire glimpse lock.
				1421	*
				1422	* - grouplock. An io context to acquire group lock.
				1423	*
				1424	* CIT_MISC io is used simply as a context in which locks and pages
				1425	* are manipulated. Such io has no internal "process", that is,
				1426	* cl_io_loop() is never called for it.
				1427	*/
				1428	CIT_MISC,
				1429	CIT_OP_NR
				1430	};
				1431
				1432	/**
				1433	* States of cl_io state machine
				1434	*/
				1435	enum cl_io_state {
				1436	/** Not initialized. */
				1437	CIS_ZERO,
				1438	/** Initialized. */
				1439	CIS_INIT,
				1440	/** IO iteration started. */
				1441	CIS_IT_STARTED,
				1442	/** Locks taken. */
				1443	CIS_LOCKED,
				1444	/** Actual IO is in progress. */
				1445	CIS_IO_GOING,
				1446	/** IO for the current iteration finished. */
				1447	CIS_IO_FINISHED,
				1448	/** Locks released. */
				1449	CIS_UNLOCKED,
				1450	/** Iteration completed. */
				1451	CIS_IT_ENDED,
				1452	/** cl_io finalized. */
				1453	CIS_FINI
				1454	};
				1455
				1456	/**
				1457	* IO state private for a layer.
				1458	*
				1459	* This is usually embedded into layer session data, rather than allocated
				1460	* dynamically.
				1461	*
John L. Hammond	10cdef7	2016-03-30 19:48:51 -0400	[diff] [blame]	1462	* \see vvp_io, lov_io, osc_io
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1463	*/
				1464	struct cl_io_slice {
				1465	struct cl_io *cis_io;
				1466	/** corresponding object slice. Immutable after creation. */
				1467	struct cl_object *cis_obj;
				1468	/** io operations. Immutable after creation. */
				1469	const struct cl_io_operations *cis_iop;
				1470	/**
				1471	* linkage into a list of all slices for a given cl_io, hanging off
				1472	* cl_io::ci_layers. Immutable after creation.
				1473	*/
				1474	struct list_head cis_linkage;
				1475	};
				1476
Jinshan Xiong	77605e4	2016-03-30 19:48:30 -0400	[diff] [blame]	1477	typedef void (cl_commit_cbt)(const struct lu_env , struct cl_io *,
				1478	struct cl_page *);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1479	/**
				1480	* Per-layer io operations.
				1481	* \see vvp_io_ops, lov_io_ops, lovsub_io_ops, osc_io_ops
				1482	*/
				1483	struct cl_io_operations {
				1484	/**
				1485	* Vector of io state transition methods for every io type.
				1486	*
				1487	* \see cl_page_operations::io
				1488	*/
				1489	struct {
				1490	/**
				1491	* Prepare io iteration at a given layer.
				1492	*
				1493	* Called top-to-bottom at the beginning of each iteration of
				1494	* "io loop" (if it makes sense for this type of io). Here
				1495	* layer selects what work it will do during this iteration.
				1496	*
				1497	* \see cl_io_operations::cio_iter_fini()
				1498	*/
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	1499	int (cio_iter_init)(const struct lu_env env,
				1500	const struct cl_io_slice *slice);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1501	/**
				1502	* Finalize io iteration.
				1503	*
				1504	* Called bottom-to-top at the end of each iteration of "io
				1505	* loop". Here layers can decide whether IO has to be
				1506	* continued.
				1507	*
				1508	* \see cl_io_operations::cio_iter_init()
				1509	*/
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	1510	void (cio_iter_fini)(const struct lu_env env,
				1511	const struct cl_io_slice *slice);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1512	/**
				1513	* Collect locks for the current iteration of io.
				1514	*
				1515	* Called top-to-bottom to collect all locks necessary for
				1516	* this iteration. This methods shouldn't actually enqueue
				1517	* anything, instead it should post a lock through
				1518	* cl_io_lock_add(). Once all locks are collected, they are
				1519	* sorted and enqueued in the proper order.
				1520	*/
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	1521	int (cio_lock)(const struct lu_env env,
				1522	const struct cl_io_slice *slice);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1523	/**
				1524	* Finalize unlocking.
				1525	*
				1526	* Called bottom-to-top to finish layer specific unlocking
				1527	* functionality, after generic code released all locks
				1528	* acquired by cl_io_operations::cio_lock().
				1529	*/
				1530	void (cio_unlock)(const struct lu_env env,
				1531	const struct cl_io_slice *slice);
				1532	/**
				1533	* Start io iteration.
				1534	*
				1535	* Once all locks are acquired, called top-to-bottom to
				1536	* commence actual IO. In the current implementation,
				1537	* top-level vvp_io_{read,write}_start() does all the work
				1538	* synchronously by calling generic_file_*(), so other layers
				1539	* are called when everything is done.
				1540	*/
				1541	int (cio_start)(const struct lu_env env,
				1542	const struct cl_io_slice *slice);
				1543	/**
				1544	* Called top-to-bottom at the end of io loop. Here layer
				1545	* might wait for an unfinished asynchronous io.
				1546	*/
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	1547	void (cio_end)(const struct lu_env env,
				1548	const struct cl_io_slice *slice);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1549	/**
				1550	* Called bottom-to-top to notify layers that read/write IO
				1551	* iteration finished, with \a nob bytes transferred.
				1552	*/
				1553	void (cio_advance)(const struct lu_env env,
				1554	const struct cl_io_slice *slice,
				1555	size_t nob);
				1556	/**
				1557	* Called once per io, bottom-to-top to release io resources.
				1558	*/
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	1559	void (cio_fini)(const struct lu_env env,
				1560	const struct cl_io_slice *slice);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1561	} op[CIT_OP_NR];
Jinshan Xiong	77605e4	2016-03-30 19:48:30 -0400	[diff] [blame]	1562
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1563	/**
				1564	* Submit pages from \a queue->c2_qin for IO, and move
				1565	* successfully submitted pages into \a queue->c2_qout. Return
				1566	* non-zero if failed to submit even the single page. If
				1567	* submission failed after some pages were moved into \a
				1568	* queue->c2_qout, completion callback with non-zero ioret is
				1569	* executed on them.
				1570	*/
				1571	int (cio_submit)(const struct lu_env env,
				1572	const struct cl_io_slice *slice,
				1573	enum cl_req_type crt,
				1574	struct cl_2queue *queue);
Jinshan Xiong	77605e4	2016-03-30 19:48:30 -0400	[diff] [blame]	1575	/**
				1576	* Queue async page for write.
				1577	* The difference between cio_submit and cio_queue is that
				1578	* cio_submit is for urgent request.
				1579	*/
				1580	int (cio_commit_async)(const struct lu_env env,
				1581	const struct cl_io_slice *slice,
				1582	struct cl_page_list *queue, int from, int to,
				1583	cl_commit_cbt cb);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1584	/**
				1585	* Read missing page.
				1586	*
				1587	* Called by a top-level cl_io_operations::op[CIT_READ]::cio_start()
				1588	* method, when it hits not-up-to-date page in the range. Optional.
				1589	*
				1590	* \pre io->ci_type == CIT_READ
				1591	*/
				1592	int (cio_read_page)(const struct lu_env env,
				1593	const struct cl_io_slice *slice,
				1594	const struct cl_page_slice *page);
				1595	/**
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1596	* Optional debugging helper. Print given io slice.
				1597	*/
				1598	int (cio_print)(const struct lu_env env, void *cookie,
				1599	lu_printer_t p, const struct cl_io_slice *slice);
				1600	};
				1601
				1602	/**
				1603	* Flags to lock enqueue procedure.
				1604	* \ingroup cl_lock
				1605	*/
				1606	enum cl_enq_flags {
				1607	/**
				1608	* instruct server to not block, if conflicting lock is found. Instead
				1609	* -EWOULDBLOCK is returned immediately.
				1610	*/
				1611	CEF_NONBLOCK = 0x00000001,
				1612	/**
				1613	* take lock asynchronously (out of order), as it cannot
				1614	* deadlock. This is for LDLM_FL_HAS_INTENT locks used for glimpsing.
				1615	*/
				1616	CEF_ASYNC = 0x00000002,
				1617	/**
				1618	* tell the server to instruct (though a flag in the blocking ast) an
				1619	* owner of the conflicting lock, that it can drop dirty pages
				1620	* protected by this lock, without sending them to the server.
				1621	*/
				1622	CEF_DISCARD_DATA = 0x00000004,
				1623	/**
				1624	* tell the sub layers that it must be a `real' lock. This is used for
				1625	* mmapped-buffer locks and glimpse locks that must be never converted
				1626	* into lockless mode.
				1627	*
				1628	* \see vvp_mmap_locks(), cl_glimpse_lock().
				1629	*/
				1630	CEF_MUST = 0x00000008,
				1631	/**
				1632	* tell the sub layers that never request a `real' lock. This flag is
				1633	* not used currently.
				1634	*
				1635	* cl_io::ci_lockreq and CEF_{MUST,NEVER} flags specify lockless
				1636	* conversion policy: ci_lockreq describes generic information of lock
				1637	* requirement for this IO, especially for locks which belong to the
				1638	* object doing IO; however, lock itself may have precise requirements
				1639	* that are described by the enqueue flags.
				1640	*/
				1641	CEF_NEVER = 0x00000010,
				1642	/**
				1643	* for async glimpse lock.
				1644	*/
				1645	CEF_AGL = 0x00000020,
				1646	/**
Jinshan Xiong	06563b5	2016-03-30 19:48:40 -0400	[diff] [blame]	1647	* enqueue a lock to test DLM lock existence.
				1648	*/
				1649	CEF_PEEK = 0x00000040,
				1650	/**
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1651	* mask of enq_flags.
				1652	*/
Jinshan Xiong	06563b5	2016-03-30 19:48:40 -0400	[diff] [blame]	1653	CEF_MASK = 0x0000007f,
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1654	};
				1655
				1656	/**
				1657	* Link between lock and io. Intermediate structure is needed, because the
				1658	* same lock can be part of multiple io's simultaneously.
				1659	*/
				1660	struct cl_io_lock_link {
				1661	/** linkage into one of cl_lockset lists. */
				1662	struct list_head cill_linkage;
Jinshan Xiong	06563b5	2016-03-30 19:48:40 -0400	[diff] [blame]	1663	struct cl_lock cill_lock;
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1664	/** optional destructor */
				1665	void (cill_fini)(const struct lu_env env,
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	1666	struct cl_io_lock_link *link);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1667	};
Jinshan Xiong	06563b5	2016-03-30 19:48:40 -0400	[diff] [blame]	1668	#define cill_descr cill_lock.cll_descr
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1669
				1670	/**
				1671	* Lock-set represents a collection of locks, that io needs at a
				1672	* time. Generally speaking, client tries to avoid holding multiple locks when
				1673	* possible, because
				1674	*
				1675	* - holding extent locks over multiple ost's introduces the danger of
				1676	* "cascading timeouts";
				1677	*
				1678	* - holding multiple locks over the same ost is still dead-lock prone,
				1679	* see comment in osc_lock_enqueue(),
				1680	*
				1681	* but there are certain situations where this is unavoidable:
				1682	*
				1683	* - O_APPEND writes have to take [0, EOF] lock for correctness;
				1684	*
				1685	* - truncate has to take [new-size, EOF] lock for correctness;
				1686	*
				1687	* - SNS has to take locks across full stripe for correctness;
				1688	*
				1689	* - in the case when user level buffer, supplied to {read,write}(file0),
				1690	* is a part of a memory mapped lustre file, client has to take a dlm
				1691	* locks on file0, and all files that back up the buffer (or a part of
				1692	* the buffer, that is being processed in the current chunk, in any
				1693	* case, there are situations where at least 2 locks are necessary).
				1694	*
				1695	* In such cases we at least try to take locks in the same consistent
				1696	* order. To this end, all locks are first collected, then sorted, and then
				1697	* enqueued.
				1698	*/
				1699	struct cl_lockset {
				1700	/** locks to be acquired. */
				1701	struct list_head cls_todo;
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1702	/** locks acquired. */
				1703	struct list_head cls_done;
				1704	};
				1705
				1706	/**
				1707	* Lock requirements(demand) for IO. It should be cl_io_lock_req,
				1708	* but 'req' is always to be thought as 'request' :-)
				1709	*/
				1710	enum cl_io_lock_dmd {
				1711	/** Always lock data (e.g., O_APPEND). */
				1712	CILR_MANDATORY = 0,
				1713	/** Layers are free to decide between local and global locking. */
				1714	CILR_MAYBE,
Oleg Drokin	85f552d	2016-02-26 01:49:56 -0500	[diff] [blame]	1715	/** Never lock: there is no cache (e.g., lockless IO). */
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1716	CILR_NEVER
				1717	};
				1718
				1719	enum cl_fsync_mode {
				1720	/** start writeback, do not wait for them to finish */
				1721	CL_FSYNC_NONE = 0,
				1722	/** start writeback and wait for them to finish */
				1723	CL_FSYNC_LOCAL = 1,
				1724	/** discard all of dirty pages in a specific file range */
				1725	CL_FSYNC_DISCARD = 2,
				1726	/** start writeback and make sure they have reached storage before
Oleg Drokin	c56e256	2016-02-24 22:00:25 -0500	[diff] [blame]	1727	* return. OST_SYNC RPC must be issued and finished
				1728	*/
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1729	CL_FSYNC_ALL = 3
				1730	};
				1731
				1732	struct cl_io_rw_common {
				1733	loff_t crw_pos;
				1734	size_t crw_count;
				1735	int crw_nonblock;
				1736	};
				1737
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1738	/**
				1739	* State for io.
				1740	*
				1741	* cl_io is shared by all threads participating in this IO (in current
				1742	* implementation only one thread advances IO, but parallel IO design and
				1743	* concurrent copy_*_user() require multiple threads acting on the same IO. It
				1744	* is up to these threads to serialize their activities, including updates to
				1745	* mutable cl_io fields.
				1746	*/
				1747	struct cl_io {
				1748	/** type of this IO. Immutable after creation. */
				1749	enum cl_io_type ci_type;
				1750	/** current state of cl_io state machine. */
				1751	enum cl_io_state ci_state;
				1752	/** main object this io is against. Immutable after creation. */
				1753	struct cl_object *ci_obj;
				1754	/**
				1755	* Upper layer io, of which this io is a part of. Immutable after
				1756	* creation.
				1757	*/
				1758	struct cl_io *ci_parent;
				1759	/** List of slices. Immutable after creation. */
				1760	struct list_head ci_layers;
				1761	/** list of locks (to be) acquired by this io. */
				1762	struct cl_lockset ci_lockset;
				1763	/** lock requirements, this is just a help info for sublayers. */
				1764	enum cl_io_lock_dmd ci_lockreq;
				1765	union {
				1766	struct cl_rd_io {
				1767	struct cl_io_rw_common rd;
				1768	} ci_rd;
				1769	struct cl_wr_io {
				1770	struct cl_io_rw_common wr;
				1771	int wr_append;
				1772	int wr_sync;
				1773	} ci_wr;
				1774	struct cl_io_rw_common ci_rw;
				1775	struct cl_setattr_io {
				1776	struct ost_lvb sa_attr;
				1777	unsigned int sa_valid;
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1778	} ci_setattr;
				1779	struct cl_fault_io {
				1780	/** page index within file. */
				1781	pgoff_t ft_index;
				1782	/** bytes valid byte on a faulted page. */
				1783	int ft_nob;
				1784	/** writable page? for nopage() only */
				1785	int ft_writable;
				1786	/** page of an executable? */
				1787	int ft_executable;
				1788	/** page_mkwrite() */
				1789	int ft_mkwrite;
				1790	/** resulting page */
				1791	struct cl_page *ft_page;
				1792	} ci_fault;
				1793	struct cl_fsync_io {
				1794	loff_t fi_start;
				1795	loff_t fi_end;
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1796	/** file system level fid */
				1797	struct lu_fid *fi_fid;
				1798	enum cl_fsync_mode fi_mode;
				1799	/* how many pages were written/discarded */
				1800	unsigned int fi_nr_written;
				1801	} ci_fsync;
				1802	} u;
				1803	struct cl_2queue ci_queue;
				1804	size_t ci_nob;
				1805	int ci_result;
				1806	unsigned int ci_continue:1,
				1807	/**
				1808	* This io has held grouplock, to inform sublayers that
				1809	* don't do lockless i/o.
				1810	*/
				1811	ci_no_srvlock:1,
				1812	/**
				1813	* The whole IO need to be restarted because layout has been changed
				1814	*/
				1815	ci_need_restart:1,
				1816	/**
				1817	* to not refresh layout - the IO issuer knows that the layout won't
				1818	* change(page operations, layout change causes all page to be
				1819	* discarded), or it doesn't matter if it changes(sync).
				1820	*/
				1821	ci_ignore_layout:1,
				1822	/**
				1823	* Check if layout changed after the IO finishes. Mainly for HSM
				1824	* requirement. If IO occurs to openning files, it doesn't need to
				1825	* verify layout because HSM won't release openning files.
Masanari Iida	bd9070c	2014-03-08 22:58:34 +0900	[diff] [blame]	1826	* Right now, only two operations need to verify layout: glimpse
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1827	* and setattr.
				1828	*/
JC Lafoucriere	5ea17d6	2013-11-21 22:24:48 +0800	[diff] [blame]	1829	ci_verify_layout:1,
				1830	/**
				1831	* file is released, restore has to to be triggered by vvp layer
				1832	*/
John L. Hammond	ec9bca9	2014-02-28 21:16:35 -0500	[diff] [blame]	1833	ci_restore_needed:1,
				1834	/**
				1835	* O_NOATIME
				1836	*/
				1837	ci_noatime:1;
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1838	/**
				1839	* Number of pages owned by this IO. For invariant checking.
				1840	*/
				1841	unsigned ci_owned_nr;
				1842	};
				1843
				1844	/** @} cl_io */
				1845
				1846	/** \addtogroup cl_req cl_req
Oleg Drokin	c56e256	2016-02-24 22:00:25 -0500	[diff] [blame]	1847	* @{
				1848	*/
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1849	/** \struct cl_req
				1850	* Transfer.
				1851	*
				1852	* There are two possible modes of transfer initiation on the client:
				1853	*
				1854	* - immediate transfer: this is started when a high level io wants a page
				1855	* or a collection of pages to be transferred right away. Examples:
				1856	* read-ahead, synchronous read in the case of non-page aligned write,
				1857	* page write-out as a part of extent lock cancellation, page write-out
				1858	* as a part of memory cleansing. Immediate transfer can be both
				1859	* cl_req_type::CRT_READ and cl_req_type::CRT_WRITE;
				1860	*
				1861	* - opportunistic transfer (cl_req_type::CRT_WRITE only), that happens
				1862	* when io wants to transfer a page to the server some time later, when
				1863	* it can be done efficiently. Example: pages dirtied by the write(2)
				1864	* path.
				1865	*
				1866	* In any case, transfer takes place in the form of a cl_req, which is a
				1867	* representation for a network RPC.
				1868	*
				1869	* Pages queued for an opportunistic transfer are cached until it is decided
				1870	* that efficient RPC can be composed of them. This decision is made by "a
				1871	* req-formation engine", currently implemented as a part of osc
				1872	* layer. Req-formation depends on many factors: the size of the resulting
				1873	* RPC, whether or not multi-object RPCs are supported by the server,
				1874	* max-rpc-in-flight limitations, size of the dirty cache, etc.
				1875	*
				1876	* For the immediate transfer io submits a cl_page_list, that req-formation
				1877	* engine slices into cl_req's, possibly adding cached pages to some of
				1878	* the resulting req's.
				1879	*
				1880	* Whenever a page from cl_page_list is added to a newly constructed req, its
				1881	* cl_page_operations::cpo_prep() layer methods are called. At that moment,
				1882	* page state is atomically changed from cl_page_state::CPS_OWNED to
				1883	* cl_page_state::CPS_PAGEOUT or cl_page_state::CPS_PAGEIN, cl_page::cp_owner
				1884	* is zeroed, and cl_page::cp_req is set to the
				1885	* req. cl_page_operations::cpo_prep() method at the particular layer might
				1886	* return -EALREADY to indicate that it does not need to submit this page
				1887	* at all. This is possible, for example, if page, submitted for read,
				1888	* became up-to-date in the meantime; and for write, the page don't have
				1889	* dirty bit marked. \see cl_io_submit_rw()
				1890	*
				1891	* Whenever a cached page is added to a newly constructed req, its
				1892	* cl_page_operations::cpo_make_ready() layer methods are called. At that
				1893	* moment, page state is atomically changed from cl_page_state::CPS_CACHED to
				1894	* cl_page_state::CPS_PAGEOUT, and cl_page::cp_req is set to
				1895	* req. cl_page_operations::cpo_make_ready() method at the particular layer
				1896	* might return -EAGAIN to indicate that this page is not eligible for the
				1897	* transfer right now.
				1898	*
				1899	* FUTURE
				1900	*
				1901	* Plan is to divide transfers into "priority bands" (indicated when
				1902	* submitting cl_page_list, and queuing a page for the opportunistic transfer)
				1903	* and allow glueing of cached pages to immediate transfers only within single
				1904	* band. This would make high priority transfers (like lock cancellation or
				1905	* memory pressure induced write-out) really high priority.
				1906	*
				1907	*/
				1908
				1909	/**
				1910	* Per-transfer attributes.
				1911	*/
				1912	struct cl_req_attr {
				1913	/** Generic attributes for the server consumption. */
				1914	struct obdo *cra_oa;
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1915	/** Jobid */
				1916	char cra_jobid[JOBSTATS_JOBID_SIZE];
				1917	};
				1918
				1919	/**
				1920	* Transfer request operations definable at every layer.
				1921	*
				1922	* Concurrency: transfer formation engine synchronizes calls to all transfer
				1923	* methods.
				1924	*/
				1925	struct cl_req_operations {
				1926	/**
				1927	* Invoked top-to-bottom by cl_req_prep() when transfer formation is
				1928	* complete (all pages are added).
				1929	*
				1930	* \see osc_req_prep()
				1931	*/
				1932	int (cro_prep)(const struct lu_env env,
				1933	const struct cl_req_slice *slice);
				1934	/**
				1935	* Called top-to-bottom to fill in \a oa fields. This is called twice
				1936	* with different flags, see bug 10150 and osc_build_req().
				1937	*
				1938	* \param obj an object from cl_req which attributes are to be set in
				1939	* \a oa.
				1940	*
				1941	* \param oa struct obdo where attributes are placed
				1942	*
				1943	* \param flags \a oa fields to be filled.
				1944	*/
				1945	void (cro_attr_set)(const struct lu_env env,
				1946	const struct cl_req_slice *slice,
				1947	const struct cl_object *obj,
Oleg Drokin	21aef7d	2014-08-15 12:55:56 -0400	[diff] [blame]	1948	struct cl_req_attr *attr, u64 flags);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1949	/**
				1950	* Called top-to-bottom from cl_req_completion() to notify layers that
				1951	* transfer completed. Has to free all state allocated by
				1952	* cl_device_operations::cdo_req_init().
				1953	*/
				1954	void (cro_completion)(const struct lu_env env,
				1955	const struct cl_req_slice *slice, int ioret);
				1956	};
				1957
				1958	/**
				1959	* A per-object state that (potentially multi-object) transfer request keeps.
				1960	*/
				1961	struct cl_req_obj {
				1962	/** object itself */
				1963	struct cl_object *ro_obj;
				1964	/** reference to cl_req_obj::ro_obj. For debugging. */
John L. Hammond	631abc6	2013-07-25 01:17:30 +0800	[diff] [blame]	1965	struct lu_ref_link ro_obj_ref;
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1966	/* something else? Number of pages for a given object? */
				1967	};
				1968
				1969	/**
				1970	* Transfer request.
				1971	*
				1972	* Transfer requests are not reference counted, because IO sub-system owns
				1973	* them exclusively and knows when to free them.
				1974	*
				1975	* Life cycle.
				1976	*
				1977	* cl_req is created by cl_req_alloc() that calls
				1978	* cl_device_operations::cdo_req_init() device methods to allocate per-req
				1979	* state in every layer.
				1980	*
				1981	* Then pages are added (cl_req_page_add()), req keeps track of all objects it
				1982	* contains pages for.
				1983	*
				1984	* Once all pages were collected, cl_page_operations::cpo_prep() method is
				1985	* called top-to-bottom. At that point layers can modify req, let it pass, or
				1986	* deny it completely. This is to support things like SNS that have transfer
				1987	* ordering requirements invisible to the individual req-formation engine.
				1988	*
				1989	* On transfer completion (or transfer timeout, or failure to initiate the
				1990	* transfer of an allocated req), cl_req_operations::cro_completion() method
				1991	* is called, after execution of cl_page_operations::cpo_completion() of all
				1992	* req's pages.
				1993	*/
				1994	struct cl_req {
				1995	enum cl_req_type crq_type;
Masanari Iida	bd9070c	2014-03-08 22:58:34 +0900	[diff] [blame]	1996	/** A list of pages being transferred */
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	1997	struct list_head crq_pages;
				1998	/** Number of pages in cl_req::crq_pages */
				1999	unsigned crq_nrpages;
				2000	/** An array of objects which pages are in ->crq_pages */
				2001	struct cl_req_obj *crq_o;
				2002	/** Number of elements in cl_req::crq_objs[] */
				2003	unsigned crq_nrobjs;
				2004	struct list_head crq_layers;
				2005	};
				2006
				2007	/**
				2008	* Per-layer state for request.
				2009	*/
				2010	struct cl_req_slice {
				2011	struct cl_req *crs_req;
				2012	struct cl_device *crs_dev;
				2013	struct list_head crs_linkage;
				2014	const struct cl_req_operations *crs_ops;
				2015	};
				2016
				2017	/* @} cl_req */
				2018
				2019	enum cache_stats_item {
				2020	/** how many cache lookups were performed */
				2021	CS_lookup = 0,
				2022	/** how many times cache lookup resulted in a hit */
				2023	CS_hit,
				2024	/** how many entities are in the cache right now */
				2025	CS_total,
				2026	/** how many entities in the cache are actively used (and cannot be
Oleg Drokin	c56e256	2016-02-24 22:00:25 -0500	[diff] [blame]	2027	* evicted) right now
				2028	*/
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2029	CS_busy,
				2030	/** how many entities were created at all */
				2031	CS_create,
				2032	CS_NR
				2033	};
				2034
				2035	#define CS_NAMES { "lookup", "hit", "total", "busy", "create" }
				2036
				2037	/**
				2038	* Stats for a generic cache (similar to inode, lu_object, etc. caches).
				2039	*/
				2040	struct cache_stats {
				2041	const char *cs_name;
				2042	atomic_t cs_stats[CS_NR];
				2043	};
				2044
				2045	/** These are not exported so far */
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2046	void cache_stats_init(struct cache_stats cs, const char name);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2047
				2048	/**
				2049	* Client-side site. This represents particular client stack. "Global"
				2050	* variables should (directly or indirectly) be added here to allow multiple
				2051	* clients to co-exist in the single address space.
				2052	*/
				2053	struct cl_site {
				2054	struct lu_site cs_lu;
				2055	/**
				2056	* Statistical counters. Atomics do not scale, something better like
				2057	* per-cpu counters is needed.
				2058	*
Oleg Drokin	406c1c7	2016-02-16 00:46:34 -0500	[diff] [blame]	2059	* These are exported as /sys/kernel/debug/lustre/llite/.../site
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2060	*
				2061	* When interpreting keep in mind that both sub-locks (and sub-pages)
				2062	* and top-locks (and top-pages) are accounted here.
				2063	*/
				2064	struct cache_stats cs_pages;
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2065	atomic_t cs_pages_state[CPS_NR];
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2066	};
				2067
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2068	int cl_site_init(struct cl_site s, struct cl_device top);
				2069	void cl_site_fini(struct cl_site *s);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2070	void cl_stack_fini(const struct lu_env env, struct cl_device cl);
				2071
				2072	/**
				2073	* Output client site statistical counters into a buffer. Suitable for
				2074	* ll_rd_*()-style functions.
				2075	*/
Peng Tao	73bb1da	2013-05-29 21:40:55 +0800	[diff] [blame]	2076	int cl_site_stats_print(const struct cl_site site, struct seq_file m);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2077
				2078	/**
				2079	* \name helpers
				2080	*
				2081	* Type conversion and accessory functions.
				2082	*/
				2083	/** @{ */
				2084
				2085	static inline struct cl_site lu2cl_site(const struct lu_site site)
				2086	{
				2087	return container_of(site, struct cl_site, cs_lu);
				2088	}
				2089
				2090	static inline int lu_device_is_cl(const struct lu_device *d)
				2091	{
				2092	return d->ld_type->ldt_tags & LU_DEVICE_CL;
				2093	}
				2094
				2095	static inline struct cl_device lu2cl_dev(const struct lu_device d)
				2096	{
Oleg Drokin	d2a1398	2016-02-16 00:46:52 -0500	[diff] [blame]	2097	LASSERT(!d \|\| IS_ERR(d) \|\| lu_device_is_cl(d));
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2098	return container_of0(d, struct cl_device, cd_lu_dev);
				2099	}
				2100
				2101	static inline struct lu_device cl2lu_dev(struct cl_device d)
				2102	{
				2103	return &d->cd_lu_dev;
				2104	}
				2105
				2106	static inline struct cl_object lu2cl(const struct lu_object o)
				2107	{
Oleg Drokin	d2a1398	2016-02-16 00:46:52 -0500	[diff] [blame]	2108	LASSERT(!o \|\| IS_ERR(o) \|\| lu_device_is_cl(o->lo_dev));
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2109	return container_of0(o, struct cl_object, co_lu);
				2110	}
				2111
				2112	static inline const struct cl_object_conf *
				2113	lu2cl_conf(const struct lu_object_conf *conf)
				2114	{
				2115	return container_of0(conf, struct cl_object_conf, coc_lu);
				2116	}
				2117
				2118	static inline struct cl_object cl_object_next(const struct cl_object obj)
				2119	{
				2120	return obj ? lu2cl(lu_object_next(&obj->co_lu)) : NULL;
				2121	}
				2122
				2123	static inline struct cl_device cl_object_device(const struct cl_object o)
				2124	{
Oleg Drokin	d2a1398	2016-02-16 00:46:52 -0500	[diff] [blame]	2125	LASSERT(!o \|\| IS_ERR(o) \|\| lu_device_is_cl(o->co_lu.lo_dev));
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2126	return container_of0(o->co_lu.lo_dev, struct cl_device, cd_lu_dev);
				2127	}
				2128
				2129	static inline struct cl_object_header luh2coh(const struct lu_object_header h)
				2130	{
				2131	return container_of0(h, struct cl_object_header, coh_lu);
				2132	}
				2133
				2134	static inline struct cl_site cl_object_site(const struct cl_object obj)
				2135	{
				2136	return lu2cl_site(obj->co_lu.lo_dev->ld_site);
				2137	}
				2138
				2139	static inline
				2140	struct cl_object_header cl_object_header(const struct cl_object obj)
				2141	{
				2142	return luh2coh(obj->co_lu.lo_header);
				2143	}
				2144
				2145	static inline int cl_device_init(struct cl_device d, struct lu_device_type t)
				2146	{
				2147	return lu_device_init(&d->cd_lu_dev, t);
				2148	}
				2149
				2150	static inline void cl_device_fini(struct cl_device *d)
				2151	{
				2152	lu_device_fini(&d->cd_lu_dev);
				2153	}
				2154
				2155	void cl_page_slice_add(struct cl_page page, struct cl_page_slice slice,
Jinshan Xiong	fd7444f	2016-03-30 19:48:33 -0400	[diff] [blame]	2156	struct cl_object *obj, pgoff_t index,
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2157	const struct cl_page_operations *ops);
				2158	void cl_lock_slice_add(struct cl_lock lock, struct cl_lock_slice slice,
				2159	struct cl_object *obj,
				2160	const struct cl_lock_operations *ops);
				2161	void cl_io_slice_add(struct cl_io io, struct cl_io_slice slice,
				2162	struct cl_object obj, const struct cl_io_operations ops);
				2163	void cl_req_slice_add(struct cl_req req, struct cl_req_slice slice,
				2164	struct cl_device *dev,
				2165	const struct cl_req_operations *ops);
				2166	/** @} helpers */
				2167
				2168	/** \defgroup cl_object cl_object
Oleg Drokin	c56e256	2016-02-24 22:00:25 -0500	[diff] [blame]	2169	* @{
				2170	*/
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2171	struct cl_object cl_object_top(struct cl_object o);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2172	struct cl_object cl_object_find(const struct lu_env env, struct cl_device *cd,
				2173	const struct lu_fid *fid,
				2174	const struct cl_object_conf *c);
				2175
				2176	int cl_object_header_init(struct cl_object_header *h);
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2177	void cl_object_put(const struct lu_env env, struct cl_object o);
				2178	void cl_object_get(struct cl_object *o);
				2179	void cl_object_attr_lock(struct cl_object *o);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2180	void cl_object_attr_unlock(struct cl_object *o);
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2181	int cl_object_attr_get(const struct lu_env env, struct cl_object obj,
				2182	struct cl_attr *attr);
				2183	int cl_object_attr_set(const struct lu_env env, struct cl_object obj,
				2184	const struct cl_attr *attr, unsigned valid);
				2185	int cl_object_glimpse(const struct lu_env env, struct cl_object obj,
				2186	struct ost_lvb *lvb);
				2187	int cl_conf_set(const struct lu_env env, struct cl_object obj,
				2188	const struct cl_object_conf *conf);
Jinshan Xiong	06563b5	2016-03-30 19:48:40 -0400	[diff] [blame]	2189	int cl_object_prune(const struct lu_env env, struct cl_object obj);
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2190	void cl_object_kill(const struct lu_env env, struct cl_object obj);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2191
				2192	/**
				2193	* Returns true, iff \a o0 and \a o1 are slices of the same object.
				2194	*/
				2195	static inline int cl_object_same(struct cl_object o0, struct cl_object o1)
				2196	{
				2197	return cl_object_header(o0) == cl_object_header(o1);
				2198	}
				2199
				2200	static inline void cl_object_page_init(struct cl_object *clob, int size)
				2201	{
				2202	clob->co_slice_off = cl_object_header(clob)->coh_page_bufsize;
Jinshan Xiong	7addf40	2016-03-30 19:48:32 -0400	[diff] [blame]	2203	cl_object_header(clob)->coh_page_bufsize += cfs_size_round(size);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2204	}
				2205
				2206	static inline void cl_object_page_slice(struct cl_object clob,
				2207	struct cl_page *page)
				2208	{
				2209	return (void )((char )page + clob->co_slice_off);
				2210	}
				2211
Jinshan Xiong	3c361c1	2016-03-30 19:48:29 -0400	[diff] [blame]	2212	/**
				2213	* Return refcount of cl_object.
				2214	*/
				2215	static inline int cl_object_refc(struct cl_object *clob)
				2216	{
				2217	struct lu_object_header *header = clob->co_lu.lo_header;
				2218
				2219	return atomic_read(&header->loh_ref);
				2220	}
				2221
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2222	/** @} cl_object */
				2223
				2224	/** \defgroup cl_page cl_page
Oleg Drokin	c56e256	2016-02-24 22:00:25 -0500	[diff] [blame]	2225	* @{
				2226	*/
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2227	enum {
				2228	CLP_GANG_OKAY = 0,
				2229	CLP_GANG_RESCHED,
				2230	CLP_GANG_AGAIN,
				2231	CLP_GANG_ABORT
				2232	};
				2233
				2234	/* callback of cl_page_gang_lookup() */
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2235	struct cl_page cl_page_find(const struct lu_env env, struct cl_object *obj,
				2236	pgoff_t idx, struct page *vmpage,
				2237	enum cl_page_type type);
Jinshan Xiong	d9d4790	2016-03-30 19:48:28 -0400	[diff] [blame]	2238	struct cl_page cl_page_alloc(const struct lu_env env,
				2239	struct cl_object *o, pgoff_t ind,
				2240	struct page *vmpage,
				2241	enum cl_page_type type);
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2242	void cl_page_get(struct cl_page *page);
				2243	void cl_page_put(const struct lu_env env, struct cl_page page);
				2244	void cl_page_print(const struct lu_env env, void cookie, lu_printer_t printer,
				2245	const struct cl_page *pg);
				2246	void cl_page_header_print(const struct lu_env env, void cookie,
				2247	lu_printer_t printer, const struct cl_page *pg);
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2248	struct cl_page cl_vmpage_page(struct page vmpage, struct cl_object *obj);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2249
				2250	const struct cl_page_slice cl_page_at(const struct cl_page page,
				2251	const struct lu_device_type *dtype);
				2252
				2253	/**
				2254	* \name ownership
				2255	*
				2256	* Functions dealing with the ownership of page by io.
				2257	*/
				2258	/** @{ */
				2259
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2260	int cl_page_own(const struct lu_env *env,
				2261	struct cl_io io, struct cl_page page);
				2262	int cl_page_own_try(const struct lu_env *env,
				2263	struct cl_io io, struct cl_page page);
				2264	void cl_page_assume(const struct lu_env *env,
				2265	struct cl_io io, struct cl_page page);
				2266	void cl_page_unassume(const struct lu_env *env,
				2267	struct cl_io io, struct cl_page pg);
				2268	void cl_page_disown(const struct lu_env *env,
				2269	struct cl_io io, struct cl_page page);
				2270	int cl_page_is_owned(const struct cl_page pg, const struct cl_io io);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2271
				2272	/** @} ownership */
				2273
				2274	/**
				2275	* \name transfer
				2276	*
				2277	* Functions dealing with the preparation of a page for a transfer, and
				2278	* tracking transfer state.
				2279	*/
				2280	/** @{ */
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2281	int cl_page_prep(const struct lu_env env, struct cl_io io,
				2282	struct cl_page *pg, enum cl_req_type crt);
				2283	void cl_page_completion(const struct lu_env *env,
				2284	struct cl_page *pg, enum cl_req_type crt, int ioret);
				2285	int cl_page_make_ready(const struct lu_env env, struct cl_page pg,
				2286	enum cl_req_type crt);
				2287	int cl_page_cache_add(const struct lu_env env, struct cl_io io,
				2288	struct cl_page *pg, enum cl_req_type crt);
				2289	void cl_page_clip(const struct lu_env env, struct cl_page pg,
				2290	int from, int to);
				2291	int cl_page_cancel(const struct lu_env env, struct cl_page page);
				2292	int cl_page_flush(const struct lu_env env, struct cl_io io,
				2293	struct cl_page *pg);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2294
				2295	/** @} transfer */
				2296
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2297	/**
				2298	* \name helper routines
				2299	* Functions to discard, delete and export a cl_page.
				2300	*/
				2301	/** @{ */
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2302	void cl_page_discard(const struct lu_env env, struct cl_io io,
				2303	struct cl_page *pg);
				2304	void cl_page_delete(const struct lu_env env, struct cl_page pg);
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2305	int cl_page_is_vmlocked(const struct lu_env env, const struct cl_page pg);
				2306	void cl_page_export(const struct lu_env env, struct cl_page pg, int uptodate);
				2307	int cl_page_is_under_lock(const struct lu_env env, struct cl_io io,
Jinshan Xiong	fd7444f	2016-03-30 19:48:33 -0400	[diff] [blame]	2308	struct cl_page page, pgoff_t max_index);
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2309	loff_t cl_offset(const struct cl_object *obj, pgoff_t idx);
				2310	pgoff_t cl_index(const struct cl_object *obj, loff_t offset);
				2311	int cl_page_size(const struct cl_object *obj);
				2312	int cl_pages_prune(const struct lu_env env, struct cl_object obj);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2313
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2314	void cl_lock_print(const struct lu_env env, void cookie,
				2315	lu_printer_t printer, const struct cl_lock *lock);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2316	void cl_lock_descr_print(const struct lu_env env, void cookie,
				2317	lu_printer_t printer,
				2318	const struct cl_lock_descr *descr);
				2319	/* @} helper */
				2320
John L. Hammond	0d34565	2016-03-30 19:48:45 -0400	[diff] [blame]	2321	/**
				2322	* Data structure managing a client's cached pages. A count of
				2323	* "unstable" pages is maintained, and an LRU of clean pages is
				2324	* maintained. "unstable" pages are pages pinned by the ptlrpc
				2325	* layer for recovery purposes.
				2326	*/
				2327	struct cl_client_cache {
				2328	/**
				2329	* # of users (OSCs)
				2330	*/
				2331	atomic_t ccc_users;
				2332	/**
				2333	* # of threads are doing shrinking
				2334	*/
				2335	unsigned int ccc_lru_shrinkers;
				2336	/**
				2337	* # of LRU entries available
				2338	*/
				2339	atomic_t ccc_lru_left;
				2340	/**
				2341	* List of entities(OSCs) for this LRU cache
				2342	*/
				2343	struct list_head ccc_lru;
				2344	/**
				2345	* Max # of LRU entries
				2346	*/
				2347	unsigned long ccc_lru_max;
				2348	/**
				2349	* Lock to protect ccc_lru list
				2350	*/
				2351	spinlock_t ccc_lru_lock;
Prakash Surya	ac5b148	2016-04-27 18:21:04 -0400	[diff] [blame]	2352	/**
				2353	* # of unstable pages for this mount point
				2354	*/
				2355	atomic_t ccc_unstable_nr;
				2356	/**
				2357	* Waitq for awaiting unstable pages to reach zero.
				2358	* Used at umounting time and signaled on BRW commit
				2359	*/
				2360	wait_queue_head_t ccc_unstable_waitq;
				2361
John L. Hammond	0d34565	2016-03-30 19:48:45 -0400	[diff] [blame]	2362	};
				2363
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2364	/** @} cl_page */
				2365
				2366	/** \defgroup cl_lock cl_lock
Oleg Drokin	c56e256	2016-02-24 22:00:25 -0500	[diff] [blame]	2367	* @{
				2368	*/
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2369
Jinshan Xiong	06563b5	2016-03-30 19:48:40 -0400	[diff] [blame]	2370	int cl_lock_request(const struct lu_env env, struct cl_io io,
				2371	struct cl_lock *lock);
				2372	int cl_lock_init(const struct lu_env env, struct cl_lock lock,
				2373	const struct cl_io *io);
				2374	void cl_lock_fini(const struct lu_env env, struct cl_lock lock);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2375	const struct cl_lock_slice cl_lock_at(const struct cl_lock lock,
				2376	const struct lu_device_type *dtype);
Jinshan Xiong	06563b5	2016-03-30 19:48:40 -0400	[diff] [blame]	2377	void cl_lock_release(const struct lu_env env, struct cl_lock lock);
				2378	int cl_lock_enqueue(const struct lu_env env, struct cl_io io,
				2379	struct cl_lock lock, struct cl_sync_io anchor);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2380	void cl_lock_cancel(const struct lu_env env, struct cl_lock lock);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2381
				2382	/** @} cl_lock */
				2383
				2384	/** \defgroup cl_io cl_io
Oleg Drokin	c56e256	2016-02-24 22:00:25 -0500	[diff] [blame]	2385	* @{
				2386	*/
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2387
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2388	int cl_io_init(const struct lu_env env, struct cl_io io,
				2389	enum cl_io_type iot, struct cl_object *obj);
				2390	int cl_io_sub_init(const struct lu_env env, struct cl_io io,
				2391	enum cl_io_type iot, struct cl_object *obj);
				2392	int cl_io_rw_init(const struct lu_env env, struct cl_io io,
				2393	enum cl_io_type iot, loff_t pos, size_t count);
				2394	int cl_io_loop(const struct lu_env env, struct cl_io io);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2395
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2396	void cl_io_fini(const struct lu_env env, struct cl_io io);
				2397	int cl_io_iter_init(const struct lu_env env, struct cl_io io);
				2398	void cl_io_iter_fini(const struct lu_env env, struct cl_io io);
				2399	int cl_io_lock(const struct lu_env env, struct cl_io io);
				2400	void cl_io_unlock(const struct lu_env env, struct cl_io io);
				2401	int cl_io_start(const struct lu_env env, struct cl_io io);
				2402	void cl_io_end(const struct lu_env env, struct cl_io io);
				2403	int cl_io_lock_add(const struct lu_env env, struct cl_io io,
				2404	struct cl_io_lock_link *link);
				2405	int cl_io_lock_alloc_add(const struct lu_env env, struct cl_io io,
				2406	struct cl_lock_descr *descr);
				2407	int cl_io_read_page(const struct lu_env env, struct cl_io io,
				2408	struct cl_page *page);
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2409	int cl_io_submit_rw(const struct lu_env env, struct cl_io io,
				2410	enum cl_req_type iot, struct cl_2queue *queue);
				2411	int cl_io_submit_sync(const struct lu_env env, struct cl_io io,
				2412	enum cl_req_type iot, struct cl_2queue *queue,
				2413	long timeout);
Jinshan Xiong	77605e4	2016-03-30 19:48:30 -0400	[diff] [blame]	2414	int cl_io_commit_async(const struct lu_env env, struct cl_io io,
				2415	struct cl_page_list *queue, int from, int to,
				2416	cl_commit_cbt cb);
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2417	int cl_io_is_going(const struct lu_env *env);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2418
				2419	/**
				2420	* True, iff \a io is an O_APPEND write(2).
				2421	*/
				2422	static inline int cl_io_is_append(const struct cl_io *io)
				2423	{
				2424	return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append;
				2425	}
				2426
				2427	static inline int cl_io_is_sync_write(const struct cl_io *io)
				2428	{
				2429	return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_sync;
				2430	}
				2431
				2432	static inline int cl_io_is_mkwrite(const struct cl_io *io)
				2433	{
				2434	return io->ci_type == CIT_FAULT && io->u.ci_fault.ft_mkwrite;
				2435	}
				2436
				2437	/**
				2438	* True, iff \a io is a truncate(2).
				2439	*/
				2440	static inline int cl_io_is_trunc(const struct cl_io *io)
				2441	{
				2442	return io->ci_type == CIT_SETATTR &&
				2443	(io->u.ci_setattr.sa_valid & ATTR_SIZE);
				2444	}
				2445
				2446	struct cl_io cl_io_top(struct cl_io io);
				2447
Joe Perches	ec83e61	2013-10-13 20:22:03 -0700	[diff] [blame]	2448	#define CL_IO_SLICE_CLEAN(foo_io, base) \
				2449	do { \
				2450	typeof(foo_io) __foo_io = (foo_io); \
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2451	\
Joe Perches	ec83e61	2013-10-13 20:22:03 -0700	[diff] [blame]	2452	CLASSERT(offsetof(typeof(*__foo_io), base) == 0); \
				2453	memset(&__foo_io->base + 1, 0, \
				2454	sizeof(*__foo_io) - sizeof(__foo_io->base)); \
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2455	} while (0)
				2456
				2457	/** @} cl_io */
				2458
				2459	/** \defgroup cl_page_list cl_page_list
Oleg Drokin	c56e256	2016-02-24 22:00:25 -0500	[diff] [blame]	2460	* @{
				2461	*/
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2462
				2463	/**
				2464	* Last page in the page list.
				2465	*/
				2466	static inline struct cl_page cl_page_list_last(struct cl_page_list plist)
				2467	{
				2468	LASSERT(plist->pl_nr > 0);
				2469	return list_entry(plist->pl_pages.prev, struct cl_page, cp_batch);
				2470	}
				2471
Jinshan Xiong	77605e4	2016-03-30 19:48:30 -0400	[diff] [blame]	2472	static inline struct cl_page cl_page_list_first(struct cl_page_list plist)
				2473	{
				2474	LASSERT(plist->pl_nr > 0);
				2475	return list_entry(plist->pl_pages.next, struct cl_page, cp_batch);
				2476	}
				2477
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2478	/**
				2479	* Iterate over pages in a page list.
				2480	*/
				2481	#define cl_page_list_for_each(page, list) \
				2482	list_for_each_entry((page), &(list)->pl_pages, cp_batch)
				2483
				2484	/**
				2485	* Iterate over pages in a page list, taking possible removals into account.
				2486	*/
				2487	#define cl_page_list_for_each_safe(page, temp, list) \
				2488	list_for_each_entry_safe((page), (temp), &(list)->pl_pages, cp_batch)
				2489
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2490	void cl_page_list_init(struct cl_page_list *plist);
				2491	void cl_page_list_add(struct cl_page_list plist, struct cl_page page);
				2492	void cl_page_list_move(struct cl_page_list dst, struct cl_page_list src,
				2493	struct cl_page *page);
Jinshan Xiong	77605e4	2016-03-30 19:48:30 -0400	[diff] [blame]	2494	void cl_page_list_move_head(struct cl_page_list dst, struct cl_page_list src,
				2495	struct cl_page *page);
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2496	void cl_page_list_splice(struct cl_page_list list, struct cl_page_list head);
Jinshan Xiong	77605e4	2016-03-30 19:48:30 -0400	[diff] [blame]	2497	void cl_page_list_del(const struct lu_env env, struct cl_page_list plist,
				2498	struct cl_page *page);
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2499	void cl_page_list_disown(const struct lu_env *env,
				2500	struct cl_io io, struct cl_page_list plist);
Jinshan Xiong	77605e4	2016-03-30 19:48:30 -0400	[diff] [blame]	2501	void cl_page_list_fini(const struct lu_env env, struct cl_page_list plist);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2502
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2503	void cl_2queue_init(struct cl_2queue *queue);
				2504	void cl_2queue_disown(const struct lu_env *env,
				2505	struct cl_io io, struct cl_2queue queue);
				2506	void cl_2queue_discard(const struct lu_env *env,
				2507	struct cl_io io, struct cl_2queue queue);
				2508	void cl_2queue_fini(const struct lu_env env, struct cl_2queue queue);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2509	void cl_2queue_init_page(struct cl_2queue queue, struct cl_page page);
				2510
				2511	/** @} cl_page_list */
				2512
				2513	/** \defgroup cl_req cl_req
Oleg Drokin	c56e256	2016-02-24 22:00:25 -0500	[diff] [blame]	2514	* @{
				2515	*/
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2516	struct cl_req cl_req_alloc(const struct lu_env env, struct cl_page *page,
				2517	enum cl_req_type crt, int nr_objects);
				2518
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2519	void cl_req_page_add(const struct lu_env env, struct cl_req req,
				2520	struct cl_page *page);
				2521	void cl_req_page_done(const struct lu_env env, struct cl_page page);
				2522	int cl_req_prep(const struct lu_env env, struct cl_req req);
				2523	void cl_req_attr_set(const struct lu_env env, struct cl_req req,
				2524	struct cl_req_attr *attr, u64 flags);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2525	void cl_req_completion(const struct lu_env env, struct cl_req req, int ioret);
				2526
				2527	/** \defgroup cl_sync_io cl_sync_io
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2528	* @{
				2529	*/
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2530
				2531	/**
				2532	* Anchor for synchronous transfer. This is allocated on a stack by thread
				2533	* doing synchronous transfer, and a pointer to this structure is set up in
				2534	* every page submitted for transfer. Transfer completion routine updates
				2535	* anchor and wakes up waiting thread when transfer is complete.
				2536	*/
				2537	struct cl_sync_io {
				2538	/** number of pages yet to be transferred. */
				2539	atomic_t csi_sync_nr;
				2540	/** error code. */
				2541	int csi_sync_rc;
				2542	/** barrier of destroy this structure */
				2543	atomic_t csi_barrier;
				2544	/** completion to be signaled when transfer is complete. */
				2545	wait_queue_head_t csi_waitq;
Jinshan Xiong	e5c4e63	2016-03-30 19:48:39 -0400	[diff] [blame]	2546	/** callback to invoke when this IO is finished */
				2547	void (csi_end_io)(const struct lu_env ,
				2548	struct cl_sync_io *);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2549	};
				2550
Jinshan Xiong	e5c4e63	2016-03-30 19:48:39 -0400	[diff] [blame]	2551	void cl_sync_io_init(struct cl_sync_io *anchor, int nr,
				2552	void (end)(const struct lu_env , struct cl_sync_io *));
				2553	int cl_sync_io_wait(const struct lu_env env, struct cl_sync_io anchor,
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2554	long timeout);
Jinshan Xiong	e5c4e63	2016-03-30 19:48:39 -0400	[diff] [blame]	2555	void cl_sync_io_note(const struct lu_env env, struct cl_sync_io anchor,
				2556	int ioret);
				2557	void cl_sync_io_end(const struct lu_env env, struct cl_sync_io anchor);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2558
				2559	/** @} cl_sync_io */
				2560
				2561	/** @} cl_req */
				2562
				2563	/** \defgroup cl_env cl_env
				2564	*
				2565	* lu_env handling for a client.
				2566	*
				2567	* lu_env is an environment within which lustre code executes. Its major part
				2568	* is lu_context---a fast memory allocation mechanism that is used to conserve
				2569	* precious kernel stack space. Originally lu_env was designed for a server,
				2570	* where
				2571	*
				2572	* - there is a (mostly) fixed number of threads, and
				2573	*
				2574	* - call chains have no non-lustre portions inserted between lustre code.
				2575	*
Masanari Iida	bd9070c	2014-03-08 22:58:34 +0900	[diff] [blame]	2576	* On a client both these assumption fails, because every user thread can
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2577	* potentially execute lustre code as part of a system call, and lustre calls
				2578	* into VFS or MM that call back into lustre.
				2579	*
				2580	* To deal with that, cl_env wrapper functions implement the following
				2581	* optimizations:
				2582	*
				2583	* - allocation and destruction of environment is amortized by caching no
				2584	* longer used environments instead of destroying them;
				2585	*
				2586	* - there is a notion of "current" environment, attached to the kernel
				2587	* data structure representing current thread Top-level lustre code
				2588	* allocates an environment and makes it current, then calls into
				2589	* non-lustre code, that in turn calls lustre back. Low-level lustre
				2590	* code thus called can fetch environment created by the top-level code
				2591	* and reuse it, avoiding additional environment allocation.
				2592	* Right now, three interfaces can attach the cl_env to running thread:
				2593	* - cl_env_get
				2594	* - cl_env_implant
				2595	* - cl_env_reexit(cl_env_reenter had to be called priorly)
				2596	*
				2597	* \see lu_env, lu_context, lu_context_key
Oleg Drokin	c56e256	2016-02-24 22:00:25 -0500	[diff] [blame]	2598	* @{
				2599	*/
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2600
				2601	struct cl_env_nest {
				2602	int cen_refcheck;
				2603	void *cen_cookie;
				2604	};
				2605
Oleg Drokin	10457d4	2016-02-26 01:49:50 -0500	[diff] [blame]	2606	struct lu_env cl_env_get(int refcheck);
				2607	struct lu_env cl_env_alloc(int refcheck, __u32 tags);
				2608	struct lu_env cl_env_nested_get(struct cl_env_nest nest);
				2609	void cl_env_put(struct lu_env env, int refcheck);
				2610	void cl_env_nested_put(struct cl_env_nest nest, struct lu_env env);
				2611	void *cl_env_reenter(void);
				2612	void cl_env_reexit(void *cookie);
				2613	void cl_env_implant(struct lu_env env, int refcheck);
				2614	void cl_env_unplant(struct lu_env env, int refcheck);
Jinshan Xiong	26f98e8	2016-03-30 19:48:25 -0400	[diff] [blame]	2615	unsigned int cl_env_cache_purge(unsigned int nr);
Jinshan Xiong	3c361c1	2016-03-30 19:48:29 -0400	[diff] [blame]	2616	struct lu_env *cl_env_percpu_get(void);
				2617	void cl_env_percpu_put(struct lu_env *env);
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2618
				2619	/** @} cl_env */
				2620
				2621	/*
				2622	* Misc
				2623	*/
Peng Tao	d7e09d0	2013-05-02 16:46:55 +0800	[diff] [blame]	2624	void cl_lvb2attr(struct cl_attr attr, const struct ost_lvb lvb);
				2625
				2626	struct cl_device cl_type_setup(const struct lu_env env, struct lu_site *site,
				2627	struct lu_device_type *ldt,
				2628	struct lu_device *next);
				2629	/** @} clio */
				2630
				2631	int cl_global_init(void);
				2632	void cl_global_fini(void);
				2633
				2634	#endif /* _LINUX_CL_OBJECT_H */