Blame - arch/ia64/kernel/perfmon.c - kernel/msm-4.9

blob: 18c51c37a9a35e825427519ebd90e51dd25c308c [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* This file implements the perfmon-2 subsystem which is used
				3	* to program the IA-64 Performance Monitoring Unit (PMU).
				4	*
				5	* The initial version of perfmon.c was written by
				6	* Ganesh Venkitachalam, IBM Corp.
				7	*
				8	* Then it was modified for perfmon-1.x by Stephane Eranian and
				9	* David Mosberger, Hewlett Packard Co.
				10	*
				11	* Version Perfmon-2.x is a rewrite of perfmon-1.x
				12	* by Stephane Eranian, Hewlett Packard Co.
				13	*
Tony Luck	a1ecf7f	2005-05-18 16:06:00 -0700	[diff] [blame]	14	* Copyright (C) 1999-2005 Hewlett Packard Co
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	15	* Stephane Eranian <eranian@hpl.hp.com>
				16	* David Mosberger-Tang <davidm@hpl.hp.com>
				17	*
				18	* More information about perfmon available at:
				19	* http://www.hpl.hp.com/research/linux/perfmon
				20	*/
				21
				22	#include <linux/config.h>
				23	#include <linux/module.h>
				24	#include <linux/kernel.h>
				25	#include <linux/sched.h>
				26	#include <linux/interrupt.h>
				27	#include <linux/smp_lock.h>
				28	#include <linux/proc_fs.h>
				29	#include <linux/seq_file.h>
				30	#include <linux/init.h>
				31	#include <linux/vmalloc.h>
				32	#include <linux/mm.h>
				33	#include <linux/sysctl.h>
				34	#include <linux/list.h>
				35	#include <linux/file.h>
				36	#include <linux/poll.h>
				37	#include <linux/vfs.h>
				38	#include <linux/pagemap.h>
				39	#include <linux/mount.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	40	#include <linux/bitops.h>
Dipankar Sarma	badf166	2005-09-09 13:04:10 -0700	[diff] [blame]	41	#include <linux/rcupdate.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	42
				43	#include <asm/errno.h>
				44	#include <asm/intrinsics.h>
				45	#include <asm/page.h>
				46	#include <asm/perfmon.h>
				47	#include <asm/processor.h>
				48	#include <asm/signal.h>
				49	#include <asm/system.h>
				50	#include <asm/uaccess.h>
				51	#include <asm/delay.h>
				52
				53	#ifdef CONFIG_PERFMON
				54	/*
				55	* perfmon context state
				56	*/
				57	#define PFM_CTX_UNLOADED 1 /* context is not loaded onto any task */
				58	#define PFM_CTX_LOADED 2 /* context is loaded onto a task */
				59	#define PFM_CTX_MASKED 3 /* context is loaded but monitoring is masked due to overflow */
				60	#define PFM_CTX_ZOMBIE 4 /* owner of the context is closing it */
				61
				62	#define PFM_INVALID_ACTIVATION (~0UL)
				63
				64	/*
				65	* depth of message queue
				66	*/
				67	#define PFM_MAX_MSGS 32
				68	#define PFM_CTXQ_EMPTY(g) ((g)->ctx_msgq_head == (g)->ctx_msgq_tail)
				69
				70	/*
				71	* type of a PMU register (bitmask).
				72	* bitmask structure:
				73	* bit0 : register implemented
				74	* bit1 : end marker
				75	* bit2-3 : reserved
				76	* bit4 : pmc has pmc.pm
				77	* bit5 : pmc controls a counter (has pmc.oi), pmd is used as counter
				78	* bit6-7 : register type
				79	* bit8-31: reserved
				80	*/
				81	#define PFM_REG_NOTIMPL 0x0 /* not implemented at all */
				82	#define PFM_REG_IMPL 0x1 /* register implemented */
				83	#define PFM_REG_END 0x2 /* end marker */
				84	#define PFM_REG_MONITOR (0x1<<4\|PFM_REG_IMPL) /* a PMC with a pmc.pm field only */
				85	#define PFM_REG_COUNTING (0x2<<4\|PFM_REG_MONITOR) /* a monitor + pmc.oi+ PMD used as a counter */
				86	#define PFM_REG_CONTROL (0x4<<4\|PFM_REG_IMPL) /* PMU control register */
				87	#define PFM_REG_CONFIG (0x8<<4\|PFM_REG_IMPL) /* configuration register */
				88	#define PFM_REG_BUFFER (0xc<<4\|PFM_REG_IMPL) /* PMD used as buffer */
				89
				90	#define PMC_IS_LAST(i) (pmu_conf->pmc_desc[i].type & PFM_REG_END)
				91	#define PMD_IS_LAST(i) (pmu_conf->pmd_desc[i].type & PFM_REG_END)
				92
				93	#define PMC_OVFL_NOTIFY(ctx, i) ((ctx)->ctx_pmds[i].flags & PFM_REGFL_OVFL_NOTIFY)
				94
				95	/* i assumed unsigned */
				96	#define PMC_IS_IMPL(i) (i< PMU_MAX_PMCS && (pmu_conf->pmc_desc[i].type & PFM_REG_IMPL))
				97	#define PMD_IS_IMPL(i) (i< PMU_MAX_PMDS && (pmu_conf->pmd_desc[i].type & PFM_REG_IMPL))
				98
				99	/* XXX: these assume that register i is implemented */
				100	#define PMD_IS_COUNTING(i) ((pmu_conf->pmd_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING)
				101	#define PMC_IS_COUNTING(i) ((pmu_conf->pmc_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING)
				102	#define PMC_IS_MONITOR(i) ((pmu_conf->pmc_desc[i].type & PFM_REG_MONITOR) == PFM_REG_MONITOR)
				103	#define PMC_IS_CONTROL(i) ((pmu_conf->pmc_desc[i].type & PFM_REG_CONTROL) == PFM_REG_CONTROL)
				104
				105	#define PMC_DFL_VAL(i) pmu_conf->pmc_desc[i].default_value
				106	#define PMC_RSVD_MASK(i) pmu_conf->pmc_desc[i].reserved_mask
				107	#define PMD_PMD_DEP(i) pmu_conf->pmd_desc[i].dep_pmd[0]
				108	#define PMC_PMD_DEP(i) pmu_conf->pmc_desc[i].dep_pmd[0]
				109
				110	#define PFM_NUM_IBRS IA64_NUM_DBG_REGS
				111	#define PFM_NUM_DBRS IA64_NUM_DBG_REGS
				112
				113	#define CTX_OVFL_NOBLOCK(c) ((c)->ctx_fl_block == 0)
				114	#define CTX_HAS_SMPL(c) ((c)->ctx_fl_is_sampling)
				115	#define PFM_CTX_TASK(h) (h)->ctx_task
				116
				117	#define PMU_PMC_OI 5 /* position of pmc.oi bit */
				118
				119	/* XXX: does not support more than 64 PMDs */
				120	#define CTX_USED_PMD(ctx, mask) (ctx)->ctx_used_pmds[0] \|= (mask)
				121	#define CTX_IS_USED_PMD(ctx, c) (((ctx)->ctx_used_pmds[0] & (1UL << (c))) != 0UL)
				122
				123	#define CTX_USED_MONITOR(ctx, mask) (ctx)->ctx_used_monitors[0] \|= (mask)
				124
				125	#define CTX_USED_IBR(ctx,n) (ctx)->ctx_used_ibrs[(n)>>6] \|= 1UL<< ((n) % 64)
				126	#define CTX_USED_DBR(ctx,n) (ctx)->ctx_used_dbrs[(n)>>6] \|= 1UL<< ((n) % 64)
				127	#define CTX_USES_DBREGS(ctx) (((pfm_context_t *)(ctx))->ctx_fl_using_dbreg==1)
				128	#define PFM_CODE_RR 0 /* requesting code range restriction */
				129	#define PFM_DATA_RR 1 /* requestion data range restriction */
				130
				131	#define PFM_CPUINFO_CLEAR(v) pfm_get_cpu_var(pfm_syst_info) &= ~(v)
				132	#define PFM_CPUINFO_SET(v) pfm_get_cpu_var(pfm_syst_info) \|= (v)
				133	#define PFM_CPUINFO_GET() pfm_get_cpu_var(pfm_syst_info)
				134
				135	#define RDEP(x) (1UL<<(x))
				136
				137	/*
				138	* context protection macros
				139	* in SMP:
				140	* - we need to protect against CPU concurrency (spin_lock)
				141	* - we need to protect against PMU overflow interrupts (local_irq_disable)
				142	* in UP:
				143	* - we need to protect against PMU overflow interrupts (local_irq_disable)
				144	*
				145	* spin_lock_irqsave()/spin_lock_irqrestore():
				146	* in SMP: local_irq_disable + spin_lock
				147	* in UP : local_irq_disable
				148	*
				149	* spin_lock()/spin_lock():
				150	* in UP : removed automatically
				151	* in SMP: protect against context accesses from other CPU. interrupts
				152	* are not masked. This is useful for the PMU interrupt handler
				153	* because we know we will not get PMU concurrency in that code.
				154	*/
				155	#define PROTECT_CTX(c, f) \
				156	do { \
				157	DPRINT(("spinlock_irq_save ctx %p by [%d]\n", c, current->pid)); \
				158	spin_lock_irqsave(&(c)->ctx_lock, f); \
				159	DPRINT(("spinlocked ctx %p by [%d]\n", c, current->pid)); \
				160	} while(0)
				161
				162	#define UNPROTECT_CTX(c, f) \
				163	do { \
				164	DPRINT(("spinlock_irq_restore ctx %p by [%d]\n", c, current->pid)); \
				165	spin_unlock_irqrestore(&(c)->ctx_lock, f); \
				166	} while(0)
				167
				168	#define PROTECT_CTX_NOPRINT(c, f) \
				169	do { \
				170	spin_lock_irqsave(&(c)->ctx_lock, f); \
				171	} while(0)
				172
				173
				174	#define UNPROTECT_CTX_NOPRINT(c, f) \
				175	do { \
				176	spin_unlock_irqrestore(&(c)->ctx_lock, f); \
				177	} while(0)
				178
				179
				180	#define PROTECT_CTX_NOIRQ(c) \
				181	do { \
				182	spin_lock(&(c)->ctx_lock); \
				183	} while(0)
				184
				185	#define UNPROTECT_CTX_NOIRQ(c) \
				186	do { \
				187	spin_unlock(&(c)->ctx_lock); \
				188	} while(0)
				189
				190
				191	#ifdef CONFIG_SMP
				192
				193	#define GET_ACTIVATION() pfm_get_cpu_var(pmu_activation_number)
				194	#define INC_ACTIVATION() pfm_get_cpu_var(pmu_activation_number)++
				195	#define SET_ACTIVATION(c) (c)->ctx_last_activation = GET_ACTIVATION()
				196
				197	#else /* !CONFIG_SMP */
				198	#define SET_ACTIVATION(t) do {} while(0)
				199	#define GET_ACTIVATION(t) do {} while(0)
				200	#define INC_ACTIVATION(t) do {} while(0)
				201	#endif /* CONFIG_SMP */
				202
				203	#define SET_PMU_OWNER(t, c) do { pfm_get_cpu_var(pmu_owner) = (t); pfm_get_cpu_var(pmu_ctx) = (c); } while(0)
				204	#define GET_PMU_OWNER() pfm_get_cpu_var(pmu_owner)
				205	#define GET_PMU_CTX() pfm_get_cpu_var(pmu_ctx)
				206
				207	#define LOCK_PFS(g) spin_lock_irqsave(&pfm_sessions.pfs_lock, g)
				208	#define UNLOCK_PFS(g) spin_unlock_irqrestore(&pfm_sessions.pfs_lock, g)
				209
				210	#define PFM_REG_RETFLAG_SET(flags, val) do { flags &= ~PFM_REG_RETFL_MASK; flags \|= (val); } while(0)
				211
				212	/*
				213	* cmp0 must be the value of pmc0
				214	*/
				215	#define PMC0_HAS_OVFL(cmp0) (cmp0 & ~0x1UL)
				216
				217	#define PFMFS_MAGIC 0xa0b4d889
				218
				219	/*
				220	* debugging
				221	*/
				222	#define PFM_DEBUGGING 1
				223	#ifdef PFM_DEBUGGING
				224	#define DPRINT(a) \
				225	do { \
				226	if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), current->pid); printk a; } \
				227	} while (0)
				228
				229	#define DPRINT_ovfl(a) \
				230	do { \
				231	if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), current->pid); printk a; } \
				232	} while (0)
				233	#endif
				234
				235	/*
				236	* 64-bit software counter structure
				237	*
				238	* the next_reset_type is applied to the next call to pfm_reset_regs()
				239	*/
				240	typedef struct {
				241	unsigned long val; /* virtual 64bit counter value */
				242	unsigned long lval; /* last reset value */
				243	unsigned long long_reset; /* reset value on sampling overflow */
				244	unsigned long short_reset; /* reset value on overflow */
				245	unsigned long reset_pmds[4]; /* which other pmds to reset when this counter overflows */
				246	unsigned long smpl_pmds[4]; /* which pmds are accessed when counter overflow */
				247	unsigned long seed; /* seed for random-number generator */
				248	unsigned long mask; /* mask for random-number generator */
				249	unsigned int flags; /* notify/do not notify */
				250	unsigned long eventid; /* overflow event identifier */
				251	} pfm_counter_t;
				252
				253	/*
				254	* context flags
				255	*/
				256	typedef struct {
				257	unsigned int block:1; /* when 1, task will blocked on user notifications */
				258	unsigned int system:1; /* do system wide monitoring */
				259	unsigned int using_dbreg:1; /* using range restrictions (debug registers) */
				260	unsigned int is_sampling:1; /* true if using a custom format */
				261	unsigned int excl_idle:1; /* exclude idle task in system wide session */
				262	unsigned int going_zombie:1; /* context is zombie (MASKED+blocking) */
				263	unsigned int trap_reason:2; /* reason for going into pfm_handle_work() */
				264	unsigned int no_msg:1; /* no message sent on overflow */
				265	unsigned int can_restart:1; /* allowed to issue a PFM_RESTART */
				266	unsigned int reserved:22;
				267	} pfm_context_flags_t;
				268
				269	#define PFM_TRAP_REASON_NONE 0x0 /* default value */
				270	#define PFM_TRAP_REASON_BLOCK 0x1 /* we need to block on overflow */
				271	#define PFM_TRAP_REASON_RESET 0x2 /* we need to reset PMDs */
				272
				273
				274	/*
				275	* perfmon context: encapsulates all the state of a monitoring session
				276	*/
				277
				278	typedef struct pfm_context {
				279	spinlock_t ctx_lock; /* context protection */
				280
				281	pfm_context_flags_t ctx_flags; /* bitmask of flags (block reason incl.) */
				282	unsigned int ctx_state; /* state: active/inactive (no bitfield) */
				283
				284	struct task_struct ctx_task; / task to which context is attached */
				285
				286	unsigned long ctx_ovfl_regs[4]; /* which registers overflowed (notification) */
				287
				288	struct semaphore ctx_restart_sem; /* use for blocking notification mode */
				289
				290	unsigned long ctx_used_pmds[4]; /* bitmask of PMD used */
				291	unsigned long ctx_all_pmds[4]; /* bitmask of all accessible PMDs */
				292	unsigned long ctx_reload_pmds[4]; /* bitmask of force reload PMD on ctxsw in */
				293
				294	unsigned long ctx_all_pmcs[4]; /* bitmask of all accessible PMCs */
				295	unsigned long ctx_reload_pmcs[4]; /* bitmask of force reload PMC on ctxsw in */
				296	unsigned long ctx_used_monitors[4]; /* bitmask of monitor PMC being used */
				297
				298	unsigned long ctx_pmcs[IA64_NUM_PMC_REGS]; /* saved copies of PMC values */
				299
				300	unsigned int ctx_used_ibrs[1]; /* bitmask of used IBR (speedup ctxsw in) */
				301	unsigned int ctx_used_dbrs[1]; /* bitmask of used DBR (speedup ctxsw in) */
				302	unsigned long ctx_dbrs[IA64_NUM_DBG_REGS]; /* DBR values (cache) when not loaded */
				303	unsigned long ctx_ibrs[IA64_NUM_DBG_REGS]; /* IBR values (cache) when not loaded */
				304
				305	pfm_counter_t ctx_pmds[IA64_NUM_PMD_REGS]; /* software state for PMDS */
				306
				307	u64 ctx_saved_psr_up; /* only contains psr.up value */
				308
				309	unsigned long ctx_last_activation; /* context last activation number for last_cpu */
				310	unsigned int ctx_last_cpu; /* CPU id of current or last CPU used (SMP only) */
				311	unsigned int ctx_cpu; /* cpu to which perfmon is applied (system wide) */
				312
				313	int ctx_fd; /* file descriptor used my this context */
				314	pfm_ovfl_arg_t ctx_ovfl_arg; /* argument to custom buffer format handler */
				315
				316	pfm_buffer_fmt_t ctx_buf_fmt; / buffer format callbacks */
				317	void ctx_smpl_hdr; / points to sampling buffer header kernel vaddr */
				318	unsigned long ctx_smpl_size; /* size of sampling buffer */
				319	void ctx_smpl_vaddr; / user level virtual address of smpl buffer */
				320
				321	wait_queue_head_t ctx_msgq_wait;
				322	pfm_msg_t ctx_msgq[PFM_MAX_MSGS];
				323	int ctx_msgq_head;
				324	int ctx_msgq_tail;
				325	struct fasync_struct *ctx_async_queue;
				326
				327	wait_queue_head_t ctx_zombieq; /* termination cleanup wait queue */
				328	} pfm_context_t;
				329
				330	/*
				331	* magic number used to verify that structure is really
				332	* a perfmon context
				333	*/
				334	#define PFM_IS_FILE(f) ((f)->f_op == &pfm_file_ops)
				335
				336	#define PFM_GET_CTX(t) ((pfm_context_t *)(t)->thread.pfm_context)
				337
				338	#ifdef CONFIG_SMP
				339	#define SET_LAST_CPU(ctx, v) (ctx)->ctx_last_cpu = (v)
				340	#define GET_LAST_CPU(ctx) (ctx)->ctx_last_cpu
				341	#else
				342	#define SET_LAST_CPU(ctx, v) do {} while(0)
				343	#define GET_LAST_CPU(ctx) do {} while(0)
				344	#endif
				345
				346
				347	#define ctx_fl_block ctx_flags.block
				348	#define ctx_fl_system ctx_flags.system
				349	#define ctx_fl_using_dbreg ctx_flags.using_dbreg
				350	#define ctx_fl_is_sampling ctx_flags.is_sampling
				351	#define ctx_fl_excl_idle ctx_flags.excl_idle
				352	#define ctx_fl_going_zombie ctx_flags.going_zombie
				353	#define ctx_fl_trap_reason ctx_flags.trap_reason
				354	#define ctx_fl_no_msg ctx_flags.no_msg
				355	#define ctx_fl_can_restart ctx_flags.can_restart
				356
				357	#define PFM_SET_WORK_PENDING(t, v) do { (t)->thread.pfm_needs_checking = v; } while(0);
				358	#define PFM_GET_WORK_PENDING(t) (t)->thread.pfm_needs_checking
				359
				360	/*
				361	* global information about all sessions
				362	* mostly used to synchronize between system wide and per-process
				363	*/
				364	typedef struct {
				365	spinlock_t pfs_lock; /* lock the structure */
				366
				367	unsigned int pfs_task_sessions; /* number of per task sessions */
				368	unsigned int pfs_sys_sessions; /* number of per system wide sessions */
				369	unsigned int pfs_sys_use_dbregs; /* incremented when a system wide session uses debug regs */
				370	unsigned int pfs_ptrace_use_dbregs; /* incremented when a process uses debug regs */
				371	struct task_struct pfs_sys_session[NR_CPUS]; / point to task owning a system-wide session */
				372	} pfm_session_t;
				373
				374	/*
				375	* information about a PMC or PMD.
				376	* dep_pmd[]: a bitmask of dependent PMD registers
				377	* dep_pmc[]: a bitmask of dependent PMC registers
				378	*/
				379	typedef int (pfm_reg_check_t)(struct task_struct task, pfm_context_t ctx, unsigned int cnum, unsigned long val, struct pt_regs *regs);
				380	typedef struct {
				381	unsigned int type;
				382	int pm_pos;
				383	unsigned long default_value; /* power-on default value */
				384	unsigned long reserved_mask; /* bitmask of reserved bits */
				385	pfm_reg_check_t read_check;
				386	pfm_reg_check_t write_check;
				387	unsigned long dep_pmd[4];
				388	unsigned long dep_pmc[4];
				389	} pfm_reg_desc_t;
				390
				391	/* assume cnum is a valid monitor */
				392	#define PMC_PM(cnum, val) (((val) >> (pmu_conf->pmc_desc[cnum].pm_pos)) & 0x1)
				393
				394	/*
				395	* This structure is initialized at boot time and contains
				396	* a description of the PMU main characteristics.
				397	*
				398	* If the probe function is defined, detection is based
				399	* on its return value:
				400	* - 0 means recognized PMU
				401	* - anything else means not supported
				402	* When the probe function is not defined, then the pmu_family field
				403	* is used and it must match the host CPU family such that:
				404	* - cpu->family & config->pmu_family != 0
				405	*/
				406	typedef struct {
				407	unsigned long ovfl_val; /* overflow value for counters */
				408
				409	pfm_reg_desc_t pmc_desc; / detailed PMC register dependencies descriptions */
				410	pfm_reg_desc_t pmd_desc; / detailed PMD register dependencies descriptions */
				411
				412	unsigned int num_pmcs; /* number of PMCS: computed at init time */
				413	unsigned int num_pmds; /* number of PMDS: computed at init time */
				414	unsigned long impl_pmcs[4]; /* bitmask of implemented PMCS */
				415	unsigned long impl_pmds[4]; /* bitmask of implemented PMDS */
				416
				417	char pmu_name; / PMU family name */
				418	unsigned int pmu_family; /* cpuid family pattern used to identify pmu */
				419	unsigned int flags; /* pmu specific flags */
				420	unsigned int num_ibrs; /* number of IBRS: computed at init time */
				421	unsigned int num_dbrs; /* number of DBRS: computed at init time */
				422	unsigned int num_counters; /* PMC/PMD counting pairs : computed at init time */
				423	int (probe)(void); / customized probe routine */
				424	unsigned int use_rr_dbregs:1; /* set if debug registers used for range restriction */
				425	} pmu_config_t;
				426	/*
				427	* PMU specific flags
				428	*/
				429	#define PFM_PMU_IRQ_RESEND 1 /* PMU needs explicit IRQ resend */
				430
				431	/*
				432	* debug register related type definitions
				433	*/
				434	typedef struct {
				435	unsigned long ibr_mask:56;
				436	unsigned long ibr_plm:4;
				437	unsigned long ibr_ig:3;
				438	unsigned long ibr_x:1;
				439	} ibr_mask_reg_t;
				440
				441	typedef struct {
				442	unsigned long dbr_mask:56;
				443	unsigned long dbr_plm:4;
				444	unsigned long dbr_ig:2;
				445	unsigned long dbr_w:1;
				446	unsigned long dbr_r:1;
				447	} dbr_mask_reg_t;
				448
				449	typedef union {
				450	unsigned long val;
				451	ibr_mask_reg_t ibr;
				452	dbr_mask_reg_t dbr;
				453	} dbreg_t;
				454
				455
				456	/*
				457	* perfmon command descriptions
				458	*/
				459	typedef struct {
				460	int (cmd_func)(pfm_context_t ctx, void arg, int count, struct pt_regs regs);
				461	char *cmd_name;
				462	int cmd_flags;
				463	unsigned int cmd_narg;
				464	size_t cmd_argsize;
				465	int (cmd_getsize)(void arg, size_t *sz);
				466	} pfm_cmd_desc_t;
				467
				468	#define PFM_CMD_FD 0x01 /* command requires a file descriptor */
				469	#define PFM_CMD_ARG_READ 0x02 /* command must read argument(s) */
				470	#define PFM_CMD_ARG_RW 0x04 /* command must read/write argument(s) */
				471	#define PFM_CMD_STOP 0x08 /* command does not work on zombie context */
				472
				473
				474	#define PFM_CMD_NAME(cmd) pfm_cmd_tab[(cmd)].cmd_name
				475	#define PFM_CMD_READ_ARG(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_ARG_READ)
				476	#define PFM_CMD_RW_ARG(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_ARG_RW)
				477	#define PFM_CMD_USE_FD(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_FD)
				478	#define PFM_CMD_STOPPED(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_STOP)
				479
				480	#define PFM_CMD_ARG_MANY -1 /* cannot be zero */
				481
				482	typedef struct {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	483	unsigned long pfm_spurious_ovfl_intr_count; /* keep track of spurious ovfl interrupts */
				484	unsigned long pfm_replay_ovfl_intr_count; /* keep track of replayed ovfl interrupts */
				485	unsigned long pfm_ovfl_intr_count; /* keep track of ovfl interrupts */
				486	unsigned long pfm_ovfl_intr_cycles; /* cycles spent processing ovfl interrupts */
				487	unsigned long pfm_ovfl_intr_cycles_min; /* min cycles spent processing ovfl interrupts */
				488	unsigned long pfm_ovfl_intr_cycles_max; /* max cycles spent processing ovfl interrupts */
				489	unsigned long pfm_smpl_handler_calls;
				490	unsigned long pfm_smpl_handler_cycles;
				491	char pad[SMP_CACHE_BYTES] ____cacheline_aligned;
				492	} pfm_stats_t;
				493
				494	/*
				495	* perfmon internal variables
				496	*/
				497	static pfm_stats_t pfm_stats[NR_CPUS];
				498	static pfm_session_t pfm_sessions; /* global sessions information */
				499
Ingo Molnar	a9f6a0d	2005-09-09 13:10:41 -0700	[diff] [blame]	500	static DEFINE_SPINLOCK(pfm_alt_install_check);
Tony Luck	a1ecf7f	2005-05-18 16:06:00 -0700	[diff] [blame]	501	static pfm_intr_handler_desc_t *pfm_alt_intr_handler;
				502
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	503	static struct proc_dir_entry *perfmon_dir;
				504	static pfm_uuid_t pfm_null_uuid = {0,};
				505
				506	static spinlock_t pfm_buffer_fmt_lock;
				507	static LIST_HEAD(pfm_buffer_fmt_list);
				508
				509	static pmu_config_t *pmu_conf;
				510
				511	/* sysctl() controls */
Stephane Eranian	4944930	2005-04-25 13:08:30 -0700	[diff] [blame]	512	pfm_sysctl_t pfm_sysctl;
				513	EXPORT_SYMBOL(pfm_sysctl);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	514
				515	static ctl_table pfm_ctl_table[]={
				516	{1, "debug", &pfm_sysctl.debug, sizeof(int), 0666, NULL, &proc_dointvec, NULL,},
				517	{2, "debug_ovfl", &pfm_sysctl.debug_ovfl, sizeof(int), 0666, NULL, &proc_dointvec, NULL,},
				518	{3, "fastctxsw", &pfm_sysctl.fastctxsw, sizeof(int), 0600, NULL, &proc_dointvec, NULL,},
				519	{4, "expert_mode", &pfm_sysctl.expert_mode, sizeof(int), 0600, NULL, &proc_dointvec, NULL,},
				520	{ 0, },
				521	};
				522	static ctl_table pfm_sysctl_dir[] = {
				523	{1, "perfmon", NULL, 0, 0755, pfm_ctl_table, },
				524	{0,},
				525	};
				526	static ctl_table pfm_sysctl_root[] = {
				527	{1, "kernel", NULL, 0, 0755, pfm_sysctl_dir, },
				528	{0,},
				529	};
				530	static struct ctl_table_header *pfm_sysctl_header;
				531
				532	static int pfm_context_unload(pfm_context_t ctx, void arg, int count, struct pt_regs *regs);
				533	static int pfm_flush(struct file *filp);
				534
				535	#define pfm_get_cpu_var(v) __ia64_per_cpu_var(v)
				536	#define pfm_get_cpu_data(a,b) per_cpu(a, b)
				537
				538	static inline void
				539	pfm_put_task(struct task_struct *task)
				540	{
				541	if (task != current) put_task_struct(task);
				542	}
				543
				544	static inline void
				545	pfm_set_task_notify(struct task_struct *task)
				546	{
				547	struct thread_info *info;
				548
				549	info = (struct thread_info ) ((char ) task + IA64_TASK_SIZE);
				550	set_bit(TIF_NOTIFY_RESUME, &info->flags);
				551	}
				552
				553	static inline void
				554	pfm_clear_task_notify(void)
				555	{
				556	clear_thread_flag(TIF_NOTIFY_RESUME);
				557	}
				558
				559	static inline void
				560	pfm_reserve_page(unsigned long a)
				561	{
				562	SetPageReserved(vmalloc_to_page((void *)a));
				563	}
				564	static inline void
				565	pfm_unreserve_page(unsigned long a)
				566	{
				567	ClearPageReserved(vmalloc_to_page((void*)a));
				568	}
				569
				570	static inline unsigned long
				571	pfm_protect_ctx_ctxsw(pfm_context_t *x)
				572	{
				573	spin_lock(&(x)->ctx_lock);
				574	return 0UL;
				575	}
				576
Peter Chubb	24b8e0c	2005-09-15 15:36:35 +1000	[diff] [blame]	577	static inline void
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	578	pfm_unprotect_ctx_ctxsw(pfm_context_t *x, unsigned long f)
				579	{
				580	spin_unlock(&(x)->ctx_lock);
				581	}
				582
				583	static inline unsigned int
				584	pfm_do_munmap(struct mm_struct *mm, unsigned long addr, size_t len, int acct)
				585	{
				586	return do_munmap(mm, addr, len);
				587	}
				588
				589	static inline unsigned long
				590	pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, unsigned long exec)
				591	{
				592	return get_unmapped_area(file, addr, len, pgoff, flags);
				593	}
				594
				595
				596	static struct super_block *
				597	pfmfs_get_sb(struct file_system_type fs_type, int flags, const char dev_name, void *data)
				598	{
				599	return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC);
				600	}
				601
				602	static struct file_system_type pfm_fs_type = {
				603	.name = "pfmfs",
				604	.get_sb = pfmfs_get_sb,
				605	.kill_sb = kill_anon_super,
				606	};
				607
				608	DEFINE_PER_CPU(unsigned long, pfm_syst_info);
				609	DEFINE_PER_CPU(struct task_struct *, pmu_owner);
				610	DEFINE_PER_CPU(pfm_context_t *, pmu_ctx);
				611	DEFINE_PER_CPU(unsigned long, pmu_activation_number);
Tony Luck	fffcc15	2005-05-31 10:38:32 -0700	[diff] [blame]	612	EXPORT_PER_CPU_SYMBOL_GPL(pfm_syst_info);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	613
				614
				615	/* forward declaration */
				616	static struct file_operations pfm_file_ops;
				617
				618	/*
				619	* forward declarations
				620	*/
				621	#ifndef CONFIG_SMP
				622	static void pfm_lazy_save_regs (struct task_struct *ta);
				623	#endif
				624
				625	void dump_pmu_state(const char *);
				626	static int pfm_write_ibr_dbr(int mode, pfm_context_t ctx, void arg, int count, struct pt_regs *regs);
				627
				628	#include "perfmon_itanium.h"
				629	#include "perfmon_mckinley.h"
				630	#include "perfmon_generic.h"
				631
				632	static pmu_config_t *pmu_confs[]={
				633	&pmu_conf_mck,
				634	&pmu_conf_ita,
				635	&pmu_conf_gen, /* must be last */
				636	NULL
				637	};
				638
				639
				640	static int pfm_end_notify_user(pfm_context_t *ctx);
				641
				642	static inline void
				643	pfm_clear_psr_pp(void)
				644	{
				645	ia64_rsm(IA64_PSR_PP);
				646	ia64_srlz_i();
				647	}
				648
				649	static inline void
				650	pfm_set_psr_pp(void)
				651	{
				652	ia64_ssm(IA64_PSR_PP);
				653	ia64_srlz_i();
				654	}
				655
				656	static inline void
				657	pfm_clear_psr_up(void)
				658	{
				659	ia64_rsm(IA64_PSR_UP);
				660	ia64_srlz_i();
				661	}
				662
				663	static inline void
				664	pfm_set_psr_up(void)
				665	{
				666	ia64_ssm(IA64_PSR_UP);
				667	ia64_srlz_i();
				668	}
				669
				670	static inline unsigned long
				671	pfm_get_psr(void)
				672	{
				673	unsigned long tmp;
				674	tmp = ia64_getreg(_IA64_REG_PSR);
				675	ia64_srlz_i();
				676	return tmp;
				677	}
				678
				679	static inline void
				680	pfm_set_psr_l(unsigned long val)
				681	{
				682	ia64_setreg(_IA64_REG_PSR_L, val);
				683	ia64_srlz_i();
				684	}
				685
				686	static inline void
				687	pfm_freeze_pmu(void)
				688	{
				689	ia64_set_pmc(0,1UL);
				690	ia64_srlz_d();
				691	}
				692
				693	static inline void
				694	pfm_unfreeze_pmu(void)
				695	{
				696	ia64_set_pmc(0,0UL);
				697	ia64_srlz_d();
				698	}
				699
				700	static inline void
				701	pfm_restore_ibrs(unsigned long *ibrs, unsigned int nibrs)
				702	{
				703	int i;
				704
				705	for (i=0; i < nibrs; i++) {
				706	ia64_set_ibr(i, ibrs[i]);
				707	ia64_dv_serialize_instruction();
				708	}
				709	ia64_srlz_i();
				710	}
				711
				712	static inline void
				713	pfm_restore_dbrs(unsigned long *dbrs, unsigned int ndbrs)
				714	{
				715	int i;
				716
				717	for (i=0; i < ndbrs; i++) {
				718	ia64_set_dbr(i, dbrs[i]);
				719	ia64_dv_serialize_data();
				720	}
				721	ia64_srlz_d();
				722	}
				723
				724	/*
				725	* PMD[i] must be a counter. no check is made
				726	*/
				727	static inline unsigned long
				728	pfm_read_soft_counter(pfm_context_t *ctx, int i)
				729	{
				730	return ctx->ctx_pmds[i].val + (ia64_get_pmd(i) & pmu_conf->ovfl_val);
				731	}
				732
				733	/*
				734	* PMD[i] must be a counter. no check is made
				735	*/
				736	static inline void
				737	pfm_write_soft_counter(pfm_context_t *ctx, int i, unsigned long val)
				738	{
				739	unsigned long ovfl_val = pmu_conf->ovfl_val;
				740
				741	ctx->ctx_pmds[i].val = val & ~ovfl_val;
				742	/*
				743	* writing to unimplemented part is ignore, so we do not need to
				744	* mask off top part
				745	*/
				746	ia64_set_pmd(i, val & ovfl_val);
				747	}
				748
				749	static pfm_msg_t *
				750	pfm_get_new_msg(pfm_context_t *ctx)
				751	{
				752	int idx, next;
				753
				754	next = (ctx->ctx_msgq_tail+1) % PFM_MAX_MSGS;
				755
				756	DPRINT(("ctx_fd=%p head=%d tail=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail));
				757	if (next == ctx->ctx_msgq_head) return NULL;
				758
				759	idx = ctx->ctx_msgq_tail;
				760	ctx->ctx_msgq_tail = next;
				761
				762	DPRINT(("ctx=%p head=%d tail=%d msg=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail, idx));
				763
				764	return ctx->ctx_msgq+idx;
				765	}
				766
				767	static pfm_msg_t *
				768	pfm_get_next_msg(pfm_context_t *ctx)
				769	{
				770	pfm_msg_t *msg;
				771
				772	DPRINT(("ctx=%p head=%d tail=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail));
				773
				774	if (PFM_CTXQ_EMPTY(ctx)) return NULL;
				775
				776	/*
				777	* get oldest message
				778	*/
				779	msg = ctx->ctx_msgq+ctx->ctx_msgq_head;
				780
				781	/*
				782	* and move forward
				783	*/
				784	ctx->ctx_msgq_head = (ctx->ctx_msgq_head+1) % PFM_MAX_MSGS;
				785
				786	DPRINT(("ctx=%p head=%d tail=%d type=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail, msg->pfm_gen_msg.msg_type));
				787
				788	return msg;
				789	}
				790
				791	static void
				792	pfm_reset_msgq(pfm_context_t *ctx)
				793	{
				794	ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0;
				795	DPRINT(("ctx=%p msgq reset\n", ctx));
				796	}
				797
				798	static void *
				799	pfm_rvmalloc(unsigned long size)
				800	{
				801	void *mem;
				802	unsigned long addr;
				803
				804	size = PAGE_ALIGN(size);
				805	mem = vmalloc(size);
				806	if (mem) {
				807	//printk("perfmon: CPU%d pfm_rvmalloc(%ld)=%p\n", smp_processor_id(), size, mem);
				808	memset(mem, 0, size);
				809	addr = (unsigned long)mem;
				810	while (size > 0) {
				811	pfm_reserve_page(addr);
				812	addr+=PAGE_SIZE;
				813	size-=PAGE_SIZE;
				814	}
				815	}
				816	return mem;
				817	}
				818
				819	static void
				820	pfm_rvfree(void *mem, unsigned long size)
				821	{
				822	unsigned long addr;
				823
				824	if (mem) {
				825	DPRINT(("freeing physical buffer @%p size=%lu\n", mem, size));
				826	addr = (unsigned long) mem;
				827	while ((long) size > 0) {
				828	pfm_unreserve_page(addr);
				829	addr+=PAGE_SIZE;
				830	size-=PAGE_SIZE;
				831	}
				832	vfree(mem);
				833	}
				834	return;
				835	}
				836
				837	static pfm_context_t *
				838	pfm_context_alloc(void)
				839	{
				840	pfm_context_t *ctx;
				841
				842	/*
				843	* allocate context descriptor
				844	* must be able to free with interrupts disabled
				845	*/
				846	ctx = kmalloc(sizeof(pfm_context_t), GFP_KERNEL);
				847	if (ctx) {
				848	memset(ctx, 0, sizeof(pfm_context_t));
				849	DPRINT(("alloc ctx @%p\n", ctx));
				850	}
				851	return ctx;
				852	}
				853
				854	static void
				855	pfm_context_free(pfm_context_t *ctx)
				856	{
				857	if (ctx) {
				858	DPRINT(("free ctx @%p\n", ctx));
				859	kfree(ctx);
				860	}
				861	}
				862
				863	static void
				864	pfm_mask_monitoring(struct task_struct *task)
				865	{
				866	pfm_context_t *ctx = PFM_GET_CTX(task);
				867	struct thread_struct *th = &task->thread;
				868	unsigned long mask, val, ovfl_mask;
				869	int i;
				870
				871	DPRINT_ovfl(("masking monitoring for [%d]\n", task->pid));
				872
				873	ovfl_mask = pmu_conf->ovfl_val;
				874	/*
				875	* monitoring can only be masked as a result of a valid
				876	* counter overflow. In UP, it means that the PMU still
				877	* has an owner. Note that the owner can be different
				878	* from the current task. However the PMU state belongs
				879	* to the owner.
				880	* In SMP, a valid overflow only happens when task is
				881	* current. Therefore if we come here, we know that
				882	* the PMU state belongs to the current task, therefore
				883	* we can access the live registers.
				884	*
				885	* So in both cases, the live register contains the owner's
				886	* state. We can ONLY touch the PMU registers and NOT the PSR.
				887	*
				888	* As a consequence to this call, the thread->pmds[] array
				889	* contains stale information which must be ignored
				890	* when context is reloaded AND monitoring is active (see
				891	* pfm_restart).
				892	*/
				893	mask = ctx->ctx_used_pmds[0];
				894	for (i = 0; mask; i++, mask>>=1) {
				895	/* skip non used pmds */
				896	if ((mask & 0x1) == 0) continue;
				897	val = ia64_get_pmd(i);
				898
				899	if (PMD_IS_COUNTING(i)) {
				900	/*
				901	* we rebuild the full 64 bit value of the counter
				902	*/
				903	ctx->ctx_pmds[i].val += (val & ovfl_mask);
				904	} else {
				905	ctx->ctx_pmds[i].val = val;
				906	}
				907	DPRINT_ovfl(("pmd[%d]=0x%lx hw_pmd=0x%lx\n",
				908	i,
				909	ctx->ctx_pmds[i].val,
				910	val & ovfl_mask));
				911	}
				912	/*
				913	* mask monitoring by setting the privilege level to 0
				914	* we cannot use psr.pp/psr.up for this, it is controlled by
				915	* the user
				916	*
				917	* if task is current, modify actual registers, otherwise modify
				918	* thread save state, i.e., what will be restored in pfm_load_regs()
				919	*/
				920	mask = ctx->ctx_used_monitors[0] >> PMU_FIRST_COUNTER;
				921	for(i= PMU_FIRST_COUNTER; mask; i++, mask>>=1) {
				922	if ((mask & 0x1) == 0UL) continue;
				923	ia64_set_pmc(i, th->pmcs[i] & ~0xfUL);
				924	th->pmcs[i] &= ~0xfUL;
				925	DPRINT_ovfl(("pmc[%d]=0x%lx\n", i, th->pmcs[i]));
				926	}
				927	/*
				928	* make all of this visible
				929	*/
				930	ia64_srlz_d();
				931	}
				932
				933	/*
				934	* must always be done with task == current
				935	*
				936	* context must be in MASKED state when calling
				937	*/
				938	static void
				939	pfm_restore_monitoring(struct task_struct *task)
				940	{
				941	pfm_context_t *ctx = PFM_GET_CTX(task);
				942	struct thread_struct *th = &task->thread;
				943	unsigned long mask, ovfl_mask;
				944	unsigned long psr, val;
				945	int i, is_system;
				946
				947	is_system = ctx->ctx_fl_system;
				948	ovfl_mask = pmu_conf->ovfl_val;
				949
				950	if (task != current) {
				951	printk(KERN_ERR "perfmon.%d: invalid task[%d] current[%d]\n", __LINE__, task->pid, current->pid);
				952	return;
				953	}
				954	if (ctx->ctx_state != PFM_CTX_MASKED) {
				955	printk(KERN_ERR "perfmon.%d: task[%d] current[%d] invalid state=%d\n", __LINE__,
				956	task->pid, current->pid, ctx->ctx_state);
				957	return;
				958	}
				959	psr = pfm_get_psr();
				960	/*
				961	* monitoring is masked via the PMC.
				962	* As we restore their value, we do not want each counter to
				963	* restart right away. We stop monitoring using the PSR,
				964	* restore the PMC (and PMD) and then re-establish the psr
				965	* as it was. Note that there can be no pending overflow at
				966	* this point, because monitoring was MASKED.
				967	*
				968	* system-wide session are pinned and self-monitoring
				969	*/
				970	if (is_system && (PFM_CPUINFO_GET() & PFM_CPUINFO_DCR_PP)) {
				971	/* disable dcr pp */
				972	ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) & ~IA64_DCR_PP);
				973	pfm_clear_psr_pp();
				974	} else {
				975	pfm_clear_psr_up();
				976	}
				977	/*
				978	* first, we restore the PMD
				979	*/
				980	mask = ctx->ctx_used_pmds[0];
				981	for (i = 0; mask; i++, mask>>=1) {
				982	/* skip non used pmds */
				983	if ((mask & 0x1) == 0) continue;
				984
				985	if (PMD_IS_COUNTING(i)) {
				986	/*
				987	* we split the 64bit value according to
				988	* counter width
				989	*/
				990	val = ctx->ctx_pmds[i].val & ovfl_mask;
				991	ctx->ctx_pmds[i].val &= ~ovfl_mask;
				992	} else {
				993	val = ctx->ctx_pmds[i].val;
				994	}
				995	ia64_set_pmd(i, val);
				996
				997	DPRINT(("pmd[%d]=0x%lx hw_pmd=0x%lx\n",
				998	i,
				999	ctx->ctx_pmds[i].val,
				1000	val));
				1001	}
				1002	/*
				1003	* restore the PMCs
				1004	*/
				1005	mask = ctx->ctx_used_monitors[0] >> PMU_FIRST_COUNTER;
				1006	for(i= PMU_FIRST_COUNTER; mask; i++, mask>>=1) {
				1007	if ((mask & 0x1) == 0UL) continue;
				1008	th->pmcs[i] = ctx->ctx_pmcs[i];
				1009	ia64_set_pmc(i, th->pmcs[i]);
				1010	DPRINT(("[%d] pmc[%d]=0x%lx\n", task->pid, i, th->pmcs[i]));
				1011	}
				1012	ia64_srlz_d();
				1013
				1014	/*
				1015	* must restore DBR/IBR because could be modified while masked
				1016	* XXX: need to optimize
				1017	*/
				1018	if (ctx->ctx_fl_using_dbreg) {
				1019	pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
				1020	pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
				1021	}
				1022
				1023	/*
				1024	* now restore PSR
				1025	*/
				1026	if (is_system && (PFM_CPUINFO_GET() & PFM_CPUINFO_DCR_PP)) {
				1027	/* enable dcr pp */
				1028	ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) \| IA64_DCR_PP);
				1029	ia64_srlz_i();
				1030	}
				1031	pfm_set_psr_l(psr);
				1032	}
				1033
				1034	static inline void
				1035	pfm_save_pmds(unsigned long *pmds, unsigned long mask)
				1036	{
				1037	int i;
				1038
				1039	ia64_srlz_d();
				1040
				1041	for (i=0; mask; i++, mask>>=1) {
				1042	if (mask & 0x1) pmds[i] = ia64_get_pmd(i);
				1043	}
				1044	}
				1045
				1046	/*
				1047	* reload from thread state (used for ctxw only)
				1048	*/
				1049	static inline void
				1050	pfm_restore_pmds(unsigned long *pmds, unsigned long mask)
				1051	{
				1052	int i;
				1053	unsigned long val, ovfl_val = pmu_conf->ovfl_val;
				1054
				1055	for (i=0; mask; i++, mask>>=1) {
				1056	if ((mask & 0x1) == 0) continue;
				1057	val = PMD_IS_COUNTING(i) ? pmds[i] & ovfl_val : pmds[i];
				1058	ia64_set_pmd(i, val);
				1059	}
				1060	ia64_srlz_d();
				1061	}
				1062
				1063	/*
				1064	* propagate PMD from context to thread-state
				1065	*/
				1066	static inline void
				1067	pfm_copy_pmds(struct task_struct task, pfm_context_t ctx)
				1068	{
				1069	struct thread_struct *thread = &task->thread;
				1070	unsigned long ovfl_val = pmu_conf->ovfl_val;
				1071	unsigned long mask = ctx->ctx_all_pmds[0];
				1072	unsigned long val;
				1073	int i;
				1074
				1075	DPRINT(("mask=0x%lx\n", mask));
				1076
				1077	for (i=0; mask; i++, mask>>=1) {
				1078
				1079	val = ctx->ctx_pmds[i].val;
				1080
				1081	/*
				1082	* We break up the 64 bit value into 2 pieces
				1083	* the lower bits go to the machine state in the
				1084	* thread (will be reloaded on ctxsw in).
				1085	* The upper part stays in the soft-counter.
				1086	*/
				1087	if (PMD_IS_COUNTING(i)) {
				1088	ctx->ctx_pmds[i].val = val & ~ovfl_val;
				1089	val &= ovfl_val;
				1090	}
				1091	thread->pmds[i] = val;
				1092
				1093	DPRINT(("pmd[%d]=0x%lx soft_val=0x%lx\n",
				1094	i,
				1095	thread->pmds[i],
				1096	ctx->ctx_pmds[i].val));
				1097	}
				1098	}
				1099
				1100	/*
				1101	* propagate PMC from context to thread-state
				1102	*/
				1103	static inline void
				1104	pfm_copy_pmcs(struct task_struct task, pfm_context_t ctx)
				1105	{
				1106	struct thread_struct *thread = &task->thread;
				1107	unsigned long mask = ctx->ctx_all_pmcs[0];
				1108	int i;
				1109
				1110	DPRINT(("mask=0x%lx\n", mask));
				1111
				1112	for (i=0; mask; i++, mask>>=1) {
				1113	/* masking 0 with ovfl_val yields 0 */
				1114	thread->pmcs[i] = ctx->ctx_pmcs[i];
				1115	DPRINT(("pmc[%d]=0x%lx\n", i, thread->pmcs[i]));
				1116	}
				1117	}
				1118
				1119
				1120
				1121	static inline void
				1122	pfm_restore_pmcs(unsigned long *pmcs, unsigned long mask)
				1123	{
				1124	int i;
				1125
				1126	for (i=0; mask; i++, mask>>=1) {
				1127	if ((mask & 0x1) == 0) continue;
				1128	ia64_set_pmc(i, pmcs[i]);
				1129	}
				1130	ia64_srlz_d();
				1131	}
				1132
				1133	static inline int
				1134	pfm_uuid_cmp(pfm_uuid_t a, pfm_uuid_t b)
				1135	{
				1136	return memcmp(a, b, sizeof(pfm_uuid_t));
				1137	}
				1138
				1139	static inline int
				1140	pfm_buf_fmt_exit(pfm_buffer_fmt_t fmt, struct task_struct task, void buf, struct pt_regs regs)
				1141	{
				1142	int ret = 0;
				1143	if (fmt->fmt_exit) ret = (*fmt->fmt_exit)(task, buf, regs);
				1144	return ret;
				1145	}
				1146
				1147	static inline int
				1148	pfm_buf_fmt_getsize(pfm_buffer_fmt_t fmt, struct task_struct task, unsigned int flags, int cpu, void arg, unsigned long size)
				1149	{
				1150	int ret = 0;
				1151	if (fmt->fmt_getsize) ret = (*fmt->fmt_getsize)(task, flags, cpu, arg, size);
				1152	return ret;
				1153	}
				1154
				1155
				1156	static inline int
				1157	pfm_buf_fmt_validate(pfm_buffer_fmt_t fmt, struct task_struct task, unsigned int flags,
				1158	int cpu, void *arg)
				1159	{
				1160	int ret = 0;
				1161	if (fmt->fmt_validate) ret = (*fmt->fmt_validate)(task, flags, cpu, arg);
				1162	return ret;
				1163	}
				1164
				1165	static inline int
				1166	pfm_buf_fmt_init(pfm_buffer_fmt_t fmt, struct task_struct task, void *buf, unsigned int flags,
				1167	int cpu, void *arg)
				1168	{
				1169	int ret = 0;
				1170	if (fmt->fmt_init) ret = (*fmt->fmt_init)(task, buf, flags, cpu, arg);
				1171	return ret;
				1172	}
				1173
				1174	static inline int
				1175	pfm_buf_fmt_restart(pfm_buffer_fmt_t fmt, struct task_struct task, pfm_ovfl_ctrl_t ctrl, void buf, struct pt_regs *regs)
				1176	{
				1177	int ret = 0;
				1178	if (fmt->fmt_restart) ret = (*fmt->fmt_restart)(task, ctrl, buf, regs);
				1179	return ret;
				1180	}
				1181
				1182	static inline int
				1183	pfm_buf_fmt_restart_active(pfm_buffer_fmt_t fmt, struct task_struct task, pfm_ovfl_ctrl_t ctrl, void buf, struct pt_regs *regs)
				1184	{
				1185	int ret = 0;
				1186	if (fmt->fmt_restart_active) ret = (*fmt->fmt_restart_active)(task, ctrl, buf, regs);
				1187	return ret;
				1188	}
				1189
				1190	static pfm_buffer_fmt_t *
				1191	__pfm_find_buffer_fmt(pfm_uuid_t uuid)
				1192	{
				1193	struct list_head * pos;
				1194	pfm_buffer_fmt_t * entry;
				1195
				1196	list_for_each(pos, &pfm_buffer_fmt_list) {
				1197	entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list);
				1198	if (pfm_uuid_cmp(uuid, entry->fmt_uuid) == 0)
				1199	return entry;
				1200	}
				1201	return NULL;
				1202	}
				1203
				1204	/*
				1205	* find a buffer format based on its uuid
				1206	*/
				1207	static pfm_buffer_fmt_t *
				1208	pfm_find_buffer_fmt(pfm_uuid_t uuid)
				1209	{
				1210	pfm_buffer_fmt_t * fmt;
				1211	spin_lock(&pfm_buffer_fmt_lock);
				1212	fmt = __pfm_find_buffer_fmt(uuid);
				1213	spin_unlock(&pfm_buffer_fmt_lock);
				1214	return fmt;
				1215	}
				1216
				1217	int
				1218	pfm_register_buffer_fmt(pfm_buffer_fmt_t *fmt)
				1219	{
				1220	int ret = 0;
				1221
				1222	/* some sanity checks */
				1223	if (fmt == NULL \|\| fmt->fmt_name == NULL) return -EINVAL;
				1224
				1225	/* we need at least a handler */
				1226	if (fmt->fmt_handler == NULL) return -EINVAL;
				1227
				1228	/*
				1229	* XXX: need check validity of fmt_arg_size
				1230	*/
				1231
				1232	spin_lock(&pfm_buffer_fmt_lock);
				1233
				1234	if (__pfm_find_buffer_fmt(fmt->fmt_uuid)) {
				1235	printk(KERN_ERR "perfmon: duplicate sampling format: %s\n", fmt->fmt_name);
				1236	ret = -EBUSY;
				1237	goto out;
				1238	}
				1239	list_add(&fmt->fmt_list, &pfm_buffer_fmt_list);
				1240	printk(KERN_INFO "perfmon: added sampling format %s\n", fmt->fmt_name);
				1241
				1242	out:
				1243	spin_unlock(&pfm_buffer_fmt_lock);
				1244	return ret;
				1245	}
				1246	EXPORT_SYMBOL(pfm_register_buffer_fmt);
				1247
				1248	int
				1249	pfm_unregister_buffer_fmt(pfm_uuid_t uuid)
				1250	{
				1251	pfm_buffer_fmt_t *fmt;
				1252	int ret = 0;
				1253
				1254	spin_lock(&pfm_buffer_fmt_lock);
				1255
				1256	fmt = __pfm_find_buffer_fmt(uuid);
				1257	if (!fmt) {
				1258	printk(KERN_ERR "perfmon: cannot unregister format, not found\n");
				1259	ret = -EINVAL;
				1260	goto out;
				1261	}
				1262	list_del_init(&fmt->fmt_list);
				1263	printk(KERN_INFO "perfmon: removed sampling format: %s\n", fmt->fmt_name);
				1264
				1265	out:
				1266	spin_unlock(&pfm_buffer_fmt_lock);
				1267	return ret;
				1268
				1269	}
				1270	EXPORT_SYMBOL(pfm_unregister_buffer_fmt);
				1271
Stephane Eranian	8df5a50	2005-04-11 13:45:00 -0700	[diff] [blame]	1272	extern void update_pal_halt_status(int);
				1273
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1274	static int
				1275	pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned int cpu)
				1276	{
				1277	unsigned long flags;
				1278	/*
				1279	* validy checks on cpu_mask have been done upstream
				1280	*/
				1281	LOCK_PFS(flags);
				1282
				1283	DPRINT(("in sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
				1284	pfm_sessions.pfs_sys_sessions,
				1285	pfm_sessions.pfs_task_sessions,
				1286	pfm_sessions.pfs_sys_use_dbregs,
				1287	is_syswide,
				1288	cpu));
				1289
				1290	if (is_syswide) {
				1291	/*
				1292	* cannot mix system wide and per-task sessions
				1293	*/
				1294	if (pfm_sessions.pfs_task_sessions > 0UL) {
				1295	DPRINT(("system wide not possible, %u conflicting task_sessions\n",
				1296	pfm_sessions.pfs_task_sessions));
				1297	goto abort;
				1298	}
				1299
				1300	if (pfm_sessions.pfs_sys_session[cpu]) goto error_conflict;
				1301
				1302	DPRINT(("reserving system wide session on CPU%u currently on CPU%u\n", cpu, smp_processor_id()));
				1303
				1304	pfm_sessions.pfs_sys_session[cpu] = task;
				1305
				1306	pfm_sessions.pfs_sys_sessions++ ;
				1307
				1308	} else {
				1309	if (pfm_sessions.pfs_sys_sessions) goto abort;
				1310	pfm_sessions.pfs_task_sessions++;
				1311	}
				1312
				1313	DPRINT(("out sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
				1314	pfm_sessions.pfs_sys_sessions,
				1315	pfm_sessions.pfs_task_sessions,
				1316	pfm_sessions.pfs_sys_use_dbregs,
				1317	is_syswide,
				1318	cpu));
				1319
Stephane Eranian	8df5a50	2005-04-11 13:45:00 -0700	[diff] [blame]	1320	/*
				1321	* disable default_idle() to go to PAL_HALT
				1322	*/
				1323	update_pal_halt_status(0);
				1324
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1325	UNLOCK_PFS(flags);
				1326
				1327	return 0;
				1328
				1329	error_conflict:
				1330	DPRINT(("system wide not possible, conflicting session [%d] on CPU%d\n",
				1331	pfm_sessions.pfs_sys_session[cpu]->pid,
Tony Luck	a1ecf7f	2005-05-18 16:06:00 -0700	[diff] [blame]	1332	cpu));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1333	abort:
				1334	UNLOCK_PFS(flags);
				1335
				1336	return -EBUSY;
				1337
				1338	}
				1339
				1340	static int
				1341	pfm_unreserve_session(pfm_context_t *ctx, int is_syswide, unsigned int cpu)
				1342	{
				1343	unsigned long flags;
				1344	/*
				1345	* validy checks on cpu_mask have been done upstream
				1346	*/
				1347	LOCK_PFS(flags);
				1348
				1349	DPRINT(("in sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
				1350	pfm_sessions.pfs_sys_sessions,
				1351	pfm_sessions.pfs_task_sessions,
				1352	pfm_sessions.pfs_sys_use_dbregs,
				1353	is_syswide,
				1354	cpu));
				1355
				1356
				1357	if (is_syswide) {
				1358	pfm_sessions.pfs_sys_session[cpu] = NULL;
				1359	/*
				1360	* would not work with perfmon+more than one bit in cpu_mask
				1361	*/
				1362	if (ctx && ctx->ctx_fl_using_dbreg) {
				1363	if (pfm_sessions.pfs_sys_use_dbregs == 0) {
				1364	printk(KERN_ERR "perfmon: invalid release for ctx %p sys_use_dbregs=0\n", ctx);
				1365	} else {
				1366	pfm_sessions.pfs_sys_use_dbregs--;
				1367	}
				1368	}
				1369	pfm_sessions.pfs_sys_sessions--;
				1370	} else {
				1371	pfm_sessions.pfs_task_sessions--;
				1372	}
				1373	DPRINT(("out sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
				1374	pfm_sessions.pfs_sys_sessions,
				1375	pfm_sessions.pfs_task_sessions,
				1376	pfm_sessions.pfs_sys_use_dbregs,
				1377	is_syswide,
				1378	cpu));
				1379
Stephane Eranian	8df5a50	2005-04-11 13:45:00 -0700	[diff] [blame]	1380	/*
				1381	* if possible, enable default_idle() to go into PAL_HALT
				1382	*/
				1383	if (pfm_sessions.pfs_task_sessions == 0 && pfm_sessions.pfs_sys_sessions == 0)
				1384	update_pal_halt_status(1);
				1385
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1386	UNLOCK_PFS(flags);
				1387
				1388	return 0;
				1389	}
				1390
				1391	/*
				1392	* removes virtual mapping of the sampling buffer.
				1393	* IMPORTANT: cannot be called with interrupts disable, e.g. inside
				1394	* a PROTECT_CTX() section.
				1395	*/
				1396	static int
				1397	pfm_remove_smpl_mapping(struct task_struct task, void vaddr, unsigned long size)
				1398	{
				1399	int r;
				1400
				1401	/* sanity checks */
				1402	if (task->mm == NULL \|\| size == 0UL \|\| vaddr == NULL) {
				1403	printk(KERN_ERR "perfmon: pfm_remove_smpl_mapping [%d] invalid context mm=%p\n", task->pid, task->mm);
				1404	return -EINVAL;
				1405	}
				1406
				1407	DPRINT(("smpl_vaddr=%p size=%lu\n", vaddr, size));
				1408
				1409	/*
				1410	* does the actual unmapping
				1411	*/
				1412	down_write(&task->mm->mmap_sem);
				1413
				1414	DPRINT(("down_write done smpl_vaddr=%p size=%lu\n", vaddr, size));
				1415
				1416	r = pfm_do_munmap(task->mm, (unsigned long)vaddr, size, 0);
				1417
				1418	up_write(&task->mm->mmap_sem);
				1419	if (r !=0) {
				1420	printk(KERN_ERR "perfmon: [%d] unable to unmap sampling buffer @%p size=%lu\n", task->pid, vaddr, size);
				1421	}
				1422
				1423	DPRINT(("do_unmap(%p, %lu)=%d\n", vaddr, size, r));
				1424
				1425	return 0;
				1426	}
				1427
				1428	/*
				1429	* free actual physical storage used by sampling buffer
				1430	*/
				1431	#if 0
				1432	static int
				1433	pfm_free_smpl_buffer(pfm_context_t *ctx)
				1434	{
				1435	pfm_buffer_fmt_t *fmt;
				1436
				1437	if (ctx->ctx_smpl_hdr == NULL) goto invalid_free;
				1438
				1439	/*
				1440	* we won't use the buffer format anymore
				1441	*/
				1442	fmt = ctx->ctx_buf_fmt;
				1443
				1444	DPRINT(("sampling buffer @%p size %lu vaddr=%p\n",
				1445	ctx->ctx_smpl_hdr,
				1446	ctx->ctx_smpl_size,
				1447	ctx->ctx_smpl_vaddr));
				1448
				1449	pfm_buf_fmt_exit(fmt, current, NULL, NULL);
				1450
				1451	/*
				1452	* free the buffer
				1453	*/
				1454	pfm_rvfree(ctx->ctx_smpl_hdr, ctx->ctx_smpl_size);
				1455
				1456	ctx->ctx_smpl_hdr = NULL;
				1457	ctx->ctx_smpl_size = 0UL;
				1458
				1459	return 0;
				1460
				1461	invalid_free:
				1462	printk(KERN_ERR "perfmon: pfm_free_smpl_buffer [%d] no buffer\n", current->pid);
				1463	return -EINVAL;
				1464	}
				1465	#endif
				1466
				1467	static inline void
				1468	pfm_exit_smpl_buffer(pfm_buffer_fmt_t *fmt)
				1469	{
				1470	if (fmt == NULL) return;
				1471
				1472	pfm_buf_fmt_exit(fmt, current, NULL, NULL);
				1473
				1474	}
				1475
				1476	/*
				1477	* pfmfs should _never_ be mounted by userland - too much of security hassle,
				1478	* no real gain from having the whole whorehouse mounted. So we don't need
				1479	* any operations on the root directory. However, we need a non-trivial
				1480	* d_name - pfm: will go nicely and kill the special-casing in procfs.
				1481	*/
				1482	static struct vfsmount *pfmfs_mnt;
				1483
				1484	static int __init
				1485	init_pfm_fs(void)
				1486	{
				1487	int err = register_filesystem(&pfm_fs_type);
				1488	if (!err) {
				1489	pfmfs_mnt = kern_mount(&pfm_fs_type);
				1490	err = PTR_ERR(pfmfs_mnt);
				1491	if (IS_ERR(pfmfs_mnt))
				1492	unregister_filesystem(&pfm_fs_type);
				1493	else
				1494	err = 0;
				1495	}
				1496	return err;
				1497	}
				1498
				1499	static void __exit
				1500	exit_pfm_fs(void)
				1501	{
				1502	unregister_filesystem(&pfm_fs_type);
				1503	mntput(pfmfs_mnt);
				1504	}
				1505
				1506	static ssize_t
				1507	pfm_read(struct file filp, char __user buf, size_t size, loff_t *ppos)
				1508	{
				1509	pfm_context_t *ctx;
				1510	pfm_msg_t *msg;
				1511	ssize_t ret;
				1512	unsigned long flags;
				1513	DECLARE_WAITQUEUE(wait, current);
				1514	if (PFM_IS_FILE(filp) == 0) {
				1515	printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid);
				1516	return -EINVAL;
				1517	}
				1518
				1519	ctx = (pfm_context_t *)filp->private_data;
				1520	if (ctx == NULL) {
				1521	printk(KERN_ERR "perfmon: pfm_read: NULL ctx [%d]\n", current->pid);
				1522	return -EINVAL;
				1523	}
				1524
				1525	/*
				1526	* check even when there is no message
				1527	*/
				1528	if (size < sizeof(pfm_msg_t)) {
				1529	DPRINT(("message is too small ctx=%p (>=%ld)\n", ctx, sizeof(pfm_msg_t)));
				1530	return -EINVAL;
				1531	}
				1532
				1533	PROTECT_CTX(ctx, flags);
				1534
				1535	/*
				1536	* put ourselves on the wait queue
				1537	*/
				1538	add_wait_queue(&ctx->ctx_msgq_wait, &wait);
				1539
				1540
				1541	for(;;) {
				1542	/*
				1543	* check wait queue
				1544	*/
				1545
				1546	set_current_state(TASK_INTERRUPTIBLE);
				1547
				1548	DPRINT(("head=%d tail=%d\n", ctx->ctx_msgq_head, ctx->ctx_msgq_tail));
				1549
				1550	ret = 0;
				1551	if(PFM_CTXQ_EMPTY(ctx) == 0) break;
				1552
				1553	UNPROTECT_CTX(ctx, flags);
				1554
				1555	/*
				1556	* check non-blocking read
				1557	*/
				1558	ret = -EAGAIN;
				1559	if(filp->f_flags & O_NONBLOCK) break;
				1560
				1561	/*
				1562	* check pending signals
				1563	*/
				1564	if(signal_pending(current)) {
				1565	ret = -EINTR;
				1566	break;
				1567	}
				1568	/*
				1569	* no message, so wait
				1570	*/
				1571	schedule();
				1572
				1573	PROTECT_CTX(ctx, flags);
				1574	}
				1575	DPRINT(("[%d] back to running ret=%ld\n", current->pid, ret));
				1576	set_current_state(TASK_RUNNING);
				1577	remove_wait_queue(&ctx->ctx_msgq_wait, &wait);
				1578
				1579	if (ret < 0) goto abort;
				1580
				1581	ret = -EINVAL;
				1582	msg = pfm_get_next_msg(ctx);
				1583	if (msg == NULL) {
				1584	printk(KERN_ERR "perfmon: pfm_read no msg for ctx=%p [%d]\n", ctx, current->pid);
				1585	goto abort_locked;
				1586	}
				1587
Stephane Eranian	4944930	2005-04-25 13:08:30 -0700	[diff] [blame]	1588	DPRINT(("fd=%d type=%d\n", msg->pfm_gen_msg.msg_ctx_fd, msg->pfm_gen_msg.msg_type));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1589
				1590	ret = -EFAULT;
				1591	if(copy_to_user(buf, msg, sizeof(pfm_msg_t)) == 0) ret = sizeof(pfm_msg_t);
				1592
				1593	abort_locked:
				1594	UNPROTECT_CTX(ctx, flags);
				1595	abort:
				1596	return ret;
				1597	}
				1598
				1599	static ssize_t
				1600	pfm_write(struct file file, const char __user ubuf,
				1601	size_t size, loff_t *ppos)
				1602	{
				1603	DPRINT(("pfm_write called\n"));
				1604	return -EINVAL;
				1605	}
				1606
				1607	static unsigned int
				1608	pfm_poll(struct file filp, poll_table wait)
				1609	{
				1610	pfm_context_t *ctx;
				1611	unsigned long flags;
				1612	unsigned int mask = 0;
				1613
				1614	if (PFM_IS_FILE(filp) == 0) {
				1615	printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid);
				1616	return 0;
				1617	}
				1618
				1619	ctx = (pfm_context_t *)filp->private_data;
				1620	if (ctx == NULL) {
				1621	printk(KERN_ERR "perfmon: pfm_poll: NULL ctx [%d]\n", current->pid);
				1622	return 0;
				1623	}
				1624
				1625
				1626	DPRINT(("pfm_poll ctx_fd=%d before poll_wait\n", ctx->ctx_fd));
				1627
				1628	poll_wait(filp, &ctx->ctx_msgq_wait, wait);
				1629
				1630	PROTECT_CTX(ctx, flags);
				1631
				1632	if (PFM_CTXQ_EMPTY(ctx) == 0)
				1633	mask = POLLIN \| POLLRDNORM;
				1634
				1635	UNPROTECT_CTX(ctx, flags);
				1636
				1637	DPRINT(("pfm_poll ctx_fd=%d mask=0x%x\n", ctx->ctx_fd, mask));
				1638
				1639	return mask;
				1640	}
				1641
				1642	static int
				1643	pfm_ioctl(struct inode inode, struct file file, unsigned int cmd, unsigned long arg)
				1644	{
				1645	DPRINT(("pfm_ioctl called\n"));
				1646	return -EINVAL;
				1647	}
				1648
				1649	/*
				1650	* interrupt cannot be masked when coming here
				1651	*/
				1652	static inline int
				1653	pfm_do_fasync(int fd, struct file filp, pfm_context_t ctx, int on)
				1654	{
				1655	int ret;
				1656
				1657	ret = fasync_helper (fd, filp, on, &ctx->ctx_async_queue);
				1658
				1659	DPRINT(("pfm_fasync called by [%d] on ctx_fd=%d on=%d async_queue=%p ret=%d\n",
				1660	current->pid,
				1661	fd,
				1662	on,
				1663	ctx->ctx_async_queue, ret));
				1664
				1665	return ret;
				1666	}
				1667
				1668	static int
				1669	pfm_fasync(int fd, struct file *filp, int on)
				1670	{
				1671	pfm_context_t *ctx;
				1672	int ret;
				1673
				1674	if (PFM_IS_FILE(filp) == 0) {
				1675	printk(KERN_ERR "perfmon: pfm_fasync bad magic [%d]\n", current->pid);
				1676	return -EBADF;
				1677	}
				1678
				1679	ctx = (pfm_context_t *)filp->private_data;
				1680	if (ctx == NULL) {
				1681	printk(KERN_ERR "perfmon: pfm_fasync NULL ctx [%d]\n", current->pid);
				1682	return -EBADF;
				1683	}
				1684	/*
				1685	* we cannot mask interrupts during this call because this may
				1686	* may go to sleep if memory is not readily avalaible.
				1687	*
				1688	* We are protected from the conetxt disappearing by the get_fd()/put_fd()
				1689	* done in caller. Serialization of this function is ensured by caller.
				1690	*/
				1691	ret = pfm_do_fasync(fd, filp, ctx, on);
				1692
				1693
				1694	DPRINT(("pfm_fasync called on ctx_fd=%d on=%d async_queue=%p ret=%d\n",
				1695	fd,
				1696	on,
				1697	ctx->ctx_async_queue, ret));
				1698
				1699	return ret;
				1700	}
				1701
				1702	#ifdef CONFIG_SMP
				1703	/*
				1704	* this function is exclusively called from pfm_close().
				1705	* The context is not protected at that time, nor are interrupts
				1706	* on the remote CPU. That's necessary to avoid deadlocks.
				1707	*/
				1708	static void
				1709	pfm_syswide_force_stop(void *info)
				1710	{
				1711	pfm_context_t ctx = (pfm_context_t )info;
				1712	struct pt_regs *regs = ia64_task_regs(current);
				1713	struct task_struct *owner;
				1714	unsigned long flags;
				1715	int ret;
				1716
				1717	if (ctx->ctx_cpu != smp_processor_id()) {
				1718	printk(KERN_ERR "perfmon: pfm_syswide_force_stop for CPU%d but on CPU%d\n",
				1719	ctx->ctx_cpu,
				1720	smp_processor_id());
				1721	return;
				1722	}
				1723	owner = GET_PMU_OWNER();
				1724	if (owner != ctx->ctx_task) {
				1725	printk(KERN_ERR "perfmon: pfm_syswide_force_stop CPU%d unexpected owner [%d] instead of [%d]\n",
				1726	smp_processor_id(),
				1727	owner->pid, ctx->ctx_task->pid);
				1728	return;
				1729	}
				1730	if (GET_PMU_CTX() != ctx) {
				1731	printk(KERN_ERR "perfmon: pfm_syswide_force_stop CPU%d unexpected ctx %p instead of %p\n",
				1732	smp_processor_id(),
				1733	GET_PMU_CTX(), ctx);
				1734	return;
				1735	}
				1736
				1737	DPRINT(("on CPU%d forcing system wide stop for [%d]\n", smp_processor_id(), ctx->ctx_task->pid));
				1738	/*
				1739	* the context is already protected in pfm_close(), we simply
				1740	* need to mask interrupts to avoid a PMU interrupt race on
				1741	* this CPU
				1742	*/
				1743	local_irq_save(flags);
				1744
				1745	ret = pfm_context_unload(ctx, NULL, 0, regs);
				1746	if (ret) {
				1747	DPRINT(("context_unload returned %d\n", ret));
				1748	}
				1749
				1750	/*
				1751	* unmask interrupts, PMU interrupts are now spurious here
				1752	*/
				1753	local_irq_restore(flags);
				1754	}
				1755
				1756	static void
				1757	pfm_syswide_cleanup_other_cpu(pfm_context_t *ctx)
				1758	{
				1759	int ret;
				1760
				1761	DPRINT(("calling CPU%d for cleanup\n", ctx->ctx_cpu));
				1762	ret = smp_call_function_single(ctx->ctx_cpu, pfm_syswide_force_stop, ctx, 0, 1);
				1763	DPRINT(("called CPU%d for cleanup ret=%d\n", ctx->ctx_cpu, ret));
				1764	}
				1765	#endif /* CONFIG_SMP */
				1766
				1767	/*
				1768	* called for each close(). Partially free resources.
				1769	* When caller is self-monitoring, the context is unloaded.
				1770	*/
				1771	static int
				1772	pfm_flush(struct file *filp)
				1773	{
				1774	pfm_context_t *ctx;
				1775	struct task_struct *task;
				1776	struct pt_regs *regs;
				1777	unsigned long flags;
				1778	unsigned long smpl_buf_size = 0UL;
				1779	void *smpl_buf_vaddr = NULL;
				1780	int state, is_system;
				1781
				1782	if (PFM_IS_FILE(filp) == 0) {
				1783	DPRINT(("bad magic for\n"));
				1784	return -EBADF;
				1785	}
				1786
				1787	ctx = (pfm_context_t *)filp->private_data;
				1788	if (ctx == NULL) {
				1789	printk(KERN_ERR "perfmon: pfm_flush: NULL ctx [%d]\n", current->pid);
				1790	return -EBADF;
				1791	}
				1792
				1793	/*
				1794	* remove our file from the async queue, if we use this mode.
				1795	* This can be done without the context being protected. We come
				1796	* here when the context has become unreacheable by other tasks.
				1797	*
				1798	* We may still have active monitoring at this point and we may
				1799	* end up in pfm_overflow_handler(). However, fasync_helper()
				1800	* operates with interrupts disabled and it cleans up the
				1801	* queue. If the PMU handler is called prior to entering
				1802	* fasync_helper() then it will send a signal. If it is
				1803	* invoked after, it will find an empty queue and no
				1804	* signal will be sent. In both case, we are safe
				1805	*/
				1806	if (filp->f_flags & FASYNC) {
				1807	DPRINT(("cleaning up async_queue=%p\n", ctx->ctx_async_queue));
				1808	pfm_do_fasync (-1, filp, ctx, 0);
				1809	}
				1810
				1811	PROTECT_CTX(ctx, flags);
				1812
				1813	state = ctx->ctx_state;
				1814	is_system = ctx->ctx_fl_system;
				1815
				1816	task = PFM_CTX_TASK(ctx);
				1817	regs = ia64_task_regs(task);
				1818
				1819	DPRINT(("ctx_state=%d is_current=%d\n",
				1820	state,
				1821	task == current ? 1 : 0));
				1822
				1823	/*
				1824	* if state == UNLOADED, then task is NULL
				1825	*/
				1826
				1827	/*
				1828	* we must stop and unload because we are losing access to the context.
				1829	*/
				1830	if (task == current) {
				1831	#ifdef CONFIG_SMP
				1832	/*
				1833	* the task IS the owner but it migrated to another CPU: that's bad
				1834	* but we must handle this cleanly. Unfortunately, the kernel does
				1835	* not provide a mechanism to block migration (while the context is loaded).
				1836	*
				1837	* We need to release the resource on the ORIGINAL cpu.
				1838	*/
				1839	if (is_system && ctx->ctx_cpu != smp_processor_id()) {
				1840
				1841	DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
				1842	/*
				1843	* keep context protected but unmask interrupt for IPI
				1844	*/
				1845	local_irq_restore(flags);
				1846
				1847	pfm_syswide_cleanup_other_cpu(ctx);
				1848
				1849	/*
				1850	* restore interrupt masking
				1851	*/
				1852	local_irq_save(flags);
				1853
				1854	/*
				1855	* context is unloaded at this point
				1856	*/
				1857	} else
				1858	#endif /* CONFIG_SMP */
				1859	{
				1860
				1861	DPRINT(("forcing unload\n"));
				1862	/*
				1863	* stop and unload, returning with state UNLOADED
				1864	* and session unreserved.
				1865	*/
				1866	pfm_context_unload(ctx, NULL, 0, regs);
				1867
				1868	DPRINT(("ctx_state=%d\n", ctx->ctx_state));
				1869	}
				1870	}
				1871
				1872	/*
				1873	* remove virtual mapping, if any, for the calling task.
				1874	* cannot reset ctx field until last user is calling close().
				1875	*
				1876	* ctx_smpl_vaddr must never be cleared because it is needed
				1877	* by every task with access to the context
				1878	*
				1879	* When called from do_exit(), the mm context is gone already, therefore
				1880	* mm is NULL, i.e., the VMA is already gone and we do not have to
				1881	* do anything here
				1882	*/
				1883	if (ctx->ctx_smpl_vaddr && current->mm) {
				1884	smpl_buf_vaddr = ctx->ctx_smpl_vaddr;
				1885	smpl_buf_size = ctx->ctx_smpl_size;
				1886	}
				1887
				1888	UNPROTECT_CTX(ctx, flags);
				1889
				1890	/*
				1891	* if there was a mapping, then we systematically remove it
				1892	* at this point. Cannot be done inside critical section
				1893	* because some VM function reenables interrupts.
				1894	*
				1895	*/
				1896	if (smpl_buf_vaddr) pfm_remove_smpl_mapping(current, smpl_buf_vaddr, smpl_buf_size);
				1897
				1898	return 0;
				1899	}
				1900	/*
				1901	* called either on explicit close() or from exit_files().
				1902	* Only the LAST user of the file gets to this point, i.e., it is
				1903	* called only ONCE.
				1904	*
				1905	* IMPORTANT: we get called ONLY when the refcnt on the file gets to zero
				1906	* (fput()),i.e, last task to access the file. Nobody else can access the
				1907	* file at this point.
				1908	*
				1909	* When called from exit_files(), the VMA has been freed because exit_mm()
				1910	* is executed before exit_files().
				1911	*
				1912	* When called from exit_files(), the current task is not yet ZOMBIE but we
				1913	* flush the PMU state to the context.
				1914	*/
				1915	static int
				1916	pfm_close(struct inode inode, struct file filp)
				1917	{
				1918	pfm_context_t *ctx;
				1919	struct task_struct *task;
				1920	struct pt_regs *regs;
				1921	DECLARE_WAITQUEUE(wait, current);
				1922	unsigned long flags;
				1923	unsigned long smpl_buf_size = 0UL;
				1924	void *smpl_buf_addr = NULL;
				1925	int free_possible = 1;
				1926	int state, is_system;
				1927
				1928	DPRINT(("pfm_close called private=%p\n", filp->private_data));
				1929
				1930	if (PFM_IS_FILE(filp) == 0) {
				1931	DPRINT(("bad magic\n"));
				1932	return -EBADF;
				1933	}
				1934
				1935	ctx = (pfm_context_t *)filp->private_data;
				1936	if (ctx == NULL) {
				1937	printk(KERN_ERR "perfmon: pfm_close: NULL ctx [%d]\n", current->pid);
				1938	return -EBADF;
				1939	}
				1940
				1941	PROTECT_CTX(ctx, flags);
				1942
				1943	state = ctx->ctx_state;
				1944	is_system = ctx->ctx_fl_system;
				1945
				1946	task = PFM_CTX_TASK(ctx);
				1947	regs = ia64_task_regs(task);
				1948
				1949	DPRINT(("ctx_state=%d is_current=%d\n",
				1950	state,
				1951	task == current ? 1 : 0));
				1952
				1953	/*
				1954	* if task == current, then pfm_flush() unloaded the context
				1955	*/
				1956	if (state == PFM_CTX_UNLOADED) goto doit;
				1957
				1958	/*
				1959	* context is loaded/masked and task != current, we need to
				1960	* either force an unload or go zombie
				1961	*/
				1962
				1963	/*
				1964	* The task is currently blocked or will block after an overflow.
				1965	* we must force it to wakeup to get out of the
				1966	* MASKED state and transition to the unloaded state by itself.
				1967	*
				1968	* This situation is only possible for per-task mode
				1969	*/
				1970	if (state == PFM_CTX_MASKED && CTX_OVFL_NOBLOCK(ctx) == 0) {
				1971
				1972	/*
				1973	* set a "partial" zombie state to be checked
				1974	* upon return from down() in pfm_handle_work().
				1975	*
				1976	* We cannot use the ZOMBIE state, because it is checked
				1977	* by pfm_load_regs() which is called upon wakeup from down().
				1978	* In such case, it would free the context and then we would
				1979	* return to pfm_handle_work() which would access the
				1980	* stale context. Instead, we set a flag invisible to pfm_load_regs()
				1981	* but visible to pfm_handle_work().
				1982	*
				1983	* For some window of time, we have a zombie context with
				1984	* ctx_state = MASKED and not ZOMBIE
				1985	*/
				1986	ctx->ctx_fl_going_zombie = 1;
				1987
				1988	/*
				1989	* force task to wake up from MASKED state
				1990	*/
				1991	up(&ctx->ctx_restart_sem);
				1992
				1993	DPRINT(("waking up ctx_state=%d\n", state));
				1994
				1995	/*
				1996	* put ourself to sleep waiting for the other
				1997	* task to report completion
				1998	*
				1999	* the context is protected by mutex, therefore there
				2000	* is no risk of being notified of completion before
				2001	* begin actually on the waitq.
				2002	*/
				2003	set_current_state(TASK_INTERRUPTIBLE);
				2004	add_wait_queue(&ctx->ctx_zombieq, &wait);
				2005
				2006	UNPROTECT_CTX(ctx, flags);
				2007
				2008	/*
				2009	* XXX: check for signals :
				2010	* - ok for explicit close
				2011	* - not ok when coming from exit_files()
				2012	*/
				2013	schedule();
				2014
				2015
				2016	PROTECT_CTX(ctx, flags);
				2017
				2018
				2019	remove_wait_queue(&ctx->ctx_zombieq, &wait);
				2020	set_current_state(TASK_RUNNING);
				2021
				2022	/*
				2023	* context is unloaded at this point
				2024	*/
				2025	DPRINT(("after zombie wakeup ctx_state=%d for\n", state));
				2026	}
				2027	else if (task != current) {
				2028	#ifdef CONFIG_SMP
				2029	/*
				2030	* switch context to zombie state
				2031	*/
				2032	ctx->ctx_state = PFM_CTX_ZOMBIE;
				2033
				2034	DPRINT(("zombie ctx for [%d]\n", task->pid));
				2035	/*
				2036	* cannot free the context on the spot. deferred until
				2037	* the task notices the ZOMBIE state
				2038	*/
				2039	free_possible = 0;
				2040	#else
				2041	pfm_context_unload(ctx, NULL, 0, regs);
				2042	#endif
				2043	}
				2044
				2045	doit:
				2046	/* reload state, may have changed during opening of critical section */
				2047	state = ctx->ctx_state;
				2048
				2049	/*
				2050	* the context is still attached to a task (possibly current)
				2051	* we cannot destroy it right now
				2052	*/
				2053
				2054	/*
				2055	* we must free the sampling buffer right here because
				2056	* we cannot rely on it being cleaned up later by the
				2057	* monitored task. It is not possible to free vmalloc'ed
				2058	* memory in pfm_load_regs(). Instead, we remove the buffer
				2059	* now. should there be subsequent PMU overflow originally
				2060	* meant for sampling, the will be converted to spurious
				2061	* and that's fine because the monitoring tools is gone anyway.
				2062	*/
				2063	if (ctx->ctx_smpl_hdr) {
				2064	smpl_buf_addr = ctx->ctx_smpl_hdr;
				2065	smpl_buf_size = ctx->ctx_smpl_size;
				2066	/* no more sampling */
				2067	ctx->ctx_smpl_hdr = NULL;
				2068	ctx->ctx_fl_is_sampling = 0;
				2069	}
				2070
				2071	DPRINT(("ctx_state=%d free_possible=%d addr=%p size=%lu\n",
				2072	state,
				2073	free_possible,
				2074	smpl_buf_addr,
				2075	smpl_buf_size));
				2076
				2077	if (smpl_buf_addr) pfm_exit_smpl_buffer(ctx->ctx_buf_fmt);
				2078
				2079	/*
				2080	* UNLOADED that the session has already been unreserved.
				2081	*/
				2082	if (state == PFM_CTX_ZOMBIE) {
				2083	pfm_unreserve_session(ctx, ctx->ctx_fl_system , ctx->ctx_cpu);
				2084	}
				2085
				2086	/*
				2087	* disconnect file descriptor from context must be done
				2088	* before we unlock.
				2089	*/
				2090	filp->private_data = NULL;
				2091
				2092	/*
				2093	* if we free on the spot, the context is now completely unreacheable
				2094	* from the callers side. The monitored task side is also cut, so we
				2095	* can freely cut.
				2096	*
				2097	* If we have a deferred free, only the caller side is disconnected.
				2098	*/
				2099	UNPROTECT_CTX(ctx, flags);
				2100
				2101	/*
				2102	* All memory free operations (especially for vmalloc'ed memory)
				2103	* MUST be done with interrupts ENABLED.
				2104	*/
				2105	if (smpl_buf_addr) pfm_rvfree(smpl_buf_addr, smpl_buf_size);
				2106
				2107	/*
				2108	* return the memory used by the context
				2109	*/
				2110	if (free_possible) pfm_context_free(ctx);
				2111
				2112	return 0;
				2113	}
				2114
				2115	static int
				2116	pfm_no_open(struct inode irrelevant, struct file dontcare)
				2117	{
				2118	DPRINT(("pfm_no_open called\n"));
				2119	return -ENXIO;
				2120	}
				2121
				2122
				2123
				2124	static struct file_operations pfm_file_ops = {
				2125	.llseek = no_llseek,
				2126	.read = pfm_read,
				2127	.write = pfm_write,
				2128	.poll = pfm_poll,
				2129	.ioctl = pfm_ioctl,
				2130	.open = pfm_no_open, /* special open code to disallow open via /proc */
				2131	.fasync = pfm_fasync,
				2132	.release = pfm_close,
				2133	.flush = pfm_flush
				2134	};
				2135
				2136	static int
				2137	pfmfs_delete_dentry(struct dentry *dentry)
				2138	{
				2139	return 1;
				2140	}
				2141
				2142	static struct dentry_operations pfmfs_dentry_operations = {
				2143	.d_delete = pfmfs_delete_dentry,
				2144	};
				2145
				2146
				2147	static int
				2148	pfm_alloc_fd(struct file **cfile)
				2149	{
				2150	int fd, ret = 0;
				2151	struct file *file = NULL;
				2152	struct inode * inode;
				2153	char name[32];
				2154	struct qstr this;
				2155
				2156	fd = get_unused_fd();
				2157	if (fd < 0) return -ENFILE;
				2158
				2159	ret = -ENFILE;
				2160
				2161	file = get_empty_filp();
				2162	if (!file) goto out;
				2163
				2164	/*
				2165	* allocate a new inode
				2166	*/
				2167	inode = new_inode(pfmfs_mnt->mnt_sb);
				2168	if (!inode) goto out;
				2169
				2170	DPRINT(("new inode ino=%ld @%p\n", inode->i_ino, inode));
				2171
				2172	inode->i_mode = S_IFCHR\|S_IRUGO;
				2173	inode->i_uid = current->fsuid;
				2174	inode->i_gid = current->fsgid;
				2175
				2176	sprintf(name, "[%lu]", inode->i_ino);
				2177	this.name = name;
				2178	this.len = strlen(name);
				2179	this.hash = inode->i_ino;
				2180
				2181	ret = -ENOMEM;
				2182
				2183	/*
				2184	* allocate a new dcache entry
				2185	*/
				2186	file->f_dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this);
				2187	if (!file->f_dentry) goto out;
				2188
				2189	file->f_dentry->d_op = &pfmfs_dentry_operations;
				2190
				2191	d_add(file->f_dentry, inode);
				2192	file->f_vfsmnt = mntget(pfmfs_mnt);
				2193	file->f_mapping = inode->i_mapping;
				2194
				2195	file->f_op = &pfm_file_ops;
				2196	file->f_mode = FMODE_READ;
				2197	file->f_flags = O_RDONLY;
				2198	file->f_pos = 0;
				2199
				2200	/*
				2201	* may have to delay until context is attached?
				2202	*/
				2203	fd_install(fd, file);
				2204
				2205	/*
				2206	* the file structure we will use
				2207	*/
				2208	*cfile = file;
				2209
				2210	return fd;
				2211	out:
				2212	if (file) put_filp(file);
				2213	put_unused_fd(fd);
				2214	return ret;
				2215	}
				2216
				2217	static void
				2218	pfm_free_fd(int fd, struct file *file)
				2219	{
				2220	struct files_struct *files = current->files;
Dipankar Sarma	4fb3a53	2005-09-16 19:28:13 -0700	[diff] [blame]	2221	struct fdtable *fdt;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2222
				2223	/*
				2224	* there ie no fd_uninstall(), so we do it here
				2225	*/
				2226	spin_lock(&files->file_lock);
Dipankar Sarma	4fb3a53	2005-09-16 19:28:13 -0700	[diff] [blame]	2227	fdt = files_fdtable(files);
Dipankar Sarma	badf166	2005-09-09 13:04:10 -0700	[diff] [blame]	2228	rcu_assign_pointer(fdt->fd[fd], NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2229	spin_unlock(&files->file_lock);
				2230
Dipankar Sarma	badf166	2005-09-09 13:04:10 -0700	[diff] [blame]	2231	if (file)
				2232	put_filp(file);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2233	put_unused_fd(fd);
				2234	}
				2235
				2236	static int
				2237	pfm_remap_buffer(struct vm_area_struct *vma, unsigned long buf, unsigned long addr, unsigned long size)
				2238	{
				2239	DPRINT(("CPU%d buf=0x%lx addr=0x%lx size=%ld\n", smp_processor_id(), buf, addr, size));
				2240
				2241	while (size > 0) {
				2242	unsigned long pfn = ia64_tpa(buf) >> PAGE_SHIFT;
				2243
				2244
				2245	if (remap_pfn_range(vma, addr, pfn, PAGE_SIZE, PAGE_READONLY))
				2246	return -ENOMEM;
				2247
				2248	addr += PAGE_SIZE;
				2249	buf += PAGE_SIZE;
				2250	size -= PAGE_SIZE;
				2251	}
				2252	return 0;
				2253	}
				2254
				2255	/*
				2256	* allocate a sampling buffer and remaps it into the user address space of the task
				2257	*/
				2258	static int
				2259	pfm_smpl_buffer_alloc(struct task_struct task, pfm_context_t ctx, unsigned long rsize, void **user_vaddr)
				2260	{
				2261	struct mm_struct *mm = task->mm;
				2262	struct vm_area_struct *vma = NULL;
				2263	unsigned long size;
				2264	void *smpl_buf;
				2265
				2266
				2267	/*
				2268	* the fixed header + requested size and align to page boundary
				2269	*/
				2270	size = PAGE_ALIGN(rsize);
				2271
				2272	DPRINT(("sampling buffer rsize=%lu size=%lu bytes\n", rsize, size));
				2273
				2274	/*
				2275	* check requested size to avoid Denial-of-service attacks
				2276	* XXX: may have to refine this test
				2277	* Check against address space limit.
				2278	*
				2279	* if ((mm->total_vm << PAGE_SHIFT) + len> task->rlim[RLIMIT_AS].rlim_cur)
				2280	* return -ENOMEM;
				2281	*/
				2282	if (size > task->signal->rlim[RLIMIT_MEMLOCK].rlim_cur)
				2283	return -ENOMEM;
				2284
				2285	/*
				2286	* We do the easy to undo allocations first.
				2287	*
				2288	* pfm_rvmalloc(), clears the buffer, so there is no leak
				2289	*/
				2290	smpl_buf = pfm_rvmalloc(size);
				2291	if (smpl_buf == NULL) {
				2292	DPRINT(("Can't allocate sampling buffer\n"));
				2293	return -ENOMEM;
				2294	}
				2295
				2296	DPRINT(("smpl_buf @%p\n", smpl_buf));
				2297
				2298	/* allocate vma */
				2299	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
				2300	if (!vma) {
				2301	DPRINT(("Cannot allocate vma\n"));
				2302	goto error_kmem;
				2303	}
				2304	memset(vma, 0, sizeof(*vma));
				2305
				2306	/*
				2307	* partially initialize the vma for the sampling buffer
				2308	*/
				2309	vma->vm_mm = mm;
				2310	vma->vm_flags = VM_READ\| VM_MAYREAD \|VM_RESERVED;
				2311	vma->vm_page_prot = PAGE_READONLY; /* XXX may need to change */
				2312
				2313	/*
				2314	* Now we have everything we need and we can initialize
				2315	* and connect all the data structures
				2316	*/
				2317
				2318	ctx->ctx_smpl_hdr = smpl_buf;
				2319	ctx->ctx_smpl_size = size; /* aligned size */
				2320
				2321	/*
				2322	* Let's do the difficult operations next.
				2323	*
				2324	* now we atomically find some area in the address space and
				2325	* remap the buffer in it.
				2326	*/
				2327	down_write(&task->mm->mmap_sem);
				2328
				2329	/* find some free area in address space, must have mmap sem held */
				2330	vma->vm_start = pfm_get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE\|MAP_ANONYMOUS, 0);
				2331	if (vma->vm_start == 0UL) {
				2332	DPRINT(("Cannot find unmapped area for size %ld\n", size));
				2333	up_write(&task->mm->mmap_sem);
				2334	goto error;
				2335	}
				2336	vma->vm_end = vma->vm_start + size;
				2337	vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
				2338
				2339	DPRINT(("aligned size=%ld, hdr=%p mapped @0x%lx\n", size, ctx->ctx_smpl_hdr, vma->vm_start));
				2340
				2341	/* can only be applied to current task, need to have the mm semaphore held when called */
				2342	if (pfm_remap_buffer(vma, (unsigned long)smpl_buf, vma->vm_start, size)) {
				2343	DPRINT(("Can't remap buffer\n"));
				2344	up_write(&task->mm->mmap_sem);
				2345	goto error;
				2346	}
				2347
				2348	/*
				2349	* now insert the vma in the vm list for the process, must be
				2350	* done with mmap lock held
				2351	*/
				2352	insert_vm_struct(mm, vma);
				2353
				2354	mm->total_vm += size >> PAGE_SHIFT;
Hugh Dickins	ab50b8e	2005-10-29 18:15:56 -0700	[diff] [blame]	2355	vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
				2356	vma_pages(vma));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2357	up_write(&task->mm->mmap_sem);
				2358
				2359	/*
				2360	* keep track of user level virtual address
				2361	*/
				2362	ctx->ctx_smpl_vaddr = (void *)vma->vm_start;
				2363	(unsigned long )user_vaddr = vma->vm_start;
				2364
				2365	return 0;
				2366
				2367	error:
				2368	kmem_cache_free(vm_area_cachep, vma);
				2369	error_kmem:
				2370	pfm_rvfree(smpl_buf, size);
				2371
				2372	return -ENOMEM;
				2373	}
				2374
				2375	/*
				2376	* XXX: do something better here
				2377	*/
				2378	static int
				2379	pfm_bad_permissions(struct task_struct *task)
				2380	{
				2381	/* inspired by ptrace_attach() */
				2382	DPRINT(("cur: uid=%d gid=%d task: euid=%d suid=%d uid=%d egid=%d sgid=%d\n",
				2383	current->uid,
				2384	current->gid,
				2385	task->euid,
				2386	task->suid,
				2387	task->uid,
				2388	task->egid,
				2389	task->sgid));
				2390
				2391	return ((current->uid != task->euid)
				2392	\|\| (current->uid != task->suid)
				2393	\|\| (current->uid != task->uid)
				2394	\|\| (current->gid != task->egid)
				2395	\|\| (current->gid != task->sgid)
				2396	\|\| (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE);
				2397	}
				2398
				2399	static int
				2400	pfarg_is_sane(struct task_struct task, pfarg_context_t pfx)
				2401	{
				2402	int ctx_flags;
				2403
				2404	/* valid signal */
				2405
				2406	ctx_flags = pfx->ctx_flags;
				2407
				2408	if (ctx_flags & PFM_FL_SYSTEM_WIDE) {
				2409
				2410	/*
				2411	* cannot block in this mode
				2412	*/
				2413	if (ctx_flags & PFM_FL_NOTIFY_BLOCK) {
				2414	DPRINT(("cannot use blocking mode when in system wide monitoring\n"));
				2415	return -EINVAL;
				2416	}
				2417	} else {
				2418	}
				2419	/* probably more to add here */
				2420
				2421	return 0;
				2422	}
				2423
				2424	static int
				2425	pfm_setup_buffer_fmt(struct task_struct task, pfm_context_t ctx, unsigned int ctx_flags,
				2426	unsigned int cpu, pfarg_context_t *arg)
				2427	{
				2428	pfm_buffer_fmt_t *fmt = NULL;
				2429	unsigned long size = 0UL;
				2430	void *uaddr = NULL;
				2431	void *fmt_arg = NULL;
				2432	int ret = 0;
				2433	#define PFM_CTXARG_BUF_ARG(a) (pfm_buffer_fmt_t *)(a+1)
				2434
				2435	/* invoke and lock buffer format, if found */
				2436	fmt = pfm_find_buffer_fmt(arg->ctx_smpl_buf_id);
				2437	if (fmt == NULL) {
				2438	DPRINT(("[%d] cannot find buffer format\n", task->pid));
				2439	return -EINVAL;
				2440	}
				2441
				2442	/*
				2443	* buffer argument MUST be contiguous to pfarg_context_t
				2444	*/
				2445	if (fmt->fmt_arg_size) fmt_arg = PFM_CTXARG_BUF_ARG(arg);
				2446
				2447	ret = pfm_buf_fmt_validate(fmt, task, ctx_flags, cpu, fmt_arg);
				2448
				2449	DPRINT(("[%d] after validate(0x%x,%d,%p)=%d\n", task->pid, ctx_flags, cpu, fmt_arg, ret));
				2450
				2451	if (ret) goto error;
				2452
				2453	/* link buffer format and context */
				2454	ctx->ctx_buf_fmt = fmt;
				2455
				2456	/*
				2457	* check if buffer format wants to use perfmon buffer allocation/mapping service
				2458	*/
				2459	ret = pfm_buf_fmt_getsize(fmt, task, ctx_flags, cpu, fmt_arg, &size);
				2460	if (ret) goto error;
				2461
				2462	if (size) {
				2463	/*
				2464	* buffer is always remapped into the caller's address space
				2465	*/
				2466	ret = pfm_smpl_buffer_alloc(current, ctx, size, &uaddr);
				2467	if (ret) goto error;
				2468
				2469	/* keep track of user address of buffer */
				2470	arg->ctx_smpl_vaddr = uaddr;
				2471	}
				2472	ret = pfm_buf_fmt_init(fmt, task, ctx->ctx_smpl_hdr, ctx_flags, cpu, fmt_arg);
				2473
				2474	error:
				2475	return ret;
				2476	}
				2477
				2478	static void
				2479	pfm_reset_pmu_state(pfm_context_t *ctx)
				2480	{
				2481	int i;
				2482
				2483	/*
				2484	* install reset values for PMC.
				2485	*/
				2486	for (i=1; PMC_IS_LAST(i) == 0; i++) {
				2487	if (PMC_IS_IMPL(i) == 0) continue;
				2488	ctx->ctx_pmcs[i] = PMC_DFL_VAL(i);
				2489	DPRINT(("pmc[%d]=0x%lx\n", i, ctx->ctx_pmcs[i]));
				2490	}
				2491	/*
				2492	* PMD registers are set to 0UL when the context in memset()
				2493	*/
				2494
				2495	/*
				2496	* On context switched restore, we must restore ALL pmc and ALL pmd even
				2497	* when they are not actively used by the task. In UP, the incoming process
				2498	* may otherwise pick up left over PMC, PMD state from the previous process.
				2499	* As opposed to PMD, stale PMC can cause harm to the incoming
				2500	* process because they may change what is being measured.
				2501	* Therefore, we must systematically reinstall the entire
				2502	* PMC state. In SMP, the same thing is possible on the
				2503	* same CPU but also on between 2 CPUs.
				2504	*
				2505	* The problem with PMD is information leaking especially
				2506	* to user level when psr.sp=0
				2507	*
				2508	* There is unfortunately no easy way to avoid this problem
				2509	* on either UP or SMP. This definitively slows down the
				2510	* pfm_load_regs() function.
				2511	*/
				2512
				2513	/*
				2514	* bitmask of all PMCs accessible to this context
				2515	*
				2516	* PMC0 is treated differently.
				2517	*/
				2518	ctx->ctx_all_pmcs[0] = pmu_conf->impl_pmcs[0] & ~0x1;
				2519
				2520	/*
				2521	* bitmask of all PMDs that are accesible to this context
				2522	*/
				2523	ctx->ctx_all_pmds[0] = pmu_conf->impl_pmds[0];
				2524
				2525	DPRINT(("<%d> all_pmcs=0x%lx all_pmds=0x%lx\n", ctx->ctx_fd, ctx->ctx_all_pmcs[0],ctx->ctx_all_pmds[0]));
				2526
				2527	/*
				2528	* useful in case of re-enable after disable
				2529	*/
				2530	ctx->ctx_used_ibrs[0] = 0UL;
				2531	ctx->ctx_used_dbrs[0] = 0UL;
				2532	}
				2533
				2534	static int
				2535	pfm_ctx_getsize(void arg, size_t sz)
				2536	{
				2537	pfarg_context_t req = (pfarg_context_t )arg;
				2538	pfm_buffer_fmt_t *fmt;
				2539
				2540	*sz = 0;
				2541
				2542	if (!pfm_uuid_cmp(req->ctx_smpl_buf_id, pfm_null_uuid)) return 0;
				2543
				2544	fmt = pfm_find_buffer_fmt(req->ctx_smpl_buf_id);
				2545	if (fmt == NULL) {
				2546	DPRINT(("cannot find buffer format\n"));
				2547	return -EINVAL;
				2548	}
				2549	/* get just enough to copy in user parameters */
				2550	*sz = fmt->fmt_arg_size;
				2551	DPRINT(("arg_size=%lu\n", *sz));
				2552
				2553	return 0;
				2554	}
				2555
				2556
				2557
				2558	/*
				2559	* cannot attach if :
				2560	* - kernel task
				2561	* - task not owned by caller
				2562	* - task incompatible with context mode
				2563	*/
				2564	static int
				2565	pfm_task_incompatible(pfm_context_t ctx, struct task_struct task)
				2566	{
				2567	/*
				2568	* no kernel task or task not owner by caller
				2569	*/
				2570	if (task->mm == NULL) {
				2571	DPRINT(("task [%d] has not memory context (kernel thread)\n", task->pid));
				2572	return -EPERM;
				2573	}
				2574	if (pfm_bad_permissions(task)) {
				2575	DPRINT(("no permission to attach to [%d]\n", task->pid));
				2576	return -EPERM;
				2577	}
				2578	/*
				2579	* cannot block in self-monitoring mode
				2580	*/
				2581	if (CTX_OVFL_NOBLOCK(ctx) == 0 && task == current) {
				2582	DPRINT(("cannot load a blocking context on self for [%d]\n", task->pid));
				2583	return -EINVAL;
				2584	}
				2585
				2586	if (task->exit_state == EXIT_ZOMBIE) {
				2587	DPRINT(("cannot attach to zombie task [%d]\n", task->pid));
				2588	return -EBUSY;
				2589	}
				2590
				2591	/*
				2592	* always ok for self
				2593	*/
				2594	if (task == current) return 0;
				2595
				2596	if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) {
				2597	DPRINT(("cannot attach to non-stopped task [%d] state=%ld\n", task->pid, task->state));
				2598	return -EBUSY;
				2599	}
				2600	/*
				2601	* make sure the task is off any CPU
				2602	*/
				2603	wait_task_inactive(task);
				2604
				2605	/* more to come... */
				2606
				2607	return 0;
				2608	}
				2609
				2610	static int
				2611	pfm_get_task(pfm_context_t ctx, pid_t pid, struct task_struct *task)
				2612	{
				2613	struct task_struct *p = current;
				2614	int ret;
				2615
				2616	/* XXX: need to add more checks here */
				2617	if (pid < 2) return -EPERM;
				2618
				2619	if (pid != current->pid) {
				2620
				2621	read_lock(&tasklist_lock);
				2622
				2623	p = find_task_by_pid(pid);
				2624
				2625	/* make sure task cannot go away while we operate on it */
				2626	if (p) get_task_struct(p);
				2627
				2628	read_unlock(&tasklist_lock);
				2629
				2630	if (p == NULL) return -ESRCH;
				2631	}
				2632
				2633	ret = pfm_task_incompatible(ctx, p);
				2634	if (ret == 0) {
				2635	*task = p;
				2636	} else if (p != current) {
				2637	pfm_put_task(p);
				2638	}
				2639	return ret;
				2640	}
				2641
				2642
				2643
				2644	static int
				2645	pfm_context_create(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				2646	{
				2647	pfarg_context_t req = (pfarg_context_t )arg;
				2648	struct file *filp;
				2649	int ctx_flags;
				2650	int ret;
				2651
				2652	/* let's check the arguments first */
				2653	ret = pfarg_is_sane(current, req);
				2654	if (ret < 0) return ret;
				2655
				2656	ctx_flags = req->ctx_flags;
				2657
				2658	ret = -ENOMEM;
				2659
				2660	ctx = pfm_context_alloc();
				2661	if (!ctx) goto error;
				2662
				2663	ret = pfm_alloc_fd(&filp);
				2664	if (ret < 0) goto error_file;
				2665
				2666	req->ctx_fd = ctx->ctx_fd = ret;
				2667
				2668	/*
				2669	* attach context to file
				2670	*/
				2671	filp->private_data = ctx;
				2672
				2673	/*
				2674	* does the user want to sample?
				2675	*/
				2676	if (pfm_uuid_cmp(req->ctx_smpl_buf_id, pfm_null_uuid)) {
				2677	ret = pfm_setup_buffer_fmt(current, ctx, ctx_flags, 0, req);
				2678	if (ret) goto buffer_error;
				2679	}
				2680
				2681	/*
				2682	* init context protection lock
				2683	*/
				2684	spin_lock_init(&ctx->ctx_lock);
				2685
				2686	/*
				2687	* context is unloaded
				2688	*/
				2689	ctx->ctx_state = PFM_CTX_UNLOADED;
				2690
				2691	/*
				2692	* initialization of context's flags
				2693	*/
				2694	ctx->ctx_fl_block = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0;
				2695	ctx->ctx_fl_system = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0;
				2696	ctx->ctx_fl_is_sampling = ctx->ctx_buf_fmt ? 1 : 0; /* assume record() is defined */
				2697	ctx->ctx_fl_no_msg = (ctx_flags & PFM_FL_OVFL_NO_MSG) ? 1: 0;
				2698	/*
				2699	* will move to set properties
				2700	* ctx->ctx_fl_excl_idle = (ctx_flags & PFM_FL_EXCL_IDLE) ? 1: 0;
				2701	*/
				2702
				2703	/*
				2704	* init restart semaphore to locked
				2705	*/
				2706	sema_init(&ctx->ctx_restart_sem, 0);
				2707
				2708	/*
				2709	* activation is used in SMP only
				2710	*/
				2711	ctx->ctx_last_activation = PFM_INVALID_ACTIVATION;
				2712	SET_LAST_CPU(ctx, -1);
				2713
				2714	/*
				2715	* initialize notification message queue
				2716	*/
				2717	ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0;
				2718	init_waitqueue_head(&ctx->ctx_msgq_wait);
				2719	init_waitqueue_head(&ctx->ctx_zombieq);
				2720
				2721	DPRINT(("ctx=%p flags=0x%x system=%d notify_block=%d excl_idle=%d no_msg=%d ctx_fd=%d \n",
				2722	ctx,
				2723	ctx_flags,
				2724	ctx->ctx_fl_system,
				2725	ctx->ctx_fl_block,
				2726	ctx->ctx_fl_excl_idle,
				2727	ctx->ctx_fl_no_msg,
				2728	ctx->ctx_fd));
				2729
				2730	/*
				2731	* initialize soft PMU state
				2732	*/
				2733	pfm_reset_pmu_state(ctx);
				2734
				2735	return 0;
				2736
				2737	buffer_error:
				2738	pfm_free_fd(ctx->ctx_fd, filp);
				2739
				2740	if (ctx->ctx_buf_fmt) {
				2741	pfm_buf_fmt_exit(ctx->ctx_buf_fmt, current, NULL, regs);
				2742	}
				2743	error_file:
				2744	pfm_context_free(ctx);
				2745
				2746	error:
				2747	return ret;
				2748	}
				2749
				2750	static inline unsigned long
				2751	pfm_new_counter_value (pfm_counter_t *reg, int is_long_reset)
				2752	{
				2753	unsigned long val = is_long_reset ? reg->long_reset : reg->short_reset;
				2754	unsigned long new_seed, old_seed = reg->seed, mask = reg->mask;
				2755	extern unsigned long carta_random32 (unsigned long seed);
				2756
				2757	if (reg->flags & PFM_REGFL_RANDOM) {
				2758	new_seed = carta_random32(old_seed);
				2759	val -= (old_seed & mask); /* counter values are negative numbers! */
				2760	if ((mask >> 32) != 0)
				2761	/* construct a full 64-bit random value: */
				2762	new_seed \|= carta_random32(old_seed >> 32) << 32;
				2763	reg->seed = new_seed;
				2764	}
				2765	reg->lval = val;
				2766	return val;
				2767	}
				2768
				2769	static void
				2770	pfm_reset_regs_masked(pfm_context_t ctx, unsigned long ovfl_regs, int is_long_reset)
				2771	{
				2772	unsigned long mask = ovfl_regs[0];
				2773	unsigned long reset_others = 0UL;
				2774	unsigned long val;
				2775	int i;
				2776
				2777	/*
				2778	* now restore reset value on sampling overflowed counters
				2779	*/
				2780	mask >>= PMU_FIRST_COUNTER;
				2781	for(i = PMU_FIRST_COUNTER; mask; i++, mask >>= 1) {
				2782
				2783	if ((mask & 0x1UL) == 0UL) continue;
				2784
				2785	ctx->ctx_pmds[i].val = val = pfm_new_counter_value(ctx->ctx_pmds+ i, is_long_reset);
				2786	reset_others \|= ctx->ctx_pmds[i].reset_pmds[0];
				2787
				2788	DPRINT_ovfl((" %s reset ctx_pmds[%d]=%lx\n", is_long_reset ? "long" : "short", i, val));
				2789	}
				2790
				2791	/*
				2792	* Now take care of resetting the other registers
				2793	*/
				2794	for(i = 0; reset_others; i++, reset_others >>= 1) {
				2795
				2796	if ((reset_others & 0x1) == 0) continue;
				2797
				2798	ctx->ctx_pmds[i].val = val = pfm_new_counter_value(ctx->ctx_pmds + i, is_long_reset);
				2799
				2800	DPRINT_ovfl(("%s reset_others pmd[%d]=%lx\n",
				2801	is_long_reset ? "long" : "short", i, val));
				2802	}
				2803	}
				2804
				2805	static void
				2806	pfm_reset_regs(pfm_context_t ctx, unsigned long ovfl_regs, int is_long_reset)
				2807	{
				2808	unsigned long mask = ovfl_regs[0];
				2809	unsigned long reset_others = 0UL;
				2810	unsigned long val;
				2811	int i;
				2812
				2813	DPRINT_ovfl(("ovfl_regs=0x%lx is_long_reset=%d\n", ovfl_regs[0], is_long_reset));
				2814
				2815	if (ctx->ctx_state == PFM_CTX_MASKED) {
				2816	pfm_reset_regs_masked(ctx, ovfl_regs, is_long_reset);
				2817	return;
				2818	}
				2819
				2820	/*
				2821	* now restore reset value on sampling overflowed counters
				2822	*/
				2823	mask >>= PMU_FIRST_COUNTER;
				2824	for(i = PMU_FIRST_COUNTER; mask; i++, mask >>= 1) {
				2825
				2826	if ((mask & 0x1UL) == 0UL) continue;
				2827
				2828	val = pfm_new_counter_value(ctx->ctx_pmds+ i, is_long_reset);
				2829	reset_others \|= ctx->ctx_pmds[i].reset_pmds[0];
				2830
				2831	DPRINT_ovfl((" %s reset ctx_pmds[%d]=%lx\n", is_long_reset ? "long" : "short", i, val));
				2832
				2833	pfm_write_soft_counter(ctx, i, val);
				2834	}
				2835
				2836	/*
				2837	* Now take care of resetting the other registers
				2838	*/
				2839	for(i = 0; reset_others; i++, reset_others >>= 1) {
				2840
				2841	if ((reset_others & 0x1) == 0) continue;
				2842
				2843	val = pfm_new_counter_value(ctx->ctx_pmds + i, is_long_reset);
				2844
				2845	if (PMD_IS_COUNTING(i)) {
				2846	pfm_write_soft_counter(ctx, i, val);
				2847	} else {
				2848	ia64_set_pmd(i, val);
				2849	}
				2850	DPRINT_ovfl(("%s reset_others pmd[%d]=%lx\n",
				2851	is_long_reset ? "long" : "short", i, val));
				2852	}
				2853	ia64_srlz_d();
				2854	}
				2855
				2856	static int
				2857	pfm_write_pmcs(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				2858	{
				2859	struct thread_struct *thread = NULL;
				2860	struct task_struct *task;
				2861	pfarg_reg_t req = (pfarg_reg_t )arg;
				2862	unsigned long value, pmc_pm;
				2863	unsigned long smpl_pmds, reset_pmds, impl_pmds;
				2864	unsigned int cnum, reg_flags, flags, pmc_type;
				2865	int i, can_access_pmu = 0, is_loaded, is_system, expert_mode;
				2866	int is_monitor, is_counting, state;
				2867	int ret = -EINVAL;
				2868	pfm_reg_check_t wr_func;
				2869	#define PFM_CHECK_PMC_PM(x, y, z) ((x)->ctx_fl_system ^ PMC_PM(y, z))
				2870
				2871	state = ctx->ctx_state;
				2872	is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
				2873	is_system = ctx->ctx_fl_system;
				2874	task = ctx->ctx_task;
				2875	impl_pmds = pmu_conf->impl_pmds[0];
				2876
				2877	if (state == PFM_CTX_ZOMBIE) return -EINVAL;
				2878
				2879	if (is_loaded) {
				2880	thread = &task->thread;
				2881	/*
				2882	* In system wide and when the context is loaded, access can only happen
				2883	* when the caller is running on the CPU being monitored by the session.
				2884	* It does not have to be the owner (ctx_task) of the context per se.
				2885	*/
				2886	if (is_system && ctx->ctx_cpu != smp_processor_id()) {
				2887	DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
				2888	return -EBUSY;
				2889	}
				2890	can_access_pmu = GET_PMU_OWNER() == task \|\| is_system ? 1 : 0;
				2891	}
				2892	expert_mode = pfm_sysctl.expert_mode;
				2893
				2894	for (i = 0; i < count; i++, req++) {
				2895
				2896	cnum = req->reg_num;
				2897	reg_flags = req->reg_flags;
				2898	value = req->reg_value;
				2899	smpl_pmds = req->reg_smpl_pmds[0];
				2900	reset_pmds = req->reg_reset_pmds[0];
				2901	flags = 0;
				2902
				2903
				2904	if (cnum >= PMU_MAX_PMCS) {
				2905	DPRINT(("pmc%u is invalid\n", cnum));
				2906	goto error;
				2907	}
				2908
				2909	pmc_type = pmu_conf->pmc_desc[cnum].type;
				2910	pmc_pm = (value >> pmu_conf->pmc_desc[cnum].pm_pos) & 0x1;
				2911	is_counting = (pmc_type & PFM_REG_COUNTING) == PFM_REG_COUNTING ? 1 : 0;
				2912	is_monitor = (pmc_type & PFM_REG_MONITOR) == PFM_REG_MONITOR ? 1 : 0;
				2913
				2914	/*
				2915	* we reject all non implemented PMC as well
				2916	* as attempts to modify PMC[0-3] which are used
				2917	* as status registers by the PMU
				2918	*/
				2919	if ((pmc_type & PFM_REG_IMPL) == 0 \|\| (pmc_type & PFM_REG_CONTROL) == PFM_REG_CONTROL) {
				2920	DPRINT(("pmc%u is unimplemented or no-access pmc_type=%x\n", cnum, pmc_type));
				2921	goto error;
				2922	}
				2923	wr_func = pmu_conf->pmc_desc[cnum].write_check;
				2924	/*
				2925	* If the PMC is a monitor, then if the value is not the default:
				2926	* - system-wide session: PMCx.pm=1 (privileged monitor)
				2927	* - per-task : PMCx.pm=0 (user monitor)
				2928	*/
				2929	if (is_monitor && value != PMC_DFL_VAL(cnum) && is_system ^ pmc_pm) {
				2930	DPRINT(("pmc%u pmc_pm=%lu is_system=%d\n",
				2931	cnum,
				2932	pmc_pm,
				2933	is_system));
				2934	goto error;
				2935	}
				2936
				2937	if (is_counting) {
				2938	/*
				2939	* enforce generation of overflow interrupt. Necessary on all
				2940	* CPUs.
				2941	*/
				2942	value \|= 1 << PMU_PMC_OI;
				2943
				2944	if (reg_flags & PFM_REGFL_OVFL_NOTIFY) {
				2945	flags \|= PFM_REGFL_OVFL_NOTIFY;
				2946	}
				2947
				2948	if (reg_flags & PFM_REGFL_RANDOM) flags \|= PFM_REGFL_RANDOM;
				2949
				2950	/* verify validity of smpl_pmds */
				2951	if ((smpl_pmds & impl_pmds) != smpl_pmds) {
				2952	DPRINT(("invalid smpl_pmds 0x%lx for pmc%u\n", smpl_pmds, cnum));
				2953	goto error;
				2954	}
				2955
				2956	/* verify validity of reset_pmds */
				2957	if ((reset_pmds & impl_pmds) != reset_pmds) {
				2958	DPRINT(("invalid reset_pmds 0x%lx for pmc%u\n", reset_pmds, cnum));
				2959	goto error;
				2960	}
				2961	} else {
				2962	if (reg_flags & (PFM_REGFL_OVFL_NOTIFY\|PFM_REGFL_RANDOM)) {
				2963	DPRINT(("cannot set ovfl_notify or random on pmc%u\n", cnum));
				2964	goto error;
				2965	}
				2966	/* eventid on non-counting monitors are ignored */
				2967	}
				2968
				2969	/*
				2970	* execute write checker, if any
				2971	*/
				2972	if (likely(expert_mode == 0 && wr_func)) {
				2973	ret = (*wr_func)(task, ctx, cnum, &value, regs);
				2974	if (ret) goto error;
				2975	ret = -EINVAL;
				2976	}
				2977
				2978	/*
				2979	* no error on this register
				2980	*/
				2981	PFM_REG_RETFLAG_SET(req->reg_flags, 0);
				2982
				2983	/*
				2984	* Now we commit the changes to the software state
				2985	*/
				2986
				2987	/*
				2988	* update overflow information
				2989	*/
				2990	if (is_counting) {
				2991	/*
				2992	* full flag update each time a register is programmed
				2993	*/
				2994	ctx->ctx_pmds[cnum].flags = flags;
				2995
				2996	ctx->ctx_pmds[cnum].reset_pmds[0] = reset_pmds;
				2997	ctx->ctx_pmds[cnum].smpl_pmds[0] = smpl_pmds;
				2998	ctx->ctx_pmds[cnum].eventid = req->reg_smpl_eventid;
				2999
				3000	/*
				3001	* Mark all PMDS to be accessed as used.
				3002	*
				3003	* We do not keep track of PMC because we have to
				3004	* systematically restore ALL of them.
				3005	*
				3006	* We do not update the used_monitors mask, because
				3007	* if we have not programmed them, then will be in
				3008	* a quiescent state, therefore we will not need to
				3009	* mask/restore then when context is MASKED.
				3010	*/
				3011	CTX_USED_PMD(ctx, reset_pmds);
				3012	CTX_USED_PMD(ctx, smpl_pmds);
				3013	/*
				3014	* make sure we do not try to reset on
				3015	* restart because we have established new values
				3016	*/
				3017	if (state == PFM_CTX_MASKED) ctx->ctx_ovfl_regs[0] &= ~1UL << cnum;
				3018	}
				3019	/*
				3020	* Needed in case the user does not initialize the equivalent
				3021	* PMD. Clearing is done indirectly via pfm_reset_pmu_state() so there is no
				3022	* possible leak here.
				3023	*/
				3024	CTX_USED_PMD(ctx, pmu_conf->pmc_desc[cnum].dep_pmd[0]);
				3025
				3026	/*
				3027	* keep track of the monitor PMC that we are using.
				3028	* we save the value of the pmc in ctx_pmcs[] and if
				3029	* the monitoring is not stopped for the context we also
				3030	* place it in the saved state area so that it will be
				3031	* picked up later by the context switch code.
				3032	*
				3033	* The value in ctx_pmcs[] can only be changed in pfm_write_pmcs().
				3034	*
				3035	* The value in thread->pmcs[] may be modified on overflow, i.e., when
				3036	* monitoring needs to be stopped.
				3037	*/
				3038	if (is_monitor) CTX_USED_MONITOR(ctx, 1UL << cnum);
				3039
				3040	/*
				3041	* update context state
				3042	*/
				3043	ctx->ctx_pmcs[cnum] = value;
				3044
				3045	if (is_loaded) {
				3046	/*
				3047	* write thread state
				3048	*/
				3049	if (is_system == 0) thread->pmcs[cnum] = value;
				3050
				3051	/*
				3052	* write hardware register if we can
				3053	*/
				3054	if (can_access_pmu) {
				3055	ia64_set_pmc(cnum, value);
				3056	}
				3057	#ifdef CONFIG_SMP
				3058	else {
				3059	/*
				3060	* per-task SMP only here
				3061	*
				3062	* we are guaranteed that the task is not running on the other CPU,
				3063	* we indicate that this PMD will need to be reloaded if the task
				3064	* is rescheduled on the CPU it ran last on.
				3065	*/
				3066	ctx->ctx_reload_pmcs[0] \|= 1UL << cnum;
				3067	}
				3068	#endif
				3069	}
				3070
				3071	DPRINT(("pmc[%u]=0x%lx ld=%d apmu=%d flags=0x%x all_pmcs=0x%lx used_pmds=0x%lx eventid=%ld smpl_pmds=0x%lx reset_pmds=0x%lx reloads_pmcs=0x%lx used_monitors=0x%lx ovfl_regs=0x%lx\n",
				3072	cnum,
				3073	value,
				3074	is_loaded,
				3075	can_access_pmu,
				3076	flags,
				3077	ctx->ctx_all_pmcs[0],
				3078	ctx->ctx_used_pmds[0],
				3079	ctx->ctx_pmds[cnum].eventid,
				3080	smpl_pmds,
				3081	reset_pmds,
				3082	ctx->ctx_reload_pmcs[0],
				3083	ctx->ctx_used_monitors[0],
				3084	ctx->ctx_ovfl_regs[0]));
				3085	}
				3086
				3087	/*
				3088	* make sure the changes are visible
				3089	*/
				3090	if (can_access_pmu) ia64_srlz_d();
				3091
				3092	return 0;
				3093	error:
				3094	PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL);
				3095	return ret;
				3096	}
				3097
				3098	static int
				3099	pfm_write_pmds(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				3100	{
				3101	struct thread_struct *thread = NULL;
				3102	struct task_struct *task;
				3103	pfarg_reg_t req = (pfarg_reg_t )arg;
				3104	unsigned long value, hw_value, ovfl_mask;
				3105	unsigned int cnum;
				3106	int i, can_access_pmu = 0, state;
				3107	int is_counting, is_loaded, is_system, expert_mode;
				3108	int ret = -EINVAL;
				3109	pfm_reg_check_t wr_func;
				3110
				3111
				3112	state = ctx->ctx_state;
				3113	is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
				3114	is_system = ctx->ctx_fl_system;
				3115	ovfl_mask = pmu_conf->ovfl_val;
				3116	task = ctx->ctx_task;
				3117
				3118	if (unlikely(state == PFM_CTX_ZOMBIE)) return -EINVAL;
				3119
				3120	/*
				3121	* on both UP and SMP, we can only write to the PMC when the task is
				3122	* the owner of the local PMU.
				3123	*/
				3124	if (likely(is_loaded)) {
				3125	thread = &task->thread;
				3126	/*
				3127	* In system wide and when the context is loaded, access can only happen
				3128	* when the caller is running on the CPU being monitored by the session.
				3129	* It does not have to be the owner (ctx_task) of the context per se.
				3130	*/
				3131	if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) {
				3132	DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
				3133	return -EBUSY;
				3134	}
				3135	can_access_pmu = GET_PMU_OWNER() == task \|\| is_system ? 1 : 0;
				3136	}
				3137	expert_mode = pfm_sysctl.expert_mode;
				3138
				3139	for (i = 0; i < count; i++, req++) {
				3140
				3141	cnum = req->reg_num;
				3142	value = req->reg_value;
				3143
				3144	if (!PMD_IS_IMPL(cnum)) {
				3145	DPRINT(("pmd[%u] is unimplemented or invalid\n", cnum));
				3146	goto abort_mission;
				3147	}
				3148	is_counting = PMD_IS_COUNTING(cnum);
				3149	wr_func = pmu_conf->pmd_desc[cnum].write_check;
				3150
				3151	/*
				3152	* execute write checker, if any
				3153	*/
				3154	if (unlikely(expert_mode == 0 && wr_func)) {
				3155	unsigned long v = value;
				3156
				3157	ret = (*wr_func)(task, ctx, cnum, &v, regs);
				3158	if (ret) goto abort_mission;
				3159
				3160	value = v;
				3161	ret = -EINVAL;
				3162	}
				3163
				3164	/*
				3165	* no error on this register
				3166	*/
				3167	PFM_REG_RETFLAG_SET(req->reg_flags, 0);
				3168
				3169	/*
				3170	* now commit changes to software state
				3171	*/
				3172	hw_value = value;
				3173
				3174	/*
				3175	* update virtualized (64bits) counter
				3176	*/
				3177	if (is_counting) {
				3178	/*
				3179	* write context state
				3180	*/
				3181	ctx->ctx_pmds[cnum].lval = value;
				3182
				3183	/*
				3184	* when context is load we use the split value
				3185	*/
				3186	if (is_loaded) {
				3187	hw_value = value & ovfl_mask;
				3188	value = value & ~ovfl_mask;
				3189	}
				3190	}
				3191	/*
				3192	* update reset values (not just for counters)
				3193	*/
				3194	ctx->ctx_pmds[cnum].long_reset = req->reg_long_reset;
				3195	ctx->ctx_pmds[cnum].short_reset = req->reg_short_reset;
				3196
				3197	/*
				3198	* update randomization parameters (not just for counters)
				3199	*/
				3200	ctx->ctx_pmds[cnum].seed = req->reg_random_seed;
				3201	ctx->ctx_pmds[cnum].mask = req->reg_random_mask;
				3202
				3203	/*
				3204	* update context value
				3205	*/
				3206	ctx->ctx_pmds[cnum].val = value;
				3207
				3208	/*
				3209	* Keep track of what we use
				3210	*
				3211	* We do not keep track of PMC because we have to
				3212	* systematically restore ALL of them.
				3213	*/
				3214	CTX_USED_PMD(ctx, PMD_PMD_DEP(cnum));
				3215
				3216	/*
				3217	* mark this PMD register used as well
				3218	*/
				3219	CTX_USED_PMD(ctx, RDEP(cnum));
				3220
				3221	/*
				3222	* make sure we do not try to reset on
				3223	* restart because we have established new values
				3224	*/
				3225	if (is_counting && state == PFM_CTX_MASKED) {
				3226	ctx->ctx_ovfl_regs[0] &= ~1UL << cnum;
				3227	}
				3228
				3229	if (is_loaded) {
				3230	/*
				3231	* write thread state
				3232	*/
				3233	if (is_system == 0) thread->pmds[cnum] = hw_value;
				3234
				3235	/*
				3236	* write hardware register if we can
				3237	*/
				3238	if (can_access_pmu) {
				3239	ia64_set_pmd(cnum, hw_value);
				3240	} else {
				3241	#ifdef CONFIG_SMP
				3242	/*
				3243	* we are guaranteed that the task is not running on the other CPU,
				3244	* we indicate that this PMD will need to be reloaded if the task
				3245	* is rescheduled on the CPU it ran last on.
				3246	*/
				3247	ctx->ctx_reload_pmds[0] \|= 1UL << cnum;
				3248	#endif
				3249	}
				3250	}
				3251
				3252	DPRINT(("pmd[%u]=0x%lx ld=%d apmu=%d, hw_value=0x%lx ctx_pmd=0x%lx short_reset=0x%lx "
				3253	"long_reset=0x%lx notify=%c seed=0x%lx mask=0x%lx used_pmds=0x%lx reset_pmds=0x%lx reload_pmds=0x%lx all_pmds=0x%lx ovfl_regs=0x%lx\n",
				3254	cnum,
				3255	value,
				3256	is_loaded,
				3257	can_access_pmu,
				3258	hw_value,
				3259	ctx->ctx_pmds[cnum].val,
				3260	ctx->ctx_pmds[cnum].short_reset,
				3261	ctx->ctx_pmds[cnum].long_reset,
				3262	PMC_OVFL_NOTIFY(ctx, cnum) ? 'Y':'N',
				3263	ctx->ctx_pmds[cnum].seed,
				3264	ctx->ctx_pmds[cnum].mask,
				3265	ctx->ctx_used_pmds[0],
				3266	ctx->ctx_pmds[cnum].reset_pmds[0],
				3267	ctx->ctx_reload_pmds[0],
				3268	ctx->ctx_all_pmds[0],
				3269	ctx->ctx_ovfl_regs[0]));
				3270	}
				3271
				3272	/*
				3273	* make changes visible
				3274	*/
				3275	if (can_access_pmu) ia64_srlz_d();
				3276
				3277	return 0;
				3278
				3279	abort_mission:
				3280	/*
				3281	* for now, we have only one possibility for error
				3282	*/
				3283	PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL);
				3284	return ret;
				3285	}
				3286
				3287	/*
				3288	* By the way of PROTECT_CONTEXT(), interrupts are masked while we are in this function.
				3289	* Therefore we know, we do not have to worry about the PMU overflow interrupt. If an
				3290	* interrupt is delivered during the call, it will be kept pending until we leave, making
				3291	* it appears as if it had been generated at the UNPROTECT_CONTEXT(). At least we are
				3292	* guaranteed to return consistent data to the user, it may simply be old. It is not
				3293	* trivial to treat the overflow while inside the call because you may end up in
				3294	* some module sampling buffer code causing deadlocks.
				3295	*/
				3296	static int
				3297	pfm_read_pmds(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				3298	{
				3299	struct thread_struct *thread = NULL;
				3300	struct task_struct *task;
				3301	unsigned long val = 0UL, lval, ovfl_mask, sval;
				3302	pfarg_reg_t req = (pfarg_reg_t )arg;
				3303	unsigned int cnum, reg_flags = 0;
				3304	int i, can_access_pmu = 0, state;
				3305	int is_loaded, is_system, is_counting, expert_mode;
				3306	int ret = -EINVAL;
				3307	pfm_reg_check_t rd_func;
				3308
				3309	/*
				3310	* access is possible when loaded only for
				3311	* self-monitoring tasks or in UP mode
				3312	*/
				3313
				3314	state = ctx->ctx_state;
				3315	is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
				3316	is_system = ctx->ctx_fl_system;
				3317	ovfl_mask = pmu_conf->ovfl_val;
				3318	task = ctx->ctx_task;
				3319
				3320	if (state == PFM_CTX_ZOMBIE) return -EINVAL;
				3321
				3322	if (likely(is_loaded)) {
				3323	thread = &task->thread;
				3324	/*
				3325	* In system wide and when the context is loaded, access can only happen
				3326	* when the caller is running on the CPU being monitored by the session.
				3327	* It does not have to be the owner (ctx_task) of the context per se.
				3328	*/
				3329	if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) {
				3330	DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
				3331	return -EBUSY;
				3332	}
				3333	/*
				3334	* this can be true when not self-monitoring only in UP
				3335	*/
				3336	can_access_pmu = GET_PMU_OWNER() == task \|\| is_system ? 1 : 0;
				3337
				3338	if (can_access_pmu) ia64_srlz_d();
				3339	}
				3340	expert_mode = pfm_sysctl.expert_mode;
				3341
				3342	DPRINT(("ld=%d apmu=%d ctx_state=%d\n",
				3343	is_loaded,
				3344	can_access_pmu,
				3345	state));
				3346
				3347	/*
				3348	* on both UP and SMP, we can only read the PMD from the hardware register when
				3349	* the task is the owner of the local PMU.
				3350	*/
				3351
				3352	for (i = 0; i < count; i++, req++) {
				3353
				3354	cnum = req->reg_num;
				3355	reg_flags = req->reg_flags;
				3356
				3357	if (unlikely(!PMD_IS_IMPL(cnum))) goto error;
				3358	/*
				3359	* we can only read the register that we use. That includes
				3360	* the one we explicitely initialize AND the one we want included
				3361	* in the sampling buffer (smpl_regs).
				3362	*
				3363	* Having this restriction allows optimization in the ctxsw routine
				3364	* without compromising security (leaks)
				3365	*/
				3366	if (unlikely(!CTX_IS_USED_PMD(ctx, cnum))) goto error;
				3367
				3368	sval = ctx->ctx_pmds[cnum].val;
				3369	lval = ctx->ctx_pmds[cnum].lval;
				3370	is_counting = PMD_IS_COUNTING(cnum);
				3371
				3372	/*
				3373	* If the task is not the current one, then we check if the
				3374	* PMU state is still in the local live register due to lazy ctxsw.
				3375	* If true, then we read directly from the registers.
				3376	*/
				3377	if (can_access_pmu){
				3378	val = ia64_get_pmd(cnum);
				3379	} else {
				3380	/*
				3381	* context has been saved
				3382	* if context is zombie, then task does not exist anymore.
				3383	* In this case, we use the full value saved in the context (pfm_flush_regs()).
				3384	*/
				3385	val = is_loaded ? thread->pmds[cnum] : 0UL;
				3386	}
				3387	rd_func = pmu_conf->pmd_desc[cnum].read_check;
				3388
				3389	if (is_counting) {
				3390	/*
				3391	* XXX: need to check for overflow when loaded
				3392	*/
				3393	val &= ovfl_mask;
				3394	val += sval;
				3395	}
				3396
				3397	/*
				3398	* execute read checker, if any
				3399	*/
				3400	if (unlikely(expert_mode == 0 && rd_func)) {
				3401	unsigned long v = val;
				3402	ret = (*rd_func)(ctx->ctx_task, ctx, cnum, &v, regs);
				3403	if (ret) goto error;
				3404	val = v;
				3405	ret = -EINVAL;
				3406	}
				3407
				3408	PFM_REG_RETFLAG_SET(reg_flags, 0);
				3409
				3410	DPRINT(("pmd[%u]=0x%lx\n", cnum, val));
				3411
				3412	/*
				3413	* update register return value, abort all if problem during copy.
				3414	* we only modify the reg_flags field. no check mode is fine because
				3415	* access has been verified upfront in sys_perfmonctl().
				3416	*/
				3417	req->reg_value = val;
				3418	req->reg_flags = reg_flags;
				3419	req->reg_last_reset_val = lval;
				3420	}
				3421
				3422	return 0;
				3423
				3424	error:
				3425	PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL);
				3426	return ret;
				3427	}
				3428
				3429	int
				3430	pfm_mod_write_pmcs(struct task_struct task, void req, unsigned int nreq, struct pt_regs *regs)
				3431	{
				3432	pfm_context_t *ctx;
				3433
				3434	if (req == NULL) return -EINVAL;
				3435
				3436	ctx = GET_PMU_CTX();
				3437
				3438	if (ctx == NULL) return -EINVAL;
				3439
				3440	/*
				3441	* for now limit to current task, which is enough when calling
				3442	* from overflow handler
				3443	*/
				3444	if (task != current && ctx->ctx_fl_system == 0) return -EBUSY;
				3445
				3446	return pfm_write_pmcs(ctx, req, nreq, regs);
				3447	}
				3448	EXPORT_SYMBOL(pfm_mod_write_pmcs);
				3449
				3450	int
				3451	pfm_mod_read_pmds(struct task_struct task, void req, unsigned int nreq, struct pt_regs *regs)
				3452	{
				3453	pfm_context_t *ctx;
				3454
				3455	if (req == NULL) return -EINVAL;
				3456
				3457	ctx = GET_PMU_CTX();
				3458
				3459	if (ctx == NULL) return -EINVAL;
				3460
				3461	/*
				3462	* for now limit to current task, which is enough when calling
				3463	* from overflow handler
				3464	*/
				3465	if (task != current && ctx->ctx_fl_system == 0) return -EBUSY;
				3466
				3467	return pfm_read_pmds(ctx, req, nreq, regs);
				3468	}
				3469	EXPORT_SYMBOL(pfm_mod_read_pmds);
				3470
				3471	/*
				3472	* Only call this function when a process it trying to
				3473	* write the debug registers (reading is always allowed)
				3474	*/
				3475	int
				3476	pfm_use_debug_registers(struct task_struct *task)
				3477	{
				3478	pfm_context_t *ctx = task->thread.pfm_context;
				3479	unsigned long flags;
				3480	int ret = 0;
				3481
				3482	if (pmu_conf->use_rr_dbregs == 0) return 0;
				3483
				3484	DPRINT(("called for [%d]\n", task->pid));
				3485
				3486	/*
				3487	* do it only once
				3488	*/
				3489	if (task->thread.flags & IA64_THREAD_DBG_VALID) return 0;
				3490
				3491	/*
				3492	* Even on SMP, we do not need to use an atomic here because
				3493	* the only way in is via ptrace() and this is possible only when the
				3494	* process is stopped. Even in the case where the ctxsw out is not totally
				3495	* completed by the time we come here, there is no way the 'stopped' process
				3496	* could be in the middle of fiddling with the pfm_write_ibr_dbr() routine.
				3497	* So this is always safe.
				3498	*/
				3499	if (ctx && ctx->ctx_fl_using_dbreg == 1) return -1;
				3500
				3501	LOCK_PFS(flags);
				3502
				3503	/*
				3504	* We cannot allow setting breakpoints when system wide monitoring
				3505	* sessions are using the debug registers.
				3506	*/
				3507	if (pfm_sessions.pfs_sys_use_dbregs> 0)
				3508	ret = -1;
				3509	else
				3510	pfm_sessions.pfs_ptrace_use_dbregs++;
				3511
				3512	DPRINT(("ptrace_use_dbregs=%u sys_use_dbregs=%u by [%d] ret = %d\n",
				3513	pfm_sessions.pfs_ptrace_use_dbregs,
				3514	pfm_sessions.pfs_sys_use_dbregs,
				3515	task->pid, ret));
				3516
				3517	UNLOCK_PFS(flags);
				3518
				3519	return ret;
				3520	}
				3521
				3522	/*
				3523	* This function is called for every task that exits with the
				3524	* IA64_THREAD_DBG_VALID set. This indicates a task which was
				3525	* able to use the debug registers for debugging purposes via
				3526	* ptrace(). Therefore we know it was not using them for
				3527	* perfmormance monitoring, so we only decrement the number
				3528	* of "ptraced" debug register users to keep the count up to date
				3529	*/
				3530	int
				3531	pfm_release_debug_registers(struct task_struct *task)
				3532	{
				3533	unsigned long flags;
				3534	int ret;
				3535
				3536	if (pmu_conf->use_rr_dbregs == 0) return 0;
				3537
				3538	LOCK_PFS(flags);
				3539	if (pfm_sessions.pfs_ptrace_use_dbregs == 0) {
				3540	printk(KERN_ERR "perfmon: invalid release for [%d] ptrace_use_dbregs=0\n", task->pid);
				3541	ret = -1;
				3542	} else {
				3543	pfm_sessions.pfs_ptrace_use_dbregs--;
				3544	ret = 0;
				3545	}
				3546	UNLOCK_PFS(flags);
				3547
				3548	return ret;
				3549	}
				3550
				3551	static int
				3552	pfm_restart(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				3553	{
				3554	struct task_struct *task;
				3555	pfm_buffer_fmt_t *fmt;
				3556	pfm_ovfl_ctrl_t rst_ctrl;
				3557	int state, is_system;
				3558	int ret = 0;
				3559
				3560	state = ctx->ctx_state;
				3561	fmt = ctx->ctx_buf_fmt;
				3562	is_system = ctx->ctx_fl_system;
				3563	task = PFM_CTX_TASK(ctx);
				3564
				3565	switch(state) {
				3566	case PFM_CTX_MASKED:
				3567	break;
				3568	case PFM_CTX_LOADED:
				3569	if (CTX_HAS_SMPL(ctx) && fmt->fmt_restart_active) break;
				3570	/* fall through */
				3571	case PFM_CTX_UNLOADED:
				3572	case PFM_CTX_ZOMBIE:
				3573	DPRINT(("invalid state=%d\n", state));
				3574	return -EBUSY;
				3575	default:
				3576	DPRINT(("state=%d, cannot operate (no active_restart handler)\n", state));
				3577	return -EINVAL;
				3578	}
				3579
				3580	/*
				3581	* In system wide and when the context is loaded, access can only happen
				3582	* when the caller is running on the CPU being monitored by the session.
				3583	* It does not have to be the owner (ctx_task) of the context per se.
				3584	*/
				3585	if (is_system && ctx->ctx_cpu != smp_processor_id()) {
				3586	DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
				3587	return -EBUSY;
				3588	}
				3589
				3590	/* sanity check */
				3591	if (unlikely(task == NULL)) {
				3592	printk(KERN_ERR "perfmon: [%d] pfm_restart no task\n", current->pid);
				3593	return -EINVAL;
				3594	}
				3595
				3596	if (task == current \|\| is_system) {
				3597
				3598	fmt = ctx->ctx_buf_fmt;
				3599
				3600	DPRINT(("restarting self %d ovfl=0x%lx\n",
				3601	task->pid,
				3602	ctx->ctx_ovfl_regs[0]));
				3603
				3604	if (CTX_HAS_SMPL(ctx)) {
				3605
				3606	prefetch(ctx->ctx_smpl_hdr);
				3607
				3608	rst_ctrl.bits.mask_monitoring = 0;
				3609	rst_ctrl.bits.reset_ovfl_pmds = 0;
				3610
				3611	if (state == PFM_CTX_LOADED)
				3612	ret = pfm_buf_fmt_restart_active(fmt, task, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
				3613	else
				3614	ret = pfm_buf_fmt_restart(fmt, task, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
				3615	} else {
				3616	rst_ctrl.bits.mask_monitoring = 0;
				3617	rst_ctrl.bits.reset_ovfl_pmds = 1;
				3618	}
				3619
				3620	if (ret == 0) {
				3621	if (rst_ctrl.bits.reset_ovfl_pmds)
				3622	pfm_reset_regs(ctx, ctx->ctx_ovfl_regs, PFM_PMD_LONG_RESET);
				3623
				3624	if (rst_ctrl.bits.mask_monitoring == 0) {
				3625	DPRINT(("resuming monitoring for [%d]\n", task->pid));
				3626
				3627	if (state == PFM_CTX_MASKED) pfm_restore_monitoring(task);
				3628	} else {
				3629	DPRINT(("keeping monitoring stopped for [%d]\n", task->pid));
				3630
				3631	// cannot use pfm_stop_monitoring(task, regs);
				3632	}
				3633	}
				3634	/*
				3635	* clear overflowed PMD mask to remove any stale information
				3636	*/
				3637	ctx->ctx_ovfl_regs[0] = 0UL;
				3638
				3639	/*
				3640	* back to LOADED state
				3641	*/
				3642	ctx->ctx_state = PFM_CTX_LOADED;
				3643
				3644	/*
				3645	* XXX: not really useful for self monitoring
				3646	*/
				3647	ctx->ctx_fl_can_restart = 0;
				3648
				3649	return 0;
				3650	}
				3651
				3652	/*
				3653	* restart another task
				3654	*/
				3655
				3656	/*
				3657	* When PFM_CTX_MASKED, we cannot issue a restart before the previous
				3658	* one is seen by the task.
				3659	*/
				3660	if (state == PFM_CTX_MASKED) {
				3661	if (ctx->ctx_fl_can_restart == 0) return -EINVAL;
				3662	/*
				3663	* will prevent subsequent restart before this one is
				3664	* seen by other task
				3665	*/
				3666	ctx->ctx_fl_can_restart = 0;
				3667	}
				3668
				3669	/*
				3670	* if blocking, then post the semaphore is PFM_CTX_MASKED, i.e.
				3671	* the task is blocked or on its way to block. That's the normal
				3672	* restart path. If the monitoring is not masked, then the task
				3673	* can be actively monitoring and we cannot directly intervene.
				3674	* Therefore we use the trap mechanism to catch the task and
				3675	* force it to reset the buffer/reset PMDs.
				3676	*
				3677	* if non-blocking, then we ensure that the task will go into
				3678	* pfm_handle_work() before returning to user mode.
				3679	*
				3680	* We cannot explicitely reset another task, it MUST always
				3681	* be done by the task itself. This works for system wide because
				3682	* the tool that is controlling the session is logically doing
				3683	* "self-monitoring".
				3684	*/
				3685	if (CTX_OVFL_NOBLOCK(ctx) == 0 && state == PFM_CTX_MASKED) {
				3686	DPRINT(("unblocking [%d] \n", task->pid));
				3687	up(&ctx->ctx_restart_sem);
				3688	} else {
				3689	DPRINT(("[%d] armed exit trap\n", task->pid));
				3690
				3691	ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_RESET;
				3692
				3693	PFM_SET_WORK_PENDING(task, 1);
				3694
				3695	pfm_set_task_notify(task);
				3696
				3697	/*
				3698	* XXX: send reschedule if task runs on another CPU
				3699	*/
				3700	}
				3701	return 0;
				3702	}
				3703
				3704	static int
				3705	pfm_debug(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				3706	{
				3707	unsigned int m = (unsigned int )arg;
				3708
				3709	pfm_sysctl.debug = m == 0 ? 0 : 1;
				3710
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3711	printk(KERN_INFO "perfmon debugging %s (timing reset)\n", pfm_sysctl.debug ? "on" : "off");
				3712
				3713	if (m == 0) {
				3714	memset(pfm_stats, 0, sizeof(pfm_stats));
				3715	for(m=0; m < NR_CPUS; m++) pfm_stats[m].pfm_ovfl_intr_cycles_min = ~0UL;
				3716	}
				3717	return 0;
				3718	}
				3719
				3720	/*
				3721	* arg can be NULL and count can be zero for this function
				3722	*/
				3723	static int
				3724	pfm_write_ibr_dbr(int mode, pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				3725	{
				3726	struct thread_struct *thread = NULL;
				3727	struct task_struct *task;
				3728	pfarg_dbreg_t req = (pfarg_dbreg_t )arg;
				3729	unsigned long flags;
				3730	dbreg_t dbreg;
				3731	unsigned int rnum;
				3732	int first_time;
				3733	int ret = 0, state;
				3734	int i, can_access_pmu = 0;
				3735	int is_system, is_loaded;
				3736
				3737	if (pmu_conf->use_rr_dbregs == 0) return -EINVAL;
				3738
				3739	state = ctx->ctx_state;
				3740	is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
				3741	is_system = ctx->ctx_fl_system;
				3742	task = ctx->ctx_task;
				3743
				3744	if (state == PFM_CTX_ZOMBIE) return -EINVAL;
				3745
				3746	/*
				3747	* on both UP and SMP, we can only write to the PMC when the task is
				3748	* the owner of the local PMU.
				3749	*/
				3750	if (is_loaded) {
				3751	thread = &task->thread;
				3752	/*
				3753	* In system wide and when the context is loaded, access can only happen
				3754	* when the caller is running on the CPU being monitored by the session.
				3755	* It does not have to be the owner (ctx_task) of the context per se.
				3756	*/
				3757	if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) {
				3758	DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
				3759	return -EBUSY;
				3760	}
				3761	can_access_pmu = GET_PMU_OWNER() == task \|\| is_system ? 1 : 0;
				3762	}
				3763
				3764	/*
				3765	* we do not need to check for ipsr.db because we do clear ibr.x, dbr.r, and dbr.w
				3766	* ensuring that no real breakpoint can be installed via this call.
				3767	*
				3768	* IMPORTANT: regs can be NULL in this function
				3769	*/
				3770
				3771	first_time = ctx->ctx_fl_using_dbreg == 0;
				3772
				3773	/*
				3774	* don't bother if we are loaded and task is being debugged
				3775	*/
				3776	if (is_loaded && (thread->flags & IA64_THREAD_DBG_VALID) != 0) {
				3777	DPRINT(("debug registers already in use for [%d]\n", task->pid));
				3778	return -EBUSY;
				3779	}
				3780
				3781	/*
				3782	* check for debug registers in system wide mode
				3783	*
				3784	* If though a check is done in pfm_context_load(),
				3785	* we must repeat it here, in case the registers are
				3786	* written after the context is loaded
				3787	*/
				3788	if (is_loaded) {
				3789	LOCK_PFS(flags);
				3790
				3791	if (first_time && is_system) {
				3792	if (pfm_sessions.pfs_ptrace_use_dbregs)
				3793	ret = -EBUSY;
				3794	else
				3795	pfm_sessions.pfs_sys_use_dbregs++;
				3796	}
				3797	UNLOCK_PFS(flags);
				3798	}
				3799
				3800	if (ret != 0) return ret;
				3801
				3802	/*
				3803	* mark ourself as user of the debug registers for
				3804	* perfmon purposes.
				3805	*/
				3806	ctx->ctx_fl_using_dbreg = 1;
				3807
				3808	/*
				3809	* clear hardware registers to make sure we don't
				3810	* pick up stale state.
				3811	*
				3812	* for a system wide session, we do not use
				3813	* thread.dbr, thread.ibr because this process
				3814	* never leaves the current CPU and the state
				3815	* is shared by all processes running on it
				3816	*/
				3817	if (first_time && can_access_pmu) {
				3818	DPRINT(("[%d] clearing ibrs, dbrs\n", task->pid));
				3819	for (i=0; i < pmu_conf->num_ibrs; i++) {
				3820	ia64_set_ibr(i, 0UL);
				3821	ia64_dv_serialize_instruction();
				3822	}
				3823	ia64_srlz_i();
				3824	for (i=0; i < pmu_conf->num_dbrs; i++) {
				3825	ia64_set_dbr(i, 0UL);
				3826	ia64_dv_serialize_data();
				3827	}
				3828	ia64_srlz_d();
				3829	}
				3830
				3831	/*
				3832	* Now install the values into the registers
				3833	*/
				3834	for (i = 0; i < count; i++, req++) {
				3835
				3836	rnum = req->dbreg_num;
				3837	dbreg.val = req->dbreg_value;
				3838
				3839	ret = -EINVAL;
				3840
				3841	if ((mode == PFM_CODE_RR && rnum >= PFM_NUM_IBRS) \|\| ((mode == PFM_DATA_RR) && rnum >= PFM_NUM_DBRS)) {
				3842	DPRINT(("invalid register %u val=0x%lx mode=%d i=%d count=%d\n",
				3843	rnum, dbreg.val, mode, i, count));
				3844
				3845	goto abort_mission;
				3846	}
				3847
				3848	/*
				3849	* make sure we do not install enabled breakpoint
				3850	*/
				3851	if (rnum & 0x1) {
				3852	if (mode == PFM_CODE_RR)
				3853	dbreg.ibr.ibr_x = 0;
				3854	else
				3855	dbreg.dbr.dbr_r = dbreg.dbr.dbr_w = 0;
				3856	}
				3857
				3858	PFM_REG_RETFLAG_SET(req->dbreg_flags, 0);
				3859
				3860	/*
				3861	* Debug registers, just like PMC, can only be modified
				3862	* by a kernel call. Moreover, perfmon() access to those
				3863	* registers are centralized in this routine. The hardware
				3864	* does not modify the value of these registers, therefore,
				3865	* if we save them as they are written, we can avoid having
				3866	* to save them on context switch out. This is made possible
				3867	* by the fact that when perfmon uses debug registers, ptrace()
				3868	* won't be able to modify them concurrently.
				3869	*/
				3870	if (mode == PFM_CODE_RR) {
				3871	CTX_USED_IBR(ctx, rnum);
				3872
				3873	if (can_access_pmu) {
				3874	ia64_set_ibr(rnum, dbreg.val);
				3875	ia64_dv_serialize_instruction();
				3876	}
				3877
				3878	ctx->ctx_ibrs[rnum] = dbreg.val;
				3879
				3880	DPRINT(("write ibr%u=0x%lx used_ibrs=0x%x ld=%d apmu=%d\n",
				3881	rnum, dbreg.val, ctx->ctx_used_ibrs[0], is_loaded, can_access_pmu));
				3882	} else {
				3883	CTX_USED_DBR(ctx, rnum);
				3884
				3885	if (can_access_pmu) {
				3886	ia64_set_dbr(rnum, dbreg.val);
				3887	ia64_dv_serialize_data();
				3888	}
				3889	ctx->ctx_dbrs[rnum] = dbreg.val;
				3890
				3891	DPRINT(("write dbr%u=0x%lx used_dbrs=0x%x ld=%d apmu=%d\n",
				3892	rnum, dbreg.val, ctx->ctx_used_dbrs[0], is_loaded, can_access_pmu));
				3893	}
				3894	}
				3895
				3896	return 0;
				3897
				3898	abort_mission:
				3899	/*
				3900	* in case it was our first attempt, we undo the global modifications
				3901	*/
				3902	if (first_time) {
				3903	LOCK_PFS(flags);
				3904	if (ctx->ctx_fl_system) {
				3905	pfm_sessions.pfs_sys_use_dbregs--;
				3906	}
				3907	UNLOCK_PFS(flags);
				3908	ctx->ctx_fl_using_dbreg = 0;
				3909	}
				3910	/*
				3911	* install error return flag
				3912	*/
				3913	PFM_REG_RETFLAG_SET(req->dbreg_flags, PFM_REG_RETFL_EINVAL);
				3914
				3915	return ret;
				3916	}
				3917
				3918	static int
				3919	pfm_write_ibrs(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				3920	{
				3921	return pfm_write_ibr_dbr(PFM_CODE_RR, ctx, arg, count, regs);
				3922	}
				3923
				3924	static int
				3925	pfm_write_dbrs(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				3926	{
				3927	return pfm_write_ibr_dbr(PFM_DATA_RR, ctx, arg, count, regs);
				3928	}
				3929
				3930	int
				3931	pfm_mod_write_ibrs(struct task_struct task, void req, unsigned int nreq, struct pt_regs *regs)
				3932	{
				3933	pfm_context_t *ctx;
				3934
				3935	if (req == NULL) return -EINVAL;
				3936
				3937	ctx = GET_PMU_CTX();
				3938
				3939	if (ctx == NULL) return -EINVAL;
				3940
				3941	/*
				3942	* for now limit to current task, which is enough when calling
				3943	* from overflow handler
				3944	*/
				3945	if (task != current && ctx->ctx_fl_system == 0) return -EBUSY;
				3946
				3947	return pfm_write_ibrs(ctx, req, nreq, regs);
				3948	}
				3949	EXPORT_SYMBOL(pfm_mod_write_ibrs);
				3950
				3951	int
				3952	pfm_mod_write_dbrs(struct task_struct task, void req, unsigned int nreq, struct pt_regs *regs)
				3953	{
				3954	pfm_context_t *ctx;
				3955
				3956	if (req == NULL) return -EINVAL;
				3957
				3958	ctx = GET_PMU_CTX();
				3959
				3960	if (ctx == NULL) return -EINVAL;
				3961
				3962	/*
				3963	* for now limit to current task, which is enough when calling
				3964	* from overflow handler
				3965	*/
				3966	if (task != current && ctx->ctx_fl_system == 0) return -EBUSY;
				3967
				3968	return pfm_write_dbrs(ctx, req, nreq, regs);
				3969	}
				3970	EXPORT_SYMBOL(pfm_mod_write_dbrs);
				3971
				3972
				3973	static int
				3974	pfm_get_features(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				3975	{
				3976	pfarg_features_t req = (pfarg_features_t )arg;
				3977
				3978	req->ft_version = PFM_VERSION;
				3979	return 0;
				3980	}
				3981
				3982	static int
				3983	pfm_stop(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				3984	{
				3985	struct pt_regs *tregs;
				3986	struct task_struct *task = PFM_CTX_TASK(ctx);
				3987	int state, is_system;
				3988
				3989	state = ctx->ctx_state;
				3990	is_system = ctx->ctx_fl_system;
				3991
				3992	/*
				3993	* context must be attached to issue the stop command (includes LOADED,MASKED,ZOMBIE)
				3994	*/
				3995	if (state == PFM_CTX_UNLOADED) return -EINVAL;
				3996
				3997	/*
				3998	* In system wide and when the context is loaded, access can only happen
				3999	* when the caller is running on the CPU being monitored by the session.
				4000	* It does not have to be the owner (ctx_task) of the context per se.
				4001	*/
				4002	if (is_system && ctx->ctx_cpu != smp_processor_id()) {
				4003	DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
				4004	return -EBUSY;
				4005	}
				4006	DPRINT(("task [%d] ctx_state=%d is_system=%d\n",
				4007	PFM_CTX_TASK(ctx)->pid,
				4008	state,
				4009	is_system));
				4010	/*
				4011	* in system mode, we need to update the PMU directly
				4012	* and the user level state of the caller, which may not
				4013	* necessarily be the creator of the context.
				4014	*/
				4015	if (is_system) {
				4016	/*
				4017	* Update local PMU first
				4018	*
				4019	* disable dcr pp
				4020	*/
				4021	ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) & ~IA64_DCR_PP);
				4022	ia64_srlz_i();
				4023
				4024	/*
				4025	* update local cpuinfo
				4026	*/
				4027	PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP);
				4028
				4029	/*
				4030	* stop monitoring, does srlz.i
				4031	*/
				4032	pfm_clear_psr_pp();
				4033
				4034	/*
				4035	* stop monitoring in the caller
				4036	*/
				4037	ia64_psr(regs)->pp = 0;
				4038
				4039	return 0;
				4040	}
				4041	/*
				4042	* per-task mode
				4043	*/
				4044
				4045	if (task == current) {
				4046	/* stop monitoring at kernel level */
				4047	pfm_clear_psr_up();
				4048
				4049	/*
				4050	* stop monitoring at the user level
				4051	*/
				4052	ia64_psr(regs)->up = 0;
				4053	} else {
				4054	tregs = ia64_task_regs(task);
				4055
				4056	/*
				4057	* stop monitoring at the user level
				4058	*/
				4059	ia64_psr(tregs)->up = 0;
				4060
				4061	/*
				4062	* monitoring disabled in kernel at next reschedule
				4063	*/
				4064	ctx->ctx_saved_psr_up = 0;
				4065	DPRINT(("task=[%d]\n", task->pid));
				4066	}
				4067	return 0;
				4068	}
				4069
				4070
				4071	static int
				4072	pfm_start(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				4073	{
				4074	struct pt_regs *tregs;
				4075	int state, is_system;
				4076
				4077	state = ctx->ctx_state;
				4078	is_system = ctx->ctx_fl_system;
				4079
				4080	if (state != PFM_CTX_LOADED) return -EINVAL;
				4081
				4082	/*
				4083	* In system wide and when the context is loaded, access can only happen
				4084	* when the caller is running on the CPU being monitored by the session.
				4085	* It does not have to be the owner (ctx_task) of the context per se.
				4086	*/
				4087	if (is_system && ctx->ctx_cpu != smp_processor_id()) {
				4088	DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
				4089	return -EBUSY;
				4090	}
				4091
				4092	/*
				4093	* in system mode, we need to update the PMU directly
				4094	* and the user level state of the caller, which may not
				4095	* necessarily be the creator of the context.
				4096	*/
				4097	if (is_system) {
				4098
				4099	/*
				4100	* set user level psr.pp for the caller
				4101	*/
				4102	ia64_psr(regs)->pp = 1;
				4103
				4104	/*
				4105	* now update the local PMU and cpuinfo
				4106	*/
				4107	PFM_CPUINFO_SET(PFM_CPUINFO_DCR_PP);
				4108
				4109	/*
				4110	* start monitoring at kernel level
				4111	*/
				4112	pfm_set_psr_pp();
				4113
				4114	/* enable dcr pp */
				4115	ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) \| IA64_DCR_PP);
				4116	ia64_srlz_i();
				4117
				4118	return 0;
				4119	}
				4120
				4121	/*
				4122	* per-process mode
				4123	*/
				4124
				4125	if (ctx->ctx_task == current) {
				4126
				4127	/* start monitoring at kernel level */
				4128	pfm_set_psr_up();
				4129
				4130	/*
				4131	* activate monitoring at user level
				4132	*/
				4133	ia64_psr(regs)->up = 1;
				4134
				4135	} else {
				4136	tregs = ia64_task_regs(ctx->ctx_task);
				4137
				4138	/*
				4139	* start monitoring at the kernel level the next
				4140	* time the task is scheduled
				4141	*/
				4142	ctx->ctx_saved_psr_up = IA64_PSR_UP;
				4143
				4144	/*
				4145	* activate monitoring at user level
				4146	*/
				4147	ia64_psr(tregs)->up = 1;
				4148	}
				4149	return 0;
				4150	}
				4151
				4152	static int
				4153	pfm_get_pmc_reset(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				4154	{
				4155	pfarg_reg_t req = (pfarg_reg_t )arg;
				4156	unsigned int cnum;
				4157	int i;
				4158	int ret = -EINVAL;
				4159
				4160	for (i = 0; i < count; i++, req++) {
				4161
				4162	cnum = req->reg_num;
				4163
				4164	if (!PMC_IS_IMPL(cnum)) goto abort_mission;
				4165
				4166	req->reg_value = PMC_DFL_VAL(cnum);
				4167
				4168	PFM_REG_RETFLAG_SET(req->reg_flags, 0);
				4169
				4170	DPRINT(("pmc_reset_val pmc[%u]=0x%lx\n", cnum, req->reg_value));
				4171	}
				4172	return 0;
				4173
				4174	abort_mission:
				4175	PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL);
				4176	return ret;
				4177	}
				4178
				4179	static int
				4180	pfm_check_task_exist(pfm_context_t *ctx)
				4181	{
				4182	struct task_struct g, t;
				4183	int ret = -ESRCH;
				4184
				4185	read_lock(&tasklist_lock);
				4186
				4187	do_each_thread (g, t) {
				4188	if (t->thread.pfm_context == ctx) {
				4189	ret = 0;
				4190	break;
				4191	}
				4192	} while_each_thread (g, t);
				4193
				4194	read_unlock(&tasklist_lock);
				4195
				4196	DPRINT(("pfm_check_task_exist: ret=%d ctx=%p\n", ret, ctx));
				4197
				4198	return ret;
				4199	}
				4200
				4201	static int
				4202	pfm_context_load(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				4203	{
				4204	struct task_struct *task;
				4205	struct thread_struct *thread;
				4206	struct pfm_context_t *old;
				4207	unsigned long flags;
				4208	#ifndef CONFIG_SMP
				4209	struct task_struct *owner_task = NULL;
				4210	#endif
				4211	pfarg_load_t req = (pfarg_load_t )arg;
				4212	unsigned long pmcs_source, pmds_source;
				4213	int the_cpu;
				4214	int ret = 0;
				4215	int state, is_system, set_dbregs = 0;
				4216
				4217	state = ctx->ctx_state;
				4218	is_system = ctx->ctx_fl_system;
				4219	/*
				4220	* can only load from unloaded or terminated state
				4221	*/
				4222	if (state != PFM_CTX_UNLOADED) {
				4223	DPRINT(("cannot load to [%d], invalid ctx_state=%d\n",
				4224	req->load_pid,
				4225	ctx->ctx_state));
stephane eranian	a5a70b7	2005-04-18 11:42:00 -0700	[diff] [blame]	4226	return -EBUSY;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	4227	}
				4228
				4229	DPRINT(("load_pid [%d] using_dbreg=%d\n", req->load_pid, ctx->ctx_fl_using_dbreg));
				4230
				4231	if (CTX_OVFL_NOBLOCK(ctx) == 0 && req->load_pid == current->pid) {
				4232	DPRINT(("cannot use blocking mode on self\n"));
				4233	return -EINVAL;
				4234	}
				4235
				4236	ret = pfm_get_task(ctx, req->load_pid, &task);
				4237	if (ret) {
				4238	DPRINT(("load_pid [%d] get_task=%d\n", req->load_pid, ret));
				4239	return ret;
				4240	}
				4241
				4242	ret = -EINVAL;
				4243
				4244	/*
				4245	* system wide is self monitoring only
				4246	*/
				4247	if (is_system && task != current) {
				4248	DPRINT(("system wide is self monitoring only load_pid=%d\n",
				4249	req->load_pid));
				4250	goto error;
				4251	}
				4252
				4253	thread = &task->thread;
				4254
				4255	ret = 0;
				4256	/*
				4257	* cannot load a context which is using range restrictions,
				4258	* into a task that is being debugged.
				4259	*/
				4260	if (ctx->ctx_fl_using_dbreg) {
				4261	if (thread->flags & IA64_THREAD_DBG_VALID) {
				4262	ret = -EBUSY;
				4263	DPRINT(("load_pid [%d] task is debugged, cannot load range restrictions\n", req->load_pid));
				4264	goto error;
				4265	}
				4266	LOCK_PFS(flags);
				4267
				4268	if (is_system) {
				4269	if (pfm_sessions.pfs_ptrace_use_dbregs) {
				4270	DPRINT(("cannot load [%d] dbregs in use\n", task->pid));
				4271	ret = -EBUSY;
				4272	} else {
				4273	pfm_sessions.pfs_sys_use_dbregs++;
				4274	DPRINT(("load [%d] increased sys_use_dbreg=%u\n", task->pid, pfm_sessions.pfs_sys_use_dbregs));
				4275	set_dbregs = 1;
				4276	}
				4277	}
				4278
				4279	UNLOCK_PFS(flags);
				4280
				4281	if (ret) goto error;
				4282	}
				4283
				4284	/*
				4285	* SMP system-wide monitoring implies self-monitoring.
				4286	*
				4287	* The programming model expects the task to
				4288	* be pinned on a CPU throughout the session.
				4289	* Here we take note of the current CPU at the
				4290	* time the context is loaded. No call from
				4291	* another CPU will be allowed.
				4292	*
				4293	* The pinning via shed_setaffinity()
				4294	* must be done by the calling task prior
				4295	* to this call.
				4296	*
				4297	* systemwide: keep track of CPU this session is supposed to run on
				4298	*/
				4299	the_cpu = ctx->ctx_cpu = smp_processor_id();
				4300
				4301	ret = -EBUSY;
				4302	/*
				4303	* now reserve the session
				4304	*/
				4305	ret = pfm_reserve_session(current, is_system, the_cpu);
				4306	if (ret) goto error;
				4307
				4308	/*
				4309	* task is necessarily stopped at this point.
				4310	*
				4311	* If the previous context was zombie, then it got removed in
				4312	* pfm_save_regs(). Therefore we should not see it here.
				4313	* If we see a context, then this is an active context
				4314	*
				4315	* XXX: needs to be atomic
				4316	*/
				4317	DPRINT(("before cmpxchg() old_ctx=%p new_ctx=%p\n",
				4318	thread->pfm_context, ctx));
				4319
stephane.eranian@hp.com	6bf11e8	2005-07-28 05:18:00 -0700	[diff] [blame]	4320	ret = -EBUSY;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	4321	old = ia64_cmpxchg(acq, &thread->pfm_context, NULL, ctx, sizeof(pfm_context_t *));
				4322	if (old != NULL) {
				4323	DPRINT(("load_pid [%d] already has a context\n", req->load_pid));
				4324	goto error_unres;
				4325	}
				4326
				4327	pfm_reset_msgq(ctx);
				4328
				4329	ctx->ctx_state = PFM_CTX_LOADED;
				4330
				4331	/*
				4332	* link context to task
				4333	*/
				4334	ctx->ctx_task = task;
				4335
				4336	if (is_system) {
				4337	/*
				4338	* we load as stopped
				4339	*/
				4340	PFM_CPUINFO_SET(PFM_CPUINFO_SYST_WIDE);
				4341	PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP);
				4342
				4343	if (ctx->ctx_fl_excl_idle) PFM_CPUINFO_SET(PFM_CPUINFO_EXCL_IDLE);
				4344	} else {
				4345	thread->flags \|= IA64_THREAD_PM_VALID;
				4346	}
				4347
				4348	/*
				4349	* propagate into thread-state
				4350	*/
				4351	pfm_copy_pmds(task, ctx);
				4352	pfm_copy_pmcs(task, ctx);
				4353
				4354	pmcs_source = thread->pmcs;
				4355	pmds_source = thread->pmds;
				4356
				4357	/*
				4358	* always the case for system-wide
				4359	*/
				4360	if (task == current) {
				4361
				4362	if (is_system == 0) {
				4363
				4364	/* allow user level control */
				4365	ia64_psr(regs)->sp = 0;
				4366	DPRINT(("clearing psr.sp for [%d]\n", task->pid));
				4367
				4368	SET_LAST_CPU(ctx, smp_processor_id());
				4369	INC_ACTIVATION();
				4370	SET_ACTIVATION(ctx);
				4371	#ifndef CONFIG_SMP
				4372	/*
				4373	* push the other task out, if any
				4374	*/
				4375	owner_task = GET_PMU_OWNER();
				4376	if (owner_task) pfm_lazy_save_regs(owner_task);
				4377	#endif
				4378	}
				4379	/*
				4380	* load all PMD from ctx to PMU (as opposed to thread state)
				4381	* restore all PMC from ctx to PMU
				4382	*/
				4383	pfm_restore_pmds(pmds_source, ctx->ctx_all_pmds[0]);
				4384	pfm_restore_pmcs(pmcs_source, ctx->ctx_all_pmcs[0]);
				4385
				4386	ctx->ctx_reload_pmcs[0] = 0UL;
				4387	ctx->ctx_reload_pmds[0] = 0UL;
				4388
				4389	/*
				4390	* guaranteed safe by earlier check against DBG_VALID
				4391	*/
				4392	if (ctx->ctx_fl_using_dbreg) {
				4393	pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
				4394	pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
				4395	}
				4396	/*
				4397	* set new ownership
				4398	*/
				4399	SET_PMU_OWNER(task, ctx);
				4400
				4401	DPRINT(("context loaded on PMU for [%d]\n", task->pid));
				4402	} else {
				4403	/*
				4404	* when not current, task MUST be stopped, so this is safe
				4405	*/
				4406	regs = ia64_task_regs(task);
				4407
				4408	/* force a full reload */
				4409	ctx->ctx_last_activation = PFM_INVALID_ACTIVATION;
				4410	SET_LAST_CPU(ctx, -1);
				4411
				4412	/* initial saved psr (stopped) */
				4413	ctx->ctx_saved_psr_up = 0UL;
				4414	ia64_psr(regs)->up = ia64_psr(regs)->pp = 0;
				4415	}
				4416
				4417	ret = 0;
				4418
				4419	error_unres:
				4420	if (ret) pfm_unreserve_session(ctx, ctx->ctx_fl_system, the_cpu);
				4421	error:
				4422	/*
				4423	* we must undo the dbregs setting (for system-wide)
				4424	*/
				4425	if (ret && set_dbregs) {
				4426	LOCK_PFS(flags);
				4427	pfm_sessions.pfs_sys_use_dbregs--;
				4428	UNLOCK_PFS(flags);
				4429	}
				4430	/*
				4431	* release task, there is now a link with the context
				4432	*/
				4433	if (is_system == 0 && task != current) {
				4434	pfm_put_task(task);
				4435
				4436	if (ret == 0) {
				4437	ret = pfm_check_task_exist(ctx);
				4438	if (ret) {
				4439	ctx->ctx_state = PFM_CTX_UNLOADED;
				4440	ctx->ctx_task = NULL;
				4441	}
				4442	}
				4443	}
				4444	return ret;
				4445	}
				4446
				4447	/*
				4448	* in this function, we do not need to increase the use count
				4449	* for the task via get_task_struct(), because we hold the
				4450	* context lock. If the task were to disappear while having
				4451	* a context attached, it would go through pfm_exit_thread()
				4452	* which also grabs the context lock and would therefore be blocked
				4453	* until we are here.
				4454	*/
				4455	static void pfm_flush_pmds(struct task_struct , pfm_context_t ctx);
				4456
				4457	static int
				4458	pfm_context_unload(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				4459	{
				4460	struct task_struct *task = PFM_CTX_TASK(ctx);
				4461	struct pt_regs *tregs;
				4462	int prev_state, is_system;
				4463	int ret;
				4464
				4465	DPRINT(("ctx_state=%d task [%d]\n", ctx->ctx_state, task ? task->pid : -1));
				4466
				4467	prev_state = ctx->ctx_state;
				4468	is_system = ctx->ctx_fl_system;
				4469
				4470	/*
				4471	* unload only when necessary
				4472	*/
				4473	if (prev_state == PFM_CTX_UNLOADED) {
				4474	DPRINT(("ctx_state=%d, nothing to do\n", prev_state));
				4475	return 0;
				4476	}
				4477
				4478	/*
				4479	* clear psr and dcr bits
				4480	*/
				4481	ret = pfm_stop(ctx, NULL, 0, regs);
				4482	if (ret) return ret;
				4483
				4484	ctx->ctx_state = PFM_CTX_UNLOADED;
				4485
				4486	/*
				4487	* in system mode, we need to update the PMU directly
				4488	* and the user level state of the caller, which may not
				4489	* necessarily be the creator of the context.
				4490	*/
				4491	if (is_system) {
				4492
				4493	/*
				4494	* Update cpuinfo
				4495	*
				4496	* local PMU is taken care of in pfm_stop()
				4497	*/
				4498	PFM_CPUINFO_CLEAR(PFM_CPUINFO_SYST_WIDE);
				4499	PFM_CPUINFO_CLEAR(PFM_CPUINFO_EXCL_IDLE);
				4500
				4501	/*
				4502	* save PMDs in context
				4503	* release ownership
				4504	*/
				4505	pfm_flush_pmds(current, ctx);
				4506
				4507	/*
				4508	* at this point we are done with the PMU
				4509	* so we can unreserve the resource.
				4510	*/
				4511	if (prev_state != PFM_CTX_ZOMBIE)
				4512	pfm_unreserve_session(ctx, 1 , ctx->ctx_cpu);
				4513
				4514	/*
				4515	* disconnect context from task
				4516	*/
				4517	task->thread.pfm_context = NULL;
				4518	/*
				4519	* disconnect task from context
				4520	*/
				4521	ctx->ctx_task = NULL;
				4522
				4523	/*
				4524	* There is nothing more to cleanup here.
				4525	*/
				4526	return 0;
				4527	}
				4528
				4529	/*
				4530	* per-task mode
				4531	*/
				4532	tregs = task == current ? regs : ia64_task_regs(task);
				4533
				4534	if (task == current) {
				4535	/*
				4536	* cancel user level control
				4537	*/
				4538	ia64_psr(regs)->sp = 1;
				4539
				4540	DPRINT(("setting psr.sp for [%d]\n", task->pid));
				4541	}
				4542	/*
				4543	* save PMDs to context
				4544	* release ownership
				4545	*/
				4546	pfm_flush_pmds(task, ctx);
				4547
				4548	/*
				4549	* at this point we are done with the PMU
				4550	* so we can unreserve the resource.
				4551	*
				4552	* when state was ZOMBIE, we have already unreserved.
				4553	*/
				4554	if (prev_state != PFM_CTX_ZOMBIE)
				4555	pfm_unreserve_session(ctx, 0 , ctx->ctx_cpu);
				4556
				4557	/*
				4558	* reset activation counter and psr
				4559	*/
				4560	ctx->ctx_last_activation = PFM_INVALID_ACTIVATION;
				4561	SET_LAST_CPU(ctx, -1);
				4562
				4563	/*
				4564	* PMU state will not be restored
				4565	*/
				4566	task->thread.flags &= ~IA64_THREAD_PM_VALID;
				4567
				4568	/*
				4569	* break links between context and task
				4570	*/
				4571	task->thread.pfm_context = NULL;
				4572	ctx->ctx_task = NULL;
				4573
				4574	PFM_SET_WORK_PENDING(task, 0);
				4575
				4576	ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE;
				4577	ctx->ctx_fl_can_restart = 0;
				4578	ctx->ctx_fl_going_zombie = 0;
				4579
				4580	DPRINT(("disconnected [%d] from context\n", task->pid));
				4581
				4582	return 0;
				4583	}
				4584
				4585
				4586	/*
				4587	* called only from exit_thread(): task == current
				4588	* we come here only if current has a context attached (loaded or masked)
				4589	*/
				4590	void
				4591	pfm_exit_thread(struct task_struct *task)
				4592	{
				4593	pfm_context_t *ctx;
				4594	unsigned long flags;
				4595	struct pt_regs *regs = ia64_task_regs(task);
				4596	int ret, state;
				4597	int free_ok = 0;
				4598
				4599	ctx = PFM_GET_CTX(task);
				4600
				4601	PROTECT_CTX(ctx, flags);
				4602
				4603	DPRINT(("state=%d task [%d]\n", ctx->ctx_state, task->pid));
				4604
				4605	state = ctx->ctx_state;
				4606	switch(state) {
				4607	case PFM_CTX_UNLOADED:
				4608	/*
				4609	* only comes to thios function if pfm_context is not NULL, i.e., cannot
				4610	* be in unloaded state
				4611	*/
				4612	printk(KERN_ERR "perfmon: pfm_exit_thread [%d] ctx unloaded\n", task->pid);
				4613	break;
				4614	case PFM_CTX_LOADED:
				4615	case PFM_CTX_MASKED:
				4616	ret = pfm_context_unload(ctx, NULL, 0, regs);
				4617	if (ret) {
				4618	printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task->pid, state, ret);
				4619	}
				4620	DPRINT(("ctx unloaded for current state was %d\n", state));
				4621
				4622	pfm_end_notify_user(ctx);
				4623	break;
				4624	case PFM_CTX_ZOMBIE:
				4625	ret = pfm_context_unload(ctx, NULL, 0, regs);
				4626	if (ret) {
				4627	printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task->pid, state, ret);
				4628	}
				4629	free_ok = 1;
				4630	break;
				4631	default:
				4632	printk(KERN_ERR "perfmon: pfm_exit_thread [%d] unexpected state=%d\n", task->pid, state);
				4633	break;
				4634	}
				4635	UNPROTECT_CTX(ctx, flags);
				4636
				4637	{ u64 psr = pfm_get_psr();
				4638	BUG_ON(psr & (IA64_PSR_UP\|IA64_PSR_PP));
				4639	BUG_ON(GET_PMU_OWNER());
				4640	BUG_ON(ia64_psr(regs)->up);
				4641	BUG_ON(ia64_psr(regs)->pp);
				4642	}
				4643
				4644	/*
				4645	* All memory free operations (especially for vmalloc'ed memory)
				4646	* MUST be done with interrupts ENABLED.
				4647	*/
				4648	if (free_ok) pfm_context_free(ctx);
				4649	}
				4650
				4651	/*
				4652	* functions MUST be listed in the increasing order of their index (see permfon.h)
				4653	*/
				4654	#define PFM_CMD(name, flags, arg_count, arg_type, getsz) { name, #name, flags, arg_count, sizeof(arg_type), getsz }
				4655	#define PFM_CMD_S(name, flags) { name, #name, flags, 0, 0, NULL }
				4656	#define PFM_CMD_PCLRWS (PFM_CMD_FD\|PFM_CMD_ARG_RW\|PFM_CMD_STOP)
				4657	#define PFM_CMD_PCLRW (PFM_CMD_FD\|PFM_CMD_ARG_RW)
				4658	#define PFM_CMD_NONE { NULL, "no-cmd", 0, 0, 0, NULL}
				4659
				4660	static pfm_cmd_desc_t pfm_cmd_tab[]={
				4661	/* 0 */PFM_CMD_NONE,
				4662	/* 1 */PFM_CMD(pfm_write_pmcs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL),
				4663	/* 2 */PFM_CMD(pfm_write_pmds, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL),
				4664	/* 3 */PFM_CMD(pfm_read_pmds, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL),
				4665	/* 4 */PFM_CMD_S(pfm_stop, PFM_CMD_PCLRWS),
				4666	/* 5 */PFM_CMD_S(pfm_start, PFM_CMD_PCLRWS),
				4667	/* 6 */PFM_CMD_NONE,
				4668	/* 7 */PFM_CMD_NONE,
				4669	/* 8 */PFM_CMD(pfm_context_create, PFM_CMD_ARG_RW, 1, pfarg_context_t, pfm_ctx_getsize),
				4670	/* 9 */PFM_CMD_NONE,
				4671	/* 10 */PFM_CMD_S(pfm_restart, PFM_CMD_PCLRW),
				4672	/* 11 */PFM_CMD_NONE,
				4673	/* 12 */PFM_CMD(pfm_get_features, PFM_CMD_ARG_RW, 1, pfarg_features_t, NULL),
				4674	/* 13 */PFM_CMD(pfm_debug, 0, 1, unsigned int, NULL),
				4675	/* 14 */PFM_CMD_NONE,
				4676	/* 15 */PFM_CMD(pfm_get_pmc_reset, PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL),
				4677	/* 16 */PFM_CMD(pfm_context_load, PFM_CMD_PCLRWS, 1, pfarg_load_t, NULL),
				4678	/* 17 */PFM_CMD_S(pfm_context_unload, PFM_CMD_PCLRWS),
				4679	/* 18 */PFM_CMD_NONE,
				4680	/* 19 */PFM_CMD_NONE,
				4681	/* 20 */PFM_CMD_NONE,
				4682	/* 21 */PFM_CMD_NONE,
				4683	/* 22 */PFM_CMD_NONE,
				4684	/* 23 */PFM_CMD_NONE,
				4685	/* 24 */PFM_CMD_NONE,
				4686	/* 25 */PFM_CMD_NONE,
				4687	/* 26 */PFM_CMD_NONE,
				4688	/* 27 */PFM_CMD_NONE,
				4689	/* 28 */PFM_CMD_NONE,
				4690	/* 29 */PFM_CMD_NONE,
				4691	/* 30 */PFM_CMD_NONE,
				4692	/* 31 */PFM_CMD_NONE,
				4693	/* 32 */PFM_CMD(pfm_write_ibrs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_dbreg_t, NULL),
				4694	/* 33 */PFM_CMD(pfm_write_dbrs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_dbreg_t, NULL)
				4695	};
				4696	#define PFM_CMD_COUNT (sizeof(pfm_cmd_tab)/sizeof(pfm_cmd_desc_t))
				4697
				4698	static int
				4699	pfm_check_task_state(pfm_context_t *ctx, int cmd, unsigned long flags)
				4700	{
				4701	struct task_struct *task;
				4702	int state, old_state;
				4703
				4704	recheck:
				4705	state = ctx->ctx_state;
				4706	task = ctx->ctx_task;
				4707
				4708	if (task == NULL) {
				4709	DPRINT(("context %d no task, state=%d\n", ctx->ctx_fd, state));
				4710	return 0;
				4711	}
				4712
				4713	DPRINT(("context %d state=%d [%d] task_state=%ld must_stop=%d\n",
				4714	ctx->ctx_fd,
				4715	state,
				4716	task->pid,
				4717	task->state, PFM_CMD_STOPPED(cmd)));
				4718
				4719	/*
				4720	* self-monitoring always ok.
				4721	*
				4722	* for system-wide the caller can either be the creator of the
				4723	* context (to one to which the context is attached to) OR
				4724	* a task running on the same CPU as the session.
				4725	*/
				4726	if (task == current \|\| ctx->ctx_fl_system) return 0;
				4727
				4728	/*
stephane eranian	a5a70b7	2005-04-18 11:42:00 -0700	[diff] [blame]	4729	* we are monitoring another thread
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	4730	*/
stephane eranian	a5a70b7	2005-04-18 11:42:00 -0700	[diff] [blame]	4731	switch(state) {
				4732	case PFM_CTX_UNLOADED:
				4733	/*
				4734	* if context is UNLOADED we are safe to go
				4735	*/
				4736	return 0;
				4737	case PFM_CTX_ZOMBIE:
				4738	/*
				4739	* no command can operate on a zombie context
				4740	*/
				4741	DPRINT(("cmd %d state zombie cannot operate on context\n", cmd));
				4742	return -EINVAL;
				4743	case PFM_CTX_MASKED:
				4744	/*
				4745	* PMU state has been saved to software even though
				4746	* the thread may still be running.
				4747	*/
				4748	if (cmd != PFM_UNLOAD_CONTEXT) return 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	4749	}
				4750
				4751	/*
				4752	* context is LOADED or MASKED. Some commands may need to have
				4753	* the task stopped.
				4754	*
				4755	* We could lift this restriction for UP but it would mean that
				4756	* the user has no guarantee the task would not run between
				4757	* two successive calls to perfmonctl(). That's probably OK.
				4758	* If this user wants to ensure the task does not run, then
				4759	* the task must be stopped.
				4760	*/
				4761	if (PFM_CMD_STOPPED(cmd)) {
				4762	if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) {
				4763	DPRINT(("[%d] task not in stopped state\n", task->pid));
				4764	return -EBUSY;
				4765	}
				4766	/*
				4767	* task is now stopped, wait for ctxsw out
				4768	*
				4769	* This is an interesting point in the code.
				4770	* We need to unprotect the context because
				4771	* the pfm_save_regs() routines needs to grab
				4772	* the same lock. There are danger in doing
				4773	* this because it leaves a window open for
				4774	* another task to get access to the context
				4775	* and possibly change its state. The one thing
				4776	* that is not possible is for the context to disappear
				4777	* because we are protected by the VFS layer, i.e.,
				4778	* get_fd()/put_fd().
				4779	*/
				4780	old_state = state;
				4781
				4782	UNPROTECT_CTX(ctx, flags);
				4783
				4784	wait_task_inactive(task);
				4785
				4786	PROTECT_CTX(ctx, flags);
				4787
				4788	/*
				4789	* we must recheck to verify if state has changed
				4790	*/
				4791	if (ctx->ctx_state != old_state) {
				4792	DPRINT(("old_state=%d new_state=%d\n", old_state, ctx->ctx_state));
				4793	goto recheck;
				4794	}
				4795	}
				4796	return 0;
				4797	}
				4798
				4799	/*
				4800	* system-call entry point (must return long)
				4801	*/
				4802	asmlinkage long
				4803	sys_perfmonctl (int fd, int cmd, void __user *arg, int count)
				4804	{
				4805	struct file *file = NULL;
				4806	pfm_context_t *ctx = NULL;
				4807	unsigned long flags = 0UL;
				4808	void *args_k = NULL;
				4809	long ret; /* will expand int return types */
				4810	size_t base_sz, sz, xtra_sz = 0;
				4811	int narg, completed_args = 0, call_made = 0, cmd_flags;
				4812	int (func)(pfm_context_t ctx, void arg, int count, struct pt_regs regs);
				4813	int (getsize)(void arg, size_t *sz);
				4814	#define PFM_MAX_ARGSIZE 4096
				4815
				4816	/*
				4817	* reject any call if perfmon was disabled at initialization
				4818	*/
				4819	if (unlikely(pmu_conf == NULL)) return -ENOSYS;
				4820
				4821	if (unlikely(cmd < 0 \|\| cmd >= PFM_CMD_COUNT)) {
				4822	DPRINT(("invalid cmd=%d\n", cmd));
				4823	return -EINVAL;
				4824	}
				4825
				4826	func = pfm_cmd_tab[cmd].cmd_func;
				4827	narg = pfm_cmd_tab[cmd].cmd_narg;
				4828	base_sz = pfm_cmd_tab[cmd].cmd_argsize;
				4829	getsize = pfm_cmd_tab[cmd].cmd_getsize;
				4830	cmd_flags = pfm_cmd_tab[cmd].cmd_flags;
				4831
				4832	if (unlikely(func == NULL)) {
				4833	DPRINT(("invalid cmd=%d\n", cmd));
				4834	return -EINVAL;
				4835	}
				4836
				4837	DPRINT(("cmd=%s idx=%d narg=0x%x argsz=%lu count=%d\n",
				4838	PFM_CMD_NAME(cmd),
				4839	cmd,
				4840	narg,
				4841	base_sz,
				4842	count));
				4843
				4844	/*
				4845	* check if number of arguments matches what the command expects
				4846	*/
				4847	if (unlikely((narg == PFM_CMD_ARG_MANY && count <= 0) \|\| (narg > 0 && narg != count)))
				4848	return -EINVAL;
				4849
				4850	restart_args:
				4851	sz = xtra_sz + base_sz*count;
				4852	/*
				4853	* limit abuse to min page size
				4854	*/
				4855	if (unlikely(sz > PFM_MAX_ARGSIZE)) {
				4856	printk(KERN_ERR "perfmon: [%d] argument too big %lu\n", current->pid, sz);
				4857	return -E2BIG;
				4858	}
				4859
				4860	/*
				4861	* allocate default-sized argument buffer
				4862	*/
				4863	if (likely(count && args_k == NULL)) {
				4864	args_k = kmalloc(PFM_MAX_ARGSIZE, GFP_KERNEL);
				4865	if (args_k == NULL) return -ENOMEM;
				4866	}
				4867
				4868	ret = -EFAULT;
				4869
				4870	/*
				4871	* copy arguments
				4872	*
				4873	* assume sz = 0 for command without parameters
				4874	*/
				4875	if (sz && copy_from_user(args_k, arg, sz)) {
				4876	DPRINT(("cannot copy_from_user %lu bytes @%p\n", sz, arg));
				4877	goto error_args;
				4878	}
				4879
				4880	/*
				4881	* check if command supports extra parameters
				4882	*/
				4883	if (completed_args == 0 && getsize) {
				4884	/*
				4885	* get extra parameters size (based on main argument)
				4886	*/
				4887	ret = (*getsize)(args_k, &xtra_sz);
				4888	if (ret) goto error_args;
				4889
				4890	completed_args = 1;
				4891
				4892	DPRINT(("restart_args sz=%lu xtra_sz=%lu\n", sz, xtra_sz));
				4893
				4894	/* retry if necessary */
				4895	if (likely(xtra_sz)) goto restart_args;
				4896	}
				4897
				4898	if (unlikely((cmd_flags & PFM_CMD_FD) == 0)) goto skip_fd;
				4899
				4900	ret = -EBADF;
				4901
				4902	file = fget(fd);
				4903	if (unlikely(file == NULL)) {
				4904	DPRINT(("invalid fd %d\n", fd));
				4905	goto error_args;
				4906	}
				4907	if (unlikely(PFM_IS_FILE(file) == 0)) {
				4908	DPRINT(("fd %d not related to perfmon\n", fd));
				4909	goto error_args;
				4910	}
				4911
				4912	ctx = (pfm_context_t *)file->private_data;
				4913	if (unlikely(ctx == NULL)) {
				4914	DPRINT(("no context for fd %d\n", fd));
				4915	goto error_args;
				4916	}
				4917	prefetch(&ctx->ctx_state);
				4918
				4919	PROTECT_CTX(ctx, flags);
				4920
				4921	/*
				4922	* check task is stopped
				4923	*/
				4924	ret = pfm_check_task_state(ctx, cmd, flags);
				4925	if (unlikely(ret)) goto abort_locked;
				4926
				4927	skip_fd:
				4928	ret = (*func)(ctx, args_k, count, ia64_task_regs(current));
				4929
				4930	call_made = 1;
				4931
				4932	abort_locked:
				4933	if (likely(ctx)) {
				4934	DPRINT(("context unlocked\n"));
				4935	UNPROTECT_CTX(ctx, flags);
				4936	fput(file);
				4937	}
				4938
				4939	/* copy argument back to user, if needed */
				4940	if (call_made && PFM_CMD_RW_ARG(cmd) && copy_to_user(arg, args_k, base_sz*count)) ret = -EFAULT;
				4941
				4942	error_args:
Jesper Juhl	b2325fe	2005-11-07 01:01:35 -0800	[diff] [blame]	4943	kfree(args_k);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	4944
				4945	DPRINT(("cmd=%s ret=%ld\n", PFM_CMD_NAME(cmd), ret));
				4946
				4947	return ret;
				4948	}
				4949
				4950	static void
				4951	pfm_resume_after_ovfl(pfm_context_t ctx, unsigned long ovfl_regs, struct pt_regs regs)
				4952	{
				4953	pfm_buffer_fmt_t *fmt = ctx->ctx_buf_fmt;
				4954	pfm_ovfl_ctrl_t rst_ctrl;
				4955	int state;
				4956	int ret = 0;
				4957
				4958	state = ctx->ctx_state;
				4959	/*
				4960	* Unlock sampling buffer and reset index atomically
				4961	* XXX: not really needed when blocking
				4962	*/
				4963	if (CTX_HAS_SMPL(ctx)) {
				4964
				4965	rst_ctrl.bits.mask_monitoring = 0;
				4966	rst_ctrl.bits.reset_ovfl_pmds = 0;
				4967
				4968	if (state == PFM_CTX_LOADED)
				4969	ret = pfm_buf_fmt_restart_active(fmt, current, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
				4970	else
				4971	ret = pfm_buf_fmt_restart(fmt, current, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
				4972	} else {
				4973	rst_ctrl.bits.mask_monitoring = 0;
				4974	rst_ctrl.bits.reset_ovfl_pmds = 1;
				4975	}
				4976
				4977	if (ret == 0) {
				4978	if (rst_ctrl.bits.reset_ovfl_pmds) {
				4979	pfm_reset_regs(ctx, &ovfl_regs, PFM_PMD_LONG_RESET);
				4980	}
				4981	if (rst_ctrl.bits.mask_monitoring == 0) {
				4982	DPRINT(("resuming monitoring\n"));
				4983	if (ctx->ctx_state == PFM_CTX_MASKED) pfm_restore_monitoring(current);
				4984	} else {
				4985	DPRINT(("stopping monitoring\n"));
				4986	//pfm_stop_monitoring(current, regs);
				4987	}
				4988	ctx->ctx_state = PFM_CTX_LOADED;
				4989	}
				4990	}
				4991
				4992	/*
				4993	* context MUST BE LOCKED when calling
				4994	* can only be called for current
				4995	*/
				4996	static void
				4997	pfm_context_force_terminate(pfm_context_t ctx, struct pt_regs regs)
				4998	{
				4999	int ret;
				5000
				5001	DPRINT(("entering for [%d]\n", current->pid));
				5002
				5003	ret = pfm_context_unload(ctx, NULL, 0, regs);
				5004	if (ret) {
				5005	printk(KERN_ERR "pfm_context_force_terminate: [%d] unloaded failed with %d\n", current->pid, ret);
				5006	}
				5007
				5008	/*
				5009	* and wakeup controlling task, indicating we are now disconnected
				5010	*/
				5011	wake_up_interruptible(&ctx->ctx_zombieq);
				5012
				5013	/*
				5014	* given that context is still locked, the controlling
				5015	* task will only get access when we return from
				5016	* pfm_handle_work().
				5017	*/
				5018	}
				5019
				5020	static int pfm_ovfl_notify_user(pfm_context_t *ctx, unsigned long ovfl_pmds);
Stephane Eranian	4944930	2005-04-25 13:08:30 -0700	[diff] [blame]	5021	/*
				5022	* pfm_handle_work() can be called with interrupts enabled
				5023	* (TIF_NEED_RESCHED) or disabled. The down_interruptible
				5024	* call may sleep, therefore we must re-enable interrupts
				5025	* to avoid deadlocks. It is safe to do so because this function
				5026	* is called ONLY when returning to user level (PUStk=1), in which case
				5027	* there is no risk of kernel stack overflow due to deep
				5028	* interrupt nesting.
				5029	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5030	void
				5031	pfm_handle_work(void)
				5032	{
				5033	pfm_context_t *ctx;
				5034	struct pt_regs *regs;
Stephane Eranian	4944930	2005-04-25 13:08:30 -0700	[diff] [blame]	5035	unsigned long flags, dummy_flags;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5036	unsigned long ovfl_regs;
				5037	unsigned int reason;
				5038	int ret;
				5039
				5040	ctx = PFM_GET_CTX(current);
				5041	if (ctx == NULL) {
				5042	printk(KERN_ERR "perfmon: [%d] has no PFM context\n", current->pid);
				5043	return;
				5044	}
				5045
				5046	PROTECT_CTX(ctx, flags);
				5047
				5048	PFM_SET_WORK_PENDING(current, 0);
				5049
				5050	pfm_clear_task_notify();
				5051
				5052	regs = ia64_task_regs(current);
				5053
				5054	/*
				5055	* extract reason for being here and clear
				5056	*/
				5057	reason = ctx->ctx_fl_trap_reason;
				5058	ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE;
				5059	ovfl_regs = ctx->ctx_ovfl_regs[0];
				5060
				5061	DPRINT(("reason=%d state=%d\n", reason, ctx->ctx_state));
				5062
				5063	/*
				5064	* must be done before we check for simple-reset mode
				5065	*/
				5066	if (ctx->ctx_fl_going_zombie \|\| ctx->ctx_state == PFM_CTX_ZOMBIE) goto do_zombie;
				5067
				5068
				5069	//if (CTX_OVFL_NOBLOCK(ctx)) goto skip_blocking;
				5070	if (reason == PFM_TRAP_REASON_RESET) goto skip_blocking;
				5071
Stephane Eranian	4944930	2005-04-25 13:08:30 -0700	[diff] [blame]	5072	/*
				5073	* restore interrupt mask to what it was on entry.
				5074	* Could be enabled/diasbled.
				5075	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5076	UNPROTECT_CTX(ctx, flags);
				5077
Stephane Eranian	4944930	2005-04-25 13:08:30 -0700	[diff] [blame]	5078	/*
				5079	* force interrupt enable because of down_interruptible()
				5080	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5081	local_irq_enable();
				5082
				5083	DPRINT(("before block sleeping\n"));
				5084
				5085	/*
				5086	* may go through without blocking on SMP systems
				5087	* if restart has been received already by the time we call down()
				5088	*/
				5089	ret = down_interruptible(&ctx->ctx_restart_sem);
				5090
				5091	DPRINT(("after block sleeping ret=%d\n", ret));
				5092
				5093	/*
Stephane Eranian	4944930	2005-04-25 13:08:30 -0700	[diff] [blame]	5094	* lock context and mask interrupts again
				5095	* We save flags into a dummy because we may have
				5096	* altered interrupts mask compared to entry in this
				5097	* function.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5098	*/
Stephane Eranian	4944930	2005-04-25 13:08:30 -0700	[diff] [blame]	5099	PROTECT_CTX(ctx, dummy_flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5100
				5101	/*
				5102	* we need to read the ovfl_regs only after wake-up
				5103	* because we may have had pfm_write_pmds() in between
				5104	* and that can changed PMD values and therefore
				5105	* ovfl_regs is reset for these new PMD values.
				5106	*/
				5107	ovfl_regs = ctx->ctx_ovfl_regs[0];
				5108
				5109	if (ctx->ctx_fl_going_zombie) {
				5110	do_zombie:
				5111	DPRINT(("context is zombie, bailing out\n"));
				5112	pfm_context_force_terminate(ctx, regs);
				5113	goto nothing_to_do;
				5114	}
				5115	/*
				5116	* in case of interruption of down() we don't restart anything
				5117	*/
				5118	if (ret < 0) goto nothing_to_do;
				5119
				5120	skip_blocking:
				5121	pfm_resume_after_ovfl(ctx, ovfl_regs, regs);
				5122	ctx->ctx_ovfl_regs[0] = 0UL;
				5123
				5124	nothing_to_do:
Stephane Eranian	4944930	2005-04-25 13:08:30 -0700	[diff] [blame]	5125	/*
				5126	* restore flags as they were upon entry
				5127	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5128	UNPROTECT_CTX(ctx, flags);
				5129	}
				5130
				5131	static int
				5132	pfm_notify_user(pfm_context_t ctx, pfm_msg_t msg)
				5133	{
				5134	if (ctx->ctx_state == PFM_CTX_ZOMBIE) {
				5135	DPRINT(("ignoring overflow notification, owner is zombie\n"));
				5136	return 0;
				5137	}
				5138
				5139	DPRINT(("waking up somebody\n"));
				5140
				5141	if (msg) wake_up_interruptible(&ctx->ctx_msgq_wait);
				5142
				5143	/*
				5144	* safe, we are not in intr handler, nor in ctxsw when
				5145	* we come here
				5146	*/
				5147	kill_fasync (&ctx->ctx_async_queue, SIGIO, POLL_IN);
				5148
				5149	return 0;
				5150	}
				5151
				5152	static int
				5153	pfm_ovfl_notify_user(pfm_context_t *ctx, unsigned long ovfl_pmds)
				5154	{
				5155	pfm_msg_t *msg = NULL;
				5156
				5157	if (ctx->ctx_fl_no_msg == 0) {
				5158	msg = pfm_get_new_msg(ctx);
				5159	if (msg == NULL) {
				5160	printk(KERN_ERR "perfmon: pfm_ovfl_notify_user no more notification msgs\n");
				5161	return -1;
				5162	}
				5163
				5164	msg->pfm_ovfl_msg.msg_type = PFM_MSG_OVFL;
				5165	msg->pfm_ovfl_msg.msg_ctx_fd = ctx->ctx_fd;
				5166	msg->pfm_ovfl_msg.msg_active_set = 0;
				5167	msg->pfm_ovfl_msg.msg_ovfl_pmds[0] = ovfl_pmds;
				5168	msg->pfm_ovfl_msg.msg_ovfl_pmds[1] = 0UL;
				5169	msg->pfm_ovfl_msg.msg_ovfl_pmds[2] = 0UL;
				5170	msg->pfm_ovfl_msg.msg_ovfl_pmds[3] = 0UL;
				5171	msg->pfm_ovfl_msg.msg_tstamp = 0UL;
				5172	}
				5173
				5174	DPRINT(("ovfl msg: msg=%p no_msg=%d fd=%d ovfl_pmds=0x%lx\n",
				5175	msg,
				5176	ctx->ctx_fl_no_msg,
				5177	ctx->ctx_fd,
				5178	ovfl_pmds));
				5179
				5180	return pfm_notify_user(ctx, msg);
				5181	}
				5182
				5183	static int
				5184	pfm_end_notify_user(pfm_context_t *ctx)
				5185	{
				5186	pfm_msg_t *msg;
				5187
				5188	msg = pfm_get_new_msg(ctx);
				5189	if (msg == NULL) {
				5190	printk(KERN_ERR "perfmon: pfm_end_notify_user no more notification msgs\n");
				5191	return -1;
				5192	}
				5193	/* no leak */
				5194	memset(msg, 0, sizeof(*msg));
				5195
				5196	msg->pfm_end_msg.msg_type = PFM_MSG_END;
				5197	msg->pfm_end_msg.msg_ctx_fd = ctx->ctx_fd;
				5198	msg->pfm_ovfl_msg.msg_tstamp = 0UL;
				5199
				5200	DPRINT(("end msg: msg=%p no_msg=%d ctx_fd=%d\n",
				5201	msg,
				5202	ctx->ctx_fl_no_msg,
				5203	ctx->ctx_fd));
				5204
				5205	return pfm_notify_user(ctx, msg);
				5206	}
				5207
				5208	/*
				5209	* main overflow processing routine.
				5210	* it can be called from the interrupt path or explicitely during the context switch code
				5211	*/
				5212	static void
				5213	pfm_overflow_handler(struct task_struct task, pfm_context_t ctx, u64 pmc0, struct pt_regs *regs)
				5214	{
				5215	pfm_ovfl_arg_t *ovfl_arg;
				5216	unsigned long mask;
				5217	unsigned long old_val, ovfl_val, new_val;
				5218	unsigned long ovfl_notify = 0UL, ovfl_pmds = 0UL, smpl_pmds = 0UL, reset_pmds;
				5219	unsigned long tstamp;
				5220	pfm_ovfl_ctrl_t ovfl_ctrl;
				5221	unsigned int i, has_smpl;
				5222	int must_notify = 0;
				5223
				5224	if (unlikely(ctx->ctx_state == PFM_CTX_ZOMBIE)) goto stop_monitoring;
				5225
				5226	/*
				5227	* sanity test. Should never happen
				5228	*/
				5229	if (unlikely((pmc0 & 0x1) == 0)) goto sanity_check;
				5230
				5231	tstamp = ia64_get_itc();
				5232	mask = pmc0 >> PMU_FIRST_COUNTER;
				5233	ovfl_val = pmu_conf->ovfl_val;
				5234	has_smpl = CTX_HAS_SMPL(ctx);
				5235
				5236	DPRINT_ovfl(("pmc0=0x%lx pid=%d iip=0x%lx, %s "
				5237	"used_pmds=0x%lx\n",
				5238	pmc0,
				5239	task ? task->pid: -1,
				5240	(regs ? regs->cr_iip : 0),
				5241	CTX_OVFL_NOBLOCK(ctx) ? "nonblocking" : "blocking",
				5242	ctx->ctx_used_pmds[0]));
				5243
				5244
				5245	/*
				5246	* first we update the virtual counters
				5247	* assume there was a prior ia64_srlz_d() issued
				5248	*/
				5249	for (i = PMU_FIRST_COUNTER; mask ; i++, mask >>= 1) {
				5250
				5251	/* skip pmd which did not overflow */
				5252	if ((mask & 0x1) == 0) continue;
				5253
				5254	/*
				5255	* Note that the pmd is not necessarily 0 at this point as qualified events
				5256	* may have happened before the PMU was frozen. The residual count is not
				5257	* taken into consideration here but will be with any read of the pmd via
				5258	* pfm_read_pmds().
				5259	*/
				5260	old_val = new_val = ctx->ctx_pmds[i].val;
				5261	new_val += 1 + ovfl_val;
				5262	ctx->ctx_pmds[i].val = new_val;
				5263
				5264	/*
				5265	* check for overflow condition
				5266	*/
				5267	if (likely(old_val > new_val)) {
				5268	ovfl_pmds \|= 1UL << i;
				5269	if (PMC_OVFL_NOTIFY(ctx, i)) ovfl_notify \|= 1UL << i;
				5270	}
				5271
				5272	DPRINT_ovfl(("ctx_pmd[%d].val=0x%lx old_val=0x%lx pmd=0x%lx ovfl_pmds=0x%lx ovfl_notify=0x%lx\n",
				5273	i,
				5274	new_val,
				5275	old_val,
				5276	ia64_get_pmd(i) & ovfl_val,
				5277	ovfl_pmds,
				5278	ovfl_notify));
				5279	}
				5280
				5281	/*
				5282	* there was no 64-bit overflow, nothing else to do
				5283	*/
				5284	if (ovfl_pmds == 0UL) return;
				5285
				5286	/*
				5287	* reset all control bits
				5288	*/
				5289	ovfl_ctrl.val = 0;
				5290	reset_pmds = 0UL;
				5291
				5292	/*
				5293	* if a sampling format module exists, then we "cache" the overflow by
				5294	* calling the module's handler() routine.
				5295	*/
				5296	if (has_smpl) {
				5297	unsigned long start_cycles, end_cycles;
				5298	unsigned long pmd_mask;
				5299	int j, k, ret = 0;
				5300	int this_cpu = smp_processor_id();
				5301
				5302	pmd_mask = ovfl_pmds >> PMU_FIRST_COUNTER;
				5303	ovfl_arg = &ctx->ctx_ovfl_arg;
				5304
				5305	prefetch(ctx->ctx_smpl_hdr);
				5306
				5307	for(i=PMU_FIRST_COUNTER; pmd_mask && ret == 0; i++, pmd_mask >>=1) {
				5308
				5309	mask = 1UL << i;
				5310
				5311	if ((pmd_mask & 0x1) == 0) continue;
				5312
				5313	ovfl_arg->ovfl_pmd = (unsigned char )i;
				5314	ovfl_arg->ovfl_notify = ovfl_notify & mask ? 1 : 0;
				5315	ovfl_arg->active_set = 0;
				5316	ovfl_arg->ovfl_ctrl.val = 0; /* module must fill in all fields */
				5317	ovfl_arg->smpl_pmds[0] = smpl_pmds = ctx->ctx_pmds[i].smpl_pmds[0];
				5318
				5319	ovfl_arg->pmd_value = ctx->ctx_pmds[i].val;
				5320	ovfl_arg->pmd_last_reset = ctx->ctx_pmds[i].lval;
				5321	ovfl_arg->pmd_eventid = ctx->ctx_pmds[i].eventid;
				5322
				5323	/*
				5324	* copy values of pmds of interest. Sampling format may copy them
				5325	* into sampling buffer.
				5326	*/
				5327	if (smpl_pmds) {
				5328	for(j=0, k=0; smpl_pmds; j++, smpl_pmds >>=1) {
				5329	if ((smpl_pmds & 0x1) == 0) continue;
				5330	ovfl_arg->smpl_pmds_values[k++] = PMD_IS_COUNTING(j) ? pfm_read_soft_counter(ctx, j) : ia64_get_pmd(j);
				5331	DPRINT_ovfl(("smpl_pmd[%d]=pmd%u=0x%lx\n", k-1, j, ovfl_arg->smpl_pmds_values[k-1]));
				5332	}
				5333	}
				5334
				5335	pfm_stats[this_cpu].pfm_smpl_handler_calls++;
				5336
				5337	start_cycles = ia64_get_itc();
				5338
				5339	/*
				5340	* call custom buffer format record (handler) routine
				5341	*/
				5342	ret = (*ctx->ctx_buf_fmt->fmt_handler)(task, ctx->ctx_smpl_hdr, ovfl_arg, regs, tstamp);
				5343
				5344	end_cycles = ia64_get_itc();
				5345
				5346	/*
				5347	* For those controls, we take the union because they have
				5348	* an all or nothing behavior.
				5349	*/
				5350	ovfl_ctrl.bits.notify_user \|= ovfl_arg->ovfl_ctrl.bits.notify_user;
				5351	ovfl_ctrl.bits.block_task \|= ovfl_arg->ovfl_ctrl.bits.block_task;
				5352	ovfl_ctrl.bits.mask_monitoring \|= ovfl_arg->ovfl_ctrl.bits.mask_monitoring;
				5353	/*
				5354	* build the bitmask of pmds to reset now
				5355	*/
				5356	if (ovfl_arg->ovfl_ctrl.bits.reset_ovfl_pmds) reset_pmds \|= mask;
				5357
				5358	pfm_stats[this_cpu].pfm_smpl_handler_cycles += end_cycles - start_cycles;
				5359	}
				5360	/*
				5361	* when the module cannot handle the rest of the overflows, we abort right here
				5362	*/
				5363	if (ret && pmd_mask) {
				5364	DPRINT(("handler aborts leftover ovfl_pmds=0x%lx\n",
				5365	pmd_mask<<PMU_FIRST_COUNTER));
				5366	}
				5367	/*
				5368	* remove the pmds we reset now from the set of pmds to reset in pfm_restart()
				5369	*/
				5370	ovfl_pmds &= ~reset_pmds;
				5371	} else {
				5372	/*
				5373	* when no sampling module is used, then the default
				5374	* is to notify on overflow if requested by user
				5375	*/
				5376	ovfl_ctrl.bits.notify_user = ovfl_notify ? 1 : 0;
				5377	ovfl_ctrl.bits.block_task = ovfl_notify ? 1 : 0;
				5378	ovfl_ctrl.bits.mask_monitoring = ovfl_notify ? 1 : 0; /* XXX: change for saturation */
				5379	ovfl_ctrl.bits.reset_ovfl_pmds = ovfl_notify ? 0 : 1;
				5380	/*
				5381	* if needed, we reset all overflowed pmds
				5382	*/
				5383	if (ovfl_notify == 0) reset_pmds = ovfl_pmds;
				5384	}
				5385
				5386	DPRINT_ovfl(("ovfl_pmds=0x%lx reset_pmds=0x%lx\n", ovfl_pmds, reset_pmds));
				5387
				5388	/*
				5389	* reset the requested PMD registers using the short reset values
				5390	*/
				5391	if (reset_pmds) {
				5392	unsigned long bm = reset_pmds;
				5393	pfm_reset_regs(ctx, &bm, PFM_PMD_SHORT_RESET);
				5394	}
				5395
				5396	if (ovfl_notify && ovfl_ctrl.bits.notify_user) {
				5397	/*
				5398	* keep track of what to reset when unblocking
				5399	*/
				5400	ctx->ctx_ovfl_regs[0] = ovfl_pmds;
				5401
				5402	/*
				5403	* check for blocking context
				5404	*/
				5405	if (CTX_OVFL_NOBLOCK(ctx) == 0 && ovfl_ctrl.bits.block_task) {
				5406
				5407	ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_BLOCK;
				5408
				5409	/*
				5410	* set the perfmon specific checking pending work for the task
				5411	*/
				5412	PFM_SET_WORK_PENDING(task, 1);
				5413
				5414	/*
				5415	* when coming from ctxsw, current still points to the
				5416	* previous task, therefore we must work with task and not current.
				5417	*/
				5418	pfm_set_task_notify(task);
				5419	}
				5420	/*
				5421	* defer until state is changed (shorten spin window). the context is locked
				5422	* anyway, so the signal receiver would come spin for nothing.
				5423	*/
				5424	must_notify = 1;
				5425	}
				5426
				5427	DPRINT_ovfl(("owner [%d] pending=%ld reason=%u ovfl_pmds=0x%lx ovfl_notify=0x%lx masked=%d\n",
				5428	GET_PMU_OWNER() ? GET_PMU_OWNER()->pid : -1,
				5429	PFM_GET_WORK_PENDING(task),
				5430	ctx->ctx_fl_trap_reason,
				5431	ovfl_pmds,
				5432	ovfl_notify,
				5433	ovfl_ctrl.bits.mask_monitoring ? 1 : 0));
				5434	/*
				5435	* in case monitoring must be stopped, we toggle the psr bits
				5436	*/
				5437	if (ovfl_ctrl.bits.mask_monitoring) {
				5438	pfm_mask_monitoring(task);
				5439	ctx->ctx_state = PFM_CTX_MASKED;
				5440	ctx->ctx_fl_can_restart = 1;
				5441	}
				5442
				5443	/*
				5444	* send notification now
				5445	*/
				5446	if (must_notify) pfm_ovfl_notify_user(ctx, ovfl_notify);
				5447
				5448	return;
				5449
				5450	sanity_check:
				5451	printk(KERN_ERR "perfmon: CPU%d overflow handler [%d] pmc0=0x%lx\n",
				5452	smp_processor_id(),
				5453	task ? task->pid : -1,
				5454	pmc0);
				5455	return;
				5456
				5457	stop_monitoring:
				5458	/*
				5459	* in SMP, zombie context is never restored but reclaimed in pfm_load_regs().
				5460	* Moreover, zombies are also reclaimed in pfm_save_regs(). Therefore we can
				5461	* come here as zombie only if the task is the current task. In which case, we
				5462	* can access the PMU hardware directly.
				5463	*
				5464	* Note that zombies do have PM_VALID set. So here we do the minimal.
				5465	*
				5466	* In case the context was zombified it could not be reclaimed at the time
				5467	* the monitoring program exited. At this point, the PMU reservation has been
				5468	* returned, the sampiing buffer has been freed. We must convert this call
				5469	* into a spurious interrupt. However, we must also avoid infinite overflows
				5470	* by stopping monitoring for this task. We can only come here for a per-task
				5471	* context. All we need to do is to stop monitoring using the psr bits which
				5472	* are always task private. By re-enabling secure montioring, we ensure that
				5473	* the monitored task will not be able to re-activate monitoring.
				5474	* The task will eventually be context switched out, at which point the context
				5475	* will be reclaimed (that includes releasing ownership of the PMU).
				5476	*
				5477	* So there might be a window of time where the number of per-task session is zero
				5478	* yet one PMU might have a owner and get at most one overflow interrupt for a zombie
				5479	* context. This is safe because if a per-task session comes in, it will push this one
				5480	* out and by the virtue on pfm_save_regs(), this one will disappear. If a system wide
				5481	* session is force on that CPU, given that we use task pinning, pfm_save_regs() will
				5482	* also push our zombie context out.
				5483	*
				5484	* Overall pretty hairy stuff....
				5485	*/
				5486	DPRINT(("ctx is zombie for [%d], converted to spurious\n", task ? task->pid: -1));
				5487	pfm_clear_psr_up();
				5488	ia64_psr(regs)->up = 0;
				5489	ia64_psr(regs)->sp = 1;
				5490	return;
				5491	}
				5492
				5493	static int
				5494	pfm_do_interrupt_handler(int irq, void arg, struct pt_regs regs)
				5495	{
				5496	struct task_struct *task;
				5497	pfm_context_t *ctx;
				5498	unsigned long flags;
				5499	u64 pmc0;
				5500	int this_cpu = smp_processor_id();
				5501	int retval = 0;
				5502
				5503	pfm_stats[this_cpu].pfm_ovfl_intr_count++;
				5504
				5505	/*
				5506	* srlz.d done before arriving here
				5507	*/
				5508	pmc0 = ia64_get_pmc(0);
				5509
				5510	task = GET_PMU_OWNER();
				5511	ctx = GET_PMU_CTX();
				5512
				5513	/*
				5514	* if we have some pending bits set
				5515	* assumes : if any PMC0.bit[63-1] is set, then PMC0.fr = 1
				5516	*/
				5517	if (PMC0_HAS_OVFL(pmc0) && task) {
				5518	/*
				5519	* we assume that pmc0.fr is always set here
				5520	*/
				5521
				5522	/* sanity check */
				5523	if (!ctx) goto report_spurious1;
				5524
				5525	if (ctx->ctx_fl_system == 0 && (task->thread.flags & IA64_THREAD_PM_VALID) == 0)
				5526	goto report_spurious2;
				5527
				5528	PROTECT_CTX_NOPRINT(ctx, flags);
				5529
				5530	pfm_overflow_handler(task, ctx, pmc0, regs);
				5531
				5532	UNPROTECT_CTX_NOPRINT(ctx, flags);
				5533
				5534	} else {
				5535	pfm_stats[this_cpu].pfm_spurious_ovfl_intr_count++;
				5536	retval = -1;
				5537	}
				5538	/*
				5539	* keep it unfrozen at all times
				5540	*/
				5541	pfm_unfreeze_pmu();
				5542
				5543	return retval;
				5544
				5545	report_spurious1:
				5546	printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d has no PFM context\n",
				5547	this_cpu, task->pid);
				5548	pfm_unfreeze_pmu();
				5549	return -1;
				5550	report_spurious2:
				5551	printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d, invalid flag\n",
				5552	this_cpu,
				5553	task->pid);
				5554	pfm_unfreeze_pmu();
				5555	return -1;
				5556	}
				5557
				5558	static irqreturn_t
				5559	pfm_interrupt_handler(int irq, void arg, struct pt_regs regs)
				5560	{
				5561	unsigned long start_cycles, total_cycles;
				5562	unsigned long min, max;
				5563	int this_cpu;
				5564	int ret;
				5565
				5566	this_cpu = get_cpu();
Tony Luck	a1ecf7f	2005-05-18 16:06:00 -0700	[diff] [blame]	5567	if (likely(!pfm_alt_intr_handler)) {
				5568	min = pfm_stats[this_cpu].pfm_ovfl_intr_cycles_min;
				5569	max = pfm_stats[this_cpu].pfm_ovfl_intr_cycles_max;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5570
Tony Luck	a1ecf7f	2005-05-18 16:06:00 -0700	[diff] [blame]	5571	start_cycles = ia64_get_itc();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5572
Tony Luck	a1ecf7f	2005-05-18 16:06:00 -0700	[diff] [blame]	5573	ret = pfm_do_interrupt_handler(irq, arg, regs);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5574
Tony Luck	a1ecf7f	2005-05-18 16:06:00 -0700	[diff] [blame]	5575	total_cycles = ia64_get_itc();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5576
Tony Luck	a1ecf7f	2005-05-18 16:06:00 -0700	[diff] [blame]	5577	/*
				5578	* don't measure spurious interrupts
				5579	*/
				5580	if (likely(ret == 0)) {
				5581	total_cycles -= start_cycles;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5582
Tony Luck	a1ecf7f	2005-05-18 16:06:00 -0700	[diff] [blame]	5583	if (total_cycles < min) pfm_stats[this_cpu].pfm_ovfl_intr_cycles_min = total_cycles;
				5584	if (total_cycles > max) pfm_stats[this_cpu].pfm_ovfl_intr_cycles_max = total_cycles;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5585
Tony Luck	a1ecf7f	2005-05-18 16:06:00 -0700	[diff] [blame]	5586	pfm_stats[this_cpu].pfm_ovfl_intr_cycles += total_cycles;
				5587	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5588	}
Tony Luck	a1ecf7f	2005-05-18 16:06:00 -0700	[diff] [blame]	5589	else {
				5590	(*pfm_alt_intr_handler->handler)(irq, arg, regs);
				5591	}
				5592
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5593	put_cpu_no_resched();
				5594	return IRQ_HANDLED;
				5595	}
				5596
				5597	/*
				5598	* /proc/perfmon interface, for debug only
				5599	*/
				5600
				5601	#define PFM_PROC_SHOW_HEADER ((void *)NR_CPUS+1)
				5602
				5603	static void *
				5604	pfm_proc_start(struct seq_file m, loff_t pos)
				5605	{
				5606	if (*pos == 0) {
				5607	return PFM_PROC_SHOW_HEADER;
				5608	}
				5609
				5610	while (*pos <= NR_CPUS) {
				5611	if (cpu_online(*pos - 1)) {
				5612	return (void )pos;
				5613	}
				5614	++*pos;
				5615	}
				5616	return NULL;
				5617	}
				5618
				5619	static void *
				5620	pfm_proc_next(struct seq_file m, void v, loff_t *pos)
				5621	{
				5622	++*pos;
				5623	return pfm_proc_start(m, pos);
				5624	}
				5625
				5626	static void
				5627	pfm_proc_stop(struct seq_file m, void v)
				5628	{
				5629	}
				5630
				5631	static void
				5632	pfm_proc_show_header(struct seq_file *m)
				5633	{
				5634	struct list_head * pos;
				5635	pfm_buffer_fmt_t * entry;
				5636	unsigned long flags;
				5637
				5638	seq_printf(m,
				5639	"perfmon version : %u.%u\n"
				5640	"model : %s\n"
				5641	"fastctxsw : %s\n"
				5642	"expert mode : %s\n"
				5643	"ovfl_mask : 0x%lx\n"
				5644	"PMU flags : 0x%x\n",
				5645	PFM_VERSION_MAJ, PFM_VERSION_MIN,
				5646	pmu_conf->pmu_name,
				5647	pfm_sysctl.fastctxsw > 0 ? "Yes": "No",
				5648	pfm_sysctl.expert_mode > 0 ? "Yes": "No",
				5649	pmu_conf->ovfl_val,
				5650	pmu_conf->flags);
				5651
				5652	LOCK_PFS(flags);
				5653
				5654	seq_printf(m,
				5655	"proc_sessions : %u\n"
				5656	"sys_sessions : %u\n"
				5657	"sys_use_dbregs : %u\n"
				5658	"ptrace_use_dbregs : %u\n",
				5659	pfm_sessions.pfs_task_sessions,
				5660	pfm_sessions.pfs_sys_sessions,
				5661	pfm_sessions.pfs_sys_use_dbregs,
				5662	pfm_sessions.pfs_ptrace_use_dbregs);
				5663
				5664	UNLOCK_PFS(flags);
				5665
				5666	spin_lock(&pfm_buffer_fmt_lock);
				5667
				5668	list_for_each(pos, &pfm_buffer_fmt_list) {
				5669	entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list);
				5670	seq_printf(m, "format : %02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x %s\n",
				5671	entry->fmt_uuid[0],
				5672	entry->fmt_uuid[1],
				5673	entry->fmt_uuid[2],
				5674	entry->fmt_uuid[3],
				5675	entry->fmt_uuid[4],
				5676	entry->fmt_uuid[5],
				5677	entry->fmt_uuid[6],
				5678	entry->fmt_uuid[7],
				5679	entry->fmt_uuid[8],
				5680	entry->fmt_uuid[9],
				5681	entry->fmt_uuid[10],
				5682	entry->fmt_uuid[11],
				5683	entry->fmt_uuid[12],
				5684	entry->fmt_uuid[13],
				5685	entry->fmt_uuid[14],
				5686	entry->fmt_uuid[15],
				5687	entry->fmt_name);
				5688	}
				5689	spin_unlock(&pfm_buffer_fmt_lock);
				5690
				5691	}
				5692
				5693	static int
				5694	pfm_proc_show(struct seq_file m, void v)
				5695	{
				5696	unsigned long psr;
				5697	unsigned int i;
				5698	int cpu;
				5699
				5700	if (v == PFM_PROC_SHOW_HEADER) {
				5701	pfm_proc_show_header(m);
				5702	return 0;
				5703	}
				5704
				5705	/* show info for CPU (v - 1) */
				5706
				5707	cpu = (long)v - 1;
				5708	seq_printf(m,
				5709	"CPU%-2d overflow intrs : %lu\n"
				5710	"CPU%-2d overflow cycles : %lu\n"
				5711	"CPU%-2d overflow min : %lu\n"
				5712	"CPU%-2d overflow max : %lu\n"
				5713	"CPU%-2d smpl handler calls : %lu\n"
				5714	"CPU%-2d smpl handler cycles : %lu\n"
				5715	"CPU%-2d spurious intrs : %lu\n"
				5716	"CPU%-2d replay intrs : %lu\n"
				5717	"CPU%-2d syst_wide : %d\n"
				5718	"CPU%-2d dcr_pp : %d\n"
				5719	"CPU%-2d exclude idle : %d\n"
				5720	"CPU%-2d owner : %d\n"
				5721	"CPU%-2d context : %p\n"
				5722	"CPU%-2d activations : %lu\n",
				5723	cpu, pfm_stats[cpu].pfm_ovfl_intr_count,
				5724	cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles,
				5725	cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles_min,
				5726	cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles_max,
				5727	cpu, pfm_stats[cpu].pfm_smpl_handler_calls,
				5728	cpu, pfm_stats[cpu].pfm_smpl_handler_cycles,
				5729	cpu, pfm_stats[cpu].pfm_spurious_ovfl_intr_count,
				5730	cpu, pfm_stats[cpu].pfm_replay_ovfl_intr_count,
				5731	cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_SYST_WIDE ? 1 : 0,
				5732	cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_DCR_PP ? 1 : 0,
				5733	cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_EXCL_IDLE ? 1 : 0,
				5734	cpu, pfm_get_cpu_data(pmu_owner, cpu) ? pfm_get_cpu_data(pmu_owner, cpu)->pid: -1,
				5735	cpu, pfm_get_cpu_data(pmu_ctx, cpu),
				5736	cpu, pfm_get_cpu_data(pmu_activation_number, cpu));
				5737
				5738	if (num_online_cpus() == 1 && pfm_sysctl.debug > 0) {
				5739
				5740	psr = pfm_get_psr();
				5741
				5742	ia64_srlz_d();
				5743
				5744	seq_printf(m,
				5745	"CPU%-2d psr : 0x%lx\n"
				5746	"CPU%-2d pmc0 : 0x%lx\n",
				5747	cpu, psr,
				5748	cpu, ia64_get_pmc(0));
				5749
				5750	for (i=0; PMC_IS_LAST(i) == 0; i++) {
				5751	if (PMC_IS_COUNTING(i) == 0) continue;
				5752	seq_printf(m,
				5753	"CPU%-2d pmc%u : 0x%lx\n"
				5754	"CPU%-2d pmd%u : 0x%lx\n",
				5755	cpu, i, ia64_get_pmc(i),
				5756	cpu, i, ia64_get_pmd(i));
				5757	}
				5758	}
				5759	return 0;
				5760	}
				5761
				5762	struct seq_operations pfm_seq_ops = {
				5763	.start = pfm_proc_start,
				5764	.next = pfm_proc_next,
				5765	.stop = pfm_proc_stop,
				5766	.show = pfm_proc_show
				5767	};
				5768
				5769	static int
				5770	pfm_proc_open(struct inode inode, struct file file)
				5771	{
				5772	return seq_open(file, &pfm_seq_ops);
				5773	}
				5774
				5775
				5776	/*
				5777	* we come here as soon as local_cpu_data->pfm_syst_wide is set. this happens
				5778	* during pfm_enable() hence before pfm_start(). We cannot assume monitoring
				5779	* is active or inactive based on mode. We must rely on the value in
				5780	* local_cpu_data->pfm_syst_info
				5781	*/
				5782	void
				5783	pfm_syst_wide_update_task(struct task_struct *task, unsigned long info, int is_ctxswin)
				5784	{
				5785	struct pt_regs *regs;
				5786	unsigned long dcr;
				5787	unsigned long dcr_pp;
				5788
				5789	dcr_pp = info & PFM_CPUINFO_DCR_PP ? 1 : 0;
				5790
				5791	/*
				5792	* pid 0 is guaranteed to be the idle task. There is one such task with pid 0
				5793	* on every CPU, so we can rely on the pid to identify the idle task.
				5794	*/
				5795	if ((info & PFM_CPUINFO_EXCL_IDLE) == 0 \|\| task->pid) {
				5796	regs = ia64_task_regs(task);
				5797	ia64_psr(regs)->pp = is_ctxswin ? dcr_pp : 0;
				5798	return;
				5799	}
				5800	/*
				5801	* if monitoring has started
				5802	*/
				5803	if (dcr_pp) {
				5804	dcr = ia64_getreg(_IA64_REG_CR_DCR);
				5805	/*
				5806	* context switching in?
				5807	*/
				5808	if (is_ctxswin) {
				5809	/* mask monitoring for the idle task */
				5810	ia64_setreg(_IA64_REG_CR_DCR, dcr & ~IA64_DCR_PP);
				5811	pfm_clear_psr_pp();
				5812	ia64_srlz_i();
				5813	return;
				5814	}
				5815	/*
				5816	* context switching out
				5817	* restore monitoring for next task
				5818	*
				5819	* Due to inlining this odd if-then-else construction generates
				5820	* better code.
				5821	*/
				5822	ia64_setreg(_IA64_REG_CR_DCR, dcr \|IA64_DCR_PP);
				5823	pfm_set_psr_pp();
				5824	ia64_srlz_i();
				5825	}
				5826	}
				5827
				5828	#ifdef CONFIG_SMP
				5829
				5830	static void
				5831	pfm_force_cleanup(pfm_context_t ctx, struct pt_regs regs)
				5832	{
				5833	struct task_struct *task = ctx->ctx_task;
				5834
				5835	ia64_psr(regs)->up = 0;
				5836	ia64_psr(regs)->sp = 1;
				5837
				5838	if (GET_PMU_OWNER() == task) {
				5839	DPRINT(("cleared ownership for [%d]\n", ctx->ctx_task->pid));
				5840	SET_PMU_OWNER(NULL, NULL);
				5841	}
				5842
				5843	/*
				5844	* disconnect the task from the context and vice-versa
				5845	*/
				5846	PFM_SET_WORK_PENDING(task, 0);
				5847
				5848	task->thread.pfm_context = NULL;
				5849	task->thread.flags &= ~IA64_THREAD_PM_VALID;
				5850
				5851	DPRINT(("force cleanup for [%d]\n", task->pid));
				5852	}
				5853
				5854
				5855	/*
				5856	* in 2.6, interrupts are masked when we come here and the runqueue lock is held
				5857	*/
				5858	void
				5859	pfm_save_regs(struct task_struct *task)
				5860	{
				5861	pfm_context_t *ctx;
				5862	struct thread_struct *t;
				5863	unsigned long flags;
				5864	u64 psr;
				5865
				5866
				5867	ctx = PFM_GET_CTX(task);
				5868	if (ctx == NULL) return;
				5869	t = &task->thread;
				5870
				5871	/*
				5872	* we always come here with interrupts ALREADY disabled by
				5873	* the scheduler. So we simply need to protect against concurrent
				5874	* access, not CPU concurrency.
				5875	*/
				5876	flags = pfm_protect_ctx_ctxsw(ctx);
				5877
				5878	if (ctx->ctx_state == PFM_CTX_ZOMBIE) {
				5879	struct pt_regs *regs = ia64_task_regs(task);
				5880
				5881	pfm_clear_psr_up();
				5882
				5883	pfm_force_cleanup(ctx, regs);
				5884
				5885	BUG_ON(ctx->ctx_smpl_hdr);
				5886
				5887	pfm_unprotect_ctx_ctxsw(ctx, flags);
				5888
				5889	pfm_context_free(ctx);
				5890	return;
				5891	}
				5892
				5893	/*
				5894	* save current PSR: needed because we modify it
				5895	*/
				5896	ia64_srlz_d();
				5897	psr = pfm_get_psr();
				5898
				5899	BUG_ON(psr & (IA64_PSR_I));
				5900
				5901	/*
				5902	* stop monitoring:
				5903	* This is the last instruction which may generate an overflow
				5904	*
				5905	* We do not need to set psr.sp because, it is irrelevant in kernel.
				5906	* It will be restored from ipsr when going back to user level
				5907	*/
				5908	pfm_clear_psr_up();
				5909
				5910	/*
				5911	* keep a copy of psr.up (for reload)
				5912	*/
				5913	ctx->ctx_saved_psr_up = psr & IA64_PSR_UP;
				5914
				5915	/*
				5916	* release ownership of this PMU.
				5917	* PM interrupts are masked, so nothing
				5918	* can happen.
				5919	*/
				5920	SET_PMU_OWNER(NULL, NULL);
				5921
				5922	/*
				5923	* we systematically save the PMD as we have no
				5924	* guarantee we will be schedule at that same
				5925	* CPU again.
				5926	*/
				5927	pfm_save_pmds(t->pmds, ctx->ctx_used_pmds[0]);
				5928
				5929	/*
				5930	* save pmc0 ia64_srlz_d() done in pfm_save_pmds()
				5931	* we will need it on the restore path to check
				5932	* for pending overflow.
				5933	*/
				5934	t->pmcs[0] = ia64_get_pmc(0);
				5935
				5936	/*
				5937	* unfreeze PMU if had pending overflows
				5938	*/
				5939	if (t->pmcs[0] & ~0x1UL) pfm_unfreeze_pmu();
				5940
				5941	/*
				5942	* finally, allow context access.
				5943	* interrupts will still be masked after this call.
				5944	*/
				5945	pfm_unprotect_ctx_ctxsw(ctx, flags);
				5946	}
				5947
				5948	#else /* !CONFIG_SMP */
				5949	void
				5950	pfm_save_regs(struct task_struct *task)
				5951	{
				5952	pfm_context_t *ctx;
				5953	u64 psr;
				5954
				5955	ctx = PFM_GET_CTX(task);
				5956	if (ctx == NULL) return;
				5957
				5958	/*
				5959	* save current PSR: needed because we modify it
				5960	*/
				5961	psr = pfm_get_psr();
				5962
				5963	BUG_ON(psr & (IA64_PSR_I));
				5964
				5965	/*
				5966	* stop monitoring:
				5967	* This is the last instruction which may generate an overflow
				5968	*
				5969	* We do not need to set psr.sp because, it is irrelevant in kernel.
				5970	* It will be restored from ipsr when going back to user level
				5971	*/
				5972	pfm_clear_psr_up();
				5973
				5974	/*
				5975	* keep a copy of psr.up (for reload)
				5976	*/
				5977	ctx->ctx_saved_psr_up = psr & IA64_PSR_UP;
				5978	}
				5979
				5980	static void
				5981	pfm_lazy_save_regs (struct task_struct *task)
				5982	{
				5983	pfm_context_t *ctx;
				5984	struct thread_struct *t;
				5985	unsigned long flags;
				5986
				5987	{ u64 psr = pfm_get_psr();
				5988	BUG_ON(psr & IA64_PSR_UP);
				5989	}
				5990
				5991	ctx = PFM_GET_CTX(task);
				5992	t = &task->thread;
				5993
				5994	/*
				5995	* we need to mask PMU overflow here to
				5996	* make sure that we maintain pmc0 until
				5997	* we save it. overflow interrupts are
				5998	* treated as spurious if there is no
				5999	* owner.
				6000	*
				6001	* XXX: I don't think this is necessary
				6002	*/
				6003	PROTECT_CTX(ctx,flags);
				6004
				6005	/*
				6006	* release ownership of this PMU.
				6007	* must be done before we save the registers.
				6008	*
				6009	* after this call any PMU interrupt is treated
				6010	* as spurious.
				6011	*/
				6012	SET_PMU_OWNER(NULL, NULL);
				6013
				6014	/*
				6015	* save all the pmds we use
				6016	*/
				6017	pfm_save_pmds(t->pmds, ctx->ctx_used_pmds[0]);
				6018
				6019	/*
				6020	* save pmc0 ia64_srlz_d() done in pfm_save_pmds()
				6021	* it is needed to check for pended overflow
				6022	* on the restore path
				6023	*/
				6024	t->pmcs[0] = ia64_get_pmc(0);
				6025
				6026	/*
				6027	* unfreeze PMU if had pending overflows
				6028	*/
				6029	if (t->pmcs[0] & ~0x1UL) pfm_unfreeze_pmu();
				6030
				6031	/*
				6032	* now get can unmask PMU interrupts, they will
				6033	* be treated as purely spurious and we will not
				6034	* lose any information
				6035	*/
				6036	UNPROTECT_CTX(ctx,flags);
				6037	}
				6038	#endif /* CONFIG_SMP */
				6039
				6040	#ifdef CONFIG_SMP
				6041	/*
				6042	* in 2.6, interrupts are masked when we come here and the runqueue lock is held
				6043	*/
				6044	void
				6045	pfm_load_regs (struct task_struct *task)
				6046	{
				6047	pfm_context_t *ctx;
				6048	struct thread_struct *t;
				6049	unsigned long pmc_mask = 0UL, pmd_mask = 0UL;
				6050	unsigned long flags;
				6051	u64 psr, psr_up;
				6052	int need_irq_resend;
				6053
				6054	ctx = PFM_GET_CTX(task);
				6055	if (unlikely(ctx == NULL)) return;
				6056
				6057	BUG_ON(GET_PMU_OWNER());
				6058
				6059	t = &task->thread;
				6060	/*
				6061	* possible on unload
				6062	*/
				6063	if (unlikely((t->flags & IA64_THREAD_PM_VALID) == 0)) return;
				6064
				6065	/*
				6066	* we always come here with interrupts ALREADY disabled by
				6067	* the scheduler. So we simply need to protect against concurrent
				6068	* access, not CPU concurrency.
				6069	*/
				6070	flags = pfm_protect_ctx_ctxsw(ctx);
				6071	psr = pfm_get_psr();
				6072
				6073	need_irq_resend = pmu_conf->flags & PFM_PMU_IRQ_RESEND;
				6074
				6075	BUG_ON(psr & (IA64_PSR_UP\|IA64_PSR_PP));
				6076	BUG_ON(psr & IA64_PSR_I);
				6077
				6078	if (unlikely(ctx->ctx_state == PFM_CTX_ZOMBIE)) {
				6079	struct pt_regs *regs = ia64_task_regs(task);
				6080
				6081	BUG_ON(ctx->ctx_smpl_hdr);
				6082
				6083	pfm_force_cleanup(ctx, regs);
				6084
				6085	pfm_unprotect_ctx_ctxsw(ctx, flags);
				6086
				6087	/*
				6088	* this one (kmalloc'ed) is fine with interrupts disabled
				6089	*/
				6090	pfm_context_free(ctx);
				6091
				6092	return;
				6093	}
				6094
				6095	/*
				6096	* we restore ALL the debug registers to avoid picking up
				6097	* stale state.
				6098	*/
				6099	if (ctx->ctx_fl_using_dbreg) {
				6100	pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
				6101	pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
				6102	}
				6103	/*
				6104	* retrieve saved psr.up
				6105	*/
				6106	psr_up = ctx->ctx_saved_psr_up;
				6107
				6108	/*
				6109	* if we were the last user of the PMU on that CPU,
				6110	* then nothing to do except restore psr
				6111	*/
				6112	if (GET_LAST_CPU(ctx) == smp_processor_id() && ctx->ctx_last_activation == GET_ACTIVATION()) {
				6113
				6114	/*
				6115	* retrieve partial reload masks (due to user modifications)
				6116	*/
				6117	pmc_mask = ctx->ctx_reload_pmcs[0];
				6118	pmd_mask = ctx->ctx_reload_pmds[0];
				6119
				6120	} else {
				6121	/*
				6122	* To avoid leaking information to the user level when psr.sp=0,
				6123	* we must reload ALL implemented pmds (even the ones we don't use).
				6124	* In the kernel we only allow PFM_READ_PMDS on registers which
				6125	* we initialized or requested (sampling) so there is no risk there.
				6126	*/
				6127	pmd_mask = pfm_sysctl.fastctxsw ? ctx->ctx_used_pmds[0] : ctx->ctx_all_pmds[0];
				6128
				6129	/*
				6130	* ALL accessible PMCs are systematically reloaded, unused registers
				6131	* get their default (from pfm_reset_pmu_state()) values to avoid picking
				6132	* up stale configuration.
				6133	*
				6134	* PMC0 is never in the mask. It is always restored separately.
				6135	*/
				6136	pmc_mask = ctx->ctx_all_pmcs[0];
				6137	}
				6138	/*
				6139	* when context is MASKED, we will restore PMC with plm=0
				6140	* and PMD with stale information, but that's ok, nothing
				6141	* will be captured.
				6142	*
				6143	* XXX: optimize here
				6144	*/
				6145	if (pmd_mask) pfm_restore_pmds(t->pmds, pmd_mask);
				6146	if (pmc_mask) pfm_restore_pmcs(t->pmcs, pmc_mask);
				6147
				6148	/*
				6149	* check for pending overflow at the time the state
				6150	* was saved.
				6151	*/
				6152	if (unlikely(PMC0_HAS_OVFL(t->pmcs[0]))) {
				6153	/*
				6154	* reload pmc0 with the overflow information
				6155	* On McKinley PMU, this will trigger a PMU interrupt
				6156	*/
				6157	ia64_set_pmc(0, t->pmcs[0]);
				6158	ia64_srlz_d();
				6159	t->pmcs[0] = 0UL;
				6160
				6161	/*
				6162	* will replay the PMU interrupt
				6163	*/
				6164	if (need_irq_resend) hw_resend_irq(NULL, IA64_PERFMON_VECTOR);
				6165
				6166	pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++;
				6167	}
				6168
				6169	/*
				6170	* we just did a reload, so we reset the partial reload fields
				6171	*/
				6172	ctx->ctx_reload_pmcs[0] = 0UL;
				6173	ctx->ctx_reload_pmds[0] = 0UL;
				6174
				6175	SET_LAST_CPU(ctx, smp_processor_id());
				6176
				6177	/*
				6178	* dump activation value for this PMU
				6179	*/
				6180	INC_ACTIVATION();
				6181	/*
				6182	* record current activation for this context
				6183	*/
				6184	SET_ACTIVATION(ctx);
				6185
				6186	/*
				6187	* establish new ownership.
				6188	*/
				6189	SET_PMU_OWNER(task, ctx);
				6190
				6191	/*
				6192	* restore the psr.up bit. measurement
				6193	* is active again.
				6194	* no PMU interrupt can happen at this point
				6195	* because we still have interrupts disabled.
				6196	*/
				6197	if (likely(psr_up)) pfm_set_psr_up();
				6198
				6199	/*
				6200	* allow concurrent access to context
				6201	*/
				6202	pfm_unprotect_ctx_ctxsw(ctx, flags);
				6203	}
				6204	#else /* !CONFIG_SMP */
				6205	/*
				6206	* reload PMU state for UP kernels
				6207	* in 2.5 we come here with interrupts disabled
				6208	*/
				6209	void
				6210	pfm_load_regs (struct task_struct *task)
				6211	{
				6212	struct thread_struct *t;
				6213	pfm_context_t *ctx;
				6214	struct task_struct *owner;
				6215	unsigned long pmd_mask, pmc_mask;
				6216	u64 psr, psr_up;
				6217	int need_irq_resend;
				6218
				6219	owner = GET_PMU_OWNER();
				6220	ctx = PFM_GET_CTX(task);
				6221	t = &task->thread;
				6222	psr = pfm_get_psr();
				6223
				6224	BUG_ON(psr & (IA64_PSR_UP\|IA64_PSR_PP));
				6225	BUG_ON(psr & IA64_PSR_I);
				6226
				6227	/*
				6228	* we restore ALL the debug registers to avoid picking up
				6229	* stale state.
				6230	*
				6231	* This must be done even when the task is still the owner
				6232	* as the registers may have been modified via ptrace()
				6233	* (not perfmon) by the previous task.
				6234	*/
				6235	if (ctx->ctx_fl_using_dbreg) {
				6236	pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
				6237	pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
				6238	}
				6239
				6240	/*
				6241	* retrieved saved psr.up
				6242	*/
				6243	psr_up = ctx->ctx_saved_psr_up;
				6244	need_irq_resend = pmu_conf->flags & PFM_PMU_IRQ_RESEND;
				6245
				6246	/*
				6247	* short path, our state is still there, just
				6248	* need to restore psr and we go
				6249	*
				6250	* we do not touch either PMC nor PMD. the psr is not touched
				6251	* by the overflow_handler. So we are safe w.r.t. to interrupt
				6252	* concurrency even without interrupt masking.
				6253	*/
				6254	if (likely(owner == task)) {
				6255	if (likely(psr_up)) pfm_set_psr_up();
				6256	return;
				6257	}
				6258
				6259	/*
				6260	* someone else is still using the PMU, first push it out and
				6261	* then we'll be able to install our stuff !
				6262	*
				6263	* Upon return, there will be no owner for the current PMU
				6264	*/
				6265	if (owner) pfm_lazy_save_regs(owner);
				6266
				6267	/*
				6268	* To avoid leaking information to the user level when psr.sp=0,
				6269	* we must reload ALL implemented pmds (even the ones we don't use).
				6270	* In the kernel we only allow PFM_READ_PMDS on registers which
				6271	* we initialized or requested (sampling) so there is no risk there.
				6272	*/
				6273	pmd_mask = pfm_sysctl.fastctxsw ? ctx->ctx_used_pmds[0] : ctx->ctx_all_pmds[0];
				6274
				6275	/*
				6276	* ALL accessible PMCs are systematically reloaded, unused registers
				6277	* get their default (from pfm_reset_pmu_state()) values to avoid picking
				6278	* up stale configuration.
				6279	*
				6280	* PMC0 is never in the mask. It is always restored separately
				6281	*/
				6282	pmc_mask = ctx->ctx_all_pmcs[0];
				6283
				6284	pfm_restore_pmds(t->pmds, pmd_mask);
				6285	pfm_restore_pmcs(t->pmcs, pmc_mask);
				6286
				6287	/*
				6288	* check for pending overflow at the time the state
				6289	* was saved.
				6290	*/
				6291	if (unlikely(PMC0_HAS_OVFL(t->pmcs[0]))) {
				6292	/*
				6293	* reload pmc0 with the overflow information
				6294	* On McKinley PMU, this will trigger a PMU interrupt
				6295	*/
				6296	ia64_set_pmc(0, t->pmcs[0]);
				6297	ia64_srlz_d();
				6298
				6299	t->pmcs[0] = 0UL;
				6300
				6301	/*
				6302	* will replay the PMU interrupt
				6303	*/
				6304	if (need_irq_resend) hw_resend_irq(NULL, IA64_PERFMON_VECTOR);
				6305
				6306	pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++;
				6307	}
				6308
				6309	/*
				6310	* establish new ownership.
				6311	*/
				6312	SET_PMU_OWNER(task, ctx);
				6313
				6314	/*
				6315	* restore the psr.up bit. measurement
				6316	* is active again.
				6317	* no PMU interrupt can happen at this point
				6318	* because we still have interrupts disabled.
				6319	*/
				6320	if (likely(psr_up)) pfm_set_psr_up();
				6321	}
				6322	#endif /* CONFIG_SMP */
				6323
				6324	/*
				6325	* this function assumes monitoring is stopped
				6326	*/
				6327	static void
				6328	pfm_flush_pmds(struct task_struct task, pfm_context_t ctx)
				6329	{
				6330	u64 pmc0;
				6331	unsigned long mask2, val, pmd_val, ovfl_val;
				6332	int i, can_access_pmu = 0;
				6333	int is_self;
				6334
				6335	/*
				6336	* is the caller the task being monitored (or which initiated the
				6337	* session for system wide measurements)
				6338	*/
				6339	is_self = ctx->ctx_task == task ? 1 : 0;
				6340
				6341	/*
				6342	* can access PMU is task is the owner of the PMU state on the current CPU
				6343	* or if we are running on the CPU bound to the context in system-wide mode
				6344	* (that is not necessarily the task the context is attached to in this mode).
				6345	* In system-wide we always have can_access_pmu true because a task running on an
				6346	* invalid processor is flagged earlier in the call stack (see pfm_stop).
				6347	*/
				6348	can_access_pmu = (GET_PMU_OWNER() == task) \|\| (ctx->ctx_fl_system && ctx->ctx_cpu == smp_processor_id());
				6349	if (can_access_pmu) {
				6350	/*
				6351	* Mark the PMU as not owned
				6352	* This will cause the interrupt handler to do nothing in case an overflow
				6353	* interrupt was in-flight
				6354	* This also guarantees that pmc0 will contain the final state
				6355	* It virtually gives us full control on overflow processing from that point
				6356	* on.
				6357	*/
				6358	SET_PMU_OWNER(NULL, NULL);
				6359	DPRINT(("releasing ownership\n"));
				6360
				6361	/*
				6362	* read current overflow status:
				6363	*
				6364	* we are guaranteed to read the final stable state
				6365	*/
				6366	ia64_srlz_d();
				6367	pmc0 = ia64_get_pmc(0); /* slow */
				6368
				6369	/*
				6370	* reset freeze bit, overflow status information destroyed
				6371	*/
				6372	pfm_unfreeze_pmu();
				6373	} else {
				6374	pmc0 = task->thread.pmcs[0];
				6375	/*
				6376	* clear whatever overflow status bits there were
				6377	*/
				6378	task->thread.pmcs[0] = 0;
				6379	}
				6380	ovfl_val = pmu_conf->ovfl_val;
				6381	/*
				6382	* we save all the used pmds
				6383	* we take care of overflows for counting PMDs
				6384	*
				6385	* XXX: sampling situation is not taken into account here
				6386	*/
				6387	mask2 = ctx->ctx_used_pmds[0];
				6388
				6389	DPRINT(("is_self=%d ovfl_val=0x%lx mask2=0x%lx\n", is_self, ovfl_val, mask2));
				6390
				6391	for (i = 0; mask2; i++, mask2>>=1) {
				6392
				6393	/* skip non used pmds */
				6394	if ((mask2 & 0x1) == 0) continue;
				6395
				6396	/*
				6397	* can access PMU always true in system wide mode
				6398	*/
				6399	val = pmd_val = can_access_pmu ? ia64_get_pmd(i) : task->thread.pmds[i];
				6400
				6401	if (PMD_IS_COUNTING(i)) {
				6402	DPRINT(("[%d] pmd[%d] ctx_pmd=0x%lx hw_pmd=0x%lx\n",
				6403	task->pid,
				6404	i,
				6405	ctx->ctx_pmds[i].val,
				6406	val & ovfl_val));
				6407
				6408	/*
				6409	* we rebuild the full 64 bit value of the counter
				6410	*/
				6411	val = ctx->ctx_pmds[i].val + (val & ovfl_val);
				6412
				6413	/*
				6414	* now everything is in ctx_pmds[] and we need
				6415	* to clear the saved context from save_regs() such that
				6416	* pfm_read_pmds() gets the correct value
				6417	*/
				6418	pmd_val = 0UL;
				6419
				6420	/*
				6421	* take care of overflow inline
				6422	*/
				6423	if (pmc0 & (1UL << i)) {
				6424	val += 1 + ovfl_val;
				6425	DPRINT(("[%d] pmd[%d] overflowed\n", task->pid, i));
				6426	}
				6427	}
				6428
				6429	DPRINT(("[%d] ctx_pmd[%d]=0x%lx pmd_val=0x%lx\n", task->pid, i, val, pmd_val));
				6430
				6431	if (is_self) task->thread.pmds[i] = pmd_val;
				6432
				6433	ctx->ctx_pmds[i].val = val;
				6434	}
				6435	}
				6436
				6437	static struct irqaction perfmon_irqaction = {
				6438	.handler = pfm_interrupt_handler,
				6439	.flags = SA_INTERRUPT,
				6440	.name = "perfmon"
				6441	};
				6442
Tony Luck	a1ecf7f	2005-05-18 16:06:00 -0700	[diff] [blame]	6443	static void
				6444	pfm_alt_save_pmu_state(void *data)
				6445	{
				6446	struct pt_regs *regs;
				6447
				6448	regs = ia64_task_regs(current);
				6449
				6450	DPRINT(("called\n"));
				6451
				6452	/*
				6453	* should not be necessary but
				6454	* let's take not risk
				6455	*/
				6456	pfm_clear_psr_up();
				6457	pfm_clear_psr_pp();
				6458	ia64_psr(regs)->pp = 0;
				6459
				6460	/*
				6461	* This call is required
				6462	* May cause a spurious interrupt on some processors
				6463	*/
				6464	pfm_freeze_pmu();
				6465
				6466	ia64_srlz_d();
				6467	}
				6468
				6469	void
				6470	pfm_alt_restore_pmu_state(void *data)
				6471	{
				6472	struct pt_regs *regs;
				6473
				6474	regs = ia64_task_regs(current);
				6475
				6476	DPRINT(("called\n"));
				6477
				6478	/*
				6479	* put PMU back in state expected
				6480	* by perfmon
				6481	*/
				6482	pfm_clear_psr_up();
				6483	pfm_clear_psr_pp();
				6484	ia64_psr(regs)->pp = 0;
				6485
				6486	/*
				6487	* perfmon runs with PMU unfrozen at all times
				6488	*/
				6489	pfm_unfreeze_pmu();
				6490
				6491	ia64_srlz_d();
				6492	}
				6493
				6494	int
				6495	pfm_install_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl)
				6496	{
				6497	int ret, i;
				6498	int reserve_cpu;
				6499
				6500	/* some sanity checks */
				6501	if (hdl == NULL \|\| hdl->handler == NULL) return -EINVAL;
				6502
				6503	/* do the easy test first */
				6504	if (pfm_alt_intr_handler) return -EBUSY;
				6505
				6506	/* one at a time in the install or remove, just fail the others */
				6507	if (!spin_trylock(&pfm_alt_install_check)) {
				6508	return -EBUSY;
				6509	}
				6510
				6511	/* reserve our session */
				6512	for_each_online_cpu(reserve_cpu) {
				6513	ret = pfm_reserve_session(NULL, 1, reserve_cpu);
				6514	if (ret) goto cleanup_reserve;
				6515	}
				6516
				6517	/* save the current system wide pmu states */
				6518	ret = on_each_cpu(pfm_alt_save_pmu_state, NULL, 0, 1);
				6519	if (ret) {
				6520	DPRINT(("on_each_cpu() failed: %d\n", ret));
				6521	goto cleanup_reserve;
				6522	}
				6523
				6524	/* officially change to the alternate interrupt handler */
				6525	pfm_alt_intr_handler = hdl;
				6526
				6527	spin_unlock(&pfm_alt_install_check);
				6528
				6529	return 0;
				6530
				6531	cleanup_reserve:
				6532	for_each_online_cpu(i) {
				6533	/* don't unreserve more than we reserved */
				6534	if (i >= reserve_cpu) break;
				6535
				6536	pfm_unreserve_session(NULL, 1, i);
				6537	}
				6538
				6539	spin_unlock(&pfm_alt_install_check);
				6540
				6541	return ret;
				6542	}
				6543	EXPORT_SYMBOL_GPL(pfm_install_alt_pmu_interrupt);
				6544
				6545	int
				6546	pfm_remove_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl)
				6547	{
				6548	int i;
				6549	int ret;
				6550
				6551	if (hdl == NULL) return -EINVAL;
				6552
				6553	/* cannot remove someone else's handler! */
				6554	if (pfm_alt_intr_handler != hdl) return -EINVAL;
				6555
				6556	/* one at a time in the install or remove, just fail the others */
				6557	if (!spin_trylock(&pfm_alt_install_check)) {
				6558	return -EBUSY;
				6559	}
				6560
				6561	pfm_alt_intr_handler = NULL;
				6562
				6563	ret = on_each_cpu(pfm_alt_restore_pmu_state, NULL, 0, 1);
				6564	if (ret) {
				6565	DPRINT(("on_each_cpu() failed: %d\n", ret));
				6566	}
				6567
				6568	for_each_online_cpu(i) {
				6569	pfm_unreserve_session(NULL, 1, i);
				6570	}
				6571
				6572	spin_unlock(&pfm_alt_install_check);
				6573
				6574	return 0;
				6575	}
				6576	EXPORT_SYMBOL_GPL(pfm_remove_alt_pmu_interrupt);
				6577
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	6578	/*
				6579	* perfmon initialization routine, called from the initcall() table
				6580	*/
				6581	static int init_pfm_fs(void);
				6582
				6583	static int __init
				6584	pfm_probe_pmu(void)
				6585	{
				6586	pmu_config_t **p;
				6587	int family;
				6588
				6589	family = local_cpu_data->family;
				6590	p = pmu_confs;
				6591
				6592	while(*p) {
				6593	if ((*p)->probe) {
				6594	if ((*p)->probe() == 0) goto found;
				6595	} else if ((p)->pmu_family == family \|\| (p)->pmu_family == 0xff) {
				6596	goto found;
				6597	}
				6598	p++;
				6599	}
				6600	return -1;
				6601	found:
				6602	pmu_conf = *p;
				6603	return 0;
				6604	}
				6605
				6606	static struct file_operations pfm_proc_fops = {
				6607	.open = pfm_proc_open,
				6608	.read = seq_read,
				6609	.llseek = seq_lseek,
				6610	.release = seq_release,
				6611	};
				6612
				6613	int __init
				6614	pfm_init(void)
				6615	{
				6616	unsigned int n, n_counters, i;
				6617
				6618	printk("perfmon: version %u.%u IRQ %u\n",
				6619	PFM_VERSION_MAJ,
				6620	PFM_VERSION_MIN,
				6621	IA64_PERFMON_VECTOR);
				6622
				6623	if (pfm_probe_pmu()) {
				6624	printk(KERN_INFO "perfmon: disabled, there is no support for processor family %d\n",
				6625	local_cpu_data->family);
				6626	return -ENODEV;
				6627	}
				6628
				6629	/*
				6630	* compute the number of implemented PMD/PMC from the
				6631	* description tables
				6632	*/
				6633	n = 0;
				6634	for (i=0; PMC_IS_LAST(i) == 0; i++) {
				6635	if (PMC_IS_IMPL(i) == 0) continue;
				6636	pmu_conf->impl_pmcs[i>>6] \|= 1UL << (i&63);
				6637	n++;
				6638	}
				6639	pmu_conf->num_pmcs = n;
				6640
				6641	n = 0; n_counters = 0;
				6642	for (i=0; PMD_IS_LAST(i) == 0; i++) {
				6643	if (PMD_IS_IMPL(i) == 0) continue;
				6644	pmu_conf->impl_pmds[i>>6] \|= 1UL << (i&63);
				6645	n++;
				6646	if (PMD_IS_COUNTING(i)) n_counters++;
				6647	}
				6648	pmu_conf->num_pmds = n;
				6649	pmu_conf->num_counters = n_counters;
				6650
				6651	/*
				6652	* sanity checks on the number of debug registers
				6653	*/
				6654	if (pmu_conf->use_rr_dbregs) {
				6655	if (pmu_conf->num_ibrs > IA64_NUM_DBG_REGS) {
				6656	printk(KERN_INFO "perfmon: unsupported number of code debug registers (%u)\n", pmu_conf->num_ibrs);
				6657	pmu_conf = NULL;
				6658	return -1;
				6659	}
				6660	if (pmu_conf->num_dbrs > IA64_NUM_DBG_REGS) {
				6661	printk(KERN_INFO "perfmon: unsupported number of data debug registers (%u)\n", pmu_conf->num_ibrs);
				6662	pmu_conf = NULL;
				6663	return -1;
				6664	}
				6665	}
				6666
				6667	printk("perfmon: %s PMU detected, %u PMCs, %u PMDs, %u counters (%lu bits)\n",
				6668	pmu_conf->pmu_name,
				6669	pmu_conf->num_pmcs,
				6670	pmu_conf->num_pmds,
				6671	pmu_conf->num_counters,
				6672	ffz(pmu_conf->ovfl_val));
				6673
				6674	/* sanity check */
				6675	if (pmu_conf->num_pmds >= IA64_NUM_PMD_REGS \|\| pmu_conf->num_pmcs >= IA64_NUM_PMC_REGS) {
				6676	printk(KERN_ERR "perfmon: not enough pmc/pmd, perfmon disabled\n");
				6677	pmu_conf = NULL;
				6678	return -1;
				6679	}
				6680
				6681	/*
				6682	* create /proc/perfmon (mostly for debugging purposes)
				6683	*/
				6684	perfmon_dir = create_proc_entry("perfmon", S_IRUGO, NULL);
				6685	if (perfmon_dir == NULL) {
				6686	printk(KERN_ERR "perfmon: cannot create /proc entry, perfmon disabled\n");
				6687	pmu_conf = NULL;
				6688	return -1;
				6689	}
				6690	/*
				6691	* install customized file operations for /proc/perfmon entry
				6692	*/
				6693	perfmon_dir->proc_fops = &pfm_proc_fops;
				6694
				6695	/*
				6696	* create /proc/sys/kernel/perfmon (for debugging purposes)
				6697	*/
				6698	pfm_sysctl_header = register_sysctl_table(pfm_sysctl_root, 0);
				6699
				6700	/*
				6701	* initialize all our spinlocks
				6702	*/
				6703	spin_lock_init(&pfm_sessions.pfs_lock);
				6704	spin_lock_init(&pfm_buffer_fmt_lock);
				6705
				6706	init_pfm_fs();
				6707
				6708	for(i=0; i < NR_CPUS; i++) pfm_stats[i].pfm_ovfl_intr_cycles_min = ~0UL;
				6709
				6710	return 0;
				6711	}
				6712
				6713	__initcall(pfm_init);
				6714
				6715	/*
				6716	* this function is called before pfm_init()
				6717	*/
				6718	void
				6719	pfm_init_percpu (void)
				6720	{
Ashok Raj	ff74190	2005-11-11 14:32:40 -0800	[diff] [blame^]	6721	static int first_time=1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	6722	/*
				6723	* make sure no measurement is active
				6724	* (may inherit programmed PMCs from EFI).
				6725	*/
				6726	pfm_clear_psr_pp();
				6727	pfm_clear_psr_up();
				6728
				6729	/*
				6730	* we run with the PMU not frozen at all times
				6731	*/
				6732	pfm_unfreeze_pmu();
				6733
Ashok Raj	ff74190	2005-11-11 14:32:40 -0800	[diff] [blame^]	6734	if (first_time) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	6735	register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction);
Ashok Raj	ff74190	2005-11-11 14:32:40 -0800	[diff] [blame^]	6736	first_time=0;
				6737	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	6738
				6739	ia64_setreg(_IA64_REG_CR_PMV, IA64_PERFMON_VECTOR);
				6740	ia64_srlz_d();
				6741	}
				6742
				6743	/*
				6744	* used for debug purposes only
				6745	*/
				6746	void
				6747	dump_pmu_state(const char *from)
				6748	{
				6749	struct task_struct *task;
				6750	struct thread_struct *t;
				6751	struct pt_regs *regs;
				6752	pfm_context_t *ctx;
				6753	unsigned long psr, dcr, info, flags;
				6754	int i, this_cpu;
				6755
				6756	local_irq_save(flags);
				6757
				6758	this_cpu = smp_processor_id();
				6759	regs = ia64_task_regs(current);
				6760	info = PFM_CPUINFO_GET();
				6761	dcr = ia64_getreg(_IA64_REG_CR_DCR);
				6762
				6763	if (info == 0 && ia64_psr(regs)->pp == 0 && (dcr & IA64_DCR_PP) == 0) {
				6764	local_irq_restore(flags);
				6765	return;
				6766	}
				6767
				6768	printk("CPU%d from %s() current [%d] iip=0x%lx %s\n",
				6769	this_cpu,
				6770	from,
				6771	current->pid,
				6772	regs->cr_iip,
				6773	current->comm);
				6774
				6775	task = GET_PMU_OWNER();
				6776	ctx = GET_PMU_CTX();
				6777
				6778	printk("->CPU%d owner [%d] ctx=%p\n", this_cpu, task ? task->pid : -1, ctx);
				6779
				6780	psr = pfm_get_psr();
				6781
				6782	printk("->CPU%d pmc0=0x%lx psr.pp=%d psr.up=%d dcr.pp=%d syst_info=0x%lx user_psr.up=%d user_psr.pp=%d\n",
				6783	this_cpu,
				6784	ia64_get_pmc(0),
				6785	psr & IA64_PSR_PP ? 1 : 0,
				6786	psr & IA64_PSR_UP ? 1 : 0,
				6787	dcr & IA64_DCR_PP ? 1 : 0,
				6788	info,
				6789	ia64_psr(regs)->up,
				6790	ia64_psr(regs)->pp);
				6791
				6792	ia64_psr(regs)->up = 0;
				6793	ia64_psr(regs)->pp = 0;
				6794
				6795	t = &current->thread;
				6796
				6797	for (i=1; PMC_IS_LAST(i) == 0; i++) {
				6798	if (PMC_IS_IMPL(i) == 0) continue;
				6799	printk("->CPU%d pmc[%d]=0x%lx thread_pmc[%d]=0x%lx\n", this_cpu, i, ia64_get_pmc(i), i, t->pmcs[i]);
				6800	}
				6801
				6802	for (i=1; PMD_IS_LAST(i) == 0; i++) {
				6803	if (PMD_IS_IMPL(i) == 0) continue;
				6804	printk("->CPU%d pmd[%d]=0x%lx thread_pmd[%d]=0x%lx\n", this_cpu, i, ia64_get_pmd(i), i, t->pmds[i]);
				6805	}
				6806
				6807	if (ctx) {
				6808	printk("->CPU%d ctx_state=%d vaddr=%p addr=%p fd=%d ctx_task=[%d] saved_psr_up=0x%lx\n",
				6809	this_cpu,
				6810	ctx->ctx_state,
				6811	ctx->ctx_smpl_vaddr,
				6812	ctx->ctx_smpl_hdr,
				6813	ctx->ctx_msgq_head,
				6814	ctx->ctx_msgq_tail,
				6815	ctx->ctx_saved_psr_up);
				6816	}
				6817	local_irq_restore(flags);
				6818	}
				6819
				6820	/*
				6821	* called from process.c:copy_thread(). task is new child.
				6822	*/
				6823	void
				6824	pfm_inherit(struct task_struct task, struct pt_regs regs)
				6825	{
				6826	struct thread_struct *thread;
				6827
				6828	DPRINT(("perfmon: pfm_inherit clearing state for [%d]\n", task->pid));
				6829
				6830	thread = &task->thread;
				6831
				6832	/*
				6833	* cut links inherited from parent (current)
				6834	*/
				6835	thread->pfm_context = NULL;
				6836
				6837	PFM_SET_WORK_PENDING(task, 0);
				6838
				6839	/*
				6840	* the psr bits are already set properly in copy_threads()
				6841	*/
				6842	}
				6843	#else /* !CONFIG_PERFMON */
				6844	asmlinkage long
				6845	sys_perfmonctl (int fd, int cmd, void *arg, int count)
				6846	{
				6847	return -ENOSYS;
				6848	}
				6849	#endif /* CONFIG_PERFMON */