Blame - arch/ia64/kernel/perfmon.c - kernel/msm-4.9

blob: 4ad97b3b39dcc904f24a502576d62b3979edbabf [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* This file implements the perfmon-2 subsystem which is used
				3	* to program the IA-64 Performance Monitoring Unit (PMU).
				4	*
				5	* The initial version of perfmon.c was written by
				6	* Ganesh Venkitachalam, IBM Corp.
				7	*
				8	* Then it was modified for perfmon-1.x by Stephane Eranian and
				9	* David Mosberger, Hewlett Packard Co.
				10	*
				11	* Version Perfmon-2.x is a rewrite of perfmon-1.x
				12	* by Stephane Eranian, Hewlett Packard Co.
				13	*
Tony Luck	a1ecf7f	2005-05-18 16:06:00 -0700	[diff] [blame]	14	* Copyright (C) 1999-2005 Hewlett Packard Co
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	15	* Stephane Eranian <eranian@hpl.hp.com>
				16	* David Mosberger-Tang <davidm@hpl.hp.com>
				17	*
				18	* More information about perfmon available at:
				19	* http://www.hpl.hp.com/research/linux/perfmon
				20	*/
				21
				22	#include <linux/config.h>
				23	#include <linux/module.h>
				24	#include <linux/kernel.h>
				25	#include <linux/sched.h>
				26	#include <linux/interrupt.h>
				27	#include <linux/smp_lock.h>
				28	#include <linux/proc_fs.h>
				29	#include <linux/seq_file.h>
				30	#include <linux/init.h>
				31	#include <linux/vmalloc.h>
				32	#include <linux/mm.h>
				33	#include <linux/sysctl.h>
				34	#include <linux/list.h>
				35	#include <linux/file.h>
				36	#include <linux/poll.h>
				37	#include <linux/vfs.h>
				38	#include <linux/pagemap.h>
				39	#include <linux/mount.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	40	#include <linux/bitops.h>
Dipankar Sarma	badf166	2005-09-09 13:04:10 -0700	[diff] [blame^]	41	#include <linux/rcupdate.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	42
				43	#include <asm/errno.h>
				44	#include <asm/intrinsics.h>
				45	#include <asm/page.h>
				46	#include <asm/perfmon.h>
				47	#include <asm/processor.h>
				48	#include <asm/signal.h>
				49	#include <asm/system.h>
				50	#include <asm/uaccess.h>
				51	#include <asm/delay.h>
				52
				53	#ifdef CONFIG_PERFMON
				54	/*
				55	* perfmon context state
				56	*/
				57	#define PFM_CTX_UNLOADED 1 /* context is not loaded onto any task */
				58	#define PFM_CTX_LOADED 2 /* context is loaded onto a task */
				59	#define PFM_CTX_MASKED 3 /* context is loaded but monitoring is masked due to overflow */
				60	#define PFM_CTX_ZOMBIE 4 /* owner of the context is closing it */
				61
				62	#define PFM_INVALID_ACTIVATION (~0UL)
				63
				64	/*
				65	* depth of message queue
				66	*/
				67	#define PFM_MAX_MSGS 32
				68	#define PFM_CTXQ_EMPTY(g) ((g)->ctx_msgq_head == (g)->ctx_msgq_tail)
				69
				70	/*
				71	* type of a PMU register (bitmask).
				72	* bitmask structure:
				73	* bit0 : register implemented
				74	* bit1 : end marker
				75	* bit2-3 : reserved
				76	* bit4 : pmc has pmc.pm
				77	* bit5 : pmc controls a counter (has pmc.oi), pmd is used as counter
				78	* bit6-7 : register type
				79	* bit8-31: reserved
				80	*/
				81	#define PFM_REG_NOTIMPL 0x0 /* not implemented at all */
				82	#define PFM_REG_IMPL 0x1 /* register implemented */
				83	#define PFM_REG_END 0x2 /* end marker */
				84	#define PFM_REG_MONITOR (0x1<<4\|PFM_REG_IMPL) /* a PMC with a pmc.pm field only */
				85	#define PFM_REG_COUNTING (0x2<<4\|PFM_REG_MONITOR) /* a monitor + pmc.oi+ PMD used as a counter */
				86	#define PFM_REG_CONTROL (0x4<<4\|PFM_REG_IMPL) /* PMU control register */
				87	#define PFM_REG_CONFIG (0x8<<4\|PFM_REG_IMPL) /* configuration register */
				88	#define PFM_REG_BUFFER (0xc<<4\|PFM_REG_IMPL) /* PMD used as buffer */
				89
				90	#define PMC_IS_LAST(i) (pmu_conf->pmc_desc[i].type & PFM_REG_END)
				91	#define PMD_IS_LAST(i) (pmu_conf->pmd_desc[i].type & PFM_REG_END)
				92
				93	#define PMC_OVFL_NOTIFY(ctx, i) ((ctx)->ctx_pmds[i].flags & PFM_REGFL_OVFL_NOTIFY)
				94
				95	/* i assumed unsigned */
				96	#define PMC_IS_IMPL(i) (i< PMU_MAX_PMCS && (pmu_conf->pmc_desc[i].type & PFM_REG_IMPL))
				97	#define PMD_IS_IMPL(i) (i< PMU_MAX_PMDS && (pmu_conf->pmd_desc[i].type & PFM_REG_IMPL))
				98
				99	/* XXX: these assume that register i is implemented */
				100	#define PMD_IS_COUNTING(i) ((pmu_conf->pmd_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING)
				101	#define PMC_IS_COUNTING(i) ((pmu_conf->pmc_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING)
				102	#define PMC_IS_MONITOR(i) ((pmu_conf->pmc_desc[i].type & PFM_REG_MONITOR) == PFM_REG_MONITOR)
				103	#define PMC_IS_CONTROL(i) ((pmu_conf->pmc_desc[i].type & PFM_REG_CONTROL) == PFM_REG_CONTROL)
				104
				105	#define PMC_DFL_VAL(i) pmu_conf->pmc_desc[i].default_value
				106	#define PMC_RSVD_MASK(i) pmu_conf->pmc_desc[i].reserved_mask
				107	#define PMD_PMD_DEP(i) pmu_conf->pmd_desc[i].dep_pmd[0]
				108	#define PMC_PMD_DEP(i) pmu_conf->pmc_desc[i].dep_pmd[0]
				109
				110	#define PFM_NUM_IBRS IA64_NUM_DBG_REGS
				111	#define PFM_NUM_DBRS IA64_NUM_DBG_REGS
				112
				113	#define CTX_OVFL_NOBLOCK(c) ((c)->ctx_fl_block == 0)
				114	#define CTX_HAS_SMPL(c) ((c)->ctx_fl_is_sampling)
				115	#define PFM_CTX_TASK(h) (h)->ctx_task
				116
				117	#define PMU_PMC_OI 5 /* position of pmc.oi bit */
				118
				119	/* XXX: does not support more than 64 PMDs */
				120	#define CTX_USED_PMD(ctx, mask) (ctx)->ctx_used_pmds[0] \|= (mask)
				121	#define CTX_IS_USED_PMD(ctx, c) (((ctx)->ctx_used_pmds[0] & (1UL << (c))) != 0UL)
				122
				123	#define CTX_USED_MONITOR(ctx, mask) (ctx)->ctx_used_monitors[0] \|= (mask)
				124
				125	#define CTX_USED_IBR(ctx,n) (ctx)->ctx_used_ibrs[(n)>>6] \|= 1UL<< ((n) % 64)
				126	#define CTX_USED_DBR(ctx,n) (ctx)->ctx_used_dbrs[(n)>>6] \|= 1UL<< ((n) % 64)
				127	#define CTX_USES_DBREGS(ctx) (((pfm_context_t *)(ctx))->ctx_fl_using_dbreg==1)
				128	#define PFM_CODE_RR 0 /* requesting code range restriction */
				129	#define PFM_DATA_RR 1 /* requestion data range restriction */
				130
				131	#define PFM_CPUINFO_CLEAR(v) pfm_get_cpu_var(pfm_syst_info) &= ~(v)
				132	#define PFM_CPUINFO_SET(v) pfm_get_cpu_var(pfm_syst_info) \|= (v)
				133	#define PFM_CPUINFO_GET() pfm_get_cpu_var(pfm_syst_info)
				134
				135	#define RDEP(x) (1UL<<(x))
				136
				137	/*
				138	* context protection macros
				139	* in SMP:
				140	* - we need to protect against CPU concurrency (spin_lock)
				141	* - we need to protect against PMU overflow interrupts (local_irq_disable)
				142	* in UP:
				143	* - we need to protect against PMU overflow interrupts (local_irq_disable)
				144	*
				145	* spin_lock_irqsave()/spin_lock_irqrestore():
				146	* in SMP: local_irq_disable + spin_lock
				147	* in UP : local_irq_disable
				148	*
				149	* spin_lock()/spin_lock():
				150	* in UP : removed automatically
				151	* in SMP: protect against context accesses from other CPU. interrupts
				152	* are not masked. This is useful for the PMU interrupt handler
				153	* because we know we will not get PMU concurrency in that code.
				154	*/
				155	#define PROTECT_CTX(c, f) \
				156	do { \
				157	DPRINT(("spinlock_irq_save ctx %p by [%d]\n", c, current->pid)); \
				158	spin_lock_irqsave(&(c)->ctx_lock, f); \
				159	DPRINT(("spinlocked ctx %p by [%d]\n", c, current->pid)); \
				160	} while(0)
				161
				162	#define UNPROTECT_CTX(c, f) \
				163	do { \
				164	DPRINT(("spinlock_irq_restore ctx %p by [%d]\n", c, current->pid)); \
				165	spin_unlock_irqrestore(&(c)->ctx_lock, f); \
				166	} while(0)
				167
				168	#define PROTECT_CTX_NOPRINT(c, f) \
				169	do { \
				170	spin_lock_irqsave(&(c)->ctx_lock, f); \
				171	} while(0)
				172
				173
				174	#define UNPROTECT_CTX_NOPRINT(c, f) \
				175	do { \
				176	spin_unlock_irqrestore(&(c)->ctx_lock, f); \
				177	} while(0)
				178
				179
				180	#define PROTECT_CTX_NOIRQ(c) \
				181	do { \
				182	spin_lock(&(c)->ctx_lock); \
				183	} while(0)
				184
				185	#define UNPROTECT_CTX_NOIRQ(c) \
				186	do { \
				187	spin_unlock(&(c)->ctx_lock); \
				188	} while(0)
				189
				190
				191	#ifdef CONFIG_SMP
				192
				193	#define GET_ACTIVATION() pfm_get_cpu_var(pmu_activation_number)
				194	#define INC_ACTIVATION() pfm_get_cpu_var(pmu_activation_number)++
				195	#define SET_ACTIVATION(c) (c)->ctx_last_activation = GET_ACTIVATION()
				196
				197	#else /* !CONFIG_SMP */
				198	#define SET_ACTIVATION(t) do {} while(0)
				199	#define GET_ACTIVATION(t) do {} while(0)
				200	#define INC_ACTIVATION(t) do {} while(0)
				201	#endif /* CONFIG_SMP */
				202
				203	#define SET_PMU_OWNER(t, c) do { pfm_get_cpu_var(pmu_owner) = (t); pfm_get_cpu_var(pmu_ctx) = (c); } while(0)
				204	#define GET_PMU_OWNER() pfm_get_cpu_var(pmu_owner)
				205	#define GET_PMU_CTX() pfm_get_cpu_var(pmu_ctx)
				206
				207	#define LOCK_PFS(g) spin_lock_irqsave(&pfm_sessions.pfs_lock, g)
				208	#define UNLOCK_PFS(g) spin_unlock_irqrestore(&pfm_sessions.pfs_lock, g)
				209
				210	#define PFM_REG_RETFLAG_SET(flags, val) do { flags &= ~PFM_REG_RETFL_MASK; flags \|= (val); } while(0)
				211
				212	/*
				213	* cmp0 must be the value of pmc0
				214	*/
				215	#define PMC0_HAS_OVFL(cmp0) (cmp0 & ~0x1UL)
				216
				217	#define PFMFS_MAGIC 0xa0b4d889
				218
				219	/*
				220	* debugging
				221	*/
				222	#define PFM_DEBUGGING 1
				223	#ifdef PFM_DEBUGGING
				224	#define DPRINT(a) \
				225	do { \
				226	if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), current->pid); printk a; } \
				227	} while (0)
				228
				229	#define DPRINT_ovfl(a) \
				230	do { \
				231	if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), current->pid); printk a; } \
				232	} while (0)
				233	#endif
				234
				235	/*
				236	* 64-bit software counter structure
				237	*
				238	* the next_reset_type is applied to the next call to pfm_reset_regs()
				239	*/
				240	typedef struct {
				241	unsigned long val; /* virtual 64bit counter value */
				242	unsigned long lval; /* last reset value */
				243	unsigned long long_reset; /* reset value on sampling overflow */
				244	unsigned long short_reset; /* reset value on overflow */
				245	unsigned long reset_pmds[4]; /* which other pmds to reset when this counter overflows */
				246	unsigned long smpl_pmds[4]; /* which pmds are accessed when counter overflow */
				247	unsigned long seed; /* seed for random-number generator */
				248	unsigned long mask; /* mask for random-number generator */
				249	unsigned int flags; /* notify/do not notify */
				250	unsigned long eventid; /* overflow event identifier */
				251	} pfm_counter_t;
				252
				253	/*
				254	* context flags
				255	*/
				256	typedef struct {
				257	unsigned int block:1; /* when 1, task will blocked on user notifications */
				258	unsigned int system:1; /* do system wide monitoring */
				259	unsigned int using_dbreg:1; /* using range restrictions (debug registers) */
				260	unsigned int is_sampling:1; /* true if using a custom format */
				261	unsigned int excl_idle:1; /* exclude idle task in system wide session */
				262	unsigned int going_zombie:1; /* context is zombie (MASKED+blocking) */
				263	unsigned int trap_reason:2; /* reason for going into pfm_handle_work() */
				264	unsigned int no_msg:1; /* no message sent on overflow */
				265	unsigned int can_restart:1; /* allowed to issue a PFM_RESTART */
				266	unsigned int reserved:22;
				267	} pfm_context_flags_t;
				268
				269	#define PFM_TRAP_REASON_NONE 0x0 /* default value */
				270	#define PFM_TRAP_REASON_BLOCK 0x1 /* we need to block on overflow */
				271	#define PFM_TRAP_REASON_RESET 0x2 /* we need to reset PMDs */
				272
				273
				274	/*
				275	* perfmon context: encapsulates all the state of a monitoring session
				276	*/
				277
				278	typedef struct pfm_context {
				279	spinlock_t ctx_lock; /* context protection */
				280
				281	pfm_context_flags_t ctx_flags; /* bitmask of flags (block reason incl.) */
				282	unsigned int ctx_state; /* state: active/inactive (no bitfield) */
				283
				284	struct task_struct ctx_task; / task to which context is attached */
				285
				286	unsigned long ctx_ovfl_regs[4]; /* which registers overflowed (notification) */
				287
				288	struct semaphore ctx_restart_sem; /* use for blocking notification mode */
				289
				290	unsigned long ctx_used_pmds[4]; /* bitmask of PMD used */
				291	unsigned long ctx_all_pmds[4]; /* bitmask of all accessible PMDs */
				292	unsigned long ctx_reload_pmds[4]; /* bitmask of force reload PMD on ctxsw in */
				293
				294	unsigned long ctx_all_pmcs[4]; /* bitmask of all accessible PMCs */
				295	unsigned long ctx_reload_pmcs[4]; /* bitmask of force reload PMC on ctxsw in */
				296	unsigned long ctx_used_monitors[4]; /* bitmask of monitor PMC being used */
				297
				298	unsigned long ctx_pmcs[IA64_NUM_PMC_REGS]; /* saved copies of PMC values */
				299
				300	unsigned int ctx_used_ibrs[1]; /* bitmask of used IBR (speedup ctxsw in) */
				301	unsigned int ctx_used_dbrs[1]; /* bitmask of used DBR (speedup ctxsw in) */
				302	unsigned long ctx_dbrs[IA64_NUM_DBG_REGS]; /* DBR values (cache) when not loaded */
				303	unsigned long ctx_ibrs[IA64_NUM_DBG_REGS]; /* IBR values (cache) when not loaded */
				304
				305	pfm_counter_t ctx_pmds[IA64_NUM_PMD_REGS]; /* software state for PMDS */
				306
				307	u64 ctx_saved_psr_up; /* only contains psr.up value */
				308
				309	unsigned long ctx_last_activation; /* context last activation number for last_cpu */
				310	unsigned int ctx_last_cpu; /* CPU id of current or last CPU used (SMP only) */
				311	unsigned int ctx_cpu; /* cpu to which perfmon is applied (system wide) */
				312
				313	int ctx_fd; /* file descriptor used my this context */
				314	pfm_ovfl_arg_t ctx_ovfl_arg; /* argument to custom buffer format handler */
				315
				316	pfm_buffer_fmt_t ctx_buf_fmt; / buffer format callbacks */
				317	void ctx_smpl_hdr; / points to sampling buffer header kernel vaddr */
				318	unsigned long ctx_smpl_size; /* size of sampling buffer */
				319	void ctx_smpl_vaddr; / user level virtual address of smpl buffer */
				320
				321	wait_queue_head_t ctx_msgq_wait;
				322	pfm_msg_t ctx_msgq[PFM_MAX_MSGS];
				323	int ctx_msgq_head;
				324	int ctx_msgq_tail;
				325	struct fasync_struct *ctx_async_queue;
				326
				327	wait_queue_head_t ctx_zombieq; /* termination cleanup wait queue */
				328	} pfm_context_t;
				329
				330	/*
				331	* magic number used to verify that structure is really
				332	* a perfmon context
				333	*/
				334	#define PFM_IS_FILE(f) ((f)->f_op == &pfm_file_ops)
				335
				336	#define PFM_GET_CTX(t) ((pfm_context_t *)(t)->thread.pfm_context)
				337
				338	#ifdef CONFIG_SMP
				339	#define SET_LAST_CPU(ctx, v) (ctx)->ctx_last_cpu = (v)
				340	#define GET_LAST_CPU(ctx) (ctx)->ctx_last_cpu
				341	#else
				342	#define SET_LAST_CPU(ctx, v) do {} while(0)
				343	#define GET_LAST_CPU(ctx) do {} while(0)
				344	#endif
				345
				346
				347	#define ctx_fl_block ctx_flags.block
				348	#define ctx_fl_system ctx_flags.system
				349	#define ctx_fl_using_dbreg ctx_flags.using_dbreg
				350	#define ctx_fl_is_sampling ctx_flags.is_sampling
				351	#define ctx_fl_excl_idle ctx_flags.excl_idle
				352	#define ctx_fl_going_zombie ctx_flags.going_zombie
				353	#define ctx_fl_trap_reason ctx_flags.trap_reason
				354	#define ctx_fl_no_msg ctx_flags.no_msg
				355	#define ctx_fl_can_restart ctx_flags.can_restart
				356
				357	#define PFM_SET_WORK_PENDING(t, v) do { (t)->thread.pfm_needs_checking = v; } while(0);
				358	#define PFM_GET_WORK_PENDING(t) (t)->thread.pfm_needs_checking
				359
				360	/*
				361	* global information about all sessions
				362	* mostly used to synchronize between system wide and per-process
				363	*/
				364	typedef struct {
				365	spinlock_t pfs_lock; /* lock the structure */
				366
				367	unsigned int pfs_task_sessions; /* number of per task sessions */
				368	unsigned int pfs_sys_sessions; /* number of per system wide sessions */
				369	unsigned int pfs_sys_use_dbregs; /* incremented when a system wide session uses debug regs */
				370	unsigned int pfs_ptrace_use_dbregs; /* incremented when a process uses debug regs */
				371	struct task_struct pfs_sys_session[NR_CPUS]; / point to task owning a system-wide session */
				372	} pfm_session_t;
				373
				374	/*
				375	* information about a PMC or PMD.
				376	* dep_pmd[]: a bitmask of dependent PMD registers
				377	* dep_pmc[]: a bitmask of dependent PMC registers
				378	*/
				379	typedef int (pfm_reg_check_t)(struct task_struct task, pfm_context_t ctx, unsigned int cnum, unsigned long val, struct pt_regs *regs);
				380	typedef struct {
				381	unsigned int type;
				382	int pm_pos;
				383	unsigned long default_value; /* power-on default value */
				384	unsigned long reserved_mask; /* bitmask of reserved bits */
				385	pfm_reg_check_t read_check;
				386	pfm_reg_check_t write_check;
				387	unsigned long dep_pmd[4];
				388	unsigned long dep_pmc[4];
				389	} pfm_reg_desc_t;
				390
				391	/* assume cnum is a valid monitor */
				392	#define PMC_PM(cnum, val) (((val) >> (pmu_conf->pmc_desc[cnum].pm_pos)) & 0x1)
				393
				394	/*
				395	* This structure is initialized at boot time and contains
				396	* a description of the PMU main characteristics.
				397	*
				398	* If the probe function is defined, detection is based
				399	* on its return value:
				400	* - 0 means recognized PMU
				401	* - anything else means not supported
				402	* When the probe function is not defined, then the pmu_family field
				403	* is used and it must match the host CPU family such that:
				404	* - cpu->family & config->pmu_family != 0
				405	*/
				406	typedef struct {
				407	unsigned long ovfl_val; /* overflow value for counters */
				408
				409	pfm_reg_desc_t pmc_desc; / detailed PMC register dependencies descriptions */
				410	pfm_reg_desc_t pmd_desc; / detailed PMD register dependencies descriptions */
				411
				412	unsigned int num_pmcs; /* number of PMCS: computed at init time */
				413	unsigned int num_pmds; /* number of PMDS: computed at init time */
				414	unsigned long impl_pmcs[4]; /* bitmask of implemented PMCS */
				415	unsigned long impl_pmds[4]; /* bitmask of implemented PMDS */
				416
				417	char pmu_name; / PMU family name */
				418	unsigned int pmu_family; /* cpuid family pattern used to identify pmu */
				419	unsigned int flags; /* pmu specific flags */
				420	unsigned int num_ibrs; /* number of IBRS: computed at init time */
				421	unsigned int num_dbrs; /* number of DBRS: computed at init time */
				422	unsigned int num_counters; /* PMC/PMD counting pairs : computed at init time */
				423	int (probe)(void); / customized probe routine */
				424	unsigned int use_rr_dbregs:1; /* set if debug registers used for range restriction */
				425	} pmu_config_t;
				426	/*
				427	* PMU specific flags
				428	*/
				429	#define PFM_PMU_IRQ_RESEND 1 /* PMU needs explicit IRQ resend */
				430
				431	/*
				432	* debug register related type definitions
				433	*/
				434	typedef struct {
				435	unsigned long ibr_mask:56;
				436	unsigned long ibr_plm:4;
				437	unsigned long ibr_ig:3;
				438	unsigned long ibr_x:1;
				439	} ibr_mask_reg_t;
				440
				441	typedef struct {
				442	unsigned long dbr_mask:56;
				443	unsigned long dbr_plm:4;
				444	unsigned long dbr_ig:2;
				445	unsigned long dbr_w:1;
				446	unsigned long dbr_r:1;
				447	} dbr_mask_reg_t;
				448
				449	typedef union {
				450	unsigned long val;
				451	ibr_mask_reg_t ibr;
				452	dbr_mask_reg_t dbr;
				453	} dbreg_t;
				454
				455
				456	/*
				457	* perfmon command descriptions
				458	*/
				459	typedef struct {
				460	int (cmd_func)(pfm_context_t ctx, void arg, int count, struct pt_regs regs);
				461	char *cmd_name;
				462	int cmd_flags;
				463	unsigned int cmd_narg;
				464	size_t cmd_argsize;
				465	int (cmd_getsize)(void arg, size_t *sz);
				466	} pfm_cmd_desc_t;
				467
				468	#define PFM_CMD_FD 0x01 /* command requires a file descriptor */
				469	#define PFM_CMD_ARG_READ 0x02 /* command must read argument(s) */
				470	#define PFM_CMD_ARG_RW 0x04 /* command must read/write argument(s) */
				471	#define PFM_CMD_STOP 0x08 /* command does not work on zombie context */
				472
				473
				474	#define PFM_CMD_NAME(cmd) pfm_cmd_tab[(cmd)].cmd_name
				475	#define PFM_CMD_READ_ARG(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_ARG_READ)
				476	#define PFM_CMD_RW_ARG(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_ARG_RW)
				477	#define PFM_CMD_USE_FD(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_FD)
				478	#define PFM_CMD_STOPPED(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_STOP)
				479
				480	#define PFM_CMD_ARG_MANY -1 /* cannot be zero */
				481
				482	typedef struct {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	483	unsigned long pfm_spurious_ovfl_intr_count; /* keep track of spurious ovfl interrupts */
				484	unsigned long pfm_replay_ovfl_intr_count; /* keep track of replayed ovfl interrupts */
				485	unsigned long pfm_ovfl_intr_count; /* keep track of ovfl interrupts */
				486	unsigned long pfm_ovfl_intr_cycles; /* cycles spent processing ovfl interrupts */
				487	unsigned long pfm_ovfl_intr_cycles_min; /* min cycles spent processing ovfl interrupts */
				488	unsigned long pfm_ovfl_intr_cycles_max; /* max cycles spent processing ovfl interrupts */
				489	unsigned long pfm_smpl_handler_calls;
				490	unsigned long pfm_smpl_handler_cycles;
				491	char pad[SMP_CACHE_BYTES] ____cacheline_aligned;
				492	} pfm_stats_t;
				493
				494	/*
				495	* perfmon internal variables
				496	*/
				497	static pfm_stats_t pfm_stats[NR_CPUS];
				498	static pfm_session_t pfm_sessions; /* global sessions information */
				499
Tony Luck	fe12e25	2005-05-18 17:09:06 -0700	[diff] [blame]	500	static spinlock_t pfm_alt_install_check = SPIN_LOCK_UNLOCKED;
Tony Luck	a1ecf7f	2005-05-18 16:06:00 -0700	[diff] [blame]	501	static pfm_intr_handler_desc_t *pfm_alt_intr_handler;
				502
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	503	static struct proc_dir_entry *perfmon_dir;
				504	static pfm_uuid_t pfm_null_uuid = {0,};
				505
				506	static spinlock_t pfm_buffer_fmt_lock;
				507	static LIST_HEAD(pfm_buffer_fmt_list);
				508
				509	static pmu_config_t *pmu_conf;
				510
				511	/* sysctl() controls */
Stephane Eranian	4944930	2005-04-25 13:08:30 -0700	[diff] [blame]	512	pfm_sysctl_t pfm_sysctl;
				513	EXPORT_SYMBOL(pfm_sysctl);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	514
				515	static ctl_table pfm_ctl_table[]={
				516	{1, "debug", &pfm_sysctl.debug, sizeof(int), 0666, NULL, &proc_dointvec, NULL,},
				517	{2, "debug_ovfl", &pfm_sysctl.debug_ovfl, sizeof(int), 0666, NULL, &proc_dointvec, NULL,},
				518	{3, "fastctxsw", &pfm_sysctl.fastctxsw, sizeof(int), 0600, NULL, &proc_dointvec, NULL,},
				519	{4, "expert_mode", &pfm_sysctl.expert_mode, sizeof(int), 0600, NULL, &proc_dointvec, NULL,},
				520	{ 0, },
				521	};
				522	static ctl_table pfm_sysctl_dir[] = {
				523	{1, "perfmon", NULL, 0, 0755, pfm_ctl_table, },
				524	{0,},
				525	};
				526	static ctl_table pfm_sysctl_root[] = {
				527	{1, "kernel", NULL, 0, 0755, pfm_sysctl_dir, },
				528	{0,},
				529	};
				530	static struct ctl_table_header *pfm_sysctl_header;
				531
				532	static int pfm_context_unload(pfm_context_t ctx, void arg, int count, struct pt_regs *regs);
				533	static int pfm_flush(struct file *filp);
				534
				535	#define pfm_get_cpu_var(v) __ia64_per_cpu_var(v)
				536	#define pfm_get_cpu_data(a,b) per_cpu(a, b)
				537
				538	static inline void
				539	pfm_put_task(struct task_struct *task)
				540	{
				541	if (task != current) put_task_struct(task);
				542	}
				543
				544	static inline void
				545	pfm_set_task_notify(struct task_struct *task)
				546	{
				547	struct thread_info *info;
				548
				549	info = (struct thread_info ) ((char ) task + IA64_TASK_SIZE);
				550	set_bit(TIF_NOTIFY_RESUME, &info->flags);
				551	}
				552
				553	static inline void
				554	pfm_clear_task_notify(void)
				555	{
				556	clear_thread_flag(TIF_NOTIFY_RESUME);
				557	}
				558
				559	static inline void
				560	pfm_reserve_page(unsigned long a)
				561	{
				562	SetPageReserved(vmalloc_to_page((void *)a));
				563	}
				564	static inline void
				565	pfm_unreserve_page(unsigned long a)
				566	{
				567	ClearPageReserved(vmalloc_to_page((void*)a));
				568	}
				569
				570	static inline unsigned long
				571	pfm_protect_ctx_ctxsw(pfm_context_t *x)
				572	{
				573	spin_lock(&(x)->ctx_lock);
				574	return 0UL;
				575	}
				576
				577	static inline unsigned long
				578	pfm_unprotect_ctx_ctxsw(pfm_context_t *x, unsigned long f)
				579	{
				580	spin_unlock(&(x)->ctx_lock);
				581	}
				582
				583	static inline unsigned int
				584	pfm_do_munmap(struct mm_struct *mm, unsigned long addr, size_t len, int acct)
				585	{
				586	return do_munmap(mm, addr, len);
				587	}
				588
				589	static inline unsigned long
				590	pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, unsigned long exec)
				591	{
				592	return get_unmapped_area(file, addr, len, pgoff, flags);
				593	}
				594
				595
				596	static struct super_block *
				597	pfmfs_get_sb(struct file_system_type fs_type, int flags, const char dev_name, void *data)
				598	{
				599	return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC);
				600	}
				601
				602	static struct file_system_type pfm_fs_type = {
				603	.name = "pfmfs",
				604	.get_sb = pfmfs_get_sb,
				605	.kill_sb = kill_anon_super,
				606	};
				607
				608	DEFINE_PER_CPU(unsigned long, pfm_syst_info);
				609	DEFINE_PER_CPU(struct task_struct *, pmu_owner);
				610	DEFINE_PER_CPU(pfm_context_t *, pmu_ctx);
				611	DEFINE_PER_CPU(unsigned long, pmu_activation_number);
Tony Luck	fffcc15	2005-05-31 10:38:32 -0700	[diff] [blame]	612	EXPORT_PER_CPU_SYMBOL_GPL(pfm_syst_info);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	613
				614
				615	/* forward declaration */
				616	static struct file_operations pfm_file_ops;
				617
				618	/*
				619	* forward declarations
				620	*/
				621	#ifndef CONFIG_SMP
				622	static void pfm_lazy_save_regs (struct task_struct *ta);
				623	#endif
				624
				625	void dump_pmu_state(const char *);
				626	static int pfm_write_ibr_dbr(int mode, pfm_context_t ctx, void arg, int count, struct pt_regs *regs);
				627
				628	#include "perfmon_itanium.h"
				629	#include "perfmon_mckinley.h"
				630	#include "perfmon_generic.h"
				631
				632	static pmu_config_t *pmu_confs[]={
				633	&pmu_conf_mck,
				634	&pmu_conf_ita,
				635	&pmu_conf_gen, /* must be last */
				636	NULL
				637	};
				638
				639
				640	static int pfm_end_notify_user(pfm_context_t *ctx);
				641
				642	static inline void
				643	pfm_clear_psr_pp(void)
				644	{
				645	ia64_rsm(IA64_PSR_PP);
				646	ia64_srlz_i();
				647	}
				648
				649	static inline void
				650	pfm_set_psr_pp(void)
				651	{
				652	ia64_ssm(IA64_PSR_PP);
				653	ia64_srlz_i();
				654	}
				655
				656	static inline void
				657	pfm_clear_psr_up(void)
				658	{
				659	ia64_rsm(IA64_PSR_UP);
				660	ia64_srlz_i();
				661	}
				662
				663	static inline void
				664	pfm_set_psr_up(void)
				665	{
				666	ia64_ssm(IA64_PSR_UP);
				667	ia64_srlz_i();
				668	}
				669
				670	static inline unsigned long
				671	pfm_get_psr(void)
				672	{
				673	unsigned long tmp;
				674	tmp = ia64_getreg(_IA64_REG_PSR);
				675	ia64_srlz_i();
				676	return tmp;
				677	}
				678
				679	static inline void
				680	pfm_set_psr_l(unsigned long val)
				681	{
				682	ia64_setreg(_IA64_REG_PSR_L, val);
				683	ia64_srlz_i();
				684	}
				685
				686	static inline void
				687	pfm_freeze_pmu(void)
				688	{
				689	ia64_set_pmc(0,1UL);
				690	ia64_srlz_d();
				691	}
				692
				693	static inline void
				694	pfm_unfreeze_pmu(void)
				695	{
				696	ia64_set_pmc(0,0UL);
				697	ia64_srlz_d();
				698	}
				699
				700	static inline void
				701	pfm_restore_ibrs(unsigned long *ibrs, unsigned int nibrs)
				702	{
				703	int i;
				704
				705	for (i=0; i < nibrs; i++) {
				706	ia64_set_ibr(i, ibrs[i]);
				707	ia64_dv_serialize_instruction();
				708	}
				709	ia64_srlz_i();
				710	}
				711
				712	static inline void
				713	pfm_restore_dbrs(unsigned long *dbrs, unsigned int ndbrs)
				714	{
				715	int i;
				716
				717	for (i=0; i < ndbrs; i++) {
				718	ia64_set_dbr(i, dbrs[i]);
				719	ia64_dv_serialize_data();
				720	}
				721	ia64_srlz_d();
				722	}
				723
				724	/*
				725	* PMD[i] must be a counter. no check is made
				726	*/
				727	static inline unsigned long
				728	pfm_read_soft_counter(pfm_context_t *ctx, int i)
				729	{
				730	return ctx->ctx_pmds[i].val + (ia64_get_pmd(i) & pmu_conf->ovfl_val);
				731	}
				732
				733	/*
				734	* PMD[i] must be a counter. no check is made
				735	*/
				736	static inline void
				737	pfm_write_soft_counter(pfm_context_t *ctx, int i, unsigned long val)
				738	{
				739	unsigned long ovfl_val = pmu_conf->ovfl_val;
				740
				741	ctx->ctx_pmds[i].val = val & ~ovfl_val;
				742	/*
				743	* writing to unimplemented part is ignore, so we do not need to
				744	* mask off top part
				745	*/
				746	ia64_set_pmd(i, val & ovfl_val);
				747	}
				748
				749	static pfm_msg_t *
				750	pfm_get_new_msg(pfm_context_t *ctx)
				751	{
				752	int idx, next;
				753
				754	next = (ctx->ctx_msgq_tail+1) % PFM_MAX_MSGS;
				755
				756	DPRINT(("ctx_fd=%p head=%d tail=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail));
				757	if (next == ctx->ctx_msgq_head) return NULL;
				758
				759	idx = ctx->ctx_msgq_tail;
				760	ctx->ctx_msgq_tail = next;
				761
				762	DPRINT(("ctx=%p head=%d tail=%d msg=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail, idx));
				763
				764	return ctx->ctx_msgq+idx;
				765	}
				766
				767	static pfm_msg_t *
				768	pfm_get_next_msg(pfm_context_t *ctx)
				769	{
				770	pfm_msg_t *msg;
				771
				772	DPRINT(("ctx=%p head=%d tail=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail));
				773
				774	if (PFM_CTXQ_EMPTY(ctx)) return NULL;
				775
				776	/*
				777	* get oldest message
				778	*/
				779	msg = ctx->ctx_msgq+ctx->ctx_msgq_head;
				780
				781	/*
				782	* and move forward
				783	*/
				784	ctx->ctx_msgq_head = (ctx->ctx_msgq_head+1) % PFM_MAX_MSGS;
				785
				786	DPRINT(("ctx=%p head=%d tail=%d type=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail, msg->pfm_gen_msg.msg_type));
				787
				788	return msg;
				789	}
				790
				791	static void
				792	pfm_reset_msgq(pfm_context_t *ctx)
				793	{
				794	ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0;
				795	DPRINT(("ctx=%p msgq reset\n", ctx));
				796	}
				797
				798	static void *
				799	pfm_rvmalloc(unsigned long size)
				800	{
				801	void *mem;
				802	unsigned long addr;
				803
				804	size = PAGE_ALIGN(size);
				805	mem = vmalloc(size);
				806	if (mem) {
				807	//printk("perfmon: CPU%d pfm_rvmalloc(%ld)=%p\n", smp_processor_id(), size, mem);
				808	memset(mem, 0, size);
				809	addr = (unsigned long)mem;
				810	while (size > 0) {
				811	pfm_reserve_page(addr);
				812	addr+=PAGE_SIZE;
				813	size-=PAGE_SIZE;
				814	}
				815	}
				816	return mem;
				817	}
				818
				819	static void
				820	pfm_rvfree(void *mem, unsigned long size)
				821	{
				822	unsigned long addr;
				823
				824	if (mem) {
				825	DPRINT(("freeing physical buffer @%p size=%lu\n", mem, size));
				826	addr = (unsigned long) mem;
				827	while ((long) size > 0) {
				828	pfm_unreserve_page(addr);
				829	addr+=PAGE_SIZE;
				830	size-=PAGE_SIZE;
				831	}
				832	vfree(mem);
				833	}
				834	return;
				835	}
				836
				837	static pfm_context_t *
				838	pfm_context_alloc(void)
				839	{
				840	pfm_context_t *ctx;
				841
				842	/*
				843	* allocate context descriptor
				844	* must be able to free with interrupts disabled
				845	*/
				846	ctx = kmalloc(sizeof(pfm_context_t), GFP_KERNEL);
				847	if (ctx) {
				848	memset(ctx, 0, sizeof(pfm_context_t));
				849	DPRINT(("alloc ctx @%p\n", ctx));
				850	}
				851	return ctx;
				852	}
				853
				854	static void
				855	pfm_context_free(pfm_context_t *ctx)
				856	{
				857	if (ctx) {
				858	DPRINT(("free ctx @%p\n", ctx));
				859	kfree(ctx);
				860	}
				861	}
				862
				863	static void
				864	pfm_mask_monitoring(struct task_struct *task)
				865	{
				866	pfm_context_t *ctx = PFM_GET_CTX(task);
				867	struct thread_struct *th = &task->thread;
				868	unsigned long mask, val, ovfl_mask;
				869	int i;
				870
				871	DPRINT_ovfl(("masking monitoring for [%d]\n", task->pid));
				872
				873	ovfl_mask = pmu_conf->ovfl_val;
				874	/*
				875	* monitoring can only be masked as a result of a valid
				876	* counter overflow. In UP, it means that the PMU still
				877	* has an owner. Note that the owner can be different
				878	* from the current task. However the PMU state belongs
				879	* to the owner.
				880	* In SMP, a valid overflow only happens when task is
				881	* current. Therefore if we come here, we know that
				882	* the PMU state belongs to the current task, therefore
				883	* we can access the live registers.
				884	*
				885	* So in both cases, the live register contains the owner's
				886	* state. We can ONLY touch the PMU registers and NOT the PSR.
				887	*
				888	* As a consequence to this call, the thread->pmds[] array
				889	* contains stale information which must be ignored
				890	* when context is reloaded AND monitoring is active (see
				891	* pfm_restart).
				892	*/
				893	mask = ctx->ctx_used_pmds[0];
				894	for (i = 0; mask; i++, mask>>=1) {
				895	/* skip non used pmds */
				896	if ((mask & 0x1) == 0) continue;
				897	val = ia64_get_pmd(i);
				898
				899	if (PMD_IS_COUNTING(i)) {
				900	/*
				901	* we rebuild the full 64 bit value of the counter
				902	*/
				903	ctx->ctx_pmds[i].val += (val & ovfl_mask);
				904	} else {
				905	ctx->ctx_pmds[i].val = val;
				906	}
				907	DPRINT_ovfl(("pmd[%d]=0x%lx hw_pmd=0x%lx\n",
				908	i,
				909	ctx->ctx_pmds[i].val,
				910	val & ovfl_mask));
				911	}
				912	/*
				913	* mask monitoring by setting the privilege level to 0
				914	* we cannot use psr.pp/psr.up for this, it is controlled by
				915	* the user
				916	*
				917	* if task is current, modify actual registers, otherwise modify
				918	* thread save state, i.e., what will be restored in pfm_load_regs()
				919	*/
				920	mask = ctx->ctx_used_monitors[0] >> PMU_FIRST_COUNTER;
				921	for(i= PMU_FIRST_COUNTER; mask; i++, mask>>=1) {
				922	if ((mask & 0x1) == 0UL) continue;
				923	ia64_set_pmc(i, th->pmcs[i] & ~0xfUL);
				924	th->pmcs[i] &= ~0xfUL;
				925	DPRINT_ovfl(("pmc[%d]=0x%lx\n", i, th->pmcs[i]));
				926	}
				927	/*
				928	* make all of this visible
				929	*/
				930	ia64_srlz_d();
				931	}
				932
				933	/*
				934	* must always be done with task == current
				935	*
				936	* context must be in MASKED state when calling
				937	*/
				938	static void
				939	pfm_restore_monitoring(struct task_struct *task)
				940	{
				941	pfm_context_t *ctx = PFM_GET_CTX(task);
				942	struct thread_struct *th = &task->thread;
				943	unsigned long mask, ovfl_mask;
				944	unsigned long psr, val;
				945	int i, is_system;
				946
				947	is_system = ctx->ctx_fl_system;
				948	ovfl_mask = pmu_conf->ovfl_val;
				949
				950	if (task != current) {
				951	printk(KERN_ERR "perfmon.%d: invalid task[%d] current[%d]\n", __LINE__, task->pid, current->pid);
				952	return;
				953	}
				954	if (ctx->ctx_state != PFM_CTX_MASKED) {
				955	printk(KERN_ERR "perfmon.%d: task[%d] current[%d] invalid state=%d\n", __LINE__,
				956	task->pid, current->pid, ctx->ctx_state);
				957	return;
				958	}
				959	psr = pfm_get_psr();
				960	/*
				961	* monitoring is masked via the PMC.
				962	* As we restore their value, we do not want each counter to
				963	* restart right away. We stop monitoring using the PSR,
				964	* restore the PMC (and PMD) and then re-establish the psr
				965	* as it was. Note that there can be no pending overflow at
				966	* this point, because monitoring was MASKED.
				967	*
				968	* system-wide session are pinned and self-monitoring
				969	*/
				970	if (is_system && (PFM_CPUINFO_GET() & PFM_CPUINFO_DCR_PP)) {
				971	/* disable dcr pp */
				972	ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) & ~IA64_DCR_PP);
				973	pfm_clear_psr_pp();
				974	} else {
				975	pfm_clear_psr_up();
				976	}
				977	/*
				978	* first, we restore the PMD
				979	*/
				980	mask = ctx->ctx_used_pmds[0];
				981	for (i = 0; mask; i++, mask>>=1) {
				982	/* skip non used pmds */
				983	if ((mask & 0x1) == 0) continue;
				984
				985	if (PMD_IS_COUNTING(i)) {
				986	/*
				987	* we split the 64bit value according to
				988	* counter width
				989	*/
				990	val = ctx->ctx_pmds[i].val & ovfl_mask;
				991	ctx->ctx_pmds[i].val &= ~ovfl_mask;
				992	} else {
				993	val = ctx->ctx_pmds[i].val;
				994	}
				995	ia64_set_pmd(i, val);
				996
				997	DPRINT(("pmd[%d]=0x%lx hw_pmd=0x%lx\n",
				998	i,
				999	ctx->ctx_pmds[i].val,
				1000	val));
				1001	}
				1002	/*
				1003	* restore the PMCs
				1004	*/
				1005	mask = ctx->ctx_used_monitors[0] >> PMU_FIRST_COUNTER;
				1006	for(i= PMU_FIRST_COUNTER; mask; i++, mask>>=1) {
				1007	if ((mask & 0x1) == 0UL) continue;
				1008	th->pmcs[i] = ctx->ctx_pmcs[i];
				1009	ia64_set_pmc(i, th->pmcs[i]);
				1010	DPRINT(("[%d] pmc[%d]=0x%lx\n", task->pid, i, th->pmcs[i]));
				1011	}
				1012	ia64_srlz_d();
				1013
				1014	/*
				1015	* must restore DBR/IBR because could be modified while masked
				1016	* XXX: need to optimize
				1017	*/
				1018	if (ctx->ctx_fl_using_dbreg) {
				1019	pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
				1020	pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
				1021	}
				1022
				1023	/*
				1024	* now restore PSR
				1025	*/
				1026	if (is_system && (PFM_CPUINFO_GET() & PFM_CPUINFO_DCR_PP)) {
				1027	/* enable dcr pp */
				1028	ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) \| IA64_DCR_PP);
				1029	ia64_srlz_i();
				1030	}
				1031	pfm_set_psr_l(psr);
				1032	}
				1033
				1034	static inline void
				1035	pfm_save_pmds(unsigned long *pmds, unsigned long mask)
				1036	{
				1037	int i;
				1038
				1039	ia64_srlz_d();
				1040
				1041	for (i=0; mask; i++, mask>>=1) {
				1042	if (mask & 0x1) pmds[i] = ia64_get_pmd(i);
				1043	}
				1044	}
				1045
				1046	/*
				1047	* reload from thread state (used for ctxw only)
				1048	*/
				1049	static inline void
				1050	pfm_restore_pmds(unsigned long *pmds, unsigned long mask)
				1051	{
				1052	int i;
				1053	unsigned long val, ovfl_val = pmu_conf->ovfl_val;
				1054
				1055	for (i=0; mask; i++, mask>>=1) {
				1056	if ((mask & 0x1) == 0) continue;
				1057	val = PMD_IS_COUNTING(i) ? pmds[i] & ovfl_val : pmds[i];
				1058	ia64_set_pmd(i, val);
				1059	}
				1060	ia64_srlz_d();
				1061	}
				1062
				1063	/*
				1064	* propagate PMD from context to thread-state
				1065	*/
				1066	static inline void
				1067	pfm_copy_pmds(struct task_struct task, pfm_context_t ctx)
				1068	{
				1069	struct thread_struct *thread = &task->thread;
				1070	unsigned long ovfl_val = pmu_conf->ovfl_val;
				1071	unsigned long mask = ctx->ctx_all_pmds[0];
				1072	unsigned long val;
				1073	int i;
				1074
				1075	DPRINT(("mask=0x%lx\n", mask));
				1076
				1077	for (i=0; mask; i++, mask>>=1) {
				1078
				1079	val = ctx->ctx_pmds[i].val;
				1080
				1081	/*
				1082	* We break up the 64 bit value into 2 pieces
				1083	* the lower bits go to the machine state in the
				1084	* thread (will be reloaded on ctxsw in).
				1085	* The upper part stays in the soft-counter.
				1086	*/
				1087	if (PMD_IS_COUNTING(i)) {
				1088	ctx->ctx_pmds[i].val = val & ~ovfl_val;
				1089	val &= ovfl_val;
				1090	}
				1091	thread->pmds[i] = val;
				1092
				1093	DPRINT(("pmd[%d]=0x%lx soft_val=0x%lx\n",
				1094	i,
				1095	thread->pmds[i],
				1096	ctx->ctx_pmds[i].val));
				1097	}
				1098	}
				1099
				1100	/*
				1101	* propagate PMC from context to thread-state
				1102	*/
				1103	static inline void
				1104	pfm_copy_pmcs(struct task_struct task, pfm_context_t ctx)
				1105	{
				1106	struct thread_struct *thread = &task->thread;
				1107	unsigned long mask = ctx->ctx_all_pmcs[0];
				1108	int i;
				1109
				1110	DPRINT(("mask=0x%lx\n", mask));
				1111
				1112	for (i=0; mask; i++, mask>>=1) {
				1113	/* masking 0 with ovfl_val yields 0 */
				1114	thread->pmcs[i] = ctx->ctx_pmcs[i];
				1115	DPRINT(("pmc[%d]=0x%lx\n", i, thread->pmcs[i]));
				1116	}
				1117	}
				1118
				1119
				1120
				1121	static inline void
				1122	pfm_restore_pmcs(unsigned long *pmcs, unsigned long mask)
				1123	{
				1124	int i;
				1125
				1126	for (i=0; mask; i++, mask>>=1) {
				1127	if ((mask & 0x1) == 0) continue;
				1128	ia64_set_pmc(i, pmcs[i]);
				1129	}
				1130	ia64_srlz_d();
				1131	}
				1132
				1133	static inline int
				1134	pfm_uuid_cmp(pfm_uuid_t a, pfm_uuid_t b)
				1135	{
				1136	return memcmp(a, b, sizeof(pfm_uuid_t));
				1137	}
				1138
				1139	static inline int
				1140	pfm_buf_fmt_exit(pfm_buffer_fmt_t fmt, struct task_struct task, void buf, struct pt_regs regs)
				1141	{
				1142	int ret = 0;
				1143	if (fmt->fmt_exit) ret = (*fmt->fmt_exit)(task, buf, regs);
				1144	return ret;
				1145	}
				1146
				1147	static inline int
				1148	pfm_buf_fmt_getsize(pfm_buffer_fmt_t fmt, struct task_struct task, unsigned int flags, int cpu, void arg, unsigned long size)
				1149	{
				1150	int ret = 0;
				1151	if (fmt->fmt_getsize) ret = (*fmt->fmt_getsize)(task, flags, cpu, arg, size);
				1152	return ret;
				1153	}
				1154
				1155
				1156	static inline int
				1157	pfm_buf_fmt_validate(pfm_buffer_fmt_t fmt, struct task_struct task, unsigned int flags,
				1158	int cpu, void *arg)
				1159	{
				1160	int ret = 0;
				1161	if (fmt->fmt_validate) ret = (*fmt->fmt_validate)(task, flags, cpu, arg);
				1162	return ret;
				1163	}
				1164
				1165	static inline int
				1166	pfm_buf_fmt_init(pfm_buffer_fmt_t fmt, struct task_struct task, void *buf, unsigned int flags,
				1167	int cpu, void *arg)
				1168	{
				1169	int ret = 0;
				1170	if (fmt->fmt_init) ret = (*fmt->fmt_init)(task, buf, flags, cpu, arg);
				1171	return ret;
				1172	}
				1173
				1174	static inline int
				1175	pfm_buf_fmt_restart(pfm_buffer_fmt_t fmt, struct task_struct task, pfm_ovfl_ctrl_t ctrl, void buf, struct pt_regs *regs)
				1176	{
				1177	int ret = 0;
				1178	if (fmt->fmt_restart) ret = (*fmt->fmt_restart)(task, ctrl, buf, regs);
				1179	return ret;
				1180	}
				1181
				1182	static inline int
				1183	pfm_buf_fmt_restart_active(pfm_buffer_fmt_t fmt, struct task_struct task, pfm_ovfl_ctrl_t ctrl, void buf, struct pt_regs *regs)
				1184	{
				1185	int ret = 0;
				1186	if (fmt->fmt_restart_active) ret = (*fmt->fmt_restart_active)(task, ctrl, buf, regs);
				1187	return ret;
				1188	}
				1189
				1190	static pfm_buffer_fmt_t *
				1191	__pfm_find_buffer_fmt(pfm_uuid_t uuid)
				1192	{
				1193	struct list_head * pos;
				1194	pfm_buffer_fmt_t * entry;
				1195
				1196	list_for_each(pos, &pfm_buffer_fmt_list) {
				1197	entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list);
				1198	if (pfm_uuid_cmp(uuid, entry->fmt_uuid) == 0)
				1199	return entry;
				1200	}
				1201	return NULL;
				1202	}
				1203
				1204	/*
				1205	* find a buffer format based on its uuid
				1206	*/
				1207	static pfm_buffer_fmt_t *
				1208	pfm_find_buffer_fmt(pfm_uuid_t uuid)
				1209	{
				1210	pfm_buffer_fmt_t * fmt;
				1211	spin_lock(&pfm_buffer_fmt_lock);
				1212	fmt = __pfm_find_buffer_fmt(uuid);
				1213	spin_unlock(&pfm_buffer_fmt_lock);
				1214	return fmt;
				1215	}
				1216
				1217	int
				1218	pfm_register_buffer_fmt(pfm_buffer_fmt_t *fmt)
				1219	{
				1220	int ret = 0;
				1221
				1222	/* some sanity checks */
				1223	if (fmt == NULL \|\| fmt->fmt_name == NULL) return -EINVAL;
				1224
				1225	/* we need at least a handler */
				1226	if (fmt->fmt_handler == NULL) return -EINVAL;
				1227
				1228	/*
				1229	* XXX: need check validity of fmt_arg_size
				1230	*/
				1231
				1232	spin_lock(&pfm_buffer_fmt_lock);
				1233
				1234	if (__pfm_find_buffer_fmt(fmt->fmt_uuid)) {
				1235	printk(KERN_ERR "perfmon: duplicate sampling format: %s\n", fmt->fmt_name);
				1236	ret = -EBUSY;
				1237	goto out;
				1238	}
				1239	list_add(&fmt->fmt_list, &pfm_buffer_fmt_list);
				1240	printk(KERN_INFO "perfmon: added sampling format %s\n", fmt->fmt_name);
				1241
				1242	out:
				1243	spin_unlock(&pfm_buffer_fmt_lock);
				1244	return ret;
				1245	}
				1246	EXPORT_SYMBOL(pfm_register_buffer_fmt);
				1247
				1248	int
				1249	pfm_unregister_buffer_fmt(pfm_uuid_t uuid)
				1250	{
				1251	pfm_buffer_fmt_t *fmt;
				1252	int ret = 0;
				1253
				1254	spin_lock(&pfm_buffer_fmt_lock);
				1255
				1256	fmt = __pfm_find_buffer_fmt(uuid);
				1257	if (!fmt) {
				1258	printk(KERN_ERR "perfmon: cannot unregister format, not found\n");
				1259	ret = -EINVAL;
				1260	goto out;
				1261	}
				1262	list_del_init(&fmt->fmt_list);
				1263	printk(KERN_INFO "perfmon: removed sampling format: %s\n", fmt->fmt_name);
				1264
				1265	out:
				1266	spin_unlock(&pfm_buffer_fmt_lock);
				1267	return ret;
				1268
				1269	}
				1270	EXPORT_SYMBOL(pfm_unregister_buffer_fmt);
				1271
Stephane Eranian	8df5a50	2005-04-11 13:45:00 -0700	[diff] [blame]	1272	extern void update_pal_halt_status(int);
				1273
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1274	static int
				1275	pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned int cpu)
				1276	{
				1277	unsigned long flags;
				1278	/*
				1279	* validy checks on cpu_mask have been done upstream
				1280	*/
				1281	LOCK_PFS(flags);
				1282
				1283	DPRINT(("in sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
				1284	pfm_sessions.pfs_sys_sessions,
				1285	pfm_sessions.pfs_task_sessions,
				1286	pfm_sessions.pfs_sys_use_dbregs,
				1287	is_syswide,
				1288	cpu));
				1289
				1290	if (is_syswide) {
				1291	/*
				1292	* cannot mix system wide and per-task sessions
				1293	*/
				1294	if (pfm_sessions.pfs_task_sessions > 0UL) {
				1295	DPRINT(("system wide not possible, %u conflicting task_sessions\n",
				1296	pfm_sessions.pfs_task_sessions));
				1297	goto abort;
				1298	}
				1299
				1300	if (pfm_sessions.pfs_sys_session[cpu]) goto error_conflict;
				1301
				1302	DPRINT(("reserving system wide session on CPU%u currently on CPU%u\n", cpu, smp_processor_id()));
				1303
				1304	pfm_sessions.pfs_sys_session[cpu] = task;
				1305
				1306	pfm_sessions.pfs_sys_sessions++ ;
				1307
				1308	} else {
				1309	if (pfm_sessions.pfs_sys_sessions) goto abort;
				1310	pfm_sessions.pfs_task_sessions++;
				1311	}
				1312
				1313	DPRINT(("out sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
				1314	pfm_sessions.pfs_sys_sessions,
				1315	pfm_sessions.pfs_task_sessions,
				1316	pfm_sessions.pfs_sys_use_dbregs,
				1317	is_syswide,
				1318	cpu));
				1319
Stephane Eranian	8df5a50	2005-04-11 13:45:00 -0700	[diff] [blame]	1320	/*
				1321	* disable default_idle() to go to PAL_HALT
				1322	*/
				1323	update_pal_halt_status(0);
				1324
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1325	UNLOCK_PFS(flags);
				1326
				1327	return 0;
				1328
				1329	error_conflict:
				1330	DPRINT(("system wide not possible, conflicting session [%d] on CPU%d\n",
				1331	pfm_sessions.pfs_sys_session[cpu]->pid,
Tony Luck	a1ecf7f	2005-05-18 16:06:00 -0700	[diff] [blame]	1332	cpu));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1333	abort:
				1334	UNLOCK_PFS(flags);
				1335
				1336	return -EBUSY;
				1337
				1338	}
				1339
				1340	static int
				1341	pfm_unreserve_session(pfm_context_t *ctx, int is_syswide, unsigned int cpu)
				1342	{
				1343	unsigned long flags;
				1344	/*
				1345	* validy checks on cpu_mask have been done upstream
				1346	*/
				1347	LOCK_PFS(flags);
				1348
				1349	DPRINT(("in sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
				1350	pfm_sessions.pfs_sys_sessions,
				1351	pfm_sessions.pfs_task_sessions,
				1352	pfm_sessions.pfs_sys_use_dbregs,
				1353	is_syswide,
				1354	cpu));
				1355
				1356
				1357	if (is_syswide) {
				1358	pfm_sessions.pfs_sys_session[cpu] = NULL;
				1359	/*
				1360	* would not work with perfmon+more than one bit in cpu_mask
				1361	*/
				1362	if (ctx && ctx->ctx_fl_using_dbreg) {
				1363	if (pfm_sessions.pfs_sys_use_dbregs == 0) {
				1364	printk(KERN_ERR "perfmon: invalid release for ctx %p sys_use_dbregs=0\n", ctx);
				1365	} else {
				1366	pfm_sessions.pfs_sys_use_dbregs--;
				1367	}
				1368	}
				1369	pfm_sessions.pfs_sys_sessions--;
				1370	} else {
				1371	pfm_sessions.pfs_task_sessions--;
				1372	}
				1373	DPRINT(("out sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
				1374	pfm_sessions.pfs_sys_sessions,
				1375	pfm_sessions.pfs_task_sessions,
				1376	pfm_sessions.pfs_sys_use_dbregs,
				1377	is_syswide,
				1378	cpu));
				1379
Stephane Eranian	8df5a50	2005-04-11 13:45:00 -0700	[diff] [blame]	1380	/*
				1381	* if possible, enable default_idle() to go into PAL_HALT
				1382	*/
				1383	if (pfm_sessions.pfs_task_sessions == 0 && pfm_sessions.pfs_sys_sessions == 0)
				1384	update_pal_halt_status(1);
				1385
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1386	UNLOCK_PFS(flags);
				1387
				1388	return 0;
				1389	}
				1390
				1391	/*
				1392	* removes virtual mapping of the sampling buffer.
				1393	* IMPORTANT: cannot be called with interrupts disable, e.g. inside
				1394	* a PROTECT_CTX() section.
				1395	*/
				1396	static int
				1397	pfm_remove_smpl_mapping(struct task_struct task, void vaddr, unsigned long size)
				1398	{
				1399	int r;
				1400
				1401	/* sanity checks */
				1402	if (task->mm == NULL \|\| size == 0UL \|\| vaddr == NULL) {
				1403	printk(KERN_ERR "perfmon: pfm_remove_smpl_mapping [%d] invalid context mm=%p\n", task->pid, task->mm);
				1404	return -EINVAL;
				1405	}
				1406
				1407	DPRINT(("smpl_vaddr=%p size=%lu\n", vaddr, size));
				1408
				1409	/*
				1410	* does the actual unmapping
				1411	*/
				1412	down_write(&task->mm->mmap_sem);
				1413
				1414	DPRINT(("down_write done smpl_vaddr=%p size=%lu\n", vaddr, size));
				1415
				1416	r = pfm_do_munmap(task->mm, (unsigned long)vaddr, size, 0);
				1417
				1418	up_write(&task->mm->mmap_sem);
				1419	if (r !=0) {
				1420	printk(KERN_ERR "perfmon: [%d] unable to unmap sampling buffer @%p size=%lu\n", task->pid, vaddr, size);
				1421	}
				1422
				1423	DPRINT(("do_unmap(%p, %lu)=%d\n", vaddr, size, r));
				1424
				1425	return 0;
				1426	}
				1427
				1428	/*
				1429	* free actual physical storage used by sampling buffer
				1430	*/
				1431	#if 0
				1432	static int
				1433	pfm_free_smpl_buffer(pfm_context_t *ctx)
				1434	{
				1435	pfm_buffer_fmt_t *fmt;
				1436
				1437	if (ctx->ctx_smpl_hdr == NULL) goto invalid_free;
				1438
				1439	/*
				1440	* we won't use the buffer format anymore
				1441	*/
				1442	fmt = ctx->ctx_buf_fmt;
				1443
				1444	DPRINT(("sampling buffer @%p size %lu vaddr=%p\n",
				1445	ctx->ctx_smpl_hdr,
				1446	ctx->ctx_smpl_size,
				1447	ctx->ctx_smpl_vaddr));
				1448
				1449	pfm_buf_fmt_exit(fmt, current, NULL, NULL);
				1450
				1451	/*
				1452	* free the buffer
				1453	*/
				1454	pfm_rvfree(ctx->ctx_smpl_hdr, ctx->ctx_smpl_size);
				1455
				1456	ctx->ctx_smpl_hdr = NULL;
				1457	ctx->ctx_smpl_size = 0UL;
				1458
				1459	return 0;
				1460
				1461	invalid_free:
				1462	printk(KERN_ERR "perfmon: pfm_free_smpl_buffer [%d] no buffer\n", current->pid);
				1463	return -EINVAL;
				1464	}
				1465	#endif
				1466
				1467	static inline void
				1468	pfm_exit_smpl_buffer(pfm_buffer_fmt_t *fmt)
				1469	{
				1470	if (fmt == NULL) return;
				1471
				1472	pfm_buf_fmt_exit(fmt, current, NULL, NULL);
				1473
				1474	}
				1475
				1476	/*
				1477	* pfmfs should _never_ be mounted by userland - too much of security hassle,
				1478	* no real gain from having the whole whorehouse mounted. So we don't need
				1479	* any operations on the root directory. However, we need a non-trivial
				1480	* d_name - pfm: will go nicely and kill the special-casing in procfs.
				1481	*/
				1482	static struct vfsmount *pfmfs_mnt;
				1483
				1484	static int __init
				1485	init_pfm_fs(void)
				1486	{
				1487	int err = register_filesystem(&pfm_fs_type);
				1488	if (!err) {
				1489	pfmfs_mnt = kern_mount(&pfm_fs_type);
				1490	err = PTR_ERR(pfmfs_mnt);
				1491	if (IS_ERR(pfmfs_mnt))
				1492	unregister_filesystem(&pfm_fs_type);
				1493	else
				1494	err = 0;
				1495	}
				1496	return err;
				1497	}
				1498
				1499	static void __exit
				1500	exit_pfm_fs(void)
				1501	{
				1502	unregister_filesystem(&pfm_fs_type);
				1503	mntput(pfmfs_mnt);
				1504	}
				1505
				1506	static ssize_t
				1507	pfm_read(struct file filp, char __user buf, size_t size, loff_t *ppos)
				1508	{
				1509	pfm_context_t *ctx;
				1510	pfm_msg_t *msg;
				1511	ssize_t ret;
				1512	unsigned long flags;
				1513	DECLARE_WAITQUEUE(wait, current);
				1514	if (PFM_IS_FILE(filp) == 0) {
				1515	printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid);
				1516	return -EINVAL;
				1517	}
				1518
				1519	ctx = (pfm_context_t *)filp->private_data;
				1520	if (ctx == NULL) {
				1521	printk(KERN_ERR "perfmon: pfm_read: NULL ctx [%d]\n", current->pid);
				1522	return -EINVAL;
				1523	}
				1524
				1525	/*
				1526	* check even when there is no message
				1527	*/
				1528	if (size < sizeof(pfm_msg_t)) {
				1529	DPRINT(("message is too small ctx=%p (>=%ld)\n", ctx, sizeof(pfm_msg_t)));
				1530	return -EINVAL;
				1531	}
				1532
				1533	PROTECT_CTX(ctx, flags);
				1534
				1535	/*
				1536	* put ourselves on the wait queue
				1537	*/
				1538	add_wait_queue(&ctx->ctx_msgq_wait, &wait);
				1539
				1540
				1541	for(;;) {
				1542	/*
				1543	* check wait queue
				1544	*/
				1545
				1546	set_current_state(TASK_INTERRUPTIBLE);
				1547
				1548	DPRINT(("head=%d tail=%d\n", ctx->ctx_msgq_head, ctx->ctx_msgq_tail));
				1549
				1550	ret = 0;
				1551	if(PFM_CTXQ_EMPTY(ctx) == 0) break;
				1552
				1553	UNPROTECT_CTX(ctx, flags);
				1554
				1555	/*
				1556	* check non-blocking read
				1557	*/
				1558	ret = -EAGAIN;
				1559	if(filp->f_flags & O_NONBLOCK) break;
				1560
				1561	/*
				1562	* check pending signals
				1563	*/
				1564	if(signal_pending(current)) {
				1565	ret = -EINTR;
				1566	break;
				1567	}
				1568	/*
				1569	* no message, so wait
				1570	*/
				1571	schedule();
				1572
				1573	PROTECT_CTX(ctx, flags);
				1574	}
				1575	DPRINT(("[%d] back to running ret=%ld\n", current->pid, ret));
				1576	set_current_state(TASK_RUNNING);
				1577	remove_wait_queue(&ctx->ctx_msgq_wait, &wait);
				1578
				1579	if (ret < 0) goto abort;
				1580
				1581	ret = -EINVAL;
				1582	msg = pfm_get_next_msg(ctx);
				1583	if (msg == NULL) {
				1584	printk(KERN_ERR "perfmon: pfm_read no msg for ctx=%p [%d]\n", ctx, current->pid);
				1585	goto abort_locked;
				1586	}
				1587
Stephane Eranian	4944930	2005-04-25 13:08:30 -0700	[diff] [blame]	1588	DPRINT(("fd=%d type=%d\n", msg->pfm_gen_msg.msg_ctx_fd, msg->pfm_gen_msg.msg_type));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1589
				1590	ret = -EFAULT;
				1591	if(copy_to_user(buf, msg, sizeof(pfm_msg_t)) == 0) ret = sizeof(pfm_msg_t);
				1592
				1593	abort_locked:
				1594	UNPROTECT_CTX(ctx, flags);
				1595	abort:
				1596	return ret;
				1597	}
				1598
				1599	static ssize_t
				1600	pfm_write(struct file file, const char __user ubuf,
				1601	size_t size, loff_t *ppos)
				1602	{
				1603	DPRINT(("pfm_write called\n"));
				1604	return -EINVAL;
				1605	}
				1606
				1607	static unsigned int
				1608	pfm_poll(struct file filp, poll_table wait)
				1609	{
				1610	pfm_context_t *ctx;
				1611	unsigned long flags;
				1612	unsigned int mask = 0;
				1613
				1614	if (PFM_IS_FILE(filp) == 0) {
				1615	printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid);
				1616	return 0;
				1617	}
				1618
				1619	ctx = (pfm_context_t *)filp->private_data;
				1620	if (ctx == NULL) {
				1621	printk(KERN_ERR "perfmon: pfm_poll: NULL ctx [%d]\n", current->pid);
				1622	return 0;
				1623	}
				1624
				1625
				1626	DPRINT(("pfm_poll ctx_fd=%d before poll_wait\n", ctx->ctx_fd));
				1627
				1628	poll_wait(filp, &ctx->ctx_msgq_wait, wait);
				1629
				1630	PROTECT_CTX(ctx, flags);
				1631
				1632	if (PFM_CTXQ_EMPTY(ctx) == 0)
				1633	mask = POLLIN \| POLLRDNORM;
				1634
				1635	UNPROTECT_CTX(ctx, flags);
				1636
				1637	DPRINT(("pfm_poll ctx_fd=%d mask=0x%x\n", ctx->ctx_fd, mask));
				1638
				1639	return mask;
				1640	}
				1641
				1642	static int
				1643	pfm_ioctl(struct inode inode, struct file file, unsigned int cmd, unsigned long arg)
				1644	{
				1645	DPRINT(("pfm_ioctl called\n"));
				1646	return -EINVAL;
				1647	}
				1648
				1649	/*
				1650	* interrupt cannot be masked when coming here
				1651	*/
				1652	static inline int
				1653	pfm_do_fasync(int fd, struct file filp, pfm_context_t ctx, int on)
				1654	{
				1655	int ret;
				1656
				1657	ret = fasync_helper (fd, filp, on, &ctx->ctx_async_queue);
				1658
				1659	DPRINT(("pfm_fasync called by [%d] on ctx_fd=%d on=%d async_queue=%p ret=%d\n",
				1660	current->pid,
				1661	fd,
				1662	on,
				1663	ctx->ctx_async_queue, ret));
				1664
				1665	return ret;
				1666	}
				1667
				1668	static int
				1669	pfm_fasync(int fd, struct file *filp, int on)
				1670	{
				1671	pfm_context_t *ctx;
				1672	int ret;
				1673
				1674	if (PFM_IS_FILE(filp) == 0) {
				1675	printk(KERN_ERR "perfmon: pfm_fasync bad magic [%d]\n", current->pid);
				1676	return -EBADF;
				1677	}
				1678
				1679	ctx = (pfm_context_t *)filp->private_data;
				1680	if (ctx == NULL) {
				1681	printk(KERN_ERR "perfmon: pfm_fasync NULL ctx [%d]\n", current->pid);
				1682	return -EBADF;
				1683	}
				1684	/*
				1685	* we cannot mask interrupts during this call because this may
				1686	* may go to sleep if memory is not readily avalaible.
				1687	*
				1688	* We are protected from the conetxt disappearing by the get_fd()/put_fd()
				1689	* done in caller. Serialization of this function is ensured by caller.
				1690	*/
				1691	ret = pfm_do_fasync(fd, filp, ctx, on);
				1692
				1693
				1694	DPRINT(("pfm_fasync called on ctx_fd=%d on=%d async_queue=%p ret=%d\n",
				1695	fd,
				1696	on,
				1697	ctx->ctx_async_queue, ret));
				1698
				1699	return ret;
				1700	}
				1701
				1702	#ifdef CONFIG_SMP
				1703	/*
				1704	* this function is exclusively called from pfm_close().
				1705	* The context is not protected at that time, nor are interrupts
				1706	* on the remote CPU. That's necessary to avoid deadlocks.
				1707	*/
				1708	static void
				1709	pfm_syswide_force_stop(void *info)
				1710	{
				1711	pfm_context_t ctx = (pfm_context_t )info;
				1712	struct pt_regs *regs = ia64_task_regs(current);
				1713	struct task_struct *owner;
				1714	unsigned long flags;
				1715	int ret;
				1716
				1717	if (ctx->ctx_cpu != smp_processor_id()) {
				1718	printk(KERN_ERR "perfmon: pfm_syswide_force_stop for CPU%d but on CPU%d\n",
				1719	ctx->ctx_cpu,
				1720	smp_processor_id());
				1721	return;
				1722	}
				1723	owner = GET_PMU_OWNER();
				1724	if (owner != ctx->ctx_task) {
				1725	printk(KERN_ERR "perfmon: pfm_syswide_force_stop CPU%d unexpected owner [%d] instead of [%d]\n",
				1726	smp_processor_id(),
				1727	owner->pid, ctx->ctx_task->pid);
				1728	return;
				1729	}
				1730	if (GET_PMU_CTX() != ctx) {
				1731	printk(KERN_ERR "perfmon: pfm_syswide_force_stop CPU%d unexpected ctx %p instead of %p\n",
				1732	smp_processor_id(),
				1733	GET_PMU_CTX(), ctx);
				1734	return;
				1735	}
				1736
				1737	DPRINT(("on CPU%d forcing system wide stop for [%d]\n", smp_processor_id(), ctx->ctx_task->pid));
				1738	/*
				1739	* the context is already protected in pfm_close(), we simply
				1740	* need to mask interrupts to avoid a PMU interrupt race on
				1741	* this CPU
				1742	*/
				1743	local_irq_save(flags);
				1744
				1745	ret = pfm_context_unload(ctx, NULL, 0, regs);
				1746	if (ret) {
				1747	DPRINT(("context_unload returned %d\n", ret));
				1748	}
				1749
				1750	/*
				1751	* unmask interrupts, PMU interrupts are now spurious here
				1752	*/
				1753	local_irq_restore(flags);
				1754	}
				1755
				1756	static void
				1757	pfm_syswide_cleanup_other_cpu(pfm_context_t *ctx)
				1758	{
				1759	int ret;
				1760
				1761	DPRINT(("calling CPU%d for cleanup\n", ctx->ctx_cpu));
				1762	ret = smp_call_function_single(ctx->ctx_cpu, pfm_syswide_force_stop, ctx, 0, 1);
				1763	DPRINT(("called CPU%d for cleanup ret=%d\n", ctx->ctx_cpu, ret));
				1764	}
				1765	#endif /* CONFIG_SMP */
				1766
				1767	/*
				1768	* called for each close(). Partially free resources.
				1769	* When caller is self-monitoring, the context is unloaded.
				1770	*/
				1771	static int
				1772	pfm_flush(struct file *filp)
				1773	{
				1774	pfm_context_t *ctx;
				1775	struct task_struct *task;
				1776	struct pt_regs *regs;
				1777	unsigned long flags;
				1778	unsigned long smpl_buf_size = 0UL;
				1779	void *smpl_buf_vaddr = NULL;
				1780	int state, is_system;
				1781
				1782	if (PFM_IS_FILE(filp) == 0) {
				1783	DPRINT(("bad magic for\n"));
				1784	return -EBADF;
				1785	}
				1786
				1787	ctx = (pfm_context_t *)filp->private_data;
				1788	if (ctx == NULL) {
				1789	printk(KERN_ERR "perfmon: pfm_flush: NULL ctx [%d]\n", current->pid);
				1790	return -EBADF;
				1791	}
				1792
				1793	/*
				1794	* remove our file from the async queue, if we use this mode.
				1795	* This can be done without the context being protected. We come
				1796	* here when the context has become unreacheable by other tasks.
				1797	*
				1798	* We may still have active monitoring at this point and we may
				1799	* end up in pfm_overflow_handler(). However, fasync_helper()
				1800	* operates with interrupts disabled and it cleans up the
				1801	* queue. If the PMU handler is called prior to entering
				1802	* fasync_helper() then it will send a signal. If it is
				1803	* invoked after, it will find an empty queue and no
				1804	* signal will be sent. In both case, we are safe
				1805	*/
				1806	if (filp->f_flags & FASYNC) {
				1807	DPRINT(("cleaning up async_queue=%p\n", ctx->ctx_async_queue));
				1808	pfm_do_fasync (-1, filp, ctx, 0);
				1809	}
				1810
				1811	PROTECT_CTX(ctx, flags);
				1812
				1813	state = ctx->ctx_state;
				1814	is_system = ctx->ctx_fl_system;
				1815
				1816	task = PFM_CTX_TASK(ctx);
				1817	regs = ia64_task_regs(task);
				1818
				1819	DPRINT(("ctx_state=%d is_current=%d\n",
				1820	state,
				1821	task == current ? 1 : 0));
				1822
				1823	/*
				1824	* if state == UNLOADED, then task is NULL
				1825	*/
				1826
				1827	/*
				1828	* we must stop and unload because we are losing access to the context.
				1829	*/
				1830	if (task == current) {
				1831	#ifdef CONFIG_SMP
				1832	/*
				1833	* the task IS the owner but it migrated to another CPU: that's bad
				1834	* but we must handle this cleanly. Unfortunately, the kernel does
				1835	* not provide a mechanism to block migration (while the context is loaded).
				1836	*
				1837	* We need to release the resource on the ORIGINAL cpu.
				1838	*/
				1839	if (is_system && ctx->ctx_cpu != smp_processor_id()) {
				1840
				1841	DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
				1842	/*
				1843	* keep context protected but unmask interrupt for IPI
				1844	*/
				1845	local_irq_restore(flags);
				1846
				1847	pfm_syswide_cleanup_other_cpu(ctx);
				1848
				1849	/*
				1850	* restore interrupt masking
				1851	*/
				1852	local_irq_save(flags);
				1853
				1854	/*
				1855	* context is unloaded at this point
				1856	*/
				1857	} else
				1858	#endif /* CONFIG_SMP */
				1859	{
				1860
				1861	DPRINT(("forcing unload\n"));
				1862	/*
				1863	* stop and unload, returning with state UNLOADED
				1864	* and session unreserved.
				1865	*/
				1866	pfm_context_unload(ctx, NULL, 0, regs);
				1867
				1868	DPRINT(("ctx_state=%d\n", ctx->ctx_state));
				1869	}
				1870	}
				1871
				1872	/*
				1873	* remove virtual mapping, if any, for the calling task.
				1874	* cannot reset ctx field until last user is calling close().
				1875	*
				1876	* ctx_smpl_vaddr must never be cleared because it is needed
				1877	* by every task with access to the context
				1878	*
				1879	* When called from do_exit(), the mm context is gone already, therefore
				1880	* mm is NULL, i.e., the VMA is already gone and we do not have to
				1881	* do anything here
				1882	*/
				1883	if (ctx->ctx_smpl_vaddr && current->mm) {
				1884	smpl_buf_vaddr = ctx->ctx_smpl_vaddr;
				1885	smpl_buf_size = ctx->ctx_smpl_size;
				1886	}
				1887
				1888	UNPROTECT_CTX(ctx, flags);
				1889
				1890	/*
				1891	* if there was a mapping, then we systematically remove it
				1892	* at this point. Cannot be done inside critical section
				1893	* because some VM function reenables interrupts.
				1894	*
				1895	*/
				1896	if (smpl_buf_vaddr) pfm_remove_smpl_mapping(current, smpl_buf_vaddr, smpl_buf_size);
				1897
				1898	return 0;
				1899	}
				1900	/*
				1901	* called either on explicit close() or from exit_files().
				1902	* Only the LAST user of the file gets to this point, i.e., it is
				1903	* called only ONCE.
				1904	*
				1905	* IMPORTANT: we get called ONLY when the refcnt on the file gets to zero
				1906	* (fput()),i.e, last task to access the file. Nobody else can access the
				1907	* file at this point.
				1908	*
				1909	* When called from exit_files(), the VMA has been freed because exit_mm()
				1910	* is executed before exit_files().
				1911	*
				1912	* When called from exit_files(), the current task is not yet ZOMBIE but we
				1913	* flush the PMU state to the context.
				1914	*/
				1915	static int
				1916	pfm_close(struct inode inode, struct file filp)
				1917	{
				1918	pfm_context_t *ctx;
				1919	struct task_struct *task;
				1920	struct pt_regs *regs;
				1921	DECLARE_WAITQUEUE(wait, current);
				1922	unsigned long flags;
				1923	unsigned long smpl_buf_size = 0UL;
				1924	void *smpl_buf_addr = NULL;
				1925	int free_possible = 1;
				1926	int state, is_system;
				1927
				1928	DPRINT(("pfm_close called private=%p\n", filp->private_data));
				1929
				1930	if (PFM_IS_FILE(filp) == 0) {
				1931	DPRINT(("bad magic\n"));
				1932	return -EBADF;
				1933	}
				1934
				1935	ctx = (pfm_context_t *)filp->private_data;
				1936	if (ctx == NULL) {
				1937	printk(KERN_ERR "perfmon: pfm_close: NULL ctx [%d]\n", current->pid);
				1938	return -EBADF;
				1939	}
				1940
				1941	PROTECT_CTX(ctx, flags);
				1942
				1943	state = ctx->ctx_state;
				1944	is_system = ctx->ctx_fl_system;
				1945
				1946	task = PFM_CTX_TASK(ctx);
				1947	regs = ia64_task_regs(task);
				1948
				1949	DPRINT(("ctx_state=%d is_current=%d\n",
				1950	state,
				1951	task == current ? 1 : 0));
				1952
				1953	/*
				1954	* if task == current, then pfm_flush() unloaded the context
				1955	*/
				1956	if (state == PFM_CTX_UNLOADED) goto doit;
				1957
				1958	/*
				1959	* context is loaded/masked and task != current, we need to
				1960	* either force an unload or go zombie
				1961	*/
				1962
				1963	/*
				1964	* The task is currently blocked or will block after an overflow.
				1965	* we must force it to wakeup to get out of the
				1966	* MASKED state and transition to the unloaded state by itself.
				1967	*
				1968	* This situation is only possible for per-task mode
				1969	*/
				1970	if (state == PFM_CTX_MASKED && CTX_OVFL_NOBLOCK(ctx) == 0) {
				1971
				1972	/*
				1973	* set a "partial" zombie state to be checked
				1974	* upon return from down() in pfm_handle_work().
				1975	*
				1976	* We cannot use the ZOMBIE state, because it is checked
				1977	* by pfm_load_regs() which is called upon wakeup from down().
				1978	* In such case, it would free the context and then we would
				1979	* return to pfm_handle_work() which would access the
				1980	* stale context. Instead, we set a flag invisible to pfm_load_regs()
				1981	* but visible to pfm_handle_work().
				1982	*
				1983	* For some window of time, we have a zombie context with
				1984	* ctx_state = MASKED and not ZOMBIE
				1985	*/
				1986	ctx->ctx_fl_going_zombie = 1;
				1987
				1988	/*
				1989	* force task to wake up from MASKED state
				1990	*/
				1991	up(&ctx->ctx_restart_sem);
				1992
				1993	DPRINT(("waking up ctx_state=%d\n", state));
				1994
				1995	/*
				1996	* put ourself to sleep waiting for the other
				1997	* task to report completion
				1998	*
				1999	* the context is protected by mutex, therefore there
				2000	* is no risk of being notified of completion before
				2001	* begin actually on the waitq.
				2002	*/
				2003	set_current_state(TASK_INTERRUPTIBLE);
				2004	add_wait_queue(&ctx->ctx_zombieq, &wait);
				2005
				2006	UNPROTECT_CTX(ctx, flags);
				2007
				2008	/*
				2009	* XXX: check for signals :
				2010	* - ok for explicit close
				2011	* - not ok when coming from exit_files()
				2012	*/
				2013	schedule();
				2014
				2015
				2016	PROTECT_CTX(ctx, flags);
				2017
				2018
				2019	remove_wait_queue(&ctx->ctx_zombieq, &wait);
				2020	set_current_state(TASK_RUNNING);
				2021
				2022	/*
				2023	* context is unloaded at this point
				2024	*/
				2025	DPRINT(("after zombie wakeup ctx_state=%d for\n", state));
				2026	}
				2027	else if (task != current) {
				2028	#ifdef CONFIG_SMP
				2029	/*
				2030	* switch context to zombie state
				2031	*/
				2032	ctx->ctx_state = PFM_CTX_ZOMBIE;
				2033
				2034	DPRINT(("zombie ctx for [%d]\n", task->pid));
				2035	/*
				2036	* cannot free the context on the spot. deferred until
				2037	* the task notices the ZOMBIE state
				2038	*/
				2039	free_possible = 0;
				2040	#else
				2041	pfm_context_unload(ctx, NULL, 0, regs);
				2042	#endif
				2043	}
				2044
				2045	doit:
				2046	/* reload state, may have changed during opening of critical section */
				2047	state = ctx->ctx_state;
				2048
				2049	/*
				2050	* the context is still attached to a task (possibly current)
				2051	* we cannot destroy it right now
				2052	*/
				2053
				2054	/*
				2055	* we must free the sampling buffer right here because
				2056	* we cannot rely on it being cleaned up later by the
				2057	* monitored task. It is not possible to free vmalloc'ed
				2058	* memory in pfm_load_regs(). Instead, we remove the buffer
				2059	* now. should there be subsequent PMU overflow originally
				2060	* meant for sampling, the will be converted to spurious
				2061	* and that's fine because the monitoring tools is gone anyway.
				2062	*/
				2063	if (ctx->ctx_smpl_hdr) {
				2064	smpl_buf_addr = ctx->ctx_smpl_hdr;
				2065	smpl_buf_size = ctx->ctx_smpl_size;
				2066	/* no more sampling */
				2067	ctx->ctx_smpl_hdr = NULL;
				2068	ctx->ctx_fl_is_sampling = 0;
				2069	}
				2070
				2071	DPRINT(("ctx_state=%d free_possible=%d addr=%p size=%lu\n",
				2072	state,
				2073	free_possible,
				2074	smpl_buf_addr,
				2075	smpl_buf_size));
				2076
				2077	if (smpl_buf_addr) pfm_exit_smpl_buffer(ctx->ctx_buf_fmt);
				2078
				2079	/*
				2080	* UNLOADED that the session has already been unreserved.
				2081	*/
				2082	if (state == PFM_CTX_ZOMBIE) {
				2083	pfm_unreserve_session(ctx, ctx->ctx_fl_system , ctx->ctx_cpu);
				2084	}
				2085
				2086	/*
				2087	* disconnect file descriptor from context must be done
				2088	* before we unlock.
				2089	*/
				2090	filp->private_data = NULL;
				2091
				2092	/*
				2093	* if we free on the spot, the context is now completely unreacheable
				2094	* from the callers side. The monitored task side is also cut, so we
				2095	* can freely cut.
				2096	*
				2097	* If we have a deferred free, only the caller side is disconnected.
				2098	*/
				2099	UNPROTECT_CTX(ctx, flags);
				2100
				2101	/*
				2102	* All memory free operations (especially for vmalloc'ed memory)
				2103	* MUST be done with interrupts ENABLED.
				2104	*/
				2105	if (smpl_buf_addr) pfm_rvfree(smpl_buf_addr, smpl_buf_size);
				2106
				2107	/*
				2108	* return the memory used by the context
				2109	*/
				2110	if (free_possible) pfm_context_free(ctx);
				2111
				2112	return 0;
				2113	}
				2114
				2115	static int
				2116	pfm_no_open(struct inode irrelevant, struct file dontcare)
				2117	{
				2118	DPRINT(("pfm_no_open called\n"));
				2119	return -ENXIO;
				2120	}
				2121
				2122
				2123
				2124	static struct file_operations pfm_file_ops = {
				2125	.llseek = no_llseek,
				2126	.read = pfm_read,
				2127	.write = pfm_write,
				2128	.poll = pfm_poll,
				2129	.ioctl = pfm_ioctl,
				2130	.open = pfm_no_open, /* special open code to disallow open via /proc */
				2131	.fasync = pfm_fasync,
				2132	.release = pfm_close,
				2133	.flush = pfm_flush
				2134	};
				2135
				2136	static int
				2137	pfmfs_delete_dentry(struct dentry *dentry)
				2138	{
				2139	return 1;
				2140	}
				2141
				2142	static struct dentry_operations pfmfs_dentry_operations = {
				2143	.d_delete = pfmfs_delete_dentry,
				2144	};
				2145
				2146
				2147	static int
				2148	pfm_alloc_fd(struct file **cfile)
				2149	{
				2150	int fd, ret = 0;
				2151	struct file *file = NULL;
				2152	struct inode * inode;
				2153	char name[32];
				2154	struct qstr this;
				2155
				2156	fd = get_unused_fd();
				2157	if (fd < 0) return -ENFILE;
				2158
				2159	ret = -ENFILE;
				2160
				2161	file = get_empty_filp();
				2162	if (!file) goto out;
				2163
				2164	/*
				2165	* allocate a new inode
				2166	*/
				2167	inode = new_inode(pfmfs_mnt->mnt_sb);
				2168	if (!inode) goto out;
				2169
				2170	DPRINT(("new inode ino=%ld @%p\n", inode->i_ino, inode));
				2171
				2172	inode->i_mode = S_IFCHR\|S_IRUGO;
				2173	inode->i_uid = current->fsuid;
				2174	inode->i_gid = current->fsgid;
				2175
				2176	sprintf(name, "[%lu]", inode->i_ino);
				2177	this.name = name;
				2178	this.len = strlen(name);
				2179	this.hash = inode->i_ino;
				2180
				2181	ret = -ENOMEM;
				2182
				2183	/*
				2184	* allocate a new dcache entry
				2185	*/
				2186	file->f_dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this);
				2187	if (!file->f_dentry) goto out;
				2188
				2189	file->f_dentry->d_op = &pfmfs_dentry_operations;
				2190
				2191	d_add(file->f_dentry, inode);
				2192	file->f_vfsmnt = mntget(pfmfs_mnt);
				2193	file->f_mapping = inode->i_mapping;
				2194
				2195	file->f_op = &pfm_file_ops;
				2196	file->f_mode = FMODE_READ;
				2197	file->f_flags = O_RDONLY;
				2198	file->f_pos = 0;
				2199
				2200	/*
				2201	* may have to delay until context is attached?
				2202	*/
				2203	fd_install(fd, file);
				2204
				2205	/*
				2206	* the file structure we will use
				2207	*/
				2208	*cfile = file;
				2209
				2210	return fd;
				2211	out:
				2212	if (file) put_filp(file);
				2213	put_unused_fd(fd);
				2214	return ret;
				2215	}
				2216
				2217	static void
				2218	pfm_free_fd(int fd, struct file *file)
				2219	{
				2220	struct files_struct *files = current->files;
Dipankar Sarma	badf166	2005-09-09 13:04:10 -0700	[diff] [blame^]	2221	struct fdtable *fdt = files_fdtable(files);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2222
				2223	/*
				2224	* there ie no fd_uninstall(), so we do it here
				2225	*/
				2226	spin_lock(&files->file_lock);
Dipankar Sarma	badf166	2005-09-09 13:04:10 -0700	[diff] [blame^]	2227	rcu_assign_pointer(fdt->fd[fd], NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2228	spin_unlock(&files->file_lock);
				2229
Dipankar Sarma	badf166	2005-09-09 13:04:10 -0700	[diff] [blame^]	2230	if (file)
				2231	put_filp(file);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2232	put_unused_fd(fd);
				2233	}
				2234
				2235	static int
				2236	pfm_remap_buffer(struct vm_area_struct *vma, unsigned long buf, unsigned long addr, unsigned long size)
				2237	{
				2238	DPRINT(("CPU%d buf=0x%lx addr=0x%lx size=%ld\n", smp_processor_id(), buf, addr, size));
				2239
				2240	while (size > 0) {
				2241	unsigned long pfn = ia64_tpa(buf) >> PAGE_SHIFT;
				2242
				2243
				2244	if (remap_pfn_range(vma, addr, pfn, PAGE_SIZE, PAGE_READONLY))
				2245	return -ENOMEM;
				2246
				2247	addr += PAGE_SIZE;
				2248	buf += PAGE_SIZE;
				2249	size -= PAGE_SIZE;
				2250	}
				2251	return 0;
				2252	}
				2253
				2254	/*
				2255	* allocate a sampling buffer and remaps it into the user address space of the task
				2256	*/
				2257	static int
				2258	pfm_smpl_buffer_alloc(struct task_struct task, pfm_context_t ctx, unsigned long rsize, void **user_vaddr)
				2259	{
				2260	struct mm_struct *mm = task->mm;
				2261	struct vm_area_struct *vma = NULL;
				2262	unsigned long size;
				2263	void *smpl_buf;
				2264
				2265
				2266	/*
				2267	* the fixed header + requested size and align to page boundary
				2268	*/
				2269	size = PAGE_ALIGN(rsize);
				2270
				2271	DPRINT(("sampling buffer rsize=%lu size=%lu bytes\n", rsize, size));
				2272
				2273	/*
				2274	* check requested size to avoid Denial-of-service attacks
				2275	* XXX: may have to refine this test
				2276	* Check against address space limit.
				2277	*
				2278	* if ((mm->total_vm << PAGE_SHIFT) + len> task->rlim[RLIMIT_AS].rlim_cur)
				2279	* return -ENOMEM;
				2280	*/
				2281	if (size > task->signal->rlim[RLIMIT_MEMLOCK].rlim_cur)
				2282	return -ENOMEM;
				2283
				2284	/*
				2285	* We do the easy to undo allocations first.
				2286	*
				2287	* pfm_rvmalloc(), clears the buffer, so there is no leak
				2288	*/
				2289	smpl_buf = pfm_rvmalloc(size);
				2290	if (smpl_buf == NULL) {
				2291	DPRINT(("Can't allocate sampling buffer\n"));
				2292	return -ENOMEM;
				2293	}
				2294
				2295	DPRINT(("smpl_buf @%p\n", smpl_buf));
				2296
				2297	/* allocate vma */
				2298	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
				2299	if (!vma) {
				2300	DPRINT(("Cannot allocate vma\n"));
				2301	goto error_kmem;
				2302	}
				2303	memset(vma, 0, sizeof(*vma));
				2304
				2305	/*
				2306	* partially initialize the vma for the sampling buffer
				2307	*/
				2308	vma->vm_mm = mm;
				2309	vma->vm_flags = VM_READ\| VM_MAYREAD \|VM_RESERVED;
				2310	vma->vm_page_prot = PAGE_READONLY; /* XXX may need to change */
				2311
				2312	/*
				2313	* Now we have everything we need and we can initialize
				2314	* and connect all the data structures
				2315	*/
				2316
				2317	ctx->ctx_smpl_hdr = smpl_buf;
				2318	ctx->ctx_smpl_size = size; /* aligned size */
				2319
				2320	/*
				2321	* Let's do the difficult operations next.
				2322	*
				2323	* now we atomically find some area in the address space and
				2324	* remap the buffer in it.
				2325	*/
				2326	down_write(&task->mm->mmap_sem);
				2327
				2328	/* find some free area in address space, must have mmap sem held */
				2329	vma->vm_start = pfm_get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE\|MAP_ANONYMOUS, 0);
				2330	if (vma->vm_start == 0UL) {
				2331	DPRINT(("Cannot find unmapped area for size %ld\n", size));
				2332	up_write(&task->mm->mmap_sem);
				2333	goto error;
				2334	}
				2335	vma->vm_end = vma->vm_start + size;
				2336	vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
				2337
				2338	DPRINT(("aligned size=%ld, hdr=%p mapped @0x%lx\n", size, ctx->ctx_smpl_hdr, vma->vm_start));
				2339
				2340	/* can only be applied to current task, need to have the mm semaphore held when called */
				2341	if (pfm_remap_buffer(vma, (unsigned long)smpl_buf, vma->vm_start, size)) {
				2342	DPRINT(("Can't remap buffer\n"));
				2343	up_write(&task->mm->mmap_sem);
				2344	goto error;
				2345	}
				2346
				2347	/*
				2348	* now insert the vma in the vm list for the process, must be
				2349	* done with mmap lock held
				2350	*/
				2351	insert_vm_struct(mm, vma);
				2352
				2353	mm->total_vm += size >> PAGE_SHIFT;
				2354	vm_stat_account(vma);
				2355	up_write(&task->mm->mmap_sem);
				2356
				2357	/*
				2358	* keep track of user level virtual address
				2359	*/
				2360	ctx->ctx_smpl_vaddr = (void *)vma->vm_start;
				2361	(unsigned long )user_vaddr = vma->vm_start;
				2362
				2363	return 0;
				2364
				2365	error:
				2366	kmem_cache_free(vm_area_cachep, vma);
				2367	error_kmem:
				2368	pfm_rvfree(smpl_buf, size);
				2369
				2370	return -ENOMEM;
				2371	}
				2372
				2373	/*
				2374	* XXX: do something better here
				2375	*/
				2376	static int
				2377	pfm_bad_permissions(struct task_struct *task)
				2378	{
				2379	/* inspired by ptrace_attach() */
				2380	DPRINT(("cur: uid=%d gid=%d task: euid=%d suid=%d uid=%d egid=%d sgid=%d\n",
				2381	current->uid,
				2382	current->gid,
				2383	task->euid,
				2384	task->suid,
				2385	task->uid,
				2386	task->egid,
				2387	task->sgid));
				2388
				2389	return ((current->uid != task->euid)
				2390	\|\| (current->uid != task->suid)
				2391	\|\| (current->uid != task->uid)
				2392	\|\| (current->gid != task->egid)
				2393	\|\| (current->gid != task->sgid)
				2394	\|\| (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE);
				2395	}
				2396
				2397	static int
				2398	pfarg_is_sane(struct task_struct task, pfarg_context_t pfx)
				2399	{
				2400	int ctx_flags;
				2401
				2402	/* valid signal */
				2403
				2404	ctx_flags = pfx->ctx_flags;
				2405
				2406	if (ctx_flags & PFM_FL_SYSTEM_WIDE) {
				2407
				2408	/*
				2409	* cannot block in this mode
				2410	*/
				2411	if (ctx_flags & PFM_FL_NOTIFY_BLOCK) {
				2412	DPRINT(("cannot use blocking mode when in system wide monitoring\n"));
				2413	return -EINVAL;
				2414	}
				2415	} else {
				2416	}
				2417	/* probably more to add here */
				2418
				2419	return 0;
				2420	}
				2421
				2422	static int
				2423	pfm_setup_buffer_fmt(struct task_struct task, pfm_context_t ctx, unsigned int ctx_flags,
				2424	unsigned int cpu, pfarg_context_t *arg)
				2425	{
				2426	pfm_buffer_fmt_t *fmt = NULL;
				2427	unsigned long size = 0UL;
				2428	void *uaddr = NULL;
				2429	void *fmt_arg = NULL;
				2430	int ret = 0;
				2431	#define PFM_CTXARG_BUF_ARG(a) (pfm_buffer_fmt_t *)(a+1)
				2432
				2433	/* invoke and lock buffer format, if found */
				2434	fmt = pfm_find_buffer_fmt(arg->ctx_smpl_buf_id);
				2435	if (fmt == NULL) {
				2436	DPRINT(("[%d] cannot find buffer format\n", task->pid));
				2437	return -EINVAL;
				2438	}
				2439
				2440	/*
				2441	* buffer argument MUST be contiguous to pfarg_context_t
				2442	*/
				2443	if (fmt->fmt_arg_size) fmt_arg = PFM_CTXARG_BUF_ARG(arg);
				2444
				2445	ret = pfm_buf_fmt_validate(fmt, task, ctx_flags, cpu, fmt_arg);
				2446
				2447	DPRINT(("[%d] after validate(0x%x,%d,%p)=%d\n", task->pid, ctx_flags, cpu, fmt_arg, ret));
				2448
				2449	if (ret) goto error;
				2450
				2451	/* link buffer format and context */
				2452	ctx->ctx_buf_fmt = fmt;
				2453
				2454	/*
				2455	* check if buffer format wants to use perfmon buffer allocation/mapping service
				2456	*/
				2457	ret = pfm_buf_fmt_getsize(fmt, task, ctx_flags, cpu, fmt_arg, &size);
				2458	if (ret) goto error;
				2459
				2460	if (size) {
				2461	/*
				2462	* buffer is always remapped into the caller's address space
				2463	*/
				2464	ret = pfm_smpl_buffer_alloc(current, ctx, size, &uaddr);
				2465	if (ret) goto error;
				2466
				2467	/* keep track of user address of buffer */
				2468	arg->ctx_smpl_vaddr = uaddr;
				2469	}
				2470	ret = pfm_buf_fmt_init(fmt, task, ctx->ctx_smpl_hdr, ctx_flags, cpu, fmt_arg);
				2471
				2472	error:
				2473	return ret;
				2474	}
				2475
				2476	static void
				2477	pfm_reset_pmu_state(pfm_context_t *ctx)
				2478	{
				2479	int i;
				2480
				2481	/*
				2482	* install reset values for PMC.
				2483	*/
				2484	for (i=1; PMC_IS_LAST(i) == 0; i++) {
				2485	if (PMC_IS_IMPL(i) == 0) continue;
				2486	ctx->ctx_pmcs[i] = PMC_DFL_VAL(i);
				2487	DPRINT(("pmc[%d]=0x%lx\n", i, ctx->ctx_pmcs[i]));
				2488	}
				2489	/*
				2490	* PMD registers are set to 0UL when the context in memset()
				2491	*/
				2492
				2493	/*
				2494	* On context switched restore, we must restore ALL pmc and ALL pmd even
				2495	* when they are not actively used by the task. In UP, the incoming process
				2496	* may otherwise pick up left over PMC, PMD state from the previous process.
				2497	* As opposed to PMD, stale PMC can cause harm to the incoming
				2498	* process because they may change what is being measured.
				2499	* Therefore, we must systematically reinstall the entire
				2500	* PMC state. In SMP, the same thing is possible on the
				2501	* same CPU but also on between 2 CPUs.
				2502	*
				2503	* The problem with PMD is information leaking especially
				2504	* to user level when psr.sp=0
				2505	*
				2506	* There is unfortunately no easy way to avoid this problem
				2507	* on either UP or SMP. This definitively slows down the
				2508	* pfm_load_regs() function.
				2509	*/
				2510
				2511	/*
				2512	* bitmask of all PMCs accessible to this context
				2513	*
				2514	* PMC0 is treated differently.
				2515	*/
				2516	ctx->ctx_all_pmcs[0] = pmu_conf->impl_pmcs[0] & ~0x1;
				2517
				2518	/*
				2519	* bitmask of all PMDs that are accesible to this context
				2520	*/
				2521	ctx->ctx_all_pmds[0] = pmu_conf->impl_pmds[0];
				2522
				2523	DPRINT(("<%d> all_pmcs=0x%lx all_pmds=0x%lx\n", ctx->ctx_fd, ctx->ctx_all_pmcs[0],ctx->ctx_all_pmds[0]));
				2524
				2525	/*
				2526	* useful in case of re-enable after disable
				2527	*/
				2528	ctx->ctx_used_ibrs[0] = 0UL;
				2529	ctx->ctx_used_dbrs[0] = 0UL;
				2530	}
				2531
				2532	static int
				2533	pfm_ctx_getsize(void arg, size_t sz)
				2534	{
				2535	pfarg_context_t req = (pfarg_context_t )arg;
				2536	pfm_buffer_fmt_t *fmt;
				2537
				2538	*sz = 0;
				2539
				2540	if (!pfm_uuid_cmp(req->ctx_smpl_buf_id, pfm_null_uuid)) return 0;
				2541
				2542	fmt = pfm_find_buffer_fmt(req->ctx_smpl_buf_id);
				2543	if (fmt == NULL) {
				2544	DPRINT(("cannot find buffer format\n"));
				2545	return -EINVAL;
				2546	}
				2547	/* get just enough to copy in user parameters */
				2548	*sz = fmt->fmt_arg_size;
				2549	DPRINT(("arg_size=%lu\n", *sz));
				2550
				2551	return 0;
				2552	}
				2553
				2554
				2555
				2556	/*
				2557	* cannot attach if :
				2558	* - kernel task
				2559	* - task not owned by caller
				2560	* - task incompatible with context mode
				2561	*/
				2562	static int
				2563	pfm_task_incompatible(pfm_context_t ctx, struct task_struct task)
				2564	{
				2565	/*
				2566	* no kernel task or task not owner by caller
				2567	*/
				2568	if (task->mm == NULL) {
				2569	DPRINT(("task [%d] has not memory context (kernel thread)\n", task->pid));
				2570	return -EPERM;
				2571	}
				2572	if (pfm_bad_permissions(task)) {
				2573	DPRINT(("no permission to attach to [%d]\n", task->pid));
				2574	return -EPERM;
				2575	}
				2576	/*
				2577	* cannot block in self-monitoring mode
				2578	*/
				2579	if (CTX_OVFL_NOBLOCK(ctx) == 0 && task == current) {
				2580	DPRINT(("cannot load a blocking context on self for [%d]\n", task->pid));
				2581	return -EINVAL;
				2582	}
				2583
				2584	if (task->exit_state == EXIT_ZOMBIE) {
				2585	DPRINT(("cannot attach to zombie task [%d]\n", task->pid));
				2586	return -EBUSY;
				2587	}
				2588
				2589	/*
				2590	* always ok for self
				2591	*/
				2592	if (task == current) return 0;
				2593
				2594	if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) {
				2595	DPRINT(("cannot attach to non-stopped task [%d] state=%ld\n", task->pid, task->state));
				2596	return -EBUSY;
				2597	}
				2598	/*
				2599	* make sure the task is off any CPU
				2600	*/
				2601	wait_task_inactive(task);
				2602
				2603	/* more to come... */
				2604
				2605	return 0;
				2606	}
				2607
				2608	static int
				2609	pfm_get_task(pfm_context_t ctx, pid_t pid, struct task_struct *task)
				2610	{
				2611	struct task_struct *p = current;
				2612	int ret;
				2613
				2614	/* XXX: need to add more checks here */
				2615	if (pid < 2) return -EPERM;
				2616
				2617	if (pid != current->pid) {
				2618
				2619	read_lock(&tasklist_lock);
				2620
				2621	p = find_task_by_pid(pid);
				2622
				2623	/* make sure task cannot go away while we operate on it */
				2624	if (p) get_task_struct(p);
				2625
				2626	read_unlock(&tasklist_lock);
				2627
				2628	if (p == NULL) return -ESRCH;
				2629	}
				2630
				2631	ret = pfm_task_incompatible(ctx, p);
				2632	if (ret == 0) {
				2633	*task = p;
				2634	} else if (p != current) {
				2635	pfm_put_task(p);
				2636	}
				2637	return ret;
				2638	}
				2639
				2640
				2641
				2642	static int
				2643	pfm_context_create(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				2644	{
				2645	pfarg_context_t req = (pfarg_context_t )arg;
				2646	struct file *filp;
				2647	int ctx_flags;
				2648	int ret;
				2649
				2650	/* let's check the arguments first */
				2651	ret = pfarg_is_sane(current, req);
				2652	if (ret < 0) return ret;
				2653
				2654	ctx_flags = req->ctx_flags;
				2655
				2656	ret = -ENOMEM;
				2657
				2658	ctx = pfm_context_alloc();
				2659	if (!ctx) goto error;
				2660
				2661	ret = pfm_alloc_fd(&filp);
				2662	if (ret < 0) goto error_file;
				2663
				2664	req->ctx_fd = ctx->ctx_fd = ret;
				2665
				2666	/*
				2667	* attach context to file
				2668	*/
				2669	filp->private_data = ctx;
				2670
				2671	/*
				2672	* does the user want to sample?
				2673	*/
				2674	if (pfm_uuid_cmp(req->ctx_smpl_buf_id, pfm_null_uuid)) {
				2675	ret = pfm_setup_buffer_fmt(current, ctx, ctx_flags, 0, req);
				2676	if (ret) goto buffer_error;
				2677	}
				2678
				2679	/*
				2680	* init context protection lock
				2681	*/
				2682	spin_lock_init(&ctx->ctx_lock);
				2683
				2684	/*
				2685	* context is unloaded
				2686	*/
				2687	ctx->ctx_state = PFM_CTX_UNLOADED;
				2688
				2689	/*
				2690	* initialization of context's flags
				2691	*/
				2692	ctx->ctx_fl_block = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0;
				2693	ctx->ctx_fl_system = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0;
				2694	ctx->ctx_fl_is_sampling = ctx->ctx_buf_fmt ? 1 : 0; /* assume record() is defined */
				2695	ctx->ctx_fl_no_msg = (ctx_flags & PFM_FL_OVFL_NO_MSG) ? 1: 0;
				2696	/*
				2697	* will move to set properties
				2698	* ctx->ctx_fl_excl_idle = (ctx_flags & PFM_FL_EXCL_IDLE) ? 1: 0;
				2699	*/
				2700
				2701	/*
				2702	* init restart semaphore to locked
				2703	*/
				2704	sema_init(&ctx->ctx_restart_sem, 0);
				2705
				2706	/*
				2707	* activation is used in SMP only
				2708	*/
				2709	ctx->ctx_last_activation = PFM_INVALID_ACTIVATION;
				2710	SET_LAST_CPU(ctx, -1);
				2711
				2712	/*
				2713	* initialize notification message queue
				2714	*/
				2715	ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0;
				2716	init_waitqueue_head(&ctx->ctx_msgq_wait);
				2717	init_waitqueue_head(&ctx->ctx_zombieq);
				2718
				2719	DPRINT(("ctx=%p flags=0x%x system=%d notify_block=%d excl_idle=%d no_msg=%d ctx_fd=%d \n",
				2720	ctx,
				2721	ctx_flags,
				2722	ctx->ctx_fl_system,
				2723	ctx->ctx_fl_block,
				2724	ctx->ctx_fl_excl_idle,
				2725	ctx->ctx_fl_no_msg,
				2726	ctx->ctx_fd));
				2727
				2728	/*
				2729	* initialize soft PMU state
				2730	*/
				2731	pfm_reset_pmu_state(ctx);
				2732
				2733	return 0;
				2734
				2735	buffer_error:
				2736	pfm_free_fd(ctx->ctx_fd, filp);
				2737
				2738	if (ctx->ctx_buf_fmt) {
				2739	pfm_buf_fmt_exit(ctx->ctx_buf_fmt, current, NULL, regs);
				2740	}
				2741	error_file:
				2742	pfm_context_free(ctx);
				2743
				2744	error:
				2745	return ret;
				2746	}
				2747
				2748	static inline unsigned long
				2749	pfm_new_counter_value (pfm_counter_t *reg, int is_long_reset)
				2750	{
				2751	unsigned long val = is_long_reset ? reg->long_reset : reg->short_reset;
				2752	unsigned long new_seed, old_seed = reg->seed, mask = reg->mask;
				2753	extern unsigned long carta_random32 (unsigned long seed);
				2754
				2755	if (reg->flags & PFM_REGFL_RANDOM) {
				2756	new_seed = carta_random32(old_seed);
				2757	val -= (old_seed & mask); /* counter values are negative numbers! */
				2758	if ((mask >> 32) != 0)
				2759	/* construct a full 64-bit random value: */
				2760	new_seed \|= carta_random32(old_seed >> 32) << 32;
				2761	reg->seed = new_seed;
				2762	}
				2763	reg->lval = val;
				2764	return val;
				2765	}
				2766
				2767	static void
				2768	pfm_reset_regs_masked(pfm_context_t ctx, unsigned long ovfl_regs, int is_long_reset)
				2769	{
				2770	unsigned long mask = ovfl_regs[0];
				2771	unsigned long reset_others = 0UL;
				2772	unsigned long val;
				2773	int i;
				2774
				2775	/*
				2776	* now restore reset value on sampling overflowed counters
				2777	*/
				2778	mask >>= PMU_FIRST_COUNTER;
				2779	for(i = PMU_FIRST_COUNTER; mask; i++, mask >>= 1) {
				2780
				2781	if ((mask & 0x1UL) == 0UL) continue;
				2782
				2783	ctx->ctx_pmds[i].val = val = pfm_new_counter_value(ctx->ctx_pmds+ i, is_long_reset);
				2784	reset_others \|= ctx->ctx_pmds[i].reset_pmds[0];
				2785
				2786	DPRINT_ovfl((" %s reset ctx_pmds[%d]=%lx\n", is_long_reset ? "long" : "short", i, val));
				2787	}
				2788
				2789	/*
				2790	* Now take care of resetting the other registers
				2791	*/
				2792	for(i = 0; reset_others; i++, reset_others >>= 1) {
				2793
				2794	if ((reset_others & 0x1) == 0) continue;
				2795
				2796	ctx->ctx_pmds[i].val = val = pfm_new_counter_value(ctx->ctx_pmds + i, is_long_reset);
				2797
				2798	DPRINT_ovfl(("%s reset_others pmd[%d]=%lx\n",
				2799	is_long_reset ? "long" : "short", i, val));
				2800	}
				2801	}
				2802
				2803	static void
				2804	pfm_reset_regs(pfm_context_t ctx, unsigned long ovfl_regs, int is_long_reset)
				2805	{
				2806	unsigned long mask = ovfl_regs[0];
				2807	unsigned long reset_others = 0UL;
				2808	unsigned long val;
				2809	int i;
				2810
				2811	DPRINT_ovfl(("ovfl_regs=0x%lx is_long_reset=%d\n", ovfl_regs[0], is_long_reset));
				2812
				2813	if (ctx->ctx_state == PFM_CTX_MASKED) {
				2814	pfm_reset_regs_masked(ctx, ovfl_regs, is_long_reset);
				2815	return;
				2816	}
				2817
				2818	/*
				2819	* now restore reset value on sampling overflowed counters
				2820	*/
				2821	mask >>= PMU_FIRST_COUNTER;
				2822	for(i = PMU_FIRST_COUNTER; mask; i++, mask >>= 1) {
				2823
				2824	if ((mask & 0x1UL) == 0UL) continue;
				2825
				2826	val = pfm_new_counter_value(ctx->ctx_pmds+ i, is_long_reset);
				2827	reset_others \|= ctx->ctx_pmds[i].reset_pmds[0];
				2828
				2829	DPRINT_ovfl((" %s reset ctx_pmds[%d]=%lx\n", is_long_reset ? "long" : "short", i, val));
				2830
				2831	pfm_write_soft_counter(ctx, i, val);
				2832	}
				2833
				2834	/*
				2835	* Now take care of resetting the other registers
				2836	*/
				2837	for(i = 0; reset_others; i++, reset_others >>= 1) {
				2838
				2839	if ((reset_others & 0x1) == 0) continue;
				2840
				2841	val = pfm_new_counter_value(ctx->ctx_pmds + i, is_long_reset);
				2842
				2843	if (PMD_IS_COUNTING(i)) {
				2844	pfm_write_soft_counter(ctx, i, val);
				2845	} else {
				2846	ia64_set_pmd(i, val);
				2847	}
				2848	DPRINT_ovfl(("%s reset_others pmd[%d]=%lx\n",
				2849	is_long_reset ? "long" : "short", i, val));
				2850	}
				2851	ia64_srlz_d();
				2852	}
				2853
				2854	static int
				2855	pfm_write_pmcs(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				2856	{
				2857	struct thread_struct *thread = NULL;
				2858	struct task_struct *task;
				2859	pfarg_reg_t req = (pfarg_reg_t )arg;
				2860	unsigned long value, pmc_pm;
				2861	unsigned long smpl_pmds, reset_pmds, impl_pmds;
				2862	unsigned int cnum, reg_flags, flags, pmc_type;
				2863	int i, can_access_pmu = 0, is_loaded, is_system, expert_mode;
				2864	int is_monitor, is_counting, state;
				2865	int ret = -EINVAL;
				2866	pfm_reg_check_t wr_func;
				2867	#define PFM_CHECK_PMC_PM(x, y, z) ((x)->ctx_fl_system ^ PMC_PM(y, z))
				2868
				2869	state = ctx->ctx_state;
				2870	is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
				2871	is_system = ctx->ctx_fl_system;
				2872	task = ctx->ctx_task;
				2873	impl_pmds = pmu_conf->impl_pmds[0];
				2874
				2875	if (state == PFM_CTX_ZOMBIE) return -EINVAL;
				2876
				2877	if (is_loaded) {
				2878	thread = &task->thread;
				2879	/*
				2880	* In system wide and when the context is loaded, access can only happen
				2881	* when the caller is running on the CPU being monitored by the session.
				2882	* It does not have to be the owner (ctx_task) of the context per se.
				2883	*/
				2884	if (is_system && ctx->ctx_cpu != smp_processor_id()) {
				2885	DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
				2886	return -EBUSY;
				2887	}
				2888	can_access_pmu = GET_PMU_OWNER() == task \|\| is_system ? 1 : 0;
				2889	}
				2890	expert_mode = pfm_sysctl.expert_mode;
				2891
				2892	for (i = 0; i < count; i++, req++) {
				2893
				2894	cnum = req->reg_num;
				2895	reg_flags = req->reg_flags;
				2896	value = req->reg_value;
				2897	smpl_pmds = req->reg_smpl_pmds[0];
				2898	reset_pmds = req->reg_reset_pmds[0];
				2899	flags = 0;
				2900
				2901
				2902	if (cnum >= PMU_MAX_PMCS) {
				2903	DPRINT(("pmc%u is invalid\n", cnum));
				2904	goto error;
				2905	}
				2906
				2907	pmc_type = pmu_conf->pmc_desc[cnum].type;
				2908	pmc_pm = (value >> pmu_conf->pmc_desc[cnum].pm_pos) & 0x1;
				2909	is_counting = (pmc_type & PFM_REG_COUNTING) == PFM_REG_COUNTING ? 1 : 0;
				2910	is_monitor = (pmc_type & PFM_REG_MONITOR) == PFM_REG_MONITOR ? 1 : 0;
				2911
				2912	/*
				2913	* we reject all non implemented PMC as well
				2914	* as attempts to modify PMC[0-3] which are used
				2915	* as status registers by the PMU
				2916	*/
				2917	if ((pmc_type & PFM_REG_IMPL) == 0 \|\| (pmc_type & PFM_REG_CONTROL) == PFM_REG_CONTROL) {
				2918	DPRINT(("pmc%u is unimplemented or no-access pmc_type=%x\n", cnum, pmc_type));
				2919	goto error;
				2920	}
				2921	wr_func = pmu_conf->pmc_desc[cnum].write_check;
				2922	/*
				2923	* If the PMC is a monitor, then if the value is not the default:
				2924	* - system-wide session: PMCx.pm=1 (privileged monitor)
				2925	* - per-task : PMCx.pm=0 (user monitor)
				2926	*/
				2927	if (is_monitor && value != PMC_DFL_VAL(cnum) && is_system ^ pmc_pm) {
				2928	DPRINT(("pmc%u pmc_pm=%lu is_system=%d\n",
				2929	cnum,
				2930	pmc_pm,
				2931	is_system));
				2932	goto error;
				2933	}
				2934
				2935	if (is_counting) {
				2936	/*
				2937	* enforce generation of overflow interrupt. Necessary on all
				2938	* CPUs.
				2939	*/
				2940	value \|= 1 << PMU_PMC_OI;
				2941
				2942	if (reg_flags & PFM_REGFL_OVFL_NOTIFY) {
				2943	flags \|= PFM_REGFL_OVFL_NOTIFY;
				2944	}
				2945
				2946	if (reg_flags & PFM_REGFL_RANDOM) flags \|= PFM_REGFL_RANDOM;
				2947
				2948	/* verify validity of smpl_pmds */
				2949	if ((smpl_pmds & impl_pmds) != smpl_pmds) {
				2950	DPRINT(("invalid smpl_pmds 0x%lx for pmc%u\n", smpl_pmds, cnum));
				2951	goto error;
				2952	}
				2953
				2954	/* verify validity of reset_pmds */
				2955	if ((reset_pmds & impl_pmds) != reset_pmds) {
				2956	DPRINT(("invalid reset_pmds 0x%lx for pmc%u\n", reset_pmds, cnum));
				2957	goto error;
				2958	}
				2959	} else {
				2960	if (reg_flags & (PFM_REGFL_OVFL_NOTIFY\|PFM_REGFL_RANDOM)) {
				2961	DPRINT(("cannot set ovfl_notify or random on pmc%u\n", cnum));
				2962	goto error;
				2963	}
				2964	/* eventid on non-counting monitors are ignored */
				2965	}
				2966
				2967	/*
				2968	* execute write checker, if any
				2969	*/
				2970	if (likely(expert_mode == 0 && wr_func)) {
				2971	ret = (*wr_func)(task, ctx, cnum, &value, regs);
				2972	if (ret) goto error;
				2973	ret = -EINVAL;
				2974	}
				2975
				2976	/*
				2977	* no error on this register
				2978	*/
				2979	PFM_REG_RETFLAG_SET(req->reg_flags, 0);
				2980
				2981	/*
				2982	* Now we commit the changes to the software state
				2983	*/
				2984
				2985	/*
				2986	* update overflow information
				2987	*/
				2988	if (is_counting) {
				2989	/*
				2990	* full flag update each time a register is programmed
				2991	*/
				2992	ctx->ctx_pmds[cnum].flags = flags;
				2993
				2994	ctx->ctx_pmds[cnum].reset_pmds[0] = reset_pmds;
				2995	ctx->ctx_pmds[cnum].smpl_pmds[0] = smpl_pmds;
				2996	ctx->ctx_pmds[cnum].eventid = req->reg_smpl_eventid;
				2997
				2998	/*
				2999	* Mark all PMDS to be accessed as used.
				3000	*
				3001	* We do not keep track of PMC because we have to
				3002	* systematically restore ALL of them.
				3003	*
				3004	* We do not update the used_monitors mask, because
				3005	* if we have not programmed them, then will be in
				3006	* a quiescent state, therefore we will not need to
				3007	* mask/restore then when context is MASKED.
				3008	*/
				3009	CTX_USED_PMD(ctx, reset_pmds);
				3010	CTX_USED_PMD(ctx, smpl_pmds);
				3011	/*
				3012	* make sure we do not try to reset on
				3013	* restart because we have established new values
				3014	*/
				3015	if (state == PFM_CTX_MASKED) ctx->ctx_ovfl_regs[0] &= ~1UL << cnum;
				3016	}
				3017	/*
				3018	* Needed in case the user does not initialize the equivalent
				3019	* PMD. Clearing is done indirectly via pfm_reset_pmu_state() so there is no
				3020	* possible leak here.
				3021	*/
				3022	CTX_USED_PMD(ctx, pmu_conf->pmc_desc[cnum].dep_pmd[0]);
				3023
				3024	/*
				3025	* keep track of the monitor PMC that we are using.
				3026	* we save the value of the pmc in ctx_pmcs[] and if
				3027	* the monitoring is not stopped for the context we also
				3028	* place it in the saved state area so that it will be
				3029	* picked up later by the context switch code.
				3030	*
				3031	* The value in ctx_pmcs[] can only be changed in pfm_write_pmcs().
				3032	*
				3033	* The value in thread->pmcs[] may be modified on overflow, i.e., when
				3034	* monitoring needs to be stopped.
				3035	*/
				3036	if (is_monitor) CTX_USED_MONITOR(ctx, 1UL << cnum);
				3037
				3038	/*
				3039	* update context state
				3040	*/
				3041	ctx->ctx_pmcs[cnum] = value;
				3042
				3043	if (is_loaded) {
				3044	/*
				3045	* write thread state
				3046	*/
				3047	if (is_system == 0) thread->pmcs[cnum] = value;
				3048
				3049	/*
				3050	* write hardware register if we can
				3051	*/
				3052	if (can_access_pmu) {
				3053	ia64_set_pmc(cnum, value);
				3054	}
				3055	#ifdef CONFIG_SMP
				3056	else {
				3057	/*
				3058	* per-task SMP only here
				3059	*
				3060	* we are guaranteed that the task is not running on the other CPU,
				3061	* we indicate that this PMD will need to be reloaded if the task
				3062	* is rescheduled on the CPU it ran last on.
				3063	*/
				3064	ctx->ctx_reload_pmcs[0] \|= 1UL << cnum;
				3065	}
				3066	#endif
				3067	}
				3068
				3069	DPRINT(("pmc[%u]=0x%lx ld=%d apmu=%d flags=0x%x all_pmcs=0x%lx used_pmds=0x%lx eventid=%ld smpl_pmds=0x%lx reset_pmds=0x%lx reloads_pmcs=0x%lx used_monitors=0x%lx ovfl_regs=0x%lx\n",
				3070	cnum,
				3071	value,
				3072	is_loaded,
				3073	can_access_pmu,
				3074	flags,
				3075	ctx->ctx_all_pmcs[0],
				3076	ctx->ctx_used_pmds[0],
				3077	ctx->ctx_pmds[cnum].eventid,
				3078	smpl_pmds,
				3079	reset_pmds,
				3080	ctx->ctx_reload_pmcs[0],
				3081	ctx->ctx_used_monitors[0],
				3082	ctx->ctx_ovfl_regs[0]));
				3083	}
				3084
				3085	/*
				3086	* make sure the changes are visible
				3087	*/
				3088	if (can_access_pmu) ia64_srlz_d();
				3089
				3090	return 0;
				3091	error:
				3092	PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL);
				3093	return ret;
				3094	}
				3095
				3096	static int
				3097	pfm_write_pmds(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				3098	{
				3099	struct thread_struct *thread = NULL;
				3100	struct task_struct *task;
				3101	pfarg_reg_t req = (pfarg_reg_t )arg;
				3102	unsigned long value, hw_value, ovfl_mask;
				3103	unsigned int cnum;
				3104	int i, can_access_pmu = 0, state;
				3105	int is_counting, is_loaded, is_system, expert_mode;
				3106	int ret = -EINVAL;
				3107	pfm_reg_check_t wr_func;
				3108
				3109
				3110	state = ctx->ctx_state;
				3111	is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
				3112	is_system = ctx->ctx_fl_system;
				3113	ovfl_mask = pmu_conf->ovfl_val;
				3114	task = ctx->ctx_task;
				3115
				3116	if (unlikely(state == PFM_CTX_ZOMBIE)) return -EINVAL;
				3117
				3118	/*
				3119	* on both UP and SMP, we can only write to the PMC when the task is
				3120	* the owner of the local PMU.
				3121	*/
				3122	if (likely(is_loaded)) {
				3123	thread = &task->thread;
				3124	/*
				3125	* In system wide and when the context is loaded, access can only happen
				3126	* when the caller is running on the CPU being monitored by the session.
				3127	* It does not have to be the owner (ctx_task) of the context per se.
				3128	*/
				3129	if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) {
				3130	DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
				3131	return -EBUSY;
				3132	}
				3133	can_access_pmu = GET_PMU_OWNER() == task \|\| is_system ? 1 : 0;
				3134	}
				3135	expert_mode = pfm_sysctl.expert_mode;
				3136
				3137	for (i = 0; i < count; i++, req++) {
				3138
				3139	cnum = req->reg_num;
				3140	value = req->reg_value;
				3141
				3142	if (!PMD_IS_IMPL(cnum)) {
				3143	DPRINT(("pmd[%u] is unimplemented or invalid\n", cnum));
				3144	goto abort_mission;
				3145	}
				3146	is_counting = PMD_IS_COUNTING(cnum);
				3147	wr_func = pmu_conf->pmd_desc[cnum].write_check;
				3148
				3149	/*
				3150	* execute write checker, if any
				3151	*/
				3152	if (unlikely(expert_mode == 0 && wr_func)) {
				3153	unsigned long v = value;
				3154
				3155	ret = (*wr_func)(task, ctx, cnum, &v, regs);
				3156	if (ret) goto abort_mission;
				3157
				3158	value = v;
				3159	ret = -EINVAL;
				3160	}
				3161
				3162	/*
				3163	* no error on this register
				3164	*/
				3165	PFM_REG_RETFLAG_SET(req->reg_flags, 0);
				3166
				3167	/*
				3168	* now commit changes to software state
				3169	*/
				3170	hw_value = value;
				3171
				3172	/*
				3173	* update virtualized (64bits) counter
				3174	*/
				3175	if (is_counting) {
				3176	/*
				3177	* write context state
				3178	*/
				3179	ctx->ctx_pmds[cnum].lval = value;
				3180
				3181	/*
				3182	* when context is load we use the split value
				3183	*/
				3184	if (is_loaded) {
				3185	hw_value = value & ovfl_mask;
				3186	value = value & ~ovfl_mask;
				3187	}
				3188	}
				3189	/*
				3190	* update reset values (not just for counters)
				3191	*/
				3192	ctx->ctx_pmds[cnum].long_reset = req->reg_long_reset;
				3193	ctx->ctx_pmds[cnum].short_reset = req->reg_short_reset;
				3194
				3195	/*
				3196	* update randomization parameters (not just for counters)
				3197	*/
				3198	ctx->ctx_pmds[cnum].seed = req->reg_random_seed;
				3199	ctx->ctx_pmds[cnum].mask = req->reg_random_mask;
				3200
				3201	/*
				3202	* update context value
				3203	*/
				3204	ctx->ctx_pmds[cnum].val = value;
				3205
				3206	/*
				3207	* Keep track of what we use
				3208	*
				3209	* We do not keep track of PMC because we have to
				3210	* systematically restore ALL of them.
				3211	*/
				3212	CTX_USED_PMD(ctx, PMD_PMD_DEP(cnum));
				3213
				3214	/*
				3215	* mark this PMD register used as well
				3216	*/
				3217	CTX_USED_PMD(ctx, RDEP(cnum));
				3218
				3219	/*
				3220	* make sure we do not try to reset on
				3221	* restart because we have established new values
				3222	*/
				3223	if (is_counting && state == PFM_CTX_MASKED) {
				3224	ctx->ctx_ovfl_regs[0] &= ~1UL << cnum;
				3225	}
				3226
				3227	if (is_loaded) {
				3228	/*
				3229	* write thread state
				3230	*/
				3231	if (is_system == 0) thread->pmds[cnum] = hw_value;
				3232
				3233	/*
				3234	* write hardware register if we can
				3235	*/
				3236	if (can_access_pmu) {
				3237	ia64_set_pmd(cnum, hw_value);
				3238	} else {
				3239	#ifdef CONFIG_SMP
				3240	/*
				3241	* we are guaranteed that the task is not running on the other CPU,
				3242	* we indicate that this PMD will need to be reloaded if the task
				3243	* is rescheduled on the CPU it ran last on.
				3244	*/
				3245	ctx->ctx_reload_pmds[0] \|= 1UL << cnum;
				3246	#endif
				3247	}
				3248	}
				3249
				3250	DPRINT(("pmd[%u]=0x%lx ld=%d apmu=%d, hw_value=0x%lx ctx_pmd=0x%lx short_reset=0x%lx "
				3251	"long_reset=0x%lx notify=%c seed=0x%lx mask=0x%lx used_pmds=0x%lx reset_pmds=0x%lx reload_pmds=0x%lx all_pmds=0x%lx ovfl_regs=0x%lx\n",
				3252	cnum,
				3253	value,
				3254	is_loaded,
				3255	can_access_pmu,
				3256	hw_value,
				3257	ctx->ctx_pmds[cnum].val,
				3258	ctx->ctx_pmds[cnum].short_reset,
				3259	ctx->ctx_pmds[cnum].long_reset,
				3260	PMC_OVFL_NOTIFY(ctx, cnum) ? 'Y':'N',
				3261	ctx->ctx_pmds[cnum].seed,
				3262	ctx->ctx_pmds[cnum].mask,
				3263	ctx->ctx_used_pmds[0],
				3264	ctx->ctx_pmds[cnum].reset_pmds[0],
				3265	ctx->ctx_reload_pmds[0],
				3266	ctx->ctx_all_pmds[0],
				3267	ctx->ctx_ovfl_regs[0]));
				3268	}
				3269
				3270	/*
				3271	* make changes visible
				3272	*/
				3273	if (can_access_pmu) ia64_srlz_d();
				3274
				3275	return 0;
				3276
				3277	abort_mission:
				3278	/*
				3279	* for now, we have only one possibility for error
				3280	*/
				3281	PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL);
				3282	return ret;
				3283	}
				3284
				3285	/*
				3286	* By the way of PROTECT_CONTEXT(), interrupts are masked while we are in this function.
				3287	* Therefore we know, we do not have to worry about the PMU overflow interrupt. If an
				3288	* interrupt is delivered during the call, it will be kept pending until we leave, making
				3289	* it appears as if it had been generated at the UNPROTECT_CONTEXT(). At least we are
				3290	* guaranteed to return consistent data to the user, it may simply be old. It is not
				3291	* trivial to treat the overflow while inside the call because you may end up in
				3292	* some module sampling buffer code causing deadlocks.
				3293	*/
				3294	static int
				3295	pfm_read_pmds(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				3296	{
				3297	struct thread_struct *thread = NULL;
				3298	struct task_struct *task;
				3299	unsigned long val = 0UL, lval, ovfl_mask, sval;
				3300	pfarg_reg_t req = (pfarg_reg_t )arg;
				3301	unsigned int cnum, reg_flags = 0;
				3302	int i, can_access_pmu = 0, state;
				3303	int is_loaded, is_system, is_counting, expert_mode;
				3304	int ret = -EINVAL;
				3305	pfm_reg_check_t rd_func;
				3306
				3307	/*
				3308	* access is possible when loaded only for
				3309	* self-monitoring tasks or in UP mode
				3310	*/
				3311
				3312	state = ctx->ctx_state;
				3313	is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
				3314	is_system = ctx->ctx_fl_system;
				3315	ovfl_mask = pmu_conf->ovfl_val;
				3316	task = ctx->ctx_task;
				3317
				3318	if (state == PFM_CTX_ZOMBIE) return -EINVAL;
				3319
				3320	if (likely(is_loaded)) {
				3321	thread = &task->thread;
				3322	/*
				3323	* In system wide and when the context is loaded, access can only happen
				3324	* when the caller is running on the CPU being monitored by the session.
				3325	* It does not have to be the owner (ctx_task) of the context per se.
				3326	*/
				3327	if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) {
				3328	DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
				3329	return -EBUSY;
				3330	}
				3331	/*
				3332	* this can be true when not self-monitoring only in UP
				3333	*/
				3334	can_access_pmu = GET_PMU_OWNER() == task \|\| is_system ? 1 : 0;
				3335
				3336	if (can_access_pmu) ia64_srlz_d();
				3337	}
				3338	expert_mode = pfm_sysctl.expert_mode;
				3339
				3340	DPRINT(("ld=%d apmu=%d ctx_state=%d\n",
				3341	is_loaded,
				3342	can_access_pmu,
				3343	state));
				3344
				3345	/*
				3346	* on both UP and SMP, we can only read the PMD from the hardware register when
				3347	* the task is the owner of the local PMU.
				3348	*/
				3349
				3350	for (i = 0; i < count; i++, req++) {
				3351
				3352	cnum = req->reg_num;
				3353	reg_flags = req->reg_flags;
				3354
				3355	if (unlikely(!PMD_IS_IMPL(cnum))) goto error;
				3356	/*
				3357	* we can only read the register that we use. That includes
				3358	* the one we explicitely initialize AND the one we want included
				3359	* in the sampling buffer (smpl_regs).
				3360	*
				3361	* Having this restriction allows optimization in the ctxsw routine
				3362	* without compromising security (leaks)
				3363	*/
				3364	if (unlikely(!CTX_IS_USED_PMD(ctx, cnum))) goto error;
				3365
				3366	sval = ctx->ctx_pmds[cnum].val;
				3367	lval = ctx->ctx_pmds[cnum].lval;
				3368	is_counting = PMD_IS_COUNTING(cnum);
				3369
				3370	/*
				3371	* If the task is not the current one, then we check if the
				3372	* PMU state is still in the local live register due to lazy ctxsw.
				3373	* If true, then we read directly from the registers.
				3374	*/
				3375	if (can_access_pmu){
				3376	val = ia64_get_pmd(cnum);
				3377	} else {
				3378	/*
				3379	* context has been saved
				3380	* if context is zombie, then task does not exist anymore.
				3381	* In this case, we use the full value saved in the context (pfm_flush_regs()).
				3382	*/
				3383	val = is_loaded ? thread->pmds[cnum] : 0UL;
				3384	}
				3385	rd_func = pmu_conf->pmd_desc[cnum].read_check;
				3386
				3387	if (is_counting) {
				3388	/*
				3389	* XXX: need to check for overflow when loaded
				3390	*/
				3391	val &= ovfl_mask;
				3392	val += sval;
				3393	}
				3394
				3395	/*
				3396	* execute read checker, if any
				3397	*/
				3398	if (unlikely(expert_mode == 0 && rd_func)) {
				3399	unsigned long v = val;
				3400	ret = (*rd_func)(ctx->ctx_task, ctx, cnum, &v, regs);
				3401	if (ret) goto error;
				3402	val = v;
				3403	ret = -EINVAL;
				3404	}
				3405
				3406	PFM_REG_RETFLAG_SET(reg_flags, 0);
				3407
				3408	DPRINT(("pmd[%u]=0x%lx\n", cnum, val));
				3409
				3410	/*
				3411	* update register return value, abort all if problem during copy.
				3412	* we only modify the reg_flags field. no check mode is fine because
				3413	* access has been verified upfront in sys_perfmonctl().
				3414	*/
				3415	req->reg_value = val;
				3416	req->reg_flags = reg_flags;
				3417	req->reg_last_reset_val = lval;
				3418	}
				3419
				3420	return 0;
				3421
				3422	error:
				3423	PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL);
				3424	return ret;
				3425	}
				3426
				3427	int
				3428	pfm_mod_write_pmcs(struct task_struct task, void req, unsigned int nreq, struct pt_regs *regs)
				3429	{
				3430	pfm_context_t *ctx;
				3431
				3432	if (req == NULL) return -EINVAL;
				3433
				3434	ctx = GET_PMU_CTX();
				3435
				3436	if (ctx == NULL) return -EINVAL;
				3437
				3438	/*
				3439	* for now limit to current task, which is enough when calling
				3440	* from overflow handler
				3441	*/
				3442	if (task != current && ctx->ctx_fl_system == 0) return -EBUSY;
				3443
				3444	return pfm_write_pmcs(ctx, req, nreq, regs);
				3445	}
				3446	EXPORT_SYMBOL(pfm_mod_write_pmcs);
				3447
				3448	int
				3449	pfm_mod_read_pmds(struct task_struct task, void req, unsigned int nreq, struct pt_regs *regs)
				3450	{
				3451	pfm_context_t *ctx;
				3452
				3453	if (req == NULL) return -EINVAL;
				3454
				3455	ctx = GET_PMU_CTX();
				3456
				3457	if (ctx == NULL) return -EINVAL;
				3458
				3459	/*
				3460	* for now limit to current task, which is enough when calling
				3461	* from overflow handler
				3462	*/
				3463	if (task != current && ctx->ctx_fl_system == 0) return -EBUSY;
				3464
				3465	return pfm_read_pmds(ctx, req, nreq, regs);
				3466	}
				3467	EXPORT_SYMBOL(pfm_mod_read_pmds);
				3468
				3469	/*
				3470	* Only call this function when a process it trying to
				3471	* write the debug registers (reading is always allowed)
				3472	*/
				3473	int
				3474	pfm_use_debug_registers(struct task_struct *task)
				3475	{
				3476	pfm_context_t *ctx = task->thread.pfm_context;
				3477	unsigned long flags;
				3478	int ret = 0;
				3479
				3480	if (pmu_conf->use_rr_dbregs == 0) return 0;
				3481
				3482	DPRINT(("called for [%d]\n", task->pid));
				3483
				3484	/*
				3485	* do it only once
				3486	*/
				3487	if (task->thread.flags & IA64_THREAD_DBG_VALID) return 0;
				3488
				3489	/*
				3490	* Even on SMP, we do not need to use an atomic here because
				3491	* the only way in is via ptrace() and this is possible only when the
				3492	* process is stopped. Even in the case where the ctxsw out is not totally
				3493	* completed by the time we come here, there is no way the 'stopped' process
				3494	* could be in the middle of fiddling with the pfm_write_ibr_dbr() routine.
				3495	* So this is always safe.
				3496	*/
				3497	if (ctx && ctx->ctx_fl_using_dbreg == 1) return -1;
				3498
				3499	LOCK_PFS(flags);
				3500
				3501	/*
				3502	* We cannot allow setting breakpoints when system wide monitoring
				3503	* sessions are using the debug registers.
				3504	*/
				3505	if (pfm_sessions.pfs_sys_use_dbregs> 0)
				3506	ret = -1;
				3507	else
				3508	pfm_sessions.pfs_ptrace_use_dbregs++;
				3509
				3510	DPRINT(("ptrace_use_dbregs=%u sys_use_dbregs=%u by [%d] ret = %d\n",
				3511	pfm_sessions.pfs_ptrace_use_dbregs,
				3512	pfm_sessions.pfs_sys_use_dbregs,
				3513	task->pid, ret));
				3514
				3515	UNLOCK_PFS(flags);
				3516
				3517	return ret;
				3518	}
				3519
				3520	/*
				3521	* This function is called for every task that exits with the
				3522	* IA64_THREAD_DBG_VALID set. This indicates a task which was
				3523	* able to use the debug registers for debugging purposes via
				3524	* ptrace(). Therefore we know it was not using them for
				3525	* perfmormance monitoring, so we only decrement the number
				3526	* of "ptraced" debug register users to keep the count up to date
				3527	*/
				3528	int
				3529	pfm_release_debug_registers(struct task_struct *task)
				3530	{
				3531	unsigned long flags;
				3532	int ret;
				3533
				3534	if (pmu_conf->use_rr_dbregs == 0) return 0;
				3535
				3536	LOCK_PFS(flags);
				3537	if (pfm_sessions.pfs_ptrace_use_dbregs == 0) {
				3538	printk(KERN_ERR "perfmon: invalid release for [%d] ptrace_use_dbregs=0\n", task->pid);
				3539	ret = -1;
				3540	} else {
				3541	pfm_sessions.pfs_ptrace_use_dbregs--;
				3542	ret = 0;
				3543	}
				3544	UNLOCK_PFS(flags);
				3545
				3546	return ret;
				3547	}
				3548
				3549	static int
				3550	pfm_restart(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				3551	{
				3552	struct task_struct *task;
				3553	pfm_buffer_fmt_t *fmt;
				3554	pfm_ovfl_ctrl_t rst_ctrl;
				3555	int state, is_system;
				3556	int ret = 0;
				3557
				3558	state = ctx->ctx_state;
				3559	fmt = ctx->ctx_buf_fmt;
				3560	is_system = ctx->ctx_fl_system;
				3561	task = PFM_CTX_TASK(ctx);
				3562
				3563	switch(state) {
				3564	case PFM_CTX_MASKED:
				3565	break;
				3566	case PFM_CTX_LOADED:
				3567	if (CTX_HAS_SMPL(ctx) && fmt->fmt_restart_active) break;
				3568	/* fall through */
				3569	case PFM_CTX_UNLOADED:
				3570	case PFM_CTX_ZOMBIE:
				3571	DPRINT(("invalid state=%d\n", state));
				3572	return -EBUSY;
				3573	default:
				3574	DPRINT(("state=%d, cannot operate (no active_restart handler)\n", state));
				3575	return -EINVAL;
				3576	}
				3577
				3578	/*
				3579	* In system wide and when the context is loaded, access can only happen
				3580	* when the caller is running on the CPU being monitored by the session.
				3581	* It does not have to be the owner (ctx_task) of the context per se.
				3582	*/
				3583	if (is_system && ctx->ctx_cpu != smp_processor_id()) {
				3584	DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
				3585	return -EBUSY;
				3586	}
				3587
				3588	/* sanity check */
				3589	if (unlikely(task == NULL)) {
				3590	printk(KERN_ERR "perfmon: [%d] pfm_restart no task\n", current->pid);
				3591	return -EINVAL;
				3592	}
				3593
				3594	if (task == current \|\| is_system) {
				3595
				3596	fmt = ctx->ctx_buf_fmt;
				3597
				3598	DPRINT(("restarting self %d ovfl=0x%lx\n",
				3599	task->pid,
				3600	ctx->ctx_ovfl_regs[0]));
				3601
				3602	if (CTX_HAS_SMPL(ctx)) {
				3603
				3604	prefetch(ctx->ctx_smpl_hdr);
				3605
				3606	rst_ctrl.bits.mask_monitoring = 0;
				3607	rst_ctrl.bits.reset_ovfl_pmds = 0;
				3608
				3609	if (state == PFM_CTX_LOADED)
				3610	ret = pfm_buf_fmt_restart_active(fmt, task, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
				3611	else
				3612	ret = pfm_buf_fmt_restart(fmt, task, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
				3613	} else {
				3614	rst_ctrl.bits.mask_monitoring = 0;
				3615	rst_ctrl.bits.reset_ovfl_pmds = 1;
				3616	}
				3617
				3618	if (ret == 0) {
				3619	if (rst_ctrl.bits.reset_ovfl_pmds)
				3620	pfm_reset_regs(ctx, ctx->ctx_ovfl_regs, PFM_PMD_LONG_RESET);
				3621
				3622	if (rst_ctrl.bits.mask_monitoring == 0) {
				3623	DPRINT(("resuming monitoring for [%d]\n", task->pid));
				3624
				3625	if (state == PFM_CTX_MASKED) pfm_restore_monitoring(task);
				3626	} else {
				3627	DPRINT(("keeping monitoring stopped for [%d]\n", task->pid));
				3628
				3629	// cannot use pfm_stop_monitoring(task, regs);
				3630	}
				3631	}
				3632	/*
				3633	* clear overflowed PMD mask to remove any stale information
				3634	*/
				3635	ctx->ctx_ovfl_regs[0] = 0UL;
				3636
				3637	/*
				3638	* back to LOADED state
				3639	*/
				3640	ctx->ctx_state = PFM_CTX_LOADED;
				3641
				3642	/*
				3643	* XXX: not really useful for self monitoring
				3644	*/
				3645	ctx->ctx_fl_can_restart = 0;
				3646
				3647	return 0;
				3648	}
				3649
				3650	/*
				3651	* restart another task
				3652	*/
				3653
				3654	/*
				3655	* When PFM_CTX_MASKED, we cannot issue a restart before the previous
				3656	* one is seen by the task.
				3657	*/
				3658	if (state == PFM_CTX_MASKED) {
				3659	if (ctx->ctx_fl_can_restart == 0) return -EINVAL;
				3660	/*
				3661	* will prevent subsequent restart before this one is
				3662	* seen by other task
				3663	*/
				3664	ctx->ctx_fl_can_restart = 0;
				3665	}
				3666
				3667	/*
				3668	* if blocking, then post the semaphore is PFM_CTX_MASKED, i.e.
				3669	* the task is blocked or on its way to block. That's the normal
				3670	* restart path. If the monitoring is not masked, then the task
				3671	* can be actively monitoring and we cannot directly intervene.
				3672	* Therefore we use the trap mechanism to catch the task and
				3673	* force it to reset the buffer/reset PMDs.
				3674	*
				3675	* if non-blocking, then we ensure that the task will go into
				3676	* pfm_handle_work() before returning to user mode.
				3677	*
				3678	* We cannot explicitely reset another task, it MUST always
				3679	* be done by the task itself. This works for system wide because
				3680	* the tool that is controlling the session is logically doing
				3681	* "self-monitoring".
				3682	*/
				3683	if (CTX_OVFL_NOBLOCK(ctx) == 0 && state == PFM_CTX_MASKED) {
				3684	DPRINT(("unblocking [%d] \n", task->pid));
				3685	up(&ctx->ctx_restart_sem);
				3686	} else {
				3687	DPRINT(("[%d] armed exit trap\n", task->pid));
				3688
				3689	ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_RESET;
				3690
				3691	PFM_SET_WORK_PENDING(task, 1);
				3692
				3693	pfm_set_task_notify(task);
				3694
				3695	/*
				3696	* XXX: send reschedule if task runs on another CPU
				3697	*/
				3698	}
				3699	return 0;
				3700	}
				3701
				3702	static int
				3703	pfm_debug(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				3704	{
				3705	unsigned int m = (unsigned int )arg;
				3706
				3707	pfm_sysctl.debug = m == 0 ? 0 : 1;
				3708
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3709	printk(KERN_INFO "perfmon debugging %s (timing reset)\n", pfm_sysctl.debug ? "on" : "off");
				3710
				3711	if (m == 0) {
				3712	memset(pfm_stats, 0, sizeof(pfm_stats));
				3713	for(m=0; m < NR_CPUS; m++) pfm_stats[m].pfm_ovfl_intr_cycles_min = ~0UL;
				3714	}
				3715	return 0;
				3716	}
				3717
				3718	/*
				3719	* arg can be NULL and count can be zero for this function
				3720	*/
				3721	static int
				3722	pfm_write_ibr_dbr(int mode, pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				3723	{
				3724	struct thread_struct *thread = NULL;
				3725	struct task_struct *task;
				3726	pfarg_dbreg_t req = (pfarg_dbreg_t )arg;
				3727	unsigned long flags;
				3728	dbreg_t dbreg;
				3729	unsigned int rnum;
				3730	int first_time;
				3731	int ret = 0, state;
				3732	int i, can_access_pmu = 0;
				3733	int is_system, is_loaded;
				3734
				3735	if (pmu_conf->use_rr_dbregs == 0) return -EINVAL;
				3736
				3737	state = ctx->ctx_state;
				3738	is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
				3739	is_system = ctx->ctx_fl_system;
				3740	task = ctx->ctx_task;
				3741
				3742	if (state == PFM_CTX_ZOMBIE) return -EINVAL;
				3743
				3744	/*
				3745	* on both UP and SMP, we can only write to the PMC when the task is
				3746	* the owner of the local PMU.
				3747	*/
				3748	if (is_loaded) {
				3749	thread = &task->thread;
				3750	/*
				3751	* In system wide and when the context is loaded, access can only happen
				3752	* when the caller is running on the CPU being monitored by the session.
				3753	* It does not have to be the owner (ctx_task) of the context per se.
				3754	*/
				3755	if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) {
				3756	DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
				3757	return -EBUSY;
				3758	}
				3759	can_access_pmu = GET_PMU_OWNER() == task \|\| is_system ? 1 : 0;
				3760	}
				3761
				3762	/*
				3763	* we do not need to check for ipsr.db because we do clear ibr.x, dbr.r, and dbr.w
				3764	* ensuring that no real breakpoint can be installed via this call.
				3765	*
				3766	* IMPORTANT: regs can be NULL in this function
				3767	*/
				3768
				3769	first_time = ctx->ctx_fl_using_dbreg == 0;
				3770
				3771	/*
				3772	* don't bother if we are loaded and task is being debugged
				3773	*/
				3774	if (is_loaded && (thread->flags & IA64_THREAD_DBG_VALID) != 0) {
				3775	DPRINT(("debug registers already in use for [%d]\n", task->pid));
				3776	return -EBUSY;
				3777	}
				3778
				3779	/*
				3780	* check for debug registers in system wide mode
				3781	*
				3782	* If though a check is done in pfm_context_load(),
				3783	* we must repeat it here, in case the registers are
				3784	* written after the context is loaded
				3785	*/
				3786	if (is_loaded) {
				3787	LOCK_PFS(flags);
				3788
				3789	if (first_time && is_system) {
				3790	if (pfm_sessions.pfs_ptrace_use_dbregs)
				3791	ret = -EBUSY;
				3792	else
				3793	pfm_sessions.pfs_sys_use_dbregs++;
				3794	}
				3795	UNLOCK_PFS(flags);
				3796	}
				3797
				3798	if (ret != 0) return ret;
				3799
				3800	/*
				3801	* mark ourself as user of the debug registers for
				3802	* perfmon purposes.
				3803	*/
				3804	ctx->ctx_fl_using_dbreg = 1;
				3805
				3806	/*
				3807	* clear hardware registers to make sure we don't
				3808	* pick up stale state.
				3809	*
				3810	* for a system wide session, we do not use
				3811	* thread.dbr, thread.ibr because this process
				3812	* never leaves the current CPU and the state
				3813	* is shared by all processes running on it
				3814	*/
				3815	if (first_time && can_access_pmu) {
				3816	DPRINT(("[%d] clearing ibrs, dbrs\n", task->pid));
				3817	for (i=0; i < pmu_conf->num_ibrs; i++) {
				3818	ia64_set_ibr(i, 0UL);
				3819	ia64_dv_serialize_instruction();
				3820	}
				3821	ia64_srlz_i();
				3822	for (i=0; i < pmu_conf->num_dbrs; i++) {
				3823	ia64_set_dbr(i, 0UL);
				3824	ia64_dv_serialize_data();
				3825	}
				3826	ia64_srlz_d();
				3827	}
				3828
				3829	/*
				3830	* Now install the values into the registers
				3831	*/
				3832	for (i = 0; i < count; i++, req++) {
				3833
				3834	rnum = req->dbreg_num;
				3835	dbreg.val = req->dbreg_value;
				3836
				3837	ret = -EINVAL;
				3838
				3839	if ((mode == PFM_CODE_RR && rnum >= PFM_NUM_IBRS) \|\| ((mode == PFM_DATA_RR) && rnum >= PFM_NUM_DBRS)) {
				3840	DPRINT(("invalid register %u val=0x%lx mode=%d i=%d count=%d\n",
				3841	rnum, dbreg.val, mode, i, count));
				3842
				3843	goto abort_mission;
				3844	}
				3845
				3846	/*
				3847	* make sure we do not install enabled breakpoint
				3848	*/
				3849	if (rnum & 0x1) {
				3850	if (mode == PFM_CODE_RR)
				3851	dbreg.ibr.ibr_x = 0;
				3852	else
				3853	dbreg.dbr.dbr_r = dbreg.dbr.dbr_w = 0;
				3854	}
				3855
				3856	PFM_REG_RETFLAG_SET(req->dbreg_flags, 0);
				3857
				3858	/*
				3859	* Debug registers, just like PMC, can only be modified
				3860	* by a kernel call. Moreover, perfmon() access to those
				3861	* registers are centralized in this routine. The hardware
				3862	* does not modify the value of these registers, therefore,
				3863	* if we save them as they are written, we can avoid having
				3864	* to save them on context switch out. This is made possible
				3865	* by the fact that when perfmon uses debug registers, ptrace()
				3866	* won't be able to modify them concurrently.
				3867	*/
				3868	if (mode == PFM_CODE_RR) {
				3869	CTX_USED_IBR(ctx, rnum);
				3870
				3871	if (can_access_pmu) {
				3872	ia64_set_ibr(rnum, dbreg.val);
				3873	ia64_dv_serialize_instruction();
				3874	}
				3875
				3876	ctx->ctx_ibrs[rnum] = dbreg.val;
				3877
				3878	DPRINT(("write ibr%u=0x%lx used_ibrs=0x%x ld=%d apmu=%d\n",
				3879	rnum, dbreg.val, ctx->ctx_used_ibrs[0], is_loaded, can_access_pmu));
				3880	} else {
				3881	CTX_USED_DBR(ctx, rnum);
				3882
				3883	if (can_access_pmu) {
				3884	ia64_set_dbr(rnum, dbreg.val);
				3885	ia64_dv_serialize_data();
				3886	}
				3887	ctx->ctx_dbrs[rnum] = dbreg.val;
				3888
				3889	DPRINT(("write dbr%u=0x%lx used_dbrs=0x%x ld=%d apmu=%d\n",
				3890	rnum, dbreg.val, ctx->ctx_used_dbrs[0], is_loaded, can_access_pmu));
				3891	}
				3892	}
				3893
				3894	return 0;
				3895
				3896	abort_mission:
				3897	/*
				3898	* in case it was our first attempt, we undo the global modifications
				3899	*/
				3900	if (first_time) {
				3901	LOCK_PFS(flags);
				3902	if (ctx->ctx_fl_system) {
				3903	pfm_sessions.pfs_sys_use_dbregs--;
				3904	}
				3905	UNLOCK_PFS(flags);
				3906	ctx->ctx_fl_using_dbreg = 0;
				3907	}
				3908	/*
				3909	* install error return flag
				3910	*/
				3911	PFM_REG_RETFLAG_SET(req->dbreg_flags, PFM_REG_RETFL_EINVAL);
				3912
				3913	return ret;
				3914	}
				3915
				3916	static int
				3917	pfm_write_ibrs(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				3918	{
				3919	return pfm_write_ibr_dbr(PFM_CODE_RR, ctx, arg, count, regs);
				3920	}
				3921
				3922	static int
				3923	pfm_write_dbrs(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				3924	{
				3925	return pfm_write_ibr_dbr(PFM_DATA_RR, ctx, arg, count, regs);
				3926	}
				3927
				3928	int
				3929	pfm_mod_write_ibrs(struct task_struct task, void req, unsigned int nreq, struct pt_regs *regs)
				3930	{
				3931	pfm_context_t *ctx;
				3932
				3933	if (req == NULL) return -EINVAL;
				3934
				3935	ctx = GET_PMU_CTX();
				3936
				3937	if (ctx == NULL) return -EINVAL;
				3938
				3939	/*
				3940	* for now limit to current task, which is enough when calling
				3941	* from overflow handler
				3942	*/
				3943	if (task != current && ctx->ctx_fl_system == 0) return -EBUSY;
				3944
				3945	return pfm_write_ibrs(ctx, req, nreq, regs);
				3946	}
				3947	EXPORT_SYMBOL(pfm_mod_write_ibrs);
				3948
				3949	int
				3950	pfm_mod_write_dbrs(struct task_struct task, void req, unsigned int nreq, struct pt_regs *regs)
				3951	{
				3952	pfm_context_t *ctx;
				3953
				3954	if (req == NULL) return -EINVAL;
				3955
				3956	ctx = GET_PMU_CTX();
				3957
				3958	if (ctx == NULL) return -EINVAL;
				3959
				3960	/*
				3961	* for now limit to current task, which is enough when calling
				3962	* from overflow handler
				3963	*/
				3964	if (task != current && ctx->ctx_fl_system == 0) return -EBUSY;
				3965
				3966	return pfm_write_dbrs(ctx, req, nreq, regs);
				3967	}
				3968	EXPORT_SYMBOL(pfm_mod_write_dbrs);
				3969
				3970
				3971	static int
				3972	pfm_get_features(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				3973	{
				3974	pfarg_features_t req = (pfarg_features_t )arg;
				3975
				3976	req->ft_version = PFM_VERSION;
				3977	return 0;
				3978	}
				3979
				3980	static int
				3981	pfm_stop(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				3982	{
				3983	struct pt_regs *tregs;
				3984	struct task_struct *task = PFM_CTX_TASK(ctx);
				3985	int state, is_system;
				3986
				3987	state = ctx->ctx_state;
				3988	is_system = ctx->ctx_fl_system;
				3989
				3990	/*
				3991	* context must be attached to issue the stop command (includes LOADED,MASKED,ZOMBIE)
				3992	*/
				3993	if (state == PFM_CTX_UNLOADED) return -EINVAL;
				3994
				3995	/*
				3996	* In system wide and when the context is loaded, access can only happen
				3997	* when the caller is running on the CPU being monitored by the session.
				3998	* It does not have to be the owner (ctx_task) of the context per se.
				3999	*/
				4000	if (is_system && ctx->ctx_cpu != smp_processor_id()) {
				4001	DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
				4002	return -EBUSY;
				4003	}
				4004	DPRINT(("task [%d] ctx_state=%d is_system=%d\n",
				4005	PFM_CTX_TASK(ctx)->pid,
				4006	state,
				4007	is_system));
				4008	/*
				4009	* in system mode, we need to update the PMU directly
				4010	* and the user level state of the caller, which may not
				4011	* necessarily be the creator of the context.
				4012	*/
				4013	if (is_system) {
				4014	/*
				4015	* Update local PMU first
				4016	*
				4017	* disable dcr pp
				4018	*/
				4019	ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) & ~IA64_DCR_PP);
				4020	ia64_srlz_i();
				4021
				4022	/*
				4023	* update local cpuinfo
				4024	*/
				4025	PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP);
				4026
				4027	/*
				4028	* stop monitoring, does srlz.i
				4029	*/
				4030	pfm_clear_psr_pp();
				4031
				4032	/*
				4033	* stop monitoring in the caller
				4034	*/
				4035	ia64_psr(regs)->pp = 0;
				4036
				4037	return 0;
				4038	}
				4039	/*
				4040	* per-task mode
				4041	*/
				4042
				4043	if (task == current) {
				4044	/* stop monitoring at kernel level */
				4045	pfm_clear_psr_up();
				4046
				4047	/*
				4048	* stop monitoring at the user level
				4049	*/
				4050	ia64_psr(regs)->up = 0;
				4051	} else {
				4052	tregs = ia64_task_regs(task);
				4053
				4054	/*
				4055	* stop monitoring at the user level
				4056	*/
				4057	ia64_psr(tregs)->up = 0;
				4058
				4059	/*
				4060	* monitoring disabled in kernel at next reschedule
				4061	*/
				4062	ctx->ctx_saved_psr_up = 0;
				4063	DPRINT(("task=[%d]\n", task->pid));
				4064	}
				4065	return 0;
				4066	}
				4067
				4068
				4069	static int
				4070	pfm_start(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				4071	{
				4072	struct pt_regs *tregs;
				4073	int state, is_system;
				4074
				4075	state = ctx->ctx_state;
				4076	is_system = ctx->ctx_fl_system;
				4077
				4078	if (state != PFM_CTX_LOADED) return -EINVAL;
				4079
				4080	/*
				4081	* In system wide and when the context is loaded, access can only happen
				4082	* when the caller is running on the CPU being monitored by the session.
				4083	* It does not have to be the owner (ctx_task) of the context per se.
				4084	*/
				4085	if (is_system && ctx->ctx_cpu != smp_processor_id()) {
				4086	DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
				4087	return -EBUSY;
				4088	}
				4089
				4090	/*
				4091	* in system mode, we need to update the PMU directly
				4092	* and the user level state of the caller, which may not
				4093	* necessarily be the creator of the context.
				4094	*/
				4095	if (is_system) {
				4096
				4097	/*
				4098	* set user level psr.pp for the caller
				4099	*/
				4100	ia64_psr(regs)->pp = 1;
				4101
				4102	/*
				4103	* now update the local PMU and cpuinfo
				4104	*/
				4105	PFM_CPUINFO_SET(PFM_CPUINFO_DCR_PP);
				4106
				4107	/*
				4108	* start monitoring at kernel level
				4109	*/
				4110	pfm_set_psr_pp();
				4111
				4112	/* enable dcr pp */
				4113	ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) \| IA64_DCR_PP);
				4114	ia64_srlz_i();
				4115
				4116	return 0;
				4117	}
				4118
				4119	/*
				4120	* per-process mode
				4121	*/
				4122
				4123	if (ctx->ctx_task == current) {
				4124
				4125	/* start monitoring at kernel level */
				4126	pfm_set_psr_up();
				4127
				4128	/*
				4129	* activate monitoring at user level
				4130	*/
				4131	ia64_psr(regs)->up = 1;
				4132
				4133	} else {
				4134	tregs = ia64_task_regs(ctx->ctx_task);
				4135
				4136	/*
				4137	* start monitoring at the kernel level the next
				4138	* time the task is scheduled
				4139	*/
				4140	ctx->ctx_saved_psr_up = IA64_PSR_UP;
				4141
				4142	/*
				4143	* activate monitoring at user level
				4144	*/
				4145	ia64_psr(tregs)->up = 1;
				4146	}
				4147	return 0;
				4148	}
				4149
				4150	static int
				4151	pfm_get_pmc_reset(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				4152	{
				4153	pfarg_reg_t req = (pfarg_reg_t )arg;
				4154	unsigned int cnum;
				4155	int i;
				4156	int ret = -EINVAL;
				4157
				4158	for (i = 0; i < count; i++, req++) {
				4159
				4160	cnum = req->reg_num;
				4161
				4162	if (!PMC_IS_IMPL(cnum)) goto abort_mission;
				4163
				4164	req->reg_value = PMC_DFL_VAL(cnum);
				4165
				4166	PFM_REG_RETFLAG_SET(req->reg_flags, 0);
				4167
				4168	DPRINT(("pmc_reset_val pmc[%u]=0x%lx\n", cnum, req->reg_value));
				4169	}
				4170	return 0;
				4171
				4172	abort_mission:
				4173	PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL);
				4174	return ret;
				4175	}
				4176
				4177	static int
				4178	pfm_check_task_exist(pfm_context_t *ctx)
				4179	{
				4180	struct task_struct g, t;
				4181	int ret = -ESRCH;
				4182
				4183	read_lock(&tasklist_lock);
				4184
				4185	do_each_thread (g, t) {
				4186	if (t->thread.pfm_context == ctx) {
				4187	ret = 0;
				4188	break;
				4189	}
				4190	} while_each_thread (g, t);
				4191
				4192	read_unlock(&tasklist_lock);
				4193
				4194	DPRINT(("pfm_check_task_exist: ret=%d ctx=%p\n", ret, ctx));
				4195
				4196	return ret;
				4197	}
				4198
				4199	static int
				4200	pfm_context_load(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				4201	{
				4202	struct task_struct *task;
				4203	struct thread_struct *thread;
				4204	struct pfm_context_t *old;
				4205	unsigned long flags;
				4206	#ifndef CONFIG_SMP
				4207	struct task_struct *owner_task = NULL;
				4208	#endif
				4209	pfarg_load_t req = (pfarg_load_t )arg;
				4210	unsigned long pmcs_source, pmds_source;
				4211	int the_cpu;
				4212	int ret = 0;
				4213	int state, is_system, set_dbregs = 0;
				4214
				4215	state = ctx->ctx_state;
				4216	is_system = ctx->ctx_fl_system;
				4217	/*
				4218	* can only load from unloaded or terminated state
				4219	*/
				4220	if (state != PFM_CTX_UNLOADED) {
				4221	DPRINT(("cannot load to [%d], invalid ctx_state=%d\n",
				4222	req->load_pid,
				4223	ctx->ctx_state));
stephane eranian	a5a70b7	2005-04-18 11:42:00 -0700	[diff] [blame]	4224	return -EBUSY;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	4225	}
				4226
				4227	DPRINT(("load_pid [%d] using_dbreg=%d\n", req->load_pid, ctx->ctx_fl_using_dbreg));
				4228
				4229	if (CTX_OVFL_NOBLOCK(ctx) == 0 && req->load_pid == current->pid) {
				4230	DPRINT(("cannot use blocking mode on self\n"));
				4231	return -EINVAL;
				4232	}
				4233
				4234	ret = pfm_get_task(ctx, req->load_pid, &task);
				4235	if (ret) {
				4236	DPRINT(("load_pid [%d] get_task=%d\n", req->load_pid, ret));
				4237	return ret;
				4238	}
				4239
				4240	ret = -EINVAL;
				4241
				4242	/*
				4243	* system wide is self monitoring only
				4244	*/
				4245	if (is_system && task != current) {
				4246	DPRINT(("system wide is self monitoring only load_pid=%d\n",
				4247	req->load_pid));
				4248	goto error;
				4249	}
				4250
				4251	thread = &task->thread;
				4252
				4253	ret = 0;
				4254	/*
				4255	* cannot load a context which is using range restrictions,
				4256	* into a task that is being debugged.
				4257	*/
				4258	if (ctx->ctx_fl_using_dbreg) {
				4259	if (thread->flags & IA64_THREAD_DBG_VALID) {
				4260	ret = -EBUSY;
				4261	DPRINT(("load_pid [%d] task is debugged, cannot load range restrictions\n", req->load_pid));
				4262	goto error;
				4263	}
				4264	LOCK_PFS(flags);
				4265
				4266	if (is_system) {
				4267	if (pfm_sessions.pfs_ptrace_use_dbregs) {
				4268	DPRINT(("cannot load [%d] dbregs in use\n", task->pid));
				4269	ret = -EBUSY;
				4270	} else {
				4271	pfm_sessions.pfs_sys_use_dbregs++;
				4272	DPRINT(("load [%d] increased sys_use_dbreg=%u\n", task->pid, pfm_sessions.pfs_sys_use_dbregs));
				4273	set_dbregs = 1;
				4274	}
				4275	}
				4276
				4277	UNLOCK_PFS(flags);
				4278
				4279	if (ret) goto error;
				4280	}
				4281
				4282	/*
				4283	* SMP system-wide monitoring implies self-monitoring.
				4284	*
				4285	* The programming model expects the task to
				4286	* be pinned on a CPU throughout the session.
				4287	* Here we take note of the current CPU at the
				4288	* time the context is loaded. No call from
				4289	* another CPU will be allowed.
				4290	*
				4291	* The pinning via shed_setaffinity()
				4292	* must be done by the calling task prior
				4293	* to this call.
				4294	*
				4295	* systemwide: keep track of CPU this session is supposed to run on
				4296	*/
				4297	the_cpu = ctx->ctx_cpu = smp_processor_id();
				4298
				4299	ret = -EBUSY;
				4300	/*
				4301	* now reserve the session
				4302	*/
				4303	ret = pfm_reserve_session(current, is_system, the_cpu);
				4304	if (ret) goto error;
				4305
				4306	/*
				4307	* task is necessarily stopped at this point.
				4308	*
				4309	* If the previous context was zombie, then it got removed in
				4310	* pfm_save_regs(). Therefore we should not see it here.
				4311	* If we see a context, then this is an active context
				4312	*
				4313	* XXX: needs to be atomic
				4314	*/
				4315	DPRINT(("before cmpxchg() old_ctx=%p new_ctx=%p\n",
				4316	thread->pfm_context, ctx));
				4317
stephane.eranian@hp.com	6bf11e8	2005-07-28 05:18:00 -0700	[diff] [blame]	4318	ret = -EBUSY;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	4319	old = ia64_cmpxchg(acq, &thread->pfm_context, NULL, ctx, sizeof(pfm_context_t *));
				4320	if (old != NULL) {
				4321	DPRINT(("load_pid [%d] already has a context\n", req->load_pid));
				4322	goto error_unres;
				4323	}
				4324
				4325	pfm_reset_msgq(ctx);
				4326
				4327	ctx->ctx_state = PFM_CTX_LOADED;
				4328
				4329	/*
				4330	* link context to task
				4331	*/
				4332	ctx->ctx_task = task;
				4333
				4334	if (is_system) {
				4335	/*
				4336	* we load as stopped
				4337	*/
				4338	PFM_CPUINFO_SET(PFM_CPUINFO_SYST_WIDE);
				4339	PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP);
				4340
				4341	if (ctx->ctx_fl_excl_idle) PFM_CPUINFO_SET(PFM_CPUINFO_EXCL_IDLE);
				4342	} else {
				4343	thread->flags \|= IA64_THREAD_PM_VALID;
				4344	}
				4345
				4346	/*
				4347	* propagate into thread-state
				4348	*/
				4349	pfm_copy_pmds(task, ctx);
				4350	pfm_copy_pmcs(task, ctx);
				4351
				4352	pmcs_source = thread->pmcs;
				4353	pmds_source = thread->pmds;
				4354
				4355	/*
				4356	* always the case for system-wide
				4357	*/
				4358	if (task == current) {
				4359
				4360	if (is_system == 0) {
				4361
				4362	/* allow user level control */
				4363	ia64_psr(regs)->sp = 0;
				4364	DPRINT(("clearing psr.sp for [%d]\n", task->pid));
				4365
				4366	SET_LAST_CPU(ctx, smp_processor_id());
				4367	INC_ACTIVATION();
				4368	SET_ACTIVATION(ctx);
				4369	#ifndef CONFIG_SMP
				4370	/*
				4371	* push the other task out, if any
				4372	*/
				4373	owner_task = GET_PMU_OWNER();
				4374	if (owner_task) pfm_lazy_save_regs(owner_task);
				4375	#endif
				4376	}
				4377	/*
				4378	* load all PMD from ctx to PMU (as opposed to thread state)
				4379	* restore all PMC from ctx to PMU
				4380	*/
				4381	pfm_restore_pmds(pmds_source, ctx->ctx_all_pmds[0]);
				4382	pfm_restore_pmcs(pmcs_source, ctx->ctx_all_pmcs[0]);
				4383
				4384	ctx->ctx_reload_pmcs[0] = 0UL;
				4385	ctx->ctx_reload_pmds[0] = 0UL;
				4386
				4387	/*
				4388	* guaranteed safe by earlier check against DBG_VALID
				4389	*/
				4390	if (ctx->ctx_fl_using_dbreg) {
				4391	pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
				4392	pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
				4393	}
				4394	/*
				4395	* set new ownership
				4396	*/
				4397	SET_PMU_OWNER(task, ctx);
				4398
				4399	DPRINT(("context loaded on PMU for [%d]\n", task->pid));
				4400	} else {
				4401	/*
				4402	* when not current, task MUST be stopped, so this is safe
				4403	*/
				4404	regs = ia64_task_regs(task);
				4405
				4406	/* force a full reload */
				4407	ctx->ctx_last_activation = PFM_INVALID_ACTIVATION;
				4408	SET_LAST_CPU(ctx, -1);
				4409
				4410	/* initial saved psr (stopped) */
				4411	ctx->ctx_saved_psr_up = 0UL;
				4412	ia64_psr(regs)->up = ia64_psr(regs)->pp = 0;
				4413	}
				4414
				4415	ret = 0;
				4416
				4417	error_unres:
				4418	if (ret) pfm_unreserve_session(ctx, ctx->ctx_fl_system, the_cpu);
				4419	error:
				4420	/*
				4421	* we must undo the dbregs setting (for system-wide)
				4422	*/
				4423	if (ret && set_dbregs) {
				4424	LOCK_PFS(flags);
				4425	pfm_sessions.pfs_sys_use_dbregs--;
				4426	UNLOCK_PFS(flags);
				4427	}
				4428	/*
				4429	* release task, there is now a link with the context
				4430	*/
				4431	if (is_system == 0 && task != current) {
				4432	pfm_put_task(task);
				4433
				4434	if (ret == 0) {
				4435	ret = pfm_check_task_exist(ctx);
				4436	if (ret) {
				4437	ctx->ctx_state = PFM_CTX_UNLOADED;
				4438	ctx->ctx_task = NULL;
				4439	}
				4440	}
				4441	}
				4442	return ret;
				4443	}
				4444
				4445	/*
				4446	* in this function, we do not need to increase the use count
				4447	* for the task via get_task_struct(), because we hold the
				4448	* context lock. If the task were to disappear while having
				4449	* a context attached, it would go through pfm_exit_thread()
				4450	* which also grabs the context lock and would therefore be blocked
				4451	* until we are here.
				4452	*/
				4453	static void pfm_flush_pmds(struct task_struct , pfm_context_t ctx);
				4454
				4455	static int
				4456	pfm_context_unload(pfm_context_t ctx, void arg, int count, struct pt_regs *regs)
				4457	{
				4458	struct task_struct *task = PFM_CTX_TASK(ctx);
				4459	struct pt_regs *tregs;
				4460	int prev_state, is_system;
				4461	int ret;
				4462
				4463	DPRINT(("ctx_state=%d task [%d]\n", ctx->ctx_state, task ? task->pid : -1));
				4464
				4465	prev_state = ctx->ctx_state;
				4466	is_system = ctx->ctx_fl_system;
				4467
				4468	/*
				4469	* unload only when necessary
				4470	*/
				4471	if (prev_state == PFM_CTX_UNLOADED) {
				4472	DPRINT(("ctx_state=%d, nothing to do\n", prev_state));
				4473	return 0;
				4474	}
				4475
				4476	/*
				4477	* clear psr and dcr bits
				4478	*/
				4479	ret = pfm_stop(ctx, NULL, 0, regs);
				4480	if (ret) return ret;
				4481
				4482	ctx->ctx_state = PFM_CTX_UNLOADED;
				4483
				4484	/*
				4485	* in system mode, we need to update the PMU directly
				4486	* and the user level state of the caller, which may not
				4487	* necessarily be the creator of the context.
				4488	*/
				4489	if (is_system) {
				4490
				4491	/*
				4492	* Update cpuinfo
				4493	*
				4494	* local PMU is taken care of in pfm_stop()
				4495	*/
				4496	PFM_CPUINFO_CLEAR(PFM_CPUINFO_SYST_WIDE);
				4497	PFM_CPUINFO_CLEAR(PFM_CPUINFO_EXCL_IDLE);
				4498
				4499	/*
				4500	* save PMDs in context
				4501	* release ownership
				4502	*/
				4503	pfm_flush_pmds(current, ctx);
				4504
				4505	/*
				4506	* at this point we are done with the PMU
				4507	* so we can unreserve the resource.
				4508	*/
				4509	if (prev_state != PFM_CTX_ZOMBIE)
				4510	pfm_unreserve_session(ctx, 1 , ctx->ctx_cpu);
				4511
				4512	/*
				4513	* disconnect context from task
				4514	*/
				4515	task->thread.pfm_context = NULL;
				4516	/*
				4517	* disconnect task from context
				4518	*/
				4519	ctx->ctx_task = NULL;
				4520
				4521	/*
				4522	* There is nothing more to cleanup here.
				4523	*/
				4524	return 0;
				4525	}
				4526
				4527	/*
				4528	* per-task mode
				4529	*/
				4530	tregs = task == current ? regs : ia64_task_regs(task);
				4531
				4532	if (task == current) {
				4533	/*
				4534	* cancel user level control
				4535	*/
				4536	ia64_psr(regs)->sp = 1;
				4537
				4538	DPRINT(("setting psr.sp for [%d]\n", task->pid));
				4539	}
				4540	/*
				4541	* save PMDs to context
				4542	* release ownership
				4543	*/
				4544	pfm_flush_pmds(task, ctx);
				4545
				4546	/*
				4547	* at this point we are done with the PMU
				4548	* so we can unreserve the resource.
				4549	*
				4550	* when state was ZOMBIE, we have already unreserved.
				4551	*/
				4552	if (prev_state != PFM_CTX_ZOMBIE)
				4553	pfm_unreserve_session(ctx, 0 , ctx->ctx_cpu);
				4554
				4555	/*
				4556	* reset activation counter and psr
				4557	*/
				4558	ctx->ctx_last_activation = PFM_INVALID_ACTIVATION;
				4559	SET_LAST_CPU(ctx, -1);
				4560
				4561	/*
				4562	* PMU state will not be restored
				4563	*/
				4564	task->thread.flags &= ~IA64_THREAD_PM_VALID;
				4565
				4566	/*
				4567	* break links between context and task
				4568	*/
				4569	task->thread.pfm_context = NULL;
				4570	ctx->ctx_task = NULL;
				4571
				4572	PFM_SET_WORK_PENDING(task, 0);
				4573
				4574	ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE;
				4575	ctx->ctx_fl_can_restart = 0;
				4576	ctx->ctx_fl_going_zombie = 0;
				4577
				4578	DPRINT(("disconnected [%d] from context\n", task->pid));
				4579
				4580	return 0;
				4581	}
				4582
				4583
				4584	/*
				4585	* called only from exit_thread(): task == current
				4586	* we come here only if current has a context attached (loaded or masked)
				4587	*/
				4588	void
				4589	pfm_exit_thread(struct task_struct *task)
				4590	{
				4591	pfm_context_t *ctx;
				4592	unsigned long flags;
				4593	struct pt_regs *regs = ia64_task_regs(task);
				4594	int ret, state;
				4595	int free_ok = 0;
				4596
				4597	ctx = PFM_GET_CTX(task);
				4598
				4599	PROTECT_CTX(ctx, flags);
				4600
				4601	DPRINT(("state=%d task [%d]\n", ctx->ctx_state, task->pid));
				4602
				4603	state = ctx->ctx_state;
				4604	switch(state) {
				4605	case PFM_CTX_UNLOADED:
				4606	/*
				4607	* only comes to thios function if pfm_context is not NULL, i.e., cannot
				4608	* be in unloaded state
				4609	*/
				4610	printk(KERN_ERR "perfmon: pfm_exit_thread [%d] ctx unloaded\n", task->pid);
				4611	break;
				4612	case PFM_CTX_LOADED:
				4613	case PFM_CTX_MASKED:
				4614	ret = pfm_context_unload(ctx, NULL, 0, regs);
				4615	if (ret) {
				4616	printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task->pid, state, ret);
				4617	}
				4618	DPRINT(("ctx unloaded for current state was %d\n", state));
				4619
				4620	pfm_end_notify_user(ctx);
				4621	break;
				4622	case PFM_CTX_ZOMBIE:
				4623	ret = pfm_context_unload(ctx, NULL, 0, regs);
				4624	if (ret) {
				4625	printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task->pid, state, ret);
				4626	}
				4627	free_ok = 1;
				4628	break;
				4629	default:
				4630	printk(KERN_ERR "perfmon: pfm_exit_thread [%d] unexpected state=%d\n", task->pid, state);
				4631	break;
				4632	}
				4633	UNPROTECT_CTX(ctx, flags);
				4634
				4635	{ u64 psr = pfm_get_psr();
				4636	BUG_ON(psr & (IA64_PSR_UP\|IA64_PSR_PP));
				4637	BUG_ON(GET_PMU_OWNER());
				4638	BUG_ON(ia64_psr(regs)->up);
				4639	BUG_ON(ia64_psr(regs)->pp);
				4640	}
				4641
				4642	/*
				4643	* All memory free operations (especially for vmalloc'ed memory)
				4644	* MUST be done with interrupts ENABLED.
				4645	*/
				4646	if (free_ok) pfm_context_free(ctx);
				4647	}
				4648
				4649	/*
				4650	* functions MUST be listed in the increasing order of their index (see permfon.h)
				4651	*/
				4652	#define PFM_CMD(name, flags, arg_count, arg_type, getsz) { name, #name, flags, arg_count, sizeof(arg_type), getsz }
				4653	#define PFM_CMD_S(name, flags) { name, #name, flags, 0, 0, NULL }
				4654	#define PFM_CMD_PCLRWS (PFM_CMD_FD\|PFM_CMD_ARG_RW\|PFM_CMD_STOP)
				4655	#define PFM_CMD_PCLRW (PFM_CMD_FD\|PFM_CMD_ARG_RW)
				4656	#define PFM_CMD_NONE { NULL, "no-cmd", 0, 0, 0, NULL}
				4657
				4658	static pfm_cmd_desc_t pfm_cmd_tab[]={
				4659	/* 0 */PFM_CMD_NONE,
				4660	/* 1 */PFM_CMD(pfm_write_pmcs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL),
				4661	/* 2 */PFM_CMD(pfm_write_pmds, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL),
				4662	/* 3 */PFM_CMD(pfm_read_pmds, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL),
				4663	/* 4 */PFM_CMD_S(pfm_stop, PFM_CMD_PCLRWS),
				4664	/* 5 */PFM_CMD_S(pfm_start, PFM_CMD_PCLRWS),
				4665	/* 6 */PFM_CMD_NONE,
				4666	/* 7 */PFM_CMD_NONE,
				4667	/* 8 */PFM_CMD(pfm_context_create, PFM_CMD_ARG_RW, 1, pfarg_context_t, pfm_ctx_getsize),
				4668	/* 9 */PFM_CMD_NONE,
				4669	/* 10 */PFM_CMD_S(pfm_restart, PFM_CMD_PCLRW),
				4670	/* 11 */PFM_CMD_NONE,
				4671	/* 12 */PFM_CMD(pfm_get_features, PFM_CMD_ARG_RW, 1, pfarg_features_t, NULL),
				4672	/* 13 */PFM_CMD(pfm_debug, 0, 1, unsigned int, NULL),
				4673	/* 14 */PFM_CMD_NONE,
				4674	/* 15 */PFM_CMD(pfm_get_pmc_reset, PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL),
				4675	/* 16 */PFM_CMD(pfm_context_load, PFM_CMD_PCLRWS, 1, pfarg_load_t, NULL),
				4676	/* 17 */PFM_CMD_S(pfm_context_unload, PFM_CMD_PCLRWS),
				4677	/* 18 */PFM_CMD_NONE,
				4678	/* 19 */PFM_CMD_NONE,
				4679	/* 20 */PFM_CMD_NONE,
				4680	/* 21 */PFM_CMD_NONE,
				4681	/* 22 */PFM_CMD_NONE,
				4682	/* 23 */PFM_CMD_NONE,
				4683	/* 24 */PFM_CMD_NONE,
				4684	/* 25 */PFM_CMD_NONE,
				4685	/* 26 */PFM_CMD_NONE,
				4686	/* 27 */PFM_CMD_NONE,
				4687	/* 28 */PFM_CMD_NONE,
				4688	/* 29 */PFM_CMD_NONE,
				4689	/* 30 */PFM_CMD_NONE,
				4690	/* 31 */PFM_CMD_NONE,
				4691	/* 32 */PFM_CMD(pfm_write_ibrs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_dbreg_t, NULL),
				4692	/* 33 */PFM_CMD(pfm_write_dbrs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_dbreg_t, NULL)
				4693	};
				4694	#define PFM_CMD_COUNT (sizeof(pfm_cmd_tab)/sizeof(pfm_cmd_desc_t))
				4695
				4696	static int
				4697	pfm_check_task_state(pfm_context_t *ctx, int cmd, unsigned long flags)
				4698	{
				4699	struct task_struct *task;
				4700	int state, old_state;
				4701
				4702	recheck:
				4703	state = ctx->ctx_state;
				4704	task = ctx->ctx_task;
				4705
				4706	if (task == NULL) {
				4707	DPRINT(("context %d no task, state=%d\n", ctx->ctx_fd, state));
				4708	return 0;
				4709	}
				4710
				4711	DPRINT(("context %d state=%d [%d] task_state=%ld must_stop=%d\n",
				4712	ctx->ctx_fd,
				4713	state,
				4714	task->pid,
				4715	task->state, PFM_CMD_STOPPED(cmd)));
				4716
				4717	/*
				4718	* self-monitoring always ok.
				4719	*
				4720	* for system-wide the caller can either be the creator of the
				4721	* context (to one to which the context is attached to) OR
				4722	* a task running on the same CPU as the session.
				4723	*/
				4724	if (task == current \|\| ctx->ctx_fl_system) return 0;
				4725
				4726	/*
stephane eranian	a5a70b7	2005-04-18 11:42:00 -0700	[diff] [blame]	4727	* we are monitoring another thread
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	4728	*/
stephane eranian	a5a70b7	2005-04-18 11:42:00 -0700	[diff] [blame]	4729	switch(state) {
				4730	case PFM_CTX_UNLOADED:
				4731	/*
				4732	* if context is UNLOADED we are safe to go
				4733	*/
				4734	return 0;
				4735	case PFM_CTX_ZOMBIE:
				4736	/*
				4737	* no command can operate on a zombie context
				4738	*/
				4739	DPRINT(("cmd %d state zombie cannot operate on context\n", cmd));
				4740	return -EINVAL;
				4741	case PFM_CTX_MASKED:
				4742	/*
				4743	* PMU state has been saved to software even though
				4744	* the thread may still be running.
				4745	*/
				4746	if (cmd != PFM_UNLOAD_CONTEXT) return 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	4747	}
				4748
				4749	/*
				4750	* context is LOADED or MASKED. Some commands may need to have
				4751	* the task stopped.
				4752	*
				4753	* We could lift this restriction for UP but it would mean that
				4754	* the user has no guarantee the task would not run between
				4755	* two successive calls to perfmonctl(). That's probably OK.
				4756	* If this user wants to ensure the task does not run, then
				4757	* the task must be stopped.
				4758	*/
				4759	if (PFM_CMD_STOPPED(cmd)) {
				4760	if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) {
				4761	DPRINT(("[%d] task not in stopped state\n", task->pid));
				4762	return -EBUSY;
				4763	}
				4764	/*
				4765	* task is now stopped, wait for ctxsw out
				4766	*
				4767	* This is an interesting point in the code.
				4768	* We need to unprotect the context because
				4769	* the pfm_save_regs() routines needs to grab
				4770	* the same lock. There are danger in doing
				4771	* this because it leaves a window open for
				4772	* another task to get access to the context
				4773	* and possibly change its state. The one thing
				4774	* that is not possible is for the context to disappear
				4775	* because we are protected by the VFS layer, i.e.,
				4776	* get_fd()/put_fd().
				4777	*/
				4778	old_state = state;
				4779
				4780	UNPROTECT_CTX(ctx, flags);
				4781
				4782	wait_task_inactive(task);
				4783
				4784	PROTECT_CTX(ctx, flags);
				4785
				4786	/*
				4787	* we must recheck to verify if state has changed
				4788	*/
				4789	if (ctx->ctx_state != old_state) {
				4790	DPRINT(("old_state=%d new_state=%d\n", old_state, ctx->ctx_state));
				4791	goto recheck;
				4792	}
				4793	}
				4794	return 0;
				4795	}
				4796
				4797	/*
				4798	* system-call entry point (must return long)
				4799	*/
				4800	asmlinkage long
				4801	sys_perfmonctl (int fd, int cmd, void __user *arg, int count)
				4802	{
				4803	struct file *file = NULL;
				4804	pfm_context_t *ctx = NULL;
				4805	unsigned long flags = 0UL;
				4806	void *args_k = NULL;
				4807	long ret; /* will expand int return types */
				4808	size_t base_sz, sz, xtra_sz = 0;
				4809	int narg, completed_args = 0, call_made = 0, cmd_flags;
				4810	int (func)(pfm_context_t ctx, void arg, int count, struct pt_regs regs);
				4811	int (getsize)(void arg, size_t *sz);
				4812	#define PFM_MAX_ARGSIZE 4096
				4813
				4814	/*
				4815	* reject any call if perfmon was disabled at initialization
				4816	*/
				4817	if (unlikely(pmu_conf == NULL)) return -ENOSYS;
				4818
				4819	if (unlikely(cmd < 0 \|\| cmd >= PFM_CMD_COUNT)) {
				4820	DPRINT(("invalid cmd=%d\n", cmd));
				4821	return -EINVAL;
				4822	}
				4823
				4824	func = pfm_cmd_tab[cmd].cmd_func;
				4825	narg = pfm_cmd_tab[cmd].cmd_narg;
				4826	base_sz = pfm_cmd_tab[cmd].cmd_argsize;
				4827	getsize = pfm_cmd_tab[cmd].cmd_getsize;
				4828	cmd_flags = pfm_cmd_tab[cmd].cmd_flags;
				4829
				4830	if (unlikely(func == NULL)) {
				4831	DPRINT(("invalid cmd=%d\n", cmd));
				4832	return -EINVAL;
				4833	}
				4834
				4835	DPRINT(("cmd=%s idx=%d narg=0x%x argsz=%lu count=%d\n",
				4836	PFM_CMD_NAME(cmd),
				4837	cmd,
				4838	narg,
				4839	base_sz,
				4840	count));
				4841
				4842	/*
				4843	* check if number of arguments matches what the command expects
				4844	*/
				4845	if (unlikely((narg == PFM_CMD_ARG_MANY && count <= 0) \|\| (narg > 0 && narg != count)))
				4846	return -EINVAL;
				4847
				4848	restart_args:
				4849	sz = xtra_sz + base_sz*count;
				4850	/*
				4851	* limit abuse to min page size
				4852	*/
				4853	if (unlikely(sz > PFM_MAX_ARGSIZE)) {
				4854	printk(KERN_ERR "perfmon: [%d] argument too big %lu\n", current->pid, sz);
				4855	return -E2BIG;
				4856	}
				4857
				4858	/*
				4859	* allocate default-sized argument buffer
				4860	*/
				4861	if (likely(count && args_k == NULL)) {
				4862	args_k = kmalloc(PFM_MAX_ARGSIZE, GFP_KERNEL);
				4863	if (args_k == NULL) return -ENOMEM;
				4864	}
				4865
				4866	ret = -EFAULT;
				4867
				4868	/*
				4869	* copy arguments
				4870	*
				4871	* assume sz = 0 for command without parameters
				4872	*/
				4873	if (sz && copy_from_user(args_k, arg, sz)) {
				4874	DPRINT(("cannot copy_from_user %lu bytes @%p\n", sz, arg));
				4875	goto error_args;
				4876	}
				4877
				4878	/*
				4879	* check if command supports extra parameters
				4880	*/
				4881	if (completed_args == 0 && getsize) {
				4882	/*
				4883	* get extra parameters size (based on main argument)
				4884	*/
				4885	ret = (*getsize)(args_k, &xtra_sz);
				4886	if (ret) goto error_args;
				4887
				4888	completed_args = 1;
				4889
				4890	DPRINT(("restart_args sz=%lu xtra_sz=%lu\n", sz, xtra_sz));
				4891
				4892	/* retry if necessary */
				4893	if (likely(xtra_sz)) goto restart_args;
				4894	}
				4895
				4896	if (unlikely((cmd_flags & PFM_CMD_FD) == 0)) goto skip_fd;
				4897
				4898	ret = -EBADF;
				4899
				4900	file = fget(fd);
				4901	if (unlikely(file == NULL)) {
				4902	DPRINT(("invalid fd %d\n", fd));
				4903	goto error_args;
				4904	}
				4905	if (unlikely(PFM_IS_FILE(file) == 0)) {
				4906	DPRINT(("fd %d not related to perfmon\n", fd));
				4907	goto error_args;
				4908	}
				4909
				4910	ctx = (pfm_context_t *)file->private_data;
				4911	if (unlikely(ctx == NULL)) {
				4912	DPRINT(("no context for fd %d\n", fd));
				4913	goto error_args;
				4914	}
				4915	prefetch(&ctx->ctx_state);
				4916
				4917	PROTECT_CTX(ctx, flags);
				4918
				4919	/*
				4920	* check task is stopped
				4921	*/
				4922	ret = pfm_check_task_state(ctx, cmd, flags);
				4923	if (unlikely(ret)) goto abort_locked;
				4924
				4925	skip_fd:
				4926	ret = (*func)(ctx, args_k, count, ia64_task_regs(current));
				4927
				4928	call_made = 1;
				4929
				4930	abort_locked:
				4931	if (likely(ctx)) {
				4932	DPRINT(("context unlocked\n"));
				4933	UNPROTECT_CTX(ctx, flags);
				4934	fput(file);
				4935	}
				4936
				4937	/* copy argument back to user, if needed */
				4938	if (call_made && PFM_CMD_RW_ARG(cmd) && copy_to_user(arg, args_k, base_sz*count)) ret = -EFAULT;
				4939
				4940	error_args:
				4941	if (args_k) kfree(args_k);
				4942
				4943	DPRINT(("cmd=%s ret=%ld\n", PFM_CMD_NAME(cmd), ret));
				4944
				4945	return ret;
				4946	}
				4947
				4948	static void
				4949	pfm_resume_after_ovfl(pfm_context_t ctx, unsigned long ovfl_regs, struct pt_regs regs)
				4950	{
				4951	pfm_buffer_fmt_t *fmt = ctx->ctx_buf_fmt;
				4952	pfm_ovfl_ctrl_t rst_ctrl;
				4953	int state;
				4954	int ret = 0;
				4955
				4956	state = ctx->ctx_state;
				4957	/*
				4958	* Unlock sampling buffer and reset index atomically
				4959	* XXX: not really needed when blocking
				4960	*/
				4961	if (CTX_HAS_SMPL(ctx)) {
				4962
				4963	rst_ctrl.bits.mask_monitoring = 0;
				4964	rst_ctrl.bits.reset_ovfl_pmds = 0;
				4965
				4966	if (state == PFM_CTX_LOADED)
				4967	ret = pfm_buf_fmt_restart_active(fmt, current, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
				4968	else
				4969	ret = pfm_buf_fmt_restart(fmt, current, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
				4970	} else {
				4971	rst_ctrl.bits.mask_monitoring = 0;
				4972	rst_ctrl.bits.reset_ovfl_pmds = 1;
				4973	}
				4974
				4975	if (ret == 0) {
				4976	if (rst_ctrl.bits.reset_ovfl_pmds) {
				4977	pfm_reset_regs(ctx, &ovfl_regs, PFM_PMD_LONG_RESET);
				4978	}
				4979	if (rst_ctrl.bits.mask_monitoring == 0) {
				4980	DPRINT(("resuming monitoring\n"));
				4981	if (ctx->ctx_state == PFM_CTX_MASKED) pfm_restore_monitoring(current);
				4982	} else {
				4983	DPRINT(("stopping monitoring\n"));
				4984	//pfm_stop_monitoring(current, regs);
				4985	}
				4986	ctx->ctx_state = PFM_CTX_LOADED;
				4987	}
				4988	}
				4989
				4990	/*
				4991	* context MUST BE LOCKED when calling
				4992	* can only be called for current
				4993	*/
				4994	static void
				4995	pfm_context_force_terminate(pfm_context_t ctx, struct pt_regs regs)
				4996	{
				4997	int ret;
				4998
				4999	DPRINT(("entering for [%d]\n", current->pid));
				5000
				5001	ret = pfm_context_unload(ctx, NULL, 0, regs);
				5002	if (ret) {
				5003	printk(KERN_ERR "pfm_context_force_terminate: [%d] unloaded failed with %d\n", current->pid, ret);
				5004	}
				5005
				5006	/*
				5007	* and wakeup controlling task, indicating we are now disconnected
				5008	*/
				5009	wake_up_interruptible(&ctx->ctx_zombieq);
				5010
				5011	/*
				5012	* given that context is still locked, the controlling
				5013	* task will only get access when we return from
				5014	* pfm_handle_work().
				5015	*/
				5016	}
				5017
				5018	static int pfm_ovfl_notify_user(pfm_context_t *ctx, unsigned long ovfl_pmds);
Stephane Eranian	4944930	2005-04-25 13:08:30 -0700	[diff] [blame]	5019	/*
				5020	* pfm_handle_work() can be called with interrupts enabled
				5021	* (TIF_NEED_RESCHED) or disabled. The down_interruptible
				5022	* call may sleep, therefore we must re-enable interrupts
				5023	* to avoid deadlocks. It is safe to do so because this function
				5024	* is called ONLY when returning to user level (PUStk=1), in which case
				5025	* there is no risk of kernel stack overflow due to deep
				5026	* interrupt nesting.
				5027	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5028	void
				5029	pfm_handle_work(void)
				5030	{
				5031	pfm_context_t *ctx;
				5032	struct pt_regs *regs;
Stephane Eranian	4944930	2005-04-25 13:08:30 -0700	[diff] [blame]	5033	unsigned long flags, dummy_flags;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5034	unsigned long ovfl_regs;
				5035	unsigned int reason;
				5036	int ret;
				5037
				5038	ctx = PFM_GET_CTX(current);
				5039	if (ctx == NULL) {
				5040	printk(KERN_ERR "perfmon: [%d] has no PFM context\n", current->pid);
				5041	return;
				5042	}
				5043
				5044	PROTECT_CTX(ctx, flags);
				5045
				5046	PFM_SET_WORK_PENDING(current, 0);
				5047
				5048	pfm_clear_task_notify();
				5049
				5050	regs = ia64_task_regs(current);
				5051
				5052	/*
				5053	* extract reason for being here and clear
				5054	*/
				5055	reason = ctx->ctx_fl_trap_reason;
				5056	ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE;
				5057	ovfl_regs = ctx->ctx_ovfl_regs[0];
				5058
				5059	DPRINT(("reason=%d state=%d\n", reason, ctx->ctx_state));
				5060
				5061	/*
				5062	* must be done before we check for simple-reset mode
				5063	*/
				5064	if (ctx->ctx_fl_going_zombie \|\| ctx->ctx_state == PFM_CTX_ZOMBIE) goto do_zombie;
				5065
				5066
				5067	//if (CTX_OVFL_NOBLOCK(ctx)) goto skip_blocking;
				5068	if (reason == PFM_TRAP_REASON_RESET) goto skip_blocking;
				5069
Stephane Eranian	4944930	2005-04-25 13:08:30 -0700	[diff] [blame]	5070	/*
				5071	* restore interrupt mask to what it was on entry.
				5072	* Could be enabled/diasbled.
				5073	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5074	UNPROTECT_CTX(ctx, flags);
				5075
Stephane Eranian	4944930	2005-04-25 13:08:30 -0700	[diff] [blame]	5076	/*
				5077	* force interrupt enable because of down_interruptible()
				5078	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5079	local_irq_enable();
				5080
				5081	DPRINT(("before block sleeping\n"));
				5082
				5083	/*
				5084	* may go through without blocking on SMP systems
				5085	* if restart has been received already by the time we call down()
				5086	*/
				5087	ret = down_interruptible(&ctx->ctx_restart_sem);
				5088
				5089	DPRINT(("after block sleeping ret=%d\n", ret));
				5090
				5091	/*
Stephane Eranian	4944930	2005-04-25 13:08:30 -0700	[diff] [blame]	5092	* lock context and mask interrupts again
				5093	* We save flags into a dummy because we may have
				5094	* altered interrupts mask compared to entry in this
				5095	* function.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5096	*/
Stephane Eranian	4944930	2005-04-25 13:08:30 -0700	[diff] [blame]	5097	PROTECT_CTX(ctx, dummy_flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5098
				5099	/*
				5100	* we need to read the ovfl_regs only after wake-up
				5101	* because we may have had pfm_write_pmds() in between
				5102	* and that can changed PMD values and therefore
				5103	* ovfl_regs is reset for these new PMD values.
				5104	*/
				5105	ovfl_regs = ctx->ctx_ovfl_regs[0];
				5106
				5107	if (ctx->ctx_fl_going_zombie) {
				5108	do_zombie:
				5109	DPRINT(("context is zombie, bailing out\n"));
				5110	pfm_context_force_terminate(ctx, regs);
				5111	goto nothing_to_do;
				5112	}
				5113	/*
				5114	* in case of interruption of down() we don't restart anything
				5115	*/
				5116	if (ret < 0) goto nothing_to_do;
				5117
				5118	skip_blocking:
				5119	pfm_resume_after_ovfl(ctx, ovfl_regs, regs);
				5120	ctx->ctx_ovfl_regs[0] = 0UL;
				5121
				5122	nothing_to_do:
Stephane Eranian	4944930	2005-04-25 13:08:30 -0700	[diff] [blame]	5123	/*
				5124	* restore flags as they were upon entry
				5125	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5126	UNPROTECT_CTX(ctx, flags);
				5127	}
				5128
				5129	static int
				5130	pfm_notify_user(pfm_context_t ctx, pfm_msg_t msg)
				5131	{
				5132	if (ctx->ctx_state == PFM_CTX_ZOMBIE) {
				5133	DPRINT(("ignoring overflow notification, owner is zombie\n"));
				5134	return 0;
				5135	}
				5136
				5137	DPRINT(("waking up somebody\n"));
				5138
				5139	if (msg) wake_up_interruptible(&ctx->ctx_msgq_wait);
				5140
				5141	/*
				5142	* safe, we are not in intr handler, nor in ctxsw when
				5143	* we come here
				5144	*/
				5145	kill_fasync (&ctx->ctx_async_queue, SIGIO, POLL_IN);
				5146
				5147	return 0;
				5148	}
				5149
				5150	static int
				5151	pfm_ovfl_notify_user(pfm_context_t *ctx, unsigned long ovfl_pmds)
				5152	{
				5153	pfm_msg_t *msg = NULL;
				5154
				5155	if (ctx->ctx_fl_no_msg == 0) {
				5156	msg = pfm_get_new_msg(ctx);
				5157	if (msg == NULL) {
				5158	printk(KERN_ERR "perfmon: pfm_ovfl_notify_user no more notification msgs\n");
				5159	return -1;
				5160	}
				5161
				5162	msg->pfm_ovfl_msg.msg_type = PFM_MSG_OVFL;
				5163	msg->pfm_ovfl_msg.msg_ctx_fd = ctx->ctx_fd;
				5164	msg->pfm_ovfl_msg.msg_active_set = 0;
				5165	msg->pfm_ovfl_msg.msg_ovfl_pmds[0] = ovfl_pmds;
				5166	msg->pfm_ovfl_msg.msg_ovfl_pmds[1] = 0UL;
				5167	msg->pfm_ovfl_msg.msg_ovfl_pmds[2] = 0UL;
				5168	msg->pfm_ovfl_msg.msg_ovfl_pmds[3] = 0UL;
				5169	msg->pfm_ovfl_msg.msg_tstamp = 0UL;
				5170	}
				5171
				5172	DPRINT(("ovfl msg: msg=%p no_msg=%d fd=%d ovfl_pmds=0x%lx\n",
				5173	msg,
				5174	ctx->ctx_fl_no_msg,
				5175	ctx->ctx_fd,
				5176	ovfl_pmds));
				5177
				5178	return pfm_notify_user(ctx, msg);
				5179	}
				5180
				5181	static int
				5182	pfm_end_notify_user(pfm_context_t *ctx)
				5183	{
				5184	pfm_msg_t *msg;
				5185
				5186	msg = pfm_get_new_msg(ctx);
				5187	if (msg == NULL) {
				5188	printk(KERN_ERR "perfmon: pfm_end_notify_user no more notification msgs\n");
				5189	return -1;
				5190	}
				5191	/* no leak */
				5192	memset(msg, 0, sizeof(*msg));
				5193
				5194	msg->pfm_end_msg.msg_type = PFM_MSG_END;
				5195	msg->pfm_end_msg.msg_ctx_fd = ctx->ctx_fd;
				5196	msg->pfm_ovfl_msg.msg_tstamp = 0UL;
				5197
				5198	DPRINT(("end msg: msg=%p no_msg=%d ctx_fd=%d\n",
				5199	msg,
				5200	ctx->ctx_fl_no_msg,
				5201	ctx->ctx_fd));
				5202
				5203	return pfm_notify_user(ctx, msg);
				5204	}
				5205
				5206	/*
				5207	* main overflow processing routine.
				5208	* it can be called from the interrupt path or explicitely during the context switch code
				5209	*/
				5210	static void
				5211	pfm_overflow_handler(struct task_struct task, pfm_context_t ctx, u64 pmc0, struct pt_regs *regs)
				5212	{
				5213	pfm_ovfl_arg_t *ovfl_arg;
				5214	unsigned long mask;
				5215	unsigned long old_val, ovfl_val, new_val;
				5216	unsigned long ovfl_notify = 0UL, ovfl_pmds = 0UL, smpl_pmds = 0UL, reset_pmds;
				5217	unsigned long tstamp;
				5218	pfm_ovfl_ctrl_t ovfl_ctrl;
				5219	unsigned int i, has_smpl;
				5220	int must_notify = 0;
				5221
				5222	if (unlikely(ctx->ctx_state == PFM_CTX_ZOMBIE)) goto stop_monitoring;
				5223
				5224	/*
				5225	* sanity test. Should never happen
				5226	*/
				5227	if (unlikely((pmc0 & 0x1) == 0)) goto sanity_check;
				5228
				5229	tstamp = ia64_get_itc();
				5230	mask = pmc0 >> PMU_FIRST_COUNTER;
				5231	ovfl_val = pmu_conf->ovfl_val;
				5232	has_smpl = CTX_HAS_SMPL(ctx);
				5233
				5234	DPRINT_ovfl(("pmc0=0x%lx pid=%d iip=0x%lx, %s "
				5235	"used_pmds=0x%lx\n",
				5236	pmc0,
				5237	task ? task->pid: -1,
				5238	(regs ? regs->cr_iip : 0),
				5239	CTX_OVFL_NOBLOCK(ctx) ? "nonblocking" : "blocking",
				5240	ctx->ctx_used_pmds[0]));
				5241
				5242
				5243	/*
				5244	* first we update the virtual counters
				5245	* assume there was a prior ia64_srlz_d() issued
				5246	*/
				5247	for (i = PMU_FIRST_COUNTER; mask ; i++, mask >>= 1) {
				5248
				5249	/* skip pmd which did not overflow */
				5250	if ((mask & 0x1) == 0) continue;
				5251
				5252	/*
				5253	* Note that the pmd is not necessarily 0 at this point as qualified events
				5254	* may have happened before the PMU was frozen. The residual count is not
				5255	* taken into consideration here but will be with any read of the pmd via
				5256	* pfm_read_pmds().
				5257	*/
				5258	old_val = new_val = ctx->ctx_pmds[i].val;
				5259	new_val += 1 + ovfl_val;
				5260	ctx->ctx_pmds[i].val = new_val;
				5261
				5262	/*
				5263	* check for overflow condition
				5264	*/
				5265	if (likely(old_val > new_val)) {
				5266	ovfl_pmds \|= 1UL << i;
				5267	if (PMC_OVFL_NOTIFY(ctx, i)) ovfl_notify \|= 1UL << i;
				5268	}
				5269
				5270	DPRINT_ovfl(("ctx_pmd[%d].val=0x%lx old_val=0x%lx pmd=0x%lx ovfl_pmds=0x%lx ovfl_notify=0x%lx\n",
				5271	i,
				5272	new_val,
				5273	old_val,
				5274	ia64_get_pmd(i) & ovfl_val,
				5275	ovfl_pmds,
				5276	ovfl_notify));
				5277	}
				5278
				5279	/*
				5280	* there was no 64-bit overflow, nothing else to do
				5281	*/
				5282	if (ovfl_pmds == 0UL) return;
				5283
				5284	/*
				5285	* reset all control bits
				5286	*/
				5287	ovfl_ctrl.val = 0;
				5288	reset_pmds = 0UL;
				5289
				5290	/*
				5291	* if a sampling format module exists, then we "cache" the overflow by
				5292	* calling the module's handler() routine.
				5293	*/
				5294	if (has_smpl) {
				5295	unsigned long start_cycles, end_cycles;
				5296	unsigned long pmd_mask;
				5297	int j, k, ret = 0;
				5298	int this_cpu = smp_processor_id();
				5299
				5300	pmd_mask = ovfl_pmds >> PMU_FIRST_COUNTER;
				5301	ovfl_arg = &ctx->ctx_ovfl_arg;
				5302
				5303	prefetch(ctx->ctx_smpl_hdr);
				5304
				5305	for(i=PMU_FIRST_COUNTER; pmd_mask && ret == 0; i++, pmd_mask >>=1) {
				5306
				5307	mask = 1UL << i;
				5308
				5309	if ((pmd_mask & 0x1) == 0) continue;
				5310
				5311	ovfl_arg->ovfl_pmd = (unsigned char )i;
				5312	ovfl_arg->ovfl_notify = ovfl_notify & mask ? 1 : 0;
				5313	ovfl_arg->active_set = 0;
				5314	ovfl_arg->ovfl_ctrl.val = 0; /* module must fill in all fields */
				5315	ovfl_arg->smpl_pmds[0] = smpl_pmds = ctx->ctx_pmds[i].smpl_pmds[0];
				5316
				5317	ovfl_arg->pmd_value = ctx->ctx_pmds[i].val;
				5318	ovfl_arg->pmd_last_reset = ctx->ctx_pmds[i].lval;
				5319	ovfl_arg->pmd_eventid = ctx->ctx_pmds[i].eventid;
				5320
				5321	/*
				5322	* copy values of pmds of interest. Sampling format may copy them
				5323	* into sampling buffer.
				5324	*/
				5325	if (smpl_pmds) {
				5326	for(j=0, k=0; smpl_pmds; j++, smpl_pmds >>=1) {
				5327	if ((smpl_pmds & 0x1) == 0) continue;
				5328	ovfl_arg->smpl_pmds_values[k++] = PMD_IS_COUNTING(j) ? pfm_read_soft_counter(ctx, j) : ia64_get_pmd(j);
				5329	DPRINT_ovfl(("smpl_pmd[%d]=pmd%u=0x%lx\n", k-1, j, ovfl_arg->smpl_pmds_values[k-1]));
				5330	}
				5331	}
				5332
				5333	pfm_stats[this_cpu].pfm_smpl_handler_calls++;
				5334
				5335	start_cycles = ia64_get_itc();
				5336
				5337	/*
				5338	* call custom buffer format record (handler) routine
				5339	*/
				5340	ret = (*ctx->ctx_buf_fmt->fmt_handler)(task, ctx->ctx_smpl_hdr, ovfl_arg, regs, tstamp);
				5341
				5342	end_cycles = ia64_get_itc();
				5343
				5344	/*
				5345	* For those controls, we take the union because they have
				5346	* an all or nothing behavior.
				5347	*/
				5348	ovfl_ctrl.bits.notify_user \|= ovfl_arg->ovfl_ctrl.bits.notify_user;
				5349	ovfl_ctrl.bits.block_task \|= ovfl_arg->ovfl_ctrl.bits.block_task;
				5350	ovfl_ctrl.bits.mask_monitoring \|= ovfl_arg->ovfl_ctrl.bits.mask_monitoring;
				5351	/*
				5352	* build the bitmask of pmds to reset now
				5353	*/
				5354	if (ovfl_arg->ovfl_ctrl.bits.reset_ovfl_pmds) reset_pmds \|= mask;
				5355
				5356	pfm_stats[this_cpu].pfm_smpl_handler_cycles += end_cycles - start_cycles;
				5357	}
				5358	/*
				5359	* when the module cannot handle the rest of the overflows, we abort right here
				5360	*/
				5361	if (ret && pmd_mask) {
				5362	DPRINT(("handler aborts leftover ovfl_pmds=0x%lx\n",
				5363	pmd_mask<<PMU_FIRST_COUNTER));
				5364	}
				5365	/*
				5366	* remove the pmds we reset now from the set of pmds to reset in pfm_restart()
				5367	*/
				5368	ovfl_pmds &= ~reset_pmds;
				5369	} else {
				5370	/*
				5371	* when no sampling module is used, then the default
				5372	* is to notify on overflow if requested by user
				5373	*/
				5374	ovfl_ctrl.bits.notify_user = ovfl_notify ? 1 : 0;
				5375	ovfl_ctrl.bits.block_task = ovfl_notify ? 1 : 0;
				5376	ovfl_ctrl.bits.mask_monitoring = ovfl_notify ? 1 : 0; /* XXX: change for saturation */
				5377	ovfl_ctrl.bits.reset_ovfl_pmds = ovfl_notify ? 0 : 1;
				5378	/*
				5379	* if needed, we reset all overflowed pmds
				5380	*/
				5381	if (ovfl_notify == 0) reset_pmds = ovfl_pmds;
				5382	}
				5383
				5384	DPRINT_ovfl(("ovfl_pmds=0x%lx reset_pmds=0x%lx\n", ovfl_pmds, reset_pmds));
				5385
				5386	/*
				5387	* reset the requested PMD registers using the short reset values
				5388	*/
				5389	if (reset_pmds) {
				5390	unsigned long bm = reset_pmds;
				5391	pfm_reset_regs(ctx, &bm, PFM_PMD_SHORT_RESET);
				5392	}
				5393
				5394	if (ovfl_notify && ovfl_ctrl.bits.notify_user) {
				5395	/*
				5396	* keep track of what to reset when unblocking
				5397	*/
				5398	ctx->ctx_ovfl_regs[0] = ovfl_pmds;
				5399
				5400	/*
				5401	* check for blocking context
				5402	*/
				5403	if (CTX_OVFL_NOBLOCK(ctx) == 0 && ovfl_ctrl.bits.block_task) {
				5404
				5405	ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_BLOCK;
				5406
				5407	/*
				5408	* set the perfmon specific checking pending work for the task
				5409	*/
				5410	PFM_SET_WORK_PENDING(task, 1);
				5411
				5412	/*
				5413	* when coming from ctxsw, current still points to the
				5414	* previous task, therefore we must work with task and not current.
				5415	*/
				5416	pfm_set_task_notify(task);
				5417	}
				5418	/*
				5419	* defer until state is changed (shorten spin window). the context is locked
				5420	* anyway, so the signal receiver would come spin for nothing.
				5421	*/
				5422	must_notify = 1;
				5423	}
				5424
				5425	DPRINT_ovfl(("owner [%d] pending=%ld reason=%u ovfl_pmds=0x%lx ovfl_notify=0x%lx masked=%d\n",
				5426	GET_PMU_OWNER() ? GET_PMU_OWNER()->pid : -1,
				5427	PFM_GET_WORK_PENDING(task),
				5428	ctx->ctx_fl_trap_reason,
				5429	ovfl_pmds,
				5430	ovfl_notify,
				5431	ovfl_ctrl.bits.mask_monitoring ? 1 : 0));
				5432	/*
				5433	* in case monitoring must be stopped, we toggle the psr bits
				5434	*/
				5435	if (ovfl_ctrl.bits.mask_monitoring) {
				5436	pfm_mask_monitoring(task);
				5437	ctx->ctx_state = PFM_CTX_MASKED;
				5438	ctx->ctx_fl_can_restart = 1;
				5439	}
				5440
				5441	/*
				5442	* send notification now
				5443	*/
				5444	if (must_notify) pfm_ovfl_notify_user(ctx, ovfl_notify);
				5445
				5446	return;
				5447
				5448	sanity_check:
				5449	printk(KERN_ERR "perfmon: CPU%d overflow handler [%d] pmc0=0x%lx\n",
				5450	smp_processor_id(),
				5451	task ? task->pid : -1,
				5452	pmc0);
				5453	return;
				5454
				5455	stop_monitoring:
				5456	/*
				5457	* in SMP, zombie context is never restored but reclaimed in pfm_load_regs().
				5458	* Moreover, zombies are also reclaimed in pfm_save_regs(). Therefore we can
				5459	* come here as zombie only if the task is the current task. In which case, we
				5460	* can access the PMU hardware directly.
				5461	*
				5462	* Note that zombies do have PM_VALID set. So here we do the minimal.
				5463	*
				5464	* In case the context was zombified it could not be reclaimed at the time
				5465	* the monitoring program exited. At this point, the PMU reservation has been
				5466	* returned, the sampiing buffer has been freed. We must convert this call
				5467	* into a spurious interrupt. However, we must also avoid infinite overflows
				5468	* by stopping monitoring for this task. We can only come here for a per-task
				5469	* context. All we need to do is to stop monitoring using the psr bits which
				5470	* are always task private. By re-enabling secure montioring, we ensure that
				5471	* the monitored task will not be able to re-activate monitoring.
				5472	* The task will eventually be context switched out, at which point the context
				5473	* will be reclaimed (that includes releasing ownership of the PMU).
				5474	*
				5475	* So there might be a window of time where the number of per-task session is zero
				5476	* yet one PMU might have a owner and get at most one overflow interrupt for a zombie
				5477	* context. This is safe because if a per-task session comes in, it will push this one
				5478	* out and by the virtue on pfm_save_regs(), this one will disappear. If a system wide
				5479	* session is force on that CPU, given that we use task pinning, pfm_save_regs() will
				5480	* also push our zombie context out.
				5481	*
				5482	* Overall pretty hairy stuff....
				5483	*/
				5484	DPRINT(("ctx is zombie for [%d], converted to spurious\n", task ? task->pid: -1));
				5485	pfm_clear_psr_up();
				5486	ia64_psr(regs)->up = 0;
				5487	ia64_psr(regs)->sp = 1;
				5488	return;
				5489	}
				5490
				5491	static int
				5492	pfm_do_interrupt_handler(int irq, void arg, struct pt_regs regs)
				5493	{
				5494	struct task_struct *task;
				5495	pfm_context_t *ctx;
				5496	unsigned long flags;
				5497	u64 pmc0;
				5498	int this_cpu = smp_processor_id();
				5499	int retval = 0;
				5500
				5501	pfm_stats[this_cpu].pfm_ovfl_intr_count++;
				5502
				5503	/*
				5504	* srlz.d done before arriving here
				5505	*/
				5506	pmc0 = ia64_get_pmc(0);
				5507
				5508	task = GET_PMU_OWNER();
				5509	ctx = GET_PMU_CTX();
				5510
				5511	/*
				5512	* if we have some pending bits set
				5513	* assumes : if any PMC0.bit[63-1] is set, then PMC0.fr = 1
				5514	*/
				5515	if (PMC0_HAS_OVFL(pmc0) && task) {
				5516	/*
				5517	* we assume that pmc0.fr is always set here
				5518	*/
				5519
				5520	/* sanity check */
				5521	if (!ctx) goto report_spurious1;
				5522
				5523	if (ctx->ctx_fl_system == 0 && (task->thread.flags & IA64_THREAD_PM_VALID) == 0)
				5524	goto report_spurious2;
				5525
				5526	PROTECT_CTX_NOPRINT(ctx, flags);
				5527
				5528	pfm_overflow_handler(task, ctx, pmc0, regs);
				5529
				5530	UNPROTECT_CTX_NOPRINT(ctx, flags);
				5531
				5532	} else {
				5533	pfm_stats[this_cpu].pfm_spurious_ovfl_intr_count++;
				5534	retval = -1;
				5535	}
				5536	/*
				5537	* keep it unfrozen at all times
				5538	*/
				5539	pfm_unfreeze_pmu();
				5540
				5541	return retval;
				5542
				5543	report_spurious1:
				5544	printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d has no PFM context\n",
				5545	this_cpu, task->pid);
				5546	pfm_unfreeze_pmu();
				5547	return -1;
				5548	report_spurious2:
				5549	printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d, invalid flag\n",
				5550	this_cpu,
				5551	task->pid);
				5552	pfm_unfreeze_pmu();
				5553	return -1;
				5554	}
				5555
				5556	static irqreturn_t
				5557	pfm_interrupt_handler(int irq, void arg, struct pt_regs regs)
				5558	{
				5559	unsigned long start_cycles, total_cycles;
				5560	unsigned long min, max;
				5561	int this_cpu;
				5562	int ret;
				5563
				5564	this_cpu = get_cpu();
Tony Luck	a1ecf7f	2005-05-18 16:06:00 -0700	[diff] [blame]	5565	if (likely(!pfm_alt_intr_handler)) {
				5566	min = pfm_stats[this_cpu].pfm_ovfl_intr_cycles_min;
				5567	max = pfm_stats[this_cpu].pfm_ovfl_intr_cycles_max;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5568
Tony Luck	a1ecf7f	2005-05-18 16:06:00 -0700	[diff] [blame]	5569	start_cycles = ia64_get_itc();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5570
Tony Luck	a1ecf7f	2005-05-18 16:06:00 -0700	[diff] [blame]	5571	ret = pfm_do_interrupt_handler(irq, arg, regs);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5572
Tony Luck	a1ecf7f	2005-05-18 16:06:00 -0700	[diff] [blame]	5573	total_cycles = ia64_get_itc();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5574
Tony Luck	a1ecf7f	2005-05-18 16:06:00 -0700	[diff] [blame]	5575	/*
				5576	* don't measure spurious interrupts
				5577	*/
				5578	if (likely(ret == 0)) {
				5579	total_cycles -= start_cycles;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5580
Tony Luck	a1ecf7f	2005-05-18 16:06:00 -0700	[diff] [blame]	5581	if (total_cycles < min) pfm_stats[this_cpu].pfm_ovfl_intr_cycles_min = total_cycles;
				5582	if (total_cycles > max) pfm_stats[this_cpu].pfm_ovfl_intr_cycles_max = total_cycles;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5583
Tony Luck	a1ecf7f	2005-05-18 16:06:00 -0700	[diff] [blame]	5584	pfm_stats[this_cpu].pfm_ovfl_intr_cycles += total_cycles;
				5585	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5586	}
Tony Luck	a1ecf7f	2005-05-18 16:06:00 -0700	[diff] [blame]	5587	else {
				5588	(*pfm_alt_intr_handler->handler)(irq, arg, regs);
				5589	}
				5590
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5591	put_cpu_no_resched();
				5592	return IRQ_HANDLED;
				5593	}
				5594
				5595	/*
				5596	* /proc/perfmon interface, for debug only
				5597	*/
				5598
				5599	#define PFM_PROC_SHOW_HEADER ((void *)NR_CPUS+1)
				5600
				5601	static void *
				5602	pfm_proc_start(struct seq_file m, loff_t pos)
				5603	{
				5604	if (*pos == 0) {
				5605	return PFM_PROC_SHOW_HEADER;
				5606	}
				5607
				5608	while (*pos <= NR_CPUS) {
				5609	if (cpu_online(*pos - 1)) {
				5610	return (void )pos;
				5611	}
				5612	++*pos;
				5613	}
				5614	return NULL;
				5615	}
				5616
				5617	static void *
				5618	pfm_proc_next(struct seq_file m, void v, loff_t *pos)
				5619	{
				5620	++*pos;
				5621	return pfm_proc_start(m, pos);
				5622	}
				5623
				5624	static void
				5625	pfm_proc_stop(struct seq_file m, void v)
				5626	{
				5627	}
				5628
				5629	static void
				5630	pfm_proc_show_header(struct seq_file *m)
				5631	{
				5632	struct list_head * pos;
				5633	pfm_buffer_fmt_t * entry;
				5634	unsigned long flags;
				5635
				5636	seq_printf(m,
				5637	"perfmon version : %u.%u\n"
				5638	"model : %s\n"
				5639	"fastctxsw : %s\n"
				5640	"expert mode : %s\n"
				5641	"ovfl_mask : 0x%lx\n"
				5642	"PMU flags : 0x%x\n",
				5643	PFM_VERSION_MAJ, PFM_VERSION_MIN,
				5644	pmu_conf->pmu_name,
				5645	pfm_sysctl.fastctxsw > 0 ? "Yes": "No",
				5646	pfm_sysctl.expert_mode > 0 ? "Yes": "No",
				5647	pmu_conf->ovfl_val,
				5648	pmu_conf->flags);
				5649
				5650	LOCK_PFS(flags);
				5651
				5652	seq_printf(m,
				5653	"proc_sessions : %u\n"
				5654	"sys_sessions : %u\n"
				5655	"sys_use_dbregs : %u\n"
				5656	"ptrace_use_dbregs : %u\n",
				5657	pfm_sessions.pfs_task_sessions,
				5658	pfm_sessions.pfs_sys_sessions,
				5659	pfm_sessions.pfs_sys_use_dbregs,
				5660	pfm_sessions.pfs_ptrace_use_dbregs);
				5661
				5662	UNLOCK_PFS(flags);
				5663
				5664	spin_lock(&pfm_buffer_fmt_lock);
				5665
				5666	list_for_each(pos, &pfm_buffer_fmt_list) {
				5667	entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list);
				5668	seq_printf(m, "format : %02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x %s\n",
				5669	entry->fmt_uuid[0],
				5670	entry->fmt_uuid[1],
				5671	entry->fmt_uuid[2],
				5672	entry->fmt_uuid[3],
				5673	entry->fmt_uuid[4],
				5674	entry->fmt_uuid[5],
				5675	entry->fmt_uuid[6],
				5676	entry->fmt_uuid[7],
				5677	entry->fmt_uuid[8],
				5678	entry->fmt_uuid[9],
				5679	entry->fmt_uuid[10],
				5680	entry->fmt_uuid[11],
				5681	entry->fmt_uuid[12],
				5682	entry->fmt_uuid[13],
				5683	entry->fmt_uuid[14],
				5684	entry->fmt_uuid[15],
				5685	entry->fmt_name);
				5686	}
				5687	spin_unlock(&pfm_buffer_fmt_lock);
				5688
				5689	}
				5690
				5691	static int
				5692	pfm_proc_show(struct seq_file m, void v)
				5693	{
				5694	unsigned long psr;
				5695	unsigned int i;
				5696	int cpu;
				5697
				5698	if (v == PFM_PROC_SHOW_HEADER) {
				5699	pfm_proc_show_header(m);
				5700	return 0;
				5701	}
				5702
				5703	/* show info for CPU (v - 1) */
				5704
				5705	cpu = (long)v - 1;
				5706	seq_printf(m,
				5707	"CPU%-2d overflow intrs : %lu\n"
				5708	"CPU%-2d overflow cycles : %lu\n"
				5709	"CPU%-2d overflow min : %lu\n"
				5710	"CPU%-2d overflow max : %lu\n"
				5711	"CPU%-2d smpl handler calls : %lu\n"
				5712	"CPU%-2d smpl handler cycles : %lu\n"
				5713	"CPU%-2d spurious intrs : %lu\n"
				5714	"CPU%-2d replay intrs : %lu\n"
				5715	"CPU%-2d syst_wide : %d\n"
				5716	"CPU%-2d dcr_pp : %d\n"
				5717	"CPU%-2d exclude idle : %d\n"
				5718	"CPU%-2d owner : %d\n"
				5719	"CPU%-2d context : %p\n"
				5720	"CPU%-2d activations : %lu\n",
				5721	cpu, pfm_stats[cpu].pfm_ovfl_intr_count,
				5722	cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles,
				5723	cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles_min,
				5724	cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles_max,
				5725	cpu, pfm_stats[cpu].pfm_smpl_handler_calls,
				5726	cpu, pfm_stats[cpu].pfm_smpl_handler_cycles,
				5727	cpu, pfm_stats[cpu].pfm_spurious_ovfl_intr_count,
				5728	cpu, pfm_stats[cpu].pfm_replay_ovfl_intr_count,
				5729	cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_SYST_WIDE ? 1 : 0,
				5730	cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_DCR_PP ? 1 : 0,
				5731	cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_EXCL_IDLE ? 1 : 0,
				5732	cpu, pfm_get_cpu_data(pmu_owner, cpu) ? pfm_get_cpu_data(pmu_owner, cpu)->pid: -1,
				5733	cpu, pfm_get_cpu_data(pmu_ctx, cpu),
				5734	cpu, pfm_get_cpu_data(pmu_activation_number, cpu));
				5735
				5736	if (num_online_cpus() == 1 && pfm_sysctl.debug > 0) {
				5737
				5738	psr = pfm_get_psr();
				5739
				5740	ia64_srlz_d();
				5741
				5742	seq_printf(m,
				5743	"CPU%-2d psr : 0x%lx\n"
				5744	"CPU%-2d pmc0 : 0x%lx\n",
				5745	cpu, psr,
				5746	cpu, ia64_get_pmc(0));
				5747
				5748	for (i=0; PMC_IS_LAST(i) == 0; i++) {
				5749	if (PMC_IS_COUNTING(i) == 0) continue;
				5750	seq_printf(m,
				5751	"CPU%-2d pmc%u : 0x%lx\n"
				5752	"CPU%-2d pmd%u : 0x%lx\n",
				5753	cpu, i, ia64_get_pmc(i),
				5754	cpu, i, ia64_get_pmd(i));
				5755	}
				5756	}
				5757	return 0;
				5758	}
				5759
				5760	struct seq_operations pfm_seq_ops = {
				5761	.start = pfm_proc_start,
				5762	.next = pfm_proc_next,
				5763	.stop = pfm_proc_stop,
				5764	.show = pfm_proc_show
				5765	};
				5766
				5767	static int
				5768	pfm_proc_open(struct inode inode, struct file file)
				5769	{
				5770	return seq_open(file, &pfm_seq_ops);
				5771	}
				5772
				5773
				5774	/*
				5775	* we come here as soon as local_cpu_data->pfm_syst_wide is set. this happens
				5776	* during pfm_enable() hence before pfm_start(). We cannot assume monitoring
				5777	* is active or inactive based on mode. We must rely on the value in
				5778	* local_cpu_data->pfm_syst_info
				5779	*/
				5780	void
				5781	pfm_syst_wide_update_task(struct task_struct *task, unsigned long info, int is_ctxswin)
				5782	{
				5783	struct pt_regs *regs;
				5784	unsigned long dcr;
				5785	unsigned long dcr_pp;
				5786
				5787	dcr_pp = info & PFM_CPUINFO_DCR_PP ? 1 : 0;
				5788
				5789	/*
				5790	* pid 0 is guaranteed to be the idle task. There is one such task with pid 0
				5791	* on every CPU, so we can rely on the pid to identify the idle task.
				5792	*/
				5793	if ((info & PFM_CPUINFO_EXCL_IDLE) == 0 \|\| task->pid) {
				5794	regs = ia64_task_regs(task);
				5795	ia64_psr(regs)->pp = is_ctxswin ? dcr_pp : 0;
				5796	return;
				5797	}
				5798	/*
				5799	* if monitoring has started
				5800	*/
				5801	if (dcr_pp) {
				5802	dcr = ia64_getreg(_IA64_REG_CR_DCR);
				5803	/*
				5804	* context switching in?
				5805	*/
				5806	if (is_ctxswin) {
				5807	/* mask monitoring for the idle task */
				5808	ia64_setreg(_IA64_REG_CR_DCR, dcr & ~IA64_DCR_PP);
				5809	pfm_clear_psr_pp();
				5810	ia64_srlz_i();
				5811	return;
				5812	}
				5813	/*
				5814	* context switching out
				5815	* restore monitoring for next task
				5816	*
				5817	* Due to inlining this odd if-then-else construction generates
				5818	* better code.
				5819	*/
				5820	ia64_setreg(_IA64_REG_CR_DCR, dcr \|IA64_DCR_PP);
				5821	pfm_set_psr_pp();
				5822	ia64_srlz_i();
				5823	}
				5824	}
				5825
				5826	#ifdef CONFIG_SMP
				5827
				5828	static void
				5829	pfm_force_cleanup(pfm_context_t ctx, struct pt_regs regs)
				5830	{
				5831	struct task_struct *task = ctx->ctx_task;
				5832
				5833	ia64_psr(regs)->up = 0;
				5834	ia64_psr(regs)->sp = 1;
				5835
				5836	if (GET_PMU_OWNER() == task) {
				5837	DPRINT(("cleared ownership for [%d]\n", ctx->ctx_task->pid));
				5838	SET_PMU_OWNER(NULL, NULL);
				5839	}
				5840
				5841	/*
				5842	* disconnect the task from the context and vice-versa
				5843	*/
				5844	PFM_SET_WORK_PENDING(task, 0);
				5845
				5846	task->thread.pfm_context = NULL;
				5847	task->thread.flags &= ~IA64_THREAD_PM_VALID;
				5848
				5849	DPRINT(("force cleanup for [%d]\n", task->pid));
				5850	}
				5851
				5852
				5853	/*
				5854	* in 2.6, interrupts are masked when we come here and the runqueue lock is held
				5855	*/
				5856	void
				5857	pfm_save_regs(struct task_struct *task)
				5858	{
				5859	pfm_context_t *ctx;
				5860	struct thread_struct *t;
				5861	unsigned long flags;
				5862	u64 psr;
				5863
				5864
				5865	ctx = PFM_GET_CTX(task);
				5866	if (ctx == NULL) return;
				5867	t = &task->thread;
				5868
				5869	/*
				5870	* we always come here with interrupts ALREADY disabled by
				5871	* the scheduler. So we simply need to protect against concurrent
				5872	* access, not CPU concurrency.
				5873	*/
				5874	flags = pfm_protect_ctx_ctxsw(ctx);
				5875
				5876	if (ctx->ctx_state == PFM_CTX_ZOMBIE) {
				5877	struct pt_regs *regs = ia64_task_regs(task);
				5878
				5879	pfm_clear_psr_up();
				5880
				5881	pfm_force_cleanup(ctx, regs);
				5882
				5883	BUG_ON(ctx->ctx_smpl_hdr);
				5884
				5885	pfm_unprotect_ctx_ctxsw(ctx, flags);
				5886
				5887	pfm_context_free(ctx);
				5888	return;
				5889	}
				5890
				5891	/*
				5892	* save current PSR: needed because we modify it
				5893	*/
				5894	ia64_srlz_d();
				5895	psr = pfm_get_psr();
				5896
				5897	BUG_ON(psr & (IA64_PSR_I));
				5898
				5899	/*
				5900	* stop monitoring:
				5901	* This is the last instruction which may generate an overflow
				5902	*
				5903	* We do not need to set psr.sp because, it is irrelevant in kernel.
				5904	* It will be restored from ipsr when going back to user level
				5905	*/
				5906	pfm_clear_psr_up();
				5907
				5908	/*
				5909	* keep a copy of psr.up (for reload)
				5910	*/
				5911	ctx->ctx_saved_psr_up = psr & IA64_PSR_UP;
				5912
				5913	/*
				5914	* release ownership of this PMU.
				5915	* PM interrupts are masked, so nothing
				5916	* can happen.
				5917	*/
				5918	SET_PMU_OWNER(NULL, NULL);
				5919
				5920	/*
				5921	* we systematically save the PMD as we have no
				5922	* guarantee we will be schedule at that same
				5923	* CPU again.
				5924	*/
				5925	pfm_save_pmds(t->pmds, ctx->ctx_used_pmds[0]);
				5926
				5927	/*
				5928	* save pmc0 ia64_srlz_d() done in pfm_save_pmds()
				5929	* we will need it on the restore path to check
				5930	* for pending overflow.
				5931	*/
				5932	t->pmcs[0] = ia64_get_pmc(0);
				5933
				5934	/*
				5935	* unfreeze PMU if had pending overflows
				5936	*/
				5937	if (t->pmcs[0] & ~0x1UL) pfm_unfreeze_pmu();
				5938
				5939	/*
				5940	* finally, allow context access.
				5941	* interrupts will still be masked after this call.
				5942	*/
				5943	pfm_unprotect_ctx_ctxsw(ctx, flags);
				5944	}
				5945
				5946	#else /* !CONFIG_SMP */
				5947	void
				5948	pfm_save_regs(struct task_struct *task)
				5949	{
				5950	pfm_context_t *ctx;
				5951	u64 psr;
				5952
				5953	ctx = PFM_GET_CTX(task);
				5954	if (ctx == NULL) return;
				5955
				5956	/*
				5957	* save current PSR: needed because we modify it
				5958	*/
				5959	psr = pfm_get_psr();
				5960
				5961	BUG_ON(psr & (IA64_PSR_I));
				5962
				5963	/*
				5964	* stop monitoring:
				5965	* This is the last instruction which may generate an overflow
				5966	*
				5967	* We do not need to set psr.sp because, it is irrelevant in kernel.
				5968	* It will be restored from ipsr when going back to user level
				5969	*/
				5970	pfm_clear_psr_up();
				5971
				5972	/*
				5973	* keep a copy of psr.up (for reload)
				5974	*/
				5975	ctx->ctx_saved_psr_up = psr & IA64_PSR_UP;
				5976	}
				5977
				5978	static void
				5979	pfm_lazy_save_regs (struct task_struct *task)
				5980	{
				5981	pfm_context_t *ctx;
				5982	struct thread_struct *t;
				5983	unsigned long flags;
				5984
				5985	{ u64 psr = pfm_get_psr();
				5986	BUG_ON(psr & IA64_PSR_UP);
				5987	}
				5988
				5989	ctx = PFM_GET_CTX(task);
				5990	t = &task->thread;
				5991
				5992	/*
				5993	* we need to mask PMU overflow here to
				5994	* make sure that we maintain pmc0 until
				5995	* we save it. overflow interrupts are
				5996	* treated as spurious if there is no
				5997	* owner.
				5998	*
				5999	* XXX: I don't think this is necessary
				6000	*/
				6001	PROTECT_CTX(ctx,flags);
				6002
				6003	/*
				6004	* release ownership of this PMU.
				6005	* must be done before we save the registers.
				6006	*
				6007	* after this call any PMU interrupt is treated
				6008	* as spurious.
				6009	*/
				6010	SET_PMU_OWNER(NULL, NULL);
				6011
				6012	/*
				6013	* save all the pmds we use
				6014	*/
				6015	pfm_save_pmds(t->pmds, ctx->ctx_used_pmds[0]);
				6016
				6017	/*
				6018	* save pmc0 ia64_srlz_d() done in pfm_save_pmds()
				6019	* it is needed to check for pended overflow
				6020	* on the restore path
				6021	*/
				6022	t->pmcs[0] = ia64_get_pmc(0);
				6023
				6024	/*
				6025	* unfreeze PMU if had pending overflows
				6026	*/
				6027	if (t->pmcs[0] & ~0x1UL) pfm_unfreeze_pmu();
				6028
				6029	/*
				6030	* now get can unmask PMU interrupts, they will
				6031	* be treated as purely spurious and we will not
				6032	* lose any information
				6033	*/
				6034	UNPROTECT_CTX(ctx,flags);
				6035	}
				6036	#endif /* CONFIG_SMP */
				6037
				6038	#ifdef CONFIG_SMP
				6039	/*
				6040	* in 2.6, interrupts are masked when we come here and the runqueue lock is held
				6041	*/
				6042	void
				6043	pfm_load_regs (struct task_struct *task)
				6044	{
				6045	pfm_context_t *ctx;
				6046	struct thread_struct *t;
				6047	unsigned long pmc_mask = 0UL, pmd_mask = 0UL;
				6048	unsigned long flags;
				6049	u64 psr, psr_up;
				6050	int need_irq_resend;
				6051
				6052	ctx = PFM_GET_CTX(task);
				6053	if (unlikely(ctx == NULL)) return;
				6054
				6055	BUG_ON(GET_PMU_OWNER());
				6056
				6057	t = &task->thread;
				6058	/*
				6059	* possible on unload
				6060	*/
				6061	if (unlikely((t->flags & IA64_THREAD_PM_VALID) == 0)) return;
				6062
				6063	/*
				6064	* we always come here with interrupts ALREADY disabled by
				6065	* the scheduler. So we simply need to protect against concurrent
				6066	* access, not CPU concurrency.
				6067	*/
				6068	flags = pfm_protect_ctx_ctxsw(ctx);
				6069	psr = pfm_get_psr();
				6070
				6071	need_irq_resend = pmu_conf->flags & PFM_PMU_IRQ_RESEND;
				6072
				6073	BUG_ON(psr & (IA64_PSR_UP\|IA64_PSR_PP));
				6074	BUG_ON(psr & IA64_PSR_I);
				6075
				6076	if (unlikely(ctx->ctx_state == PFM_CTX_ZOMBIE)) {
				6077	struct pt_regs *regs = ia64_task_regs(task);
				6078
				6079	BUG_ON(ctx->ctx_smpl_hdr);
				6080
				6081	pfm_force_cleanup(ctx, regs);
				6082
				6083	pfm_unprotect_ctx_ctxsw(ctx, flags);
				6084
				6085	/*
				6086	* this one (kmalloc'ed) is fine with interrupts disabled
				6087	*/
				6088	pfm_context_free(ctx);
				6089
				6090	return;
				6091	}
				6092
				6093	/*
				6094	* we restore ALL the debug registers to avoid picking up
				6095	* stale state.
				6096	*/
				6097	if (ctx->ctx_fl_using_dbreg) {
				6098	pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
				6099	pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
				6100	}
				6101	/*
				6102	* retrieve saved psr.up
				6103	*/
				6104	psr_up = ctx->ctx_saved_psr_up;
				6105
				6106	/*
				6107	* if we were the last user of the PMU on that CPU,
				6108	* then nothing to do except restore psr
				6109	*/
				6110	if (GET_LAST_CPU(ctx) == smp_processor_id() && ctx->ctx_last_activation == GET_ACTIVATION()) {
				6111
				6112	/*
				6113	* retrieve partial reload masks (due to user modifications)
				6114	*/
				6115	pmc_mask = ctx->ctx_reload_pmcs[0];
				6116	pmd_mask = ctx->ctx_reload_pmds[0];
				6117
				6118	} else {
				6119	/*
				6120	* To avoid leaking information to the user level when psr.sp=0,
				6121	* we must reload ALL implemented pmds (even the ones we don't use).
				6122	* In the kernel we only allow PFM_READ_PMDS on registers which
				6123	* we initialized or requested (sampling) so there is no risk there.
				6124	*/
				6125	pmd_mask = pfm_sysctl.fastctxsw ? ctx->ctx_used_pmds[0] : ctx->ctx_all_pmds[0];
				6126
				6127	/*
				6128	* ALL accessible PMCs are systematically reloaded, unused registers
				6129	* get their default (from pfm_reset_pmu_state()) values to avoid picking
				6130	* up stale configuration.
				6131	*
				6132	* PMC0 is never in the mask. It is always restored separately.
				6133	*/
				6134	pmc_mask = ctx->ctx_all_pmcs[0];
				6135	}
				6136	/*
				6137	* when context is MASKED, we will restore PMC with plm=0
				6138	* and PMD with stale information, but that's ok, nothing
				6139	* will be captured.
				6140	*
				6141	* XXX: optimize here
				6142	*/
				6143	if (pmd_mask) pfm_restore_pmds(t->pmds, pmd_mask);
				6144	if (pmc_mask) pfm_restore_pmcs(t->pmcs, pmc_mask);
				6145
				6146	/*
				6147	* check for pending overflow at the time the state
				6148	* was saved.
				6149	*/
				6150	if (unlikely(PMC0_HAS_OVFL(t->pmcs[0]))) {
				6151	/*
				6152	* reload pmc0 with the overflow information
				6153	* On McKinley PMU, this will trigger a PMU interrupt
				6154	*/
				6155	ia64_set_pmc(0, t->pmcs[0]);
				6156	ia64_srlz_d();
				6157	t->pmcs[0] = 0UL;
				6158
				6159	/*
				6160	* will replay the PMU interrupt
				6161	*/
				6162	if (need_irq_resend) hw_resend_irq(NULL, IA64_PERFMON_VECTOR);
				6163
				6164	pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++;
				6165	}
				6166
				6167	/*
				6168	* we just did a reload, so we reset the partial reload fields
				6169	*/
				6170	ctx->ctx_reload_pmcs[0] = 0UL;
				6171	ctx->ctx_reload_pmds[0] = 0UL;
				6172
				6173	SET_LAST_CPU(ctx, smp_processor_id());
				6174
				6175	/*
				6176	* dump activation value for this PMU
				6177	*/
				6178	INC_ACTIVATION();
				6179	/*
				6180	* record current activation for this context
				6181	*/
				6182	SET_ACTIVATION(ctx);
				6183
				6184	/*
				6185	* establish new ownership.
				6186	*/
				6187	SET_PMU_OWNER(task, ctx);
				6188
				6189	/*
				6190	* restore the psr.up bit. measurement
				6191	* is active again.
				6192	* no PMU interrupt can happen at this point
				6193	* because we still have interrupts disabled.
				6194	*/
				6195	if (likely(psr_up)) pfm_set_psr_up();
				6196
				6197	/*
				6198	* allow concurrent access to context
				6199	*/
				6200	pfm_unprotect_ctx_ctxsw(ctx, flags);
				6201	}
				6202	#else /* !CONFIG_SMP */
				6203	/*
				6204	* reload PMU state for UP kernels
				6205	* in 2.5 we come here with interrupts disabled
				6206	*/
				6207	void
				6208	pfm_load_regs (struct task_struct *task)
				6209	{
				6210	struct thread_struct *t;
				6211	pfm_context_t *ctx;
				6212	struct task_struct *owner;
				6213	unsigned long pmd_mask, pmc_mask;
				6214	u64 psr, psr_up;
				6215	int need_irq_resend;
				6216
				6217	owner = GET_PMU_OWNER();
				6218	ctx = PFM_GET_CTX(task);
				6219	t = &task->thread;
				6220	psr = pfm_get_psr();
				6221
				6222	BUG_ON(psr & (IA64_PSR_UP\|IA64_PSR_PP));
				6223	BUG_ON(psr & IA64_PSR_I);
				6224
				6225	/*
				6226	* we restore ALL the debug registers to avoid picking up
				6227	* stale state.
				6228	*
				6229	* This must be done even when the task is still the owner
				6230	* as the registers may have been modified via ptrace()
				6231	* (not perfmon) by the previous task.
				6232	*/
				6233	if (ctx->ctx_fl_using_dbreg) {
				6234	pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
				6235	pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
				6236	}
				6237
				6238	/*
				6239	* retrieved saved psr.up
				6240	*/
				6241	psr_up = ctx->ctx_saved_psr_up;
				6242	need_irq_resend = pmu_conf->flags & PFM_PMU_IRQ_RESEND;
				6243
				6244	/*
				6245	* short path, our state is still there, just
				6246	* need to restore psr and we go
				6247	*
				6248	* we do not touch either PMC nor PMD. the psr is not touched
				6249	* by the overflow_handler. So we are safe w.r.t. to interrupt
				6250	* concurrency even without interrupt masking.
				6251	*/
				6252	if (likely(owner == task)) {
				6253	if (likely(psr_up)) pfm_set_psr_up();
				6254	return;
				6255	}
				6256
				6257	/*
				6258	* someone else is still using the PMU, first push it out and
				6259	* then we'll be able to install our stuff !
				6260	*
				6261	* Upon return, there will be no owner for the current PMU
				6262	*/
				6263	if (owner) pfm_lazy_save_regs(owner);
				6264
				6265	/*
				6266	* To avoid leaking information to the user level when psr.sp=0,
				6267	* we must reload ALL implemented pmds (even the ones we don't use).
				6268	* In the kernel we only allow PFM_READ_PMDS on registers which
				6269	* we initialized or requested (sampling) so there is no risk there.
				6270	*/
				6271	pmd_mask = pfm_sysctl.fastctxsw ? ctx->ctx_used_pmds[0] : ctx->ctx_all_pmds[0];
				6272
				6273	/*
				6274	* ALL accessible PMCs are systematically reloaded, unused registers
				6275	* get their default (from pfm_reset_pmu_state()) values to avoid picking
				6276	* up stale configuration.
				6277	*
				6278	* PMC0 is never in the mask. It is always restored separately
				6279	*/
				6280	pmc_mask = ctx->ctx_all_pmcs[0];
				6281
				6282	pfm_restore_pmds(t->pmds, pmd_mask);
				6283	pfm_restore_pmcs(t->pmcs, pmc_mask);
				6284
				6285	/*
				6286	* check for pending overflow at the time the state
				6287	* was saved.
				6288	*/
				6289	if (unlikely(PMC0_HAS_OVFL(t->pmcs[0]))) {
				6290	/*
				6291	* reload pmc0 with the overflow information
				6292	* On McKinley PMU, this will trigger a PMU interrupt
				6293	*/
				6294	ia64_set_pmc(0, t->pmcs[0]);
				6295	ia64_srlz_d();
				6296
				6297	t->pmcs[0] = 0UL;
				6298
				6299	/*
				6300	* will replay the PMU interrupt
				6301	*/
				6302	if (need_irq_resend) hw_resend_irq(NULL, IA64_PERFMON_VECTOR);
				6303
				6304	pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++;
				6305	}
				6306
				6307	/*
				6308	* establish new ownership.
				6309	*/
				6310	SET_PMU_OWNER(task, ctx);
				6311
				6312	/*
				6313	* restore the psr.up bit. measurement
				6314	* is active again.
				6315	* no PMU interrupt can happen at this point
				6316	* because we still have interrupts disabled.
				6317	*/
				6318	if (likely(psr_up)) pfm_set_psr_up();
				6319	}
				6320	#endif /* CONFIG_SMP */
				6321
				6322	/*
				6323	* this function assumes monitoring is stopped
				6324	*/
				6325	static void
				6326	pfm_flush_pmds(struct task_struct task, pfm_context_t ctx)
				6327	{
				6328	u64 pmc0;
				6329	unsigned long mask2, val, pmd_val, ovfl_val;
				6330	int i, can_access_pmu = 0;
				6331	int is_self;
				6332
				6333	/*
				6334	* is the caller the task being monitored (or which initiated the
				6335	* session for system wide measurements)
				6336	*/
				6337	is_self = ctx->ctx_task == task ? 1 : 0;
				6338
				6339	/*
				6340	* can access PMU is task is the owner of the PMU state on the current CPU
				6341	* or if we are running on the CPU bound to the context in system-wide mode
				6342	* (that is not necessarily the task the context is attached to in this mode).
				6343	* In system-wide we always have can_access_pmu true because a task running on an
				6344	* invalid processor is flagged earlier in the call stack (see pfm_stop).
				6345	*/
				6346	can_access_pmu = (GET_PMU_OWNER() == task) \|\| (ctx->ctx_fl_system && ctx->ctx_cpu == smp_processor_id());
				6347	if (can_access_pmu) {
				6348	/*
				6349	* Mark the PMU as not owned
				6350	* This will cause the interrupt handler to do nothing in case an overflow
				6351	* interrupt was in-flight
				6352	* This also guarantees that pmc0 will contain the final state
				6353	* It virtually gives us full control on overflow processing from that point
				6354	* on.
				6355	*/
				6356	SET_PMU_OWNER(NULL, NULL);
				6357	DPRINT(("releasing ownership\n"));
				6358
				6359	/*
				6360	* read current overflow status:
				6361	*
				6362	* we are guaranteed to read the final stable state
				6363	*/
				6364	ia64_srlz_d();
				6365	pmc0 = ia64_get_pmc(0); /* slow */
				6366
				6367	/*
				6368	* reset freeze bit, overflow status information destroyed
				6369	*/
				6370	pfm_unfreeze_pmu();
				6371	} else {
				6372	pmc0 = task->thread.pmcs[0];
				6373	/*
				6374	* clear whatever overflow status bits there were
				6375	*/
				6376	task->thread.pmcs[0] = 0;
				6377	}
				6378	ovfl_val = pmu_conf->ovfl_val;
				6379	/*
				6380	* we save all the used pmds
				6381	* we take care of overflows for counting PMDs
				6382	*
				6383	* XXX: sampling situation is not taken into account here
				6384	*/
				6385	mask2 = ctx->ctx_used_pmds[0];
				6386
				6387	DPRINT(("is_self=%d ovfl_val=0x%lx mask2=0x%lx\n", is_self, ovfl_val, mask2));
				6388
				6389	for (i = 0; mask2; i++, mask2>>=1) {
				6390
				6391	/* skip non used pmds */
				6392	if ((mask2 & 0x1) == 0) continue;
				6393
				6394	/*
				6395	* can access PMU always true in system wide mode
				6396	*/
				6397	val = pmd_val = can_access_pmu ? ia64_get_pmd(i) : task->thread.pmds[i];
				6398
				6399	if (PMD_IS_COUNTING(i)) {
				6400	DPRINT(("[%d] pmd[%d] ctx_pmd=0x%lx hw_pmd=0x%lx\n",
				6401	task->pid,
				6402	i,
				6403	ctx->ctx_pmds[i].val,
				6404	val & ovfl_val));
				6405
				6406	/*
				6407	* we rebuild the full 64 bit value of the counter
				6408	*/
				6409	val = ctx->ctx_pmds[i].val + (val & ovfl_val);
				6410
				6411	/*
				6412	* now everything is in ctx_pmds[] and we need
				6413	* to clear the saved context from save_regs() such that
				6414	* pfm_read_pmds() gets the correct value
				6415	*/
				6416	pmd_val = 0UL;
				6417
				6418	/*
				6419	* take care of overflow inline
				6420	*/
				6421	if (pmc0 & (1UL << i)) {
				6422	val += 1 + ovfl_val;
				6423	DPRINT(("[%d] pmd[%d] overflowed\n", task->pid, i));
				6424	}
				6425	}
				6426
				6427	DPRINT(("[%d] ctx_pmd[%d]=0x%lx pmd_val=0x%lx\n", task->pid, i, val, pmd_val));
				6428
				6429	if (is_self) task->thread.pmds[i] = pmd_val;
				6430
				6431	ctx->ctx_pmds[i].val = val;
				6432	}
				6433	}
				6434
				6435	static struct irqaction perfmon_irqaction = {
				6436	.handler = pfm_interrupt_handler,
				6437	.flags = SA_INTERRUPT,
				6438	.name = "perfmon"
				6439	};
				6440
Tony Luck	a1ecf7f	2005-05-18 16:06:00 -0700	[diff] [blame]	6441	static void
				6442	pfm_alt_save_pmu_state(void *data)
				6443	{
				6444	struct pt_regs *regs;
				6445
				6446	regs = ia64_task_regs(current);
				6447
				6448	DPRINT(("called\n"));
				6449
				6450	/*
				6451	* should not be necessary but
				6452	* let's take not risk
				6453	*/
				6454	pfm_clear_psr_up();
				6455	pfm_clear_psr_pp();
				6456	ia64_psr(regs)->pp = 0;
				6457
				6458	/*
				6459	* This call is required
				6460	* May cause a spurious interrupt on some processors
				6461	*/
				6462	pfm_freeze_pmu();
				6463
				6464	ia64_srlz_d();
				6465	}
				6466
				6467	void
				6468	pfm_alt_restore_pmu_state(void *data)
				6469	{
				6470	struct pt_regs *regs;
				6471
				6472	regs = ia64_task_regs(current);
				6473
				6474	DPRINT(("called\n"));
				6475
				6476	/*
				6477	* put PMU back in state expected
				6478	* by perfmon
				6479	*/
				6480	pfm_clear_psr_up();
				6481	pfm_clear_psr_pp();
				6482	ia64_psr(regs)->pp = 0;
				6483
				6484	/*
				6485	* perfmon runs with PMU unfrozen at all times
				6486	*/
				6487	pfm_unfreeze_pmu();
				6488
				6489	ia64_srlz_d();
				6490	}
				6491
				6492	int
				6493	pfm_install_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl)
				6494	{
				6495	int ret, i;
				6496	int reserve_cpu;
				6497
				6498	/* some sanity checks */
				6499	if (hdl == NULL \|\| hdl->handler == NULL) return -EINVAL;
				6500
				6501	/* do the easy test first */
				6502	if (pfm_alt_intr_handler) return -EBUSY;
				6503
				6504	/* one at a time in the install or remove, just fail the others */
				6505	if (!spin_trylock(&pfm_alt_install_check)) {
				6506	return -EBUSY;
				6507	}
				6508
				6509	/* reserve our session */
				6510	for_each_online_cpu(reserve_cpu) {
				6511	ret = pfm_reserve_session(NULL, 1, reserve_cpu);
				6512	if (ret) goto cleanup_reserve;
				6513	}
				6514
				6515	/* save the current system wide pmu states */
				6516	ret = on_each_cpu(pfm_alt_save_pmu_state, NULL, 0, 1);
				6517	if (ret) {
				6518	DPRINT(("on_each_cpu() failed: %d\n", ret));
				6519	goto cleanup_reserve;
				6520	}
				6521
				6522	/* officially change to the alternate interrupt handler */
				6523	pfm_alt_intr_handler = hdl;
				6524
				6525	spin_unlock(&pfm_alt_install_check);
				6526
				6527	return 0;
				6528
				6529	cleanup_reserve:
				6530	for_each_online_cpu(i) {
				6531	/* don't unreserve more than we reserved */
				6532	if (i >= reserve_cpu) break;
				6533
				6534	pfm_unreserve_session(NULL, 1, i);
				6535	}
				6536
				6537	spin_unlock(&pfm_alt_install_check);
				6538
				6539	return ret;
				6540	}
				6541	EXPORT_SYMBOL_GPL(pfm_install_alt_pmu_interrupt);
				6542
				6543	int
				6544	pfm_remove_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl)
				6545	{
				6546	int i;
				6547	int ret;
				6548
				6549	if (hdl == NULL) return -EINVAL;
				6550
				6551	/* cannot remove someone else's handler! */
				6552	if (pfm_alt_intr_handler != hdl) return -EINVAL;
				6553
				6554	/* one at a time in the install or remove, just fail the others */
				6555	if (!spin_trylock(&pfm_alt_install_check)) {
				6556	return -EBUSY;
				6557	}
				6558
				6559	pfm_alt_intr_handler = NULL;
				6560
				6561	ret = on_each_cpu(pfm_alt_restore_pmu_state, NULL, 0, 1);
				6562	if (ret) {
				6563	DPRINT(("on_each_cpu() failed: %d\n", ret));
				6564	}
				6565
				6566	for_each_online_cpu(i) {
				6567	pfm_unreserve_session(NULL, 1, i);
				6568	}
				6569
				6570	spin_unlock(&pfm_alt_install_check);
				6571
				6572	return 0;
				6573	}
				6574	EXPORT_SYMBOL_GPL(pfm_remove_alt_pmu_interrupt);
				6575
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	6576	/*
				6577	* perfmon initialization routine, called from the initcall() table
				6578	*/
				6579	static int init_pfm_fs(void);
				6580
				6581	static int __init
				6582	pfm_probe_pmu(void)
				6583	{
				6584	pmu_config_t **p;
				6585	int family;
				6586
				6587	family = local_cpu_data->family;
				6588	p = pmu_confs;
				6589
				6590	while(*p) {
				6591	if ((*p)->probe) {
				6592	if ((*p)->probe() == 0) goto found;
				6593	} else if ((p)->pmu_family == family \|\| (p)->pmu_family == 0xff) {
				6594	goto found;
				6595	}
				6596	p++;
				6597	}
				6598	return -1;
				6599	found:
				6600	pmu_conf = *p;
				6601	return 0;
				6602	}
				6603
				6604	static struct file_operations pfm_proc_fops = {
				6605	.open = pfm_proc_open,
				6606	.read = seq_read,
				6607	.llseek = seq_lseek,
				6608	.release = seq_release,
				6609	};
				6610
				6611	int __init
				6612	pfm_init(void)
				6613	{
				6614	unsigned int n, n_counters, i;
				6615
				6616	printk("perfmon: version %u.%u IRQ %u\n",
				6617	PFM_VERSION_MAJ,
				6618	PFM_VERSION_MIN,
				6619	IA64_PERFMON_VECTOR);
				6620
				6621	if (pfm_probe_pmu()) {
				6622	printk(KERN_INFO "perfmon: disabled, there is no support for processor family %d\n",
				6623	local_cpu_data->family);
				6624	return -ENODEV;
				6625	}
				6626
				6627	/*
				6628	* compute the number of implemented PMD/PMC from the
				6629	* description tables
				6630	*/
				6631	n = 0;
				6632	for (i=0; PMC_IS_LAST(i) == 0; i++) {
				6633	if (PMC_IS_IMPL(i) == 0) continue;
				6634	pmu_conf->impl_pmcs[i>>6] \|= 1UL << (i&63);
				6635	n++;
				6636	}
				6637	pmu_conf->num_pmcs = n;
				6638
				6639	n = 0; n_counters = 0;
				6640	for (i=0; PMD_IS_LAST(i) == 0; i++) {
				6641	if (PMD_IS_IMPL(i) == 0) continue;
				6642	pmu_conf->impl_pmds[i>>6] \|= 1UL << (i&63);
				6643	n++;
				6644	if (PMD_IS_COUNTING(i)) n_counters++;
				6645	}
				6646	pmu_conf->num_pmds = n;
				6647	pmu_conf->num_counters = n_counters;
				6648
				6649	/*
				6650	* sanity checks on the number of debug registers
				6651	*/
				6652	if (pmu_conf->use_rr_dbregs) {
				6653	if (pmu_conf->num_ibrs > IA64_NUM_DBG_REGS) {
				6654	printk(KERN_INFO "perfmon: unsupported number of code debug registers (%u)\n", pmu_conf->num_ibrs);
				6655	pmu_conf = NULL;
				6656	return -1;
				6657	}
				6658	if (pmu_conf->num_dbrs > IA64_NUM_DBG_REGS) {
				6659	printk(KERN_INFO "perfmon: unsupported number of data debug registers (%u)\n", pmu_conf->num_ibrs);
				6660	pmu_conf = NULL;
				6661	return -1;
				6662	}
				6663	}
				6664
				6665	printk("perfmon: %s PMU detected, %u PMCs, %u PMDs, %u counters (%lu bits)\n",
				6666	pmu_conf->pmu_name,
				6667	pmu_conf->num_pmcs,
				6668	pmu_conf->num_pmds,
				6669	pmu_conf->num_counters,
				6670	ffz(pmu_conf->ovfl_val));
				6671
				6672	/* sanity check */
				6673	if (pmu_conf->num_pmds >= IA64_NUM_PMD_REGS \|\| pmu_conf->num_pmcs >= IA64_NUM_PMC_REGS) {
				6674	printk(KERN_ERR "perfmon: not enough pmc/pmd, perfmon disabled\n");
				6675	pmu_conf = NULL;
				6676	return -1;
				6677	}
				6678
				6679	/*
				6680	* create /proc/perfmon (mostly for debugging purposes)
				6681	*/
				6682	perfmon_dir = create_proc_entry("perfmon", S_IRUGO, NULL);
				6683	if (perfmon_dir == NULL) {
				6684	printk(KERN_ERR "perfmon: cannot create /proc entry, perfmon disabled\n");
				6685	pmu_conf = NULL;
				6686	return -1;
				6687	}
				6688	/*
				6689	* install customized file operations for /proc/perfmon entry
				6690	*/
				6691	perfmon_dir->proc_fops = &pfm_proc_fops;
				6692
				6693	/*
				6694	* create /proc/sys/kernel/perfmon (for debugging purposes)
				6695	*/
				6696	pfm_sysctl_header = register_sysctl_table(pfm_sysctl_root, 0);
				6697
				6698	/*
				6699	* initialize all our spinlocks
				6700	*/
				6701	spin_lock_init(&pfm_sessions.pfs_lock);
				6702	spin_lock_init(&pfm_buffer_fmt_lock);
				6703
				6704	init_pfm_fs();
				6705
				6706	for(i=0; i < NR_CPUS; i++) pfm_stats[i].pfm_ovfl_intr_cycles_min = ~0UL;
				6707
				6708	return 0;
				6709	}
				6710
				6711	__initcall(pfm_init);
				6712
				6713	/*
				6714	* this function is called before pfm_init()
				6715	*/
				6716	void
				6717	pfm_init_percpu (void)
				6718	{
				6719	/*
				6720	* make sure no measurement is active
				6721	* (may inherit programmed PMCs from EFI).
				6722	*/
				6723	pfm_clear_psr_pp();
				6724	pfm_clear_psr_up();
				6725
				6726	/*
				6727	* we run with the PMU not frozen at all times
				6728	*/
				6729	pfm_unfreeze_pmu();
				6730
				6731	if (smp_processor_id() == 0)
				6732	register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction);
				6733
				6734	ia64_setreg(_IA64_REG_CR_PMV, IA64_PERFMON_VECTOR);
				6735	ia64_srlz_d();
				6736	}
				6737
				6738	/*
				6739	* used for debug purposes only
				6740	*/
				6741	void
				6742	dump_pmu_state(const char *from)
				6743	{
				6744	struct task_struct *task;
				6745	struct thread_struct *t;
				6746	struct pt_regs *regs;
				6747	pfm_context_t *ctx;
				6748	unsigned long psr, dcr, info, flags;
				6749	int i, this_cpu;
				6750
				6751	local_irq_save(flags);
				6752
				6753	this_cpu = smp_processor_id();
				6754	regs = ia64_task_regs(current);
				6755	info = PFM_CPUINFO_GET();
				6756	dcr = ia64_getreg(_IA64_REG_CR_DCR);
				6757
				6758	if (info == 0 && ia64_psr(regs)->pp == 0 && (dcr & IA64_DCR_PP) == 0) {
				6759	local_irq_restore(flags);
				6760	return;
				6761	}
				6762
				6763	printk("CPU%d from %s() current [%d] iip=0x%lx %s\n",
				6764	this_cpu,
				6765	from,
				6766	current->pid,
				6767	regs->cr_iip,
				6768	current->comm);
				6769
				6770	task = GET_PMU_OWNER();
				6771	ctx = GET_PMU_CTX();
				6772
				6773	printk("->CPU%d owner [%d] ctx=%p\n", this_cpu, task ? task->pid : -1, ctx);
				6774
				6775	psr = pfm_get_psr();
				6776
				6777	printk("->CPU%d pmc0=0x%lx psr.pp=%d psr.up=%d dcr.pp=%d syst_info=0x%lx user_psr.up=%d user_psr.pp=%d\n",
				6778	this_cpu,
				6779	ia64_get_pmc(0),
				6780	psr & IA64_PSR_PP ? 1 : 0,
				6781	psr & IA64_PSR_UP ? 1 : 0,
				6782	dcr & IA64_DCR_PP ? 1 : 0,
				6783	info,
				6784	ia64_psr(regs)->up,
				6785	ia64_psr(regs)->pp);
				6786
				6787	ia64_psr(regs)->up = 0;
				6788	ia64_psr(regs)->pp = 0;
				6789
				6790	t = &current->thread;
				6791
				6792	for (i=1; PMC_IS_LAST(i) == 0; i++) {
				6793	if (PMC_IS_IMPL(i) == 0) continue;
				6794	printk("->CPU%d pmc[%d]=0x%lx thread_pmc[%d]=0x%lx\n", this_cpu, i, ia64_get_pmc(i), i, t->pmcs[i]);
				6795	}
				6796
				6797	for (i=1; PMD_IS_LAST(i) == 0; i++) {
				6798	if (PMD_IS_IMPL(i) == 0) continue;
				6799	printk("->CPU%d pmd[%d]=0x%lx thread_pmd[%d]=0x%lx\n", this_cpu, i, ia64_get_pmd(i), i, t->pmds[i]);
				6800	}
				6801
				6802	if (ctx) {
				6803	printk("->CPU%d ctx_state=%d vaddr=%p addr=%p fd=%d ctx_task=[%d] saved_psr_up=0x%lx\n",
				6804	this_cpu,
				6805	ctx->ctx_state,
				6806	ctx->ctx_smpl_vaddr,
				6807	ctx->ctx_smpl_hdr,
				6808	ctx->ctx_msgq_head,
				6809	ctx->ctx_msgq_tail,
				6810	ctx->ctx_saved_psr_up);
				6811	}
				6812	local_irq_restore(flags);
				6813	}
				6814
				6815	/*
				6816	* called from process.c:copy_thread(). task is new child.
				6817	*/
				6818	void
				6819	pfm_inherit(struct task_struct task, struct pt_regs regs)
				6820	{
				6821	struct thread_struct *thread;
				6822
				6823	DPRINT(("perfmon: pfm_inherit clearing state for [%d]\n", task->pid));
				6824
				6825	thread = &task->thread;
				6826
				6827	/*
				6828	* cut links inherited from parent (current)
				6829	*/
				6830	thread->pfm_context = NULL;
				6831
				6832	PFM_SET_WORK_PENDING(task, 0);
				6833
				6834	/*
				6835	* the psr bits are already set properly in copy_threads()
				6836	*/
				6837	}
				6838	#else /* !CONFIG_PERFMON */
				6839	asmlinkage long
				6840	sys_perfmonctl (int fd, int cmd, void *arg, int count)
				6841	{
				6842	return -ENOSYS;
				6843	}
				6844	#endif /* CONFIG_PERFMON */