Blame - doc/internals.html - fp2-dev/platform/external/oprofile

blob: 230516877c9ee7ed930336d2b27851c818511961 [file] [log] [blame]

Upstream	cc2ee17	1970-01-12 13:46:40 +0000	[diff] [blame^]	1	<?xml version="1.0" encoding="ISO-8859-1"?>
				2	<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
				3	<html xmlns="http://www.w3.org/1999/xhtml">
				4	<head>
				5	<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1" />
				6	<title>OProfile Internals</title>
				7	<meta name="generator" content="DocBook XSL Stylesheets V1.68.1" />
				8	</head>
				9	<body>
				10	<div class="book" lang="en" xml:lang="en">
				11	<div class="titlepage">
				12	<div>
				13	<div>
				14	<h1 class="title"><a id="oprofile-internals"></a>OProfile Internals</h1>
				15	</div>
				16	<div>
				17	<div class="authorgroup">
				18	<div class="author">
				19	<h3 class="author"><span class="firstname">John</span> <span class="surname">Levon</span></h3>
				20	<div class="affiliation">
				21	<div class="address">
				22	<p>
				23	<code class="email"><<a href="mailto:levon@movementarian.org">levon@movementarian.org</a>></code>
				24	</p>
				25	</div>
				26	</div>
				27	</div>
				28	</div>
				29	</div>
				30	<div>
				31	<p class="copyright">Copyright © 2003 John Levon</p>
				32	</div>
				33	</div>
				34	<hr />
				35	</div>
				36	<div class="toc">
				37	<p>
				38	<b>Table of Contents</b>
				39	</p>
				40	<dl>
				41	<dt>
				42	<span class="chapter">
				43	<a href="#introduction">1. Introduction</a>
				44	</span>
				45	</dt>
				46	<dd>
				47	<dl>
				48	<dt>
				49	<span class="sect1">
				50	<a href="#overview">1. Overview</a>
				51	</span>
				52	</dt>
				53	<dt>
				54	<span class="sect1">
				55	<a href="#components">2. Components of the OProfile system</a>
				56	</span>
				57	</dt>
				58	<dd>
				59	<dl>
				60	<dt>
				61	<span class="sect2">
				62	<a href="#arch-specific-components">2.1. Architecture-specific components</a>
				63	</span>
				64	</dt>
				65	<dt>
				66	<span class="sect2">
				67	<a href="#filesystem">2.2. oprofilefs</a>
				68	</span>
				69	</dt>
				70	<dt>
				71	<span class="sect2">
				72	<a href="#driver">2.3. Generic kernel driver</a>
				73	</span>
				74	</dt>
				75	<dt>
				76	<span class="sect2">
				77	<a href="#daemon">2.4. The OProfile daemon</a>
				78	</span>
				79	</dt>
				80	<dt>
				81	<span class="sect2">
				82	<a href="#post-profiling">2.5. Post-profiling tools</a>
				83	</span>
				84	</dt>
				85	</dl>
				86	</dd>
				87	</dl>
				88	</dd>
				89	<dt>
				90	<span class="chapter">
				91	<a href="#performance-counters">2. Performance counter management</a>
				92	</span>
				93	</dt>
				94	<dd>
				95	<dl>
				96	<dt>
				97	<span class="sect1">
				98	<a href="#performance-counters-ui">1. Providing a user interface</a>
				99	</span>
				100	</dt>
				101	<dt>
				102	<span class="sect1">
				103	<a href="#performance-counters-programming">2. Programming the performance counter registers</a>
				104	</span>
				105	</dt>
				106	<dd>
				107	<dl>
				108	<dt>
				109	<span class="sect2">
				110	<a href="#performance-counters-start">2.1. Starting and stopping the counters</a>
				111	</span>
				112	</dt>
				113	<dt>
				114	<span class="sect2">
				115	<a href="#id2495021">2.2. IA64 and perfmon</a>
				116	</span>
				117	</dt>
				118	</dl>
				119	</dd>
				120	</dl>
				121	</dd>
				122	<dt>
				123	<span class="chapter">
				124	<a href="#collecting-samples">3. Collecting and processing samples</a>
				125	</span>
				126	</dt>
				127	<dd>
				128	<dl>
				129	<dt>
				130	<span class="sect1">
				131	<a href="#receiving-interrupts">1. Receiving interrupts</a>
				132	</span>
				133	</dt>
				134	<dt>
				135	<span class="sect1">
				136	<a href="#core-structure">2. Core data structures</a>
				137	</span>
				138	</dt>
				139	<dt>
				140	<span class="sect1">
				141	<a href="#logging-sample">3. Logging a sample</a>
				142	</span>
				143	</dt>
				144	<dt>
				145	<span class="sect1">
				146	<a href="#logging-stack">4. Logging stack traces</a>
				147	</span>
				148	</dt>
				149	<dt>
				150	<span class="sect1">
				151	<a href="#synchronising-buffers">5. Synchronising the CPU buffers to the event buffer</a>
				152	</span>
				153	</dt>
				154	<dt>
				155	<span class="sect1">
				156	<a href="#dentry-cookies">6. Identifying binary images</a>
				157	</span>
				158	</dt>
				159	<dt>
				160	<span class="sect1">
				161	<a href="#finding-dentry">7. Finding a sample's binary image and offset</a>
				162	</span>
				163	</dt>
				164	</dl>
				165	</dd>
				166	<dt>
				167	<span class="chapter">
				168	<a href="#sample-files">4. Generating sample files</a>
				169	</span>
				170	</dt>
				171	<dd>
				172	<dl>
				173	<dt>
				174	<span class="sect1">
				175	<a href="#processing-buffer">1. Processing the buffer</a>
				176	</span>
				177	</dt>
				178	<dd>
				179	<dl>
				180	<dt>
				181	<span class="sect2">
				182	<a href="#handling-kernel-samples">1.1. Handling kernel samples</a>
				183	</span>
				184	</dt>
				185	</dl>
				186	</dd>
				187	<dt>
				188	<span class="sect1">
				189	<a href="#sample-file-generation">2. Locating and creating sample files</a>
				190	</span>
				191	</dt>
				192	<dt>
				193	<span class="sect1">
				194	<a href="#sample-file-writing">3. Writing data to a sample file</a>
				195	</span>
				196	</dt>
				197	</dl>
				198	</dd>
				199	<dt>
				200	<span class="chapter">
				201	<a href="#output">5. Generating useful output</a>
				202	</span>
				203	</dt>
				204	<dd>
				205	<dl>
				206	<dt>
				207	<span class="sect1">
				208	<a href="#profile-specification">1. Handling the profile specification</a>
				209	</span>
				210	</dt>
				211	<dt>
				212	<span class="sect1">
				213	<a href="#sample-file-collating">2. Collating the candidate sample files</a>
				214	</span>
				215	</dt>
				216	<dd>
				217	<dl>
				218	<dt>
				219	<span class="sect2">
				220	<a href="#sample-file-classifying">2.1. Classifying sample files</a>
				221	</span>
				222	</dt>
				223	<dt>
				224	<span class="sect2">
				225	<a href="#sample-file-inverting">2.2. Creating inverted profile lists</a>
				226	</span>
				227	</dt>
				228	</dl>
				229	</dd>
				230	<dt>
				231	<span class="sect1">
				232	<a href="#generating-profile-data">3. Generating profile data</a>
				233	</span>
				234	</dt>
				235	<dd>
				236	<dl>
				237	<dt>
				238	<span class="sect2">
				239	<a href="#bfd">3.1. Processing the binary image</a>
				240	</span>
				241	</dt>
				242	<dt>
				243	<span class="sect2">
				244	<a href="#processing-sample-files">3.2. Processing the sample files</a>
				245	</span>
				246	</dt>
				247	</dl>
				248	</dd>
				249	<dt>
				250	<span class="sect1">
				251	<a href="#generating-output">4. Generating output</a>
				252	</span>
				253	</dt>
				254	</dl>
				255	</dd>
				256	<dt>
				257	<span class="glossary">
				258	<a href="#glossary">Glossary of OProfile source concepts and types</a>
				259	</span>
				260	</dt>
				261	</dl>
				262	</div>
				263	<div class="list-of-figures">
				264	<p>
				265	<b>List of Figures</b>
				266	</p>
				267	<dl>
				268	<dt>3.1. <a href="#id2495193">The OProfile buffers</a></dt>
				269	</dl>
				270	</div>
				271	<div class="chapter" lang="en" xml:lang="en">
				272	<div class="titlepage">
				273	<div>
				274	<div>
				275	<h2 class="title"><a id="introduction"></a>Chapter 1. Introduction</h2>
				276	</div>
				277	</div>
				278	</div>
				279	<div class="toc">
				280	<p>
				281	<b>Table of Contents</b>
				282	</p>
				283	<dl>
				284	<dt>
				285	<span class="sect1">
				286	<a href="#overview">1. Overview</a>
				287	</span>
				288	</dt>
				289	<dt>
				290	<span class="sect1">
				291	<a href="#components">2. Components of the OProfile system</a>
				292	</span>
				293	</dt>
				294	<dd>
				295	<dl>
				296	<dt>
				297	<span class="sect2">
				298	<a href="#arch-specific-components">2.1. Architecture-specific components</a>
				299	</span>
				300	</dt>
				301	<dt>
				302	<span class="sect2">
				303	<a href="#filesystem">2.2. oprofilefs</a>
				304	</span>
				305	</dt>
				306	<dt>
				307	<span class="sect2">
				308	<a href="#driver">2.3. Generic kernel driver</a>
				309	</span>
				310	</dt>
				311	<dt>
				312	<span class="sect2">
				313	<a href="#daemon">2.4. The OProfile daemon</a>
				314	</span>
				315	</dt>
				316	<dt>
				317	<span class="sect2">
				318	<a href="#post-profiling">2.5. Post-profiling tools</a>
				319	</span>
				320	</dt>
				321	</dl>
				322	</dd>
				323	</dl>
				324	</div>
				325	<p>
				326	This document is current for OProfile version 0.9.1cvs.
				327	This document provides some details on the internal workings of OProfile for the
				328	interested hacker. This document assumes strong C, working C++, plus some knowledge of
				329	kernel internals and CPU hardware.
				330	</p>
				331	<div class="note" style="margin-left: 0.5in; margin-right: 0.5in;">
				332	<h3 class="title">Note</h3>
				333	<p>
				334	Only the "new" implementation associated with kernel 2.6 and above is covered here. 2.4
				335	uses a very different kernel module implementation and daemon to produce the sample files.
				336	</p>
				337	</div>
				338	<div class="sect1" lang="en" xml:lang="en">
				339	<div class="titlepage">
				340	<div>
				341	<div>
				342	<h2 class="title" style="clear: both"><a id="overview"></a>1. Overview</h2>
				343	</div>
				344	</div>
				345	</div>
				346	<p>
				347	OProfile is a statistical continuous profiler. In other words, profiles are generated by
				348	regularly sampling the current registers on each CPU (from an interrupt handler, the
				349	saved PC value at the time of interrupt is stored), and converting that runtime PC
				350	value into something meaningful to the programmer.
				351	</p>
				352	<p>
				353	OProfile achieves this by taking the stream of sampled PC values, along with the detail
				354	of which task was running at the time of the interrupt, and converting into a file offset
				355	against a particular binary file. Because applications <code class="function">mmap()</code>
				356	the code they run (be it <code class="filename">/bin/bash</code>, <code class="filename">/lib/libfoo.so</code>
				357	or whatever), it's possible to find the relevant binary file and offset by walking
				358	the task's list of mapped memory areas. Each PC value is thus converted into a tuple
				359	of binary-image,offset. This is something that the userspace tools can use directly
				360	to reconstruct where the code came from, including the particular assembly instructions,
				361	symbol, and source line (via the binary's debug information if present).
				362	</p>
				363	<p>
				364	Regularly sampling the PC value like this approximates what actually was executed and
				365	how often - more often than not, this statistical approximation is good enough to
				366	reflect reality. In common operation, the time between each sample interrupt is regulated
				367	by a fixed number of clock cycles. This implies that the results will reflect where
				368	the CPU is spending the most time; this is obviously a very useful information source
				369	for performance analysis.
				370	</p>
				371	<p>
				372	Sometimes though, an application programmer needs different kinds of information: for example,
				373	"which of the source routines cause the most cache misses ?". The rise in importance of
				374	such metrics in recent years has led many CPU manufacturers to provide hardware performance
				375	counters capable of measuring these events on the hardware level. Typically, these counters
				376	increment once per each event, and generate an interrupt on reaching some pre-defined
				377	number of events. OProfile can use these interrupts to generate samples: then, the
				378	profile results are a statistical approximation of which code caused how many of the
				379	given event.
				380	</p>
				381	<p>
				382	Consider a simplified system that only executes two functions A and B. A
				383	takes one cycle to execute, whereas B takes 99 cycles. Imagine we run at
				384	100 cycles a second, and we've set the performance counter to create an
				385	interrupt after a set number of "events" (in this case an event is one
				386	clock cycle). It should be clear that the chances of the interrupt
				387	occurring in function A is 1/100, and 99/100 for function B. Thus, we
				388	statistically approximate the actual relative performance features of
				389	the two functions over time. This same analysis works for other types of
				390	events, providing that the interrupt is tied to the number of events
				391	occurring (that is, after N events, an interrupt is generated).
				392	</p>
				393	<p>
				394	There are typically more than one of these counters, so it's possible to set up profiling
				395	for several different event types. Using these counters gives us a powerful, low-overhead
				396	way of gaining performance metrics. If OProfile, or the CPU, does not support performance
				397	counters, then a simpler method is used: the kernel timer interrupt feeds samples
				398	into OProfile itself.
				399	</p>
				400	<p>
				401	The rest of this document concerns itself with how we get from receiving samples at
				402	interrupt time to producing user-readable profile information.
				403	</p>
				404	</div>
				405	<div class="sect1" lang="en" xml:lang="en">
				406	<div class="titlepage">
				407	<div>
				408	<div>
				409	<h2 class="title" style="clear: both"><a id="components"></a>2. Components of the OProfile system</h2>
				410	</div>
				411	</div>
				412	</div>
				413	<div class="sect2" lang="en" xml:lang="en">
				414	<div class="titlepage">
				415	<div>
				416	<div>
				417	<h3 class="title"><a id="arch-specific-components"></a>2.1. Architecture-specific components</h3>
				418	</div>
				419	</div>
				420	</div>
				421	<p>
				422	If OProfile supports the hardware performance counters found on
				423	a particular architecture, code for managing the details of setting
				424	up and managing these counters can be found in the kernel source
				425	tree in the relevant <code class="filename">arch/<span class="emphasis"><em>arch</em></span>/oprofile/</code>
				426	directory. The architecture-specific implementation works via
				427	filling in the oprofile_operations structure at init time. This
				428	provides a set of operations such as <code class="function">setup()</code>,
				429	<code class="function">start()</code>, <code class="function">stop()</code>, etc.
				430	that manage the hardware-specific details of fiddling with the
				431	performance counter registers.
				432	</p>
				433	<p>
				434	The other important facility available to the architecture code is
				435	<code class="function">oprofile_add_sample()</code>. This is where a particular sample
				436	taken at interrupt time is fed into the generic OProfile driver code.
				437	</p>
				438	</div>
				439	<div class="sect2" lang="en" xml:lang="en">
				440	<div class="titlepage">
				441	<div>
				442	<div>
				443	<h3 class="title"><a id="filesystem"></a>2.2. oprofilefs</h3>
				444	</div>
				445	</div>
				446	</div>
				447	<p>
				448	OProfile implements a pseudo-filesystem known as "oprofilefs", mounted from
				449	userspace at <code class="filename">/dev/oprofile</code>. This consists of small
				450	files for reporting and receiving configuration from userspace, as well
				451	as the actual character device that the OProfile userspace receives samples
				452	from. At <code class="function">setup()</code> time, the architecture-specific may
				453	add further configuration files related to the details of the performance
				454	counters. For example, on x86, one numbered directory for each hardware
				455	performance counter is added, with files in each for the event type,
				456	reset value, etc.
				457	</p>
				458	<p>
				459	The filesystem also contains a <code class="filename">stats</code> directory with
				460	a number of useful counters for various OProfile events.
				461	</p>
				462	</div>
				463	<div class="sect2" lang="en" xml:lang="en">
				464	<div class="titlepage">
				465	<div>
				466	<div>
				467	<h3 class="title"><a id="driver"></a>2.3. Generic kernel driver</h3>
				468	</div>
				469	</div>
				470	</div>
				471	<p>
				472	This lives in <code class="filename">drivers/oprofile/</code>, and forms the core of
				473	how OProfile works in the kernel. Its job is to take samples delivered
				474	from the architecture-specific code (via <code class="function">oprofile_add_sample()</code>),
				475	and buffer this data, in a transformed form as described later, until releasing
				476	the data to the userspace daemon via the <code class="filename">/dev/oprofile/buffer</code>
				477	character device.
				478	</p>
				479	</div>
				480	<div class="sect2" lang="en" xml:lang="en">
				481	<div class="titlepage">
				482	<div>
				483	<div>
				484	<h3 class="title"><a id="daemon"></a>2.4. The OProfile daemon</h3>
				485	</div>
				486	</div>
				487	</div>
				488	<p>
				489	The OProfile userspace daemon's job is to take the raw data provided by the
				490	kernel and write it to the disk. It takes the single data stream from the
				491	kernel and logs sample data against a number of sample files (found in
				492	<code class="filename">/var/lib/oprofile/samples/current/</code>. For the benefit
				493	of the "separate" functionality, the names/paths of these sample files
				494	are mangled to reflect where the samples were from: this can include
				495	thread IDs, the binary file path, the event type used, and more.
				496	</p>
				497	<p>
				498	After this final step from interrupt to disk file, the data is now
				499	persistent (that is, changes in the running of the system do not invalidate
				500	stored data). So the post-profiling tools can run on this data at any
				501	time (assuming the original binary files are still available and unchanged,
				502	naturally).
				503	</p>
				504	</div>
				505	<div class="sect2" lang="en" xml:lang="en"><div class="titlepage"><div><div><h3 class="title"><a id="post-profiling"></a>2.5. Post-profiling tools</h3></div></div></div>
				506	So far, we've collected data, but we've yet to present it in a useful form
				507	to the user. This is the job of the post-profiling tools. In general form,
				508	they collate a subset of the available sample files, load and process each one
				509	correlated against the relevant binary file, and finally produce user-readable
				510	information.
				511	</div>
				512	</div>
				513	</div>
				514	<div class="chapter" lang="en" xml:lang="en">
				515	<div class="titlepage">
				516	<div>
				517	<div>
				518	<h2 class="title"><a id="performance-counters"></a>Chapter 2. Performance counter management</h2>
				519	</div>
				520	</div>
				521	</div>
				522	<div class="toc">
				523	<p>
				524	<b>Table of Contents</b>
				525	</p>
				526	<dl>
				527	<dt>
				528	<span class="sect1">
				529	<a href="#performance-counters-ui">1. Providing a user interface</a>
				530	</span>
				531	</dt>
				532	<dt>
				533	<span class="sect1">
				534	<a href="#performance-counters-programming">2. Programming the performance counter registers</a>
				535	</span>
				536	</dt>
				537	<dd>
				538	<dl>
				539	<dt>
				540	<span class="sect2">
				541	<a href="#performance-counters-start">2.1. Starting and stopping the counters</a>
				542	</span>
				543	</dt>
				544	<dt>
				545	<span class="sect2">
				546	<a href="#id2495021">2.2. IA64 and perfmon</a>
				547	</span>
				548	</dt>
				549	</dl>
				550	</dd>
				551	</dl>
				552	</div>
				553	<div class="sect1" lang="en" xml:lang="en">
				554	<div class="titlepage">
				555	<div>
				556	<div>
				557	<h2 class="title" style="clear: both"><a id="performance-counters-ui"></a>1. Providing a user interface</h2>
				558	</div>
				559	</div>
				560	</div>
				561	<p>
				562	The performance counter registers need programming in order to set the
				563	type of event to count, etc. OProfile uses a standard model across all
				564	CPUs for defining these events as follows :
				565	</p>
				566	<div class="informaltable">
				567	<table border="1">
				568	<colgroup>
				569	<col />
				570	<col />
				571	</colgroup>
				572	<tbody>
				573	<tr>
				574	<td>
				575	<code class="option">event</code>
				576	</td>
				577	<td>The event type e.g. DATA_MEM_REFS</td>
				578	</tr>
				579	<tr>
				580	<td>
				581	<code class="option">unit mask</code>
				582	</td>
				583	<td>The sub-events to count (more detailed specification)</td>
				584	</tr>
				585	<tr>
				586	<td>
				587	<code class="option">counter</code>
				588	</td>
				589	<td>The hardware counter(s) that can count this event</td>
				590	</tr>
				591	<tr>
				592	<td>
				593	<code class="option">count</code>
				594	</td>
				595	<td>The reset value (how many events before an interrupt)</td>
				596	</tr>
				597	<tr>
				598	<td>
				599	<code class="option">kernel</code>
				600	</td>
				601	<td>Whether the counter should increment when in kernel space</td>
				602	</tr>
				603	<tr>
				604	<td>
				605	<code class="option">user</code>
				606	</td>
				607	<td>Whether the counter should increment when in user space</td>
				608	</tr>
				609	</tbody>
				610	</table>
				611	</div>
				612	<p>
				613	The term "unit mask" is borrowed from the Intel architectures, and can
				614	further specify exactly when a counter is incremented (for example,
				615	cache-related events can be restricted to particular state transitions
				616	of the cache lines).
				617	</p>
				618	<p>
				619	All of the available hardware events and their details are specified in
				620	the textual files in the <code class="filename">events</code> directory. The
				621	syntax of these files should be fairly obvious. The user specifies the
				622	names and configuration details of the chosen counters via
				623	<span><strong class="command">opcontrol</strong></span>. These are then written to the kernel
				624	module (in numerical form) via <code class="filename">/dev/oprofile/N/</code>
				625	where N is the physical hardware counter (some events can only be used
				626	on specific counters; OProfile hides these details from the user when
				627	possible). On IA64, the perfmon-based interface behaves somewhat
				628	differently, as described later.
				629	</p>
				630	</div>
				631	<div class="sect1" lang="en" xml:lang="en">
				632	<div class="titlepage">
				633	<div>
				634	<div>
				635	<h2 class="title" style="clear: both"><a id="performance-counters-programming"></a>2. Programming the performance counter registers</h2>
				636	</div>
				637	</div>
				638	</div>
				639	<p>
				640	We have described how the user interface fills in the desired
				641	configuration of the counters and transmits the information to the
				642	kernel. It is the job of the <code class="function">->setup()</code> method
				643	to actually program the performance counter registers. Clearly, the
				644	details of how this is done is architecture-specific; it is also
				645	model-specific on many architectures. For example, i386 provides methods
				646	for each model type that programs the counter registers correctly
				647	(see the <code class="filename">op_model_*</code> files in
				648	<code class="filename">arch/i386/oprofile</code> for the details). The method
				649	reads the values stored in the virtual oprofilefs files and programs
				650	the registers appropriately, ready for starting the actual profiling
				651	session.
				652	</p>
				653	<p>
				654	The architecture-specific drivers make sure to save the old register
				655	settings before doing OProfile setup. They are restored when OProfile
				656	shuts down. This is useful, for example, on i386, where the NMI watchdog
				657	uses the same performance counter registers as OProfile; they cannot
				658	run concurrently, but OProfile makes sure to restore the setup it found
				659	before it was running.
				660	</p>
				661	<p>
				662	In addition to programming the counter registers themselves, other setup
				663	is often necessary. For example, on i386, the local APIC needs
				664	programming in order to make the counter's overflow interrupt appear as
				665	an NMI (non-maskable interrupt). This allows sampling (and therefore
				666	profiling) of regions where "normal" interrupts are masked, enabling
				667	more reliable profiles.
				668	</p>
				669	<div class="sect2" lang="en" xml:lang="en">
				670	<div class="titlepage">
				671	<div>
				672	<div>
				673	<h3 class="title"><a id="performance-counters-start"></a>2.1. Starting and stopping the counters</h3>
				674	</div>
				675	</div>
				676	</div>
				677	<p>
				678	Initiating a profiling session is done via writing an ASCII '1'
				679	to the file <code class="filename">/dev/oprofile/enable</code>. This sets up the
				680	core, and calls into the architecture-specific driver to actually
				681	enable each configured counter. Again, the details of how this is
				682	done is model-specific (for example, the Athlon models can disable
				683	or enable on a per-counter basis, unlike the PPro models).
				684	</p>
				685	</div>
				686	<div class="sect2" lang="en" xml:lang="en">
				687	<div class="titlepage">
				688	<div>
				689	<div>
				690	<h3 class="title"><a id="id2495021"></a>2.2. IA64 and perfmon</h3>
				691	</div>
				692	</div>
				693	</div>
				694	<p>
				695	The IA64 architecture provides a different interface from the other
				696	architectures, using the existing perfmon driver. Register programming
				697	is handled entirely in user-space (see
				698	<code class="filename">daemon/opd_perfmon.c</code> for the details). A process
				699	is forked for each CPU, which creates a perfmon context and sets the
				700	counter registers appropriately via the
				701	<code class="function">sys_perfmonctl</code> interface. In addition, the actual
				702	initiation and termination of the profiling session is handled via the
				703	same interface using <code class="constant">PFM_START</code> and
				704	<code class="constant">PFM_STOP</code>. On IA64, then, there are no oprofilefs
				705	files for the performance counters, as the kernel driver does not
				706	program the registers itself.
				707	</p>
				708	<p>
				709	Instead, the perfmon driver for OProfile simply registers with the
				710	OProfile core with an OProfile-specific UUID. During a profiling
				711	session, the perfmon core calls into the OProfile perfmon driver and
				712	samples are registered with the OProfile core itself as usual (with
				713	<code class="function">oprofile_add_sample()</code>).
				714	</p>
				715	</div>
				716	</div>
				717	</div>
				718	<div class="chapter" lang="en" xml:lang="en">
				719	<div class="titlepage">
				720	<div>
				721	<div>
				722	<h2 class="title"><a id="collecting-samples"></a>Chapter 3. Collecting and processing samples</h2>
				723	</div>
				724	</div>
				725	</div>
				726	<div class="toc">
				727	<p>
				728	<b>Table of Contents</b>
				729	</p>
				730	<dl>
				731	<dt>
				732	<span class="sect1">
				733	<a href="#receiving-interrupts">1. Receiving interrupts</a>
				734	</span>
				735	</dt>
				736	<dt>
				737	<span class="sect1">
				738	<a href="#core-structure">2. Core data structures</a>
				739	</span>
				740	</dt>
				741	<dt>
				742	<span class="sect1">
				743	<a href="#logging-sample">3. Logging a sample</a>
				744	</span>
				745	</dt>
				746	<dt>
				747	<span class="sect1">
				748	<a href="#logging-stack">4. Logging stack traces</a>
				749	</span>
				750	</dt>
				751	<dt>
				752	<span class="sect1">
				753	<a href="#synchronising-buffers">5. Synchronising the CPU buffers to the event buffer</a>
				754	</span>
				755	</dt>
				756	<dt>
				757	<span class="sect1">
				758	<a href="#dentry-cookies">6. Identifying binary images</a>
				759	</span>
				760	</dt>
				761	<dt>
				762	<span class="sect1">
				763	<a href="#finding-dentry">7. Finding a sample's binary image and offset</a>
				764	</span>
				765	</dt>
				766	</dl>
				767	</div>
				768	<div class="sect1" lang="en" xml:lang="en">
				769	<div class="titlepage">
				770	<div>
				771	<div>
				772	<h2 class="title" style="clear: both"><a id="receiving-interrupts"></a>1. Receiving interrupts</h2>
				773	</div>
				774	</div>
				775	</div>
				776	<p>
				777	Naturally, how the overflow interrupts are received is specific
				778	to the hardware architecture, unless we are in "timer" mode, where the
				779	logging routine is called directly from the standard kernel timer
				780	interrupt handler.
				781	</p>
				782	<p>
				783	On the i386 architecture, the local APIC is programmed such that when a
				784	counter overflows (that is, it receives an event that causes an integer
				785	overflow of the register value to zero), an NMI is generated. This calls
				786	into the general handler <code class="function">do_nmi()</code>; because OProfile
				787	has registered itself as capable of handling NMI interrupts, this will
				788	call into the OProfile driver code in
				789	<code class="filename">arch/i386/oprofile</code>. Here, the saved PC value (the
				790	CPU saves the register set at the time of interrupt on the stack
				791	available for inspection) is extracted, and the counters are examined to
				792	find out which one generated the interrupt. Also determined is whether
				793	the system was inside kernel or user space at the time of the interrupt.
				794	These three pieces of information are then forwarded onto the OProfile
				795	core via <code class="function">oprofile_add_sample()</code>. Finally, the
				796	counter values are reset to the chosen count value, to ensure another
				797	interrupt happens after another N events have occurred. Other
				798	architectures behave in a similar manner.
				799	</p>
				800	</div>
				801	<div class="sect1" lang="en" xml:lang="en">
				802	<div class="titlepage">
				803	<div>
				804	<div>
				805	<h2 class="title" style="clear: both"><a id="core-structure"></a>2. Core data structures</h2>
				806	</div>
				807	</div>
				808	</div>
				809	<p>
				810	Before considering what happens when we log a sample, we shall digress
				811	for a moment and look at the general structure of the data collection
				812	system.
				813	</p>
				814	<p>
				815	OProfile maintains a small buffer for storing the logged samples for
				816	each CPU on the system. Only this buffer is altered when we actually log
				817	a sample (remember, we may still be in an NMI context, so no locking is
				818	possible). The buffer is managed by a two-handed system; the "head"
				819	iterator dictates where the next sample data should be placed in the
				820	buffer. Of course, overflow of the buffer is possible, in which case
				821	the sample is discarded.
				822	</p>
				823	<p>
				824	It is critical to remember that at this point, the PC value is an
				825	absolute value, and is therefore only meaningful in the context of which
				826	task it was logged against. Thus, these per-CPU buffers also maintain
				827	details of which task each logged sample is for, as described in the
				828	next section. In addition, we store whether the sample was in kernel
				829	space or user space (on some architectures and configurations, the address
				830	space is not sub-divided neatly at a specific PC value, so we must store
				831	this information).
				832	</p>
				833	<p>
				834	As well as these small per-CPU buffers, we have a considerably larger
				835	single buffer. This holds the data that is eventually copied out into
				836	the OProfile daemon. On certain system events, the per-CPU buffers are
				837	processed and entered (in mutated form) into the main buffer, known in
				838	the source as the "event buffer". The "tail" iterator indicates the
				839	point from which the CPU may be read, up to the position of the "head"
				840	iterator. This provides an entirely lock-free method for extracting data
				841	from the CPU buffers. This process is described in detail later in this chapter.
				842	</p>
				843	<div class="figure">
				844	<a id="id2495193"></a>
				845	<p class="title">
				846	<b>Figure 3.1. The OProfile buffers</b>
				847	</p>
				848	<div>
				849	<img src="buffers.png" alt="The OProfile buffers" />
				850	</div>
				851	</div>
				852	</div>
				853	<div class="sect1" lang="en" xml:lang="en">
				854	<div class="titlepage">
				855	<div>
				856	<div>
				857	<h2 class="title" style="clear: both"><a id="logging-sample"></a>3. Logging a sample</h2>
				858	</div>
				859	</div>
				860	</div>
				861	<p>
				862	As mentioned, the sample is logged into the buffer specific to the
				863	current CPU. The CPU buffer is a simple array of pairs of unsigned long
				864	values; for a sample, they hold the PC value and the counter for the
				865	sample. (The counter value is later used to translate back into the relevant
				866	event type the counter was programmed to).
				867	</p>
				868	<p>
				869	In addition to logging the sample itself, we also log task switches.
				870	This is simply done by storing the address of the last task to log a
				871	sample on that CPU in a data structure, and writing a task switch entry
				872	into the buffer if the new value of <code class="function">current()</code> has
				873	changed. Note that later we will directly de-reference this pointer;
				874	this imposes certain restrictions on when and how the CPU buffers need
				875	to be processed.
				876	</p>
				877	<p>
				878	Finally, as mentioned, we log whether we have changed between kernel and
				879	userspace using a similar method. Both of these variables
				880	(<code class="varname">last_task</code> and <code class="varname">last_is_kernel</code>) are
				881	reset when the CPU buffer is read.
				882	</p>
				883	</div>
				884	<div class="sect1" lang="en" xml:lang="en">
				885	<div class="titlepage">
				886	<div>
				887	<div>
				888	<h2 class="title" style="clear: both"><a id="logging-stack"></a>4. Logging stack traces</h2>
				889	</div>
				890	</div>
				891	</div>
				892	<p>
				893	OProfile can also provide statistical samples of call chains (on x86). To
				894	do this, at sample time, the frame pointer chain is traversed, recording
				895	the return address for each stack frame. This will only work if the code
				896	was compiled with frame pointers, but we're careful to abort the
				897	traversal if the frame pointer appears bad. We store the set of return
				898	addresses straight into the CPU buffer. Note that, since this traversal
				899	is keyed off the standard sample interrupt, the number of times a
				900	function appears in a stack trace is not an indicator of how many times
				901	the call site was executed: rather, it's related to the number of
				902	samples we took where that call site was involved. Thus, the results for
				903	stack traces are not necessarily proportional to the call counts:
				904	typical programs will have many <code class="function">main()</code> samples.
				905	</p>
				906	</div>
				907	<div class="sect1" lang="en" xml:lang="en">
				908	<div class="titlepage">
				909	<div>
				910	<div>
				911	<h2 class="title" style="clear: both"><a id="synchronising-buffers"></a>5. Synchronising the CPU buffers to the event buffer</h2>
				912	</div>
				913	</div>
				914	</div>
				915	<p>
				916	At some point, we have to process the data in each CPU buffer and enter
				917	it into the main (event) buffer. The file
				918	<code class="filename">buffer_sync.c</code> contains the relevant code. We
				919	periodically (currently every <code class="constant">HZ</code>/4 jiffies) start
				920	the synchronisation process. In addition, we process the buffers on
				921	certain events, such as an application calling
				922	<code class="function">munmap()</code>. This is particularly important for
				923	<code class="function">exit()</code> - because the CPU buffers contain pointers
				924	to the task structure, if we don't process all the buffers before the
				925	task is actually destroyed and the task structure freed, then we could
				926	end up trying to dereference a bogus pointer in one of the CPU buffers.
				927	</p>
				928	<p>
				929	We also add a notification when a kernel module is loaded; this is so
				930	that user-space can re-read <code class="filename">/proc/modules</code> to
				931	determine the load addresses of kernel module text sections. Without
				932	this notification, samples for a newly-loaded module could get lost or
				933	be attributed to the wrong module.
				934	</p>
				935	<p>
				936	The synchronisation itself works in the following manner: first, mutual
				937	exclusion on the event buffer is taken. Remember, we do not need to do
				938	that for each CPU buffer, as we only read from the tail iterator (whilst
				939	interrupts might be arriving at the same buffer, but they will write to
				940	the position of the head iterator, leaving previously written entries
				941	intact). Then, we process each CPU buffer in turn. A CPU switch
				942	notification is added to the buffer first (for
				943	<code class="option">--separate=cpu</code> support). Then the processing of the
				944	actual data starts.
				945	</p>
				946	<p>
				947	As mentioned, the CPU buffer consists of task switch entries and the
				948	actual samples. When the routine <code class="function">sync_buffer()</code> sees
				949	a task switch, the process ID and process group ID are recorded into the
				950	event buffer, along with a dcookie (see below) identifying the
				951	application binary (e.g. <code class="filename">/bin/bash</code>). The
				952	<code class="varname">mmap_sem</code> for the task is then taken, to allow safe
				953	iteration across the tasks' list of mapped areas. Each sample is then
				954	processed as described in the next section.
				955	</p>
				956	<p>
				957	After a buffer has been read, the tail iterator is updated to reflect
				958	how much of the buffer was processed. Note that when we determined how
				959	much data there was to read in the CPU buffer, we also called
				960	<code class="function">cpu_buffer_reset()</code> to reset
				961	<code class="varname">last_task</code> and <code class="varname">last_is_kernel</code>, as
				962	we've already mentioned. During the processing, more samples may have
				963	been arriving in the CPU buffer; this is OK because we are careful to
				964	only update the tail iterator to how much we actually read - on the next
				965	buffer synchronisation, we will start again from that point.
				966	</p>
				967	</div>
				968	<div class="sect1" lang="en" xml:lang="en">
				969	<div class="titlepage">
				970	<div>
				971	<div>
				972	<h2 class="title" style="clear: both"><a id="dentry-cookies"></a>6. Identifying binary images</h2>
				973	</div>
				974	</div>
				975	</div>
				976	<p>
				977	In order to produce useful profiles, we need to be able to associate a
				978	particular PC value sample with an actual ELF binary on the disk. This
				979	leaves us with the problem of how to export this information to
				980	user-space. We create unique IDs that identify a particular directory
				981	entry (dentry), and write those IDs into the event buffer. Later on,
				982	the user-space daemon can call the <code class="function">lookup_dcookie</code>
				983	system call, which looks up the ID and fills in the full path of
				984	the binary image in the buffer user-space passes in. These IDs are
				985	maintained by the code in <code class="filename">fs/dcookies.c</code>; the
				986	cache lasts for as long as the daemon has the event buffer open.
				987	</p>
				988	</div>
				989	<div class="sect1" lang="en" xml:lang="en">
				990	<div class="titlepage">
				991	<div>
				992	<div>
				993	<h2 class="title" style="clear: both"><a id="finding-dentry"></a>7. Finding a sample's binary image and offset</h2>
				994	</div>
				995	</div>
				996	</div>
				997	<p>
				998	We haven't yet described how we process the absolute PC value into
				999	something usable by the user-space daemon. When we find a sample entered
				1000	into the CPU buffer, we traverse the list of mappings for the task
				1001	(remember, we will have seen a task switch earlier, so we know which
				1002	task's lists to look at). When a mapping is found that contains the PC
				1003	value, we look up the mapped file's dentry in the dcookie cache. This
				1004	gives the dcookie ID that will uniquely identify the mapped file. Then
				1005	we alter the absolute value such that it is an offset from the start of
				1006	the file being mapped (the mapping need not start at the start of the
				1007	actual file, so we have to consider the offset value of the mapping). We
				1008	store this dcookie ID into the event buffer; this identifies which
				1009	binary the samples following it are against.
				1010	In this manner, we have converted a PC value, which has transitory
				1011	meaning only, into a static offset value for later processing by the
				1012	daemon.
				1013	</p>
				1014	<p>
				1015	We also attempt to avoid the relatively expensive lookup of the dentry
				1016	cookie value by storing the cookie value directly into the dentry
				1017	itself; then we can simply derive the cookie value immediately when we
				1018	find the correct mapping.
				1019	</p>
				1020	</div>
				1021	</div>
				1022	<div class="chapter" lang="en" xml:lang="en">
				1023	<div class="titlepage">
				1024	<div>
				1025	<div>
				1026	<h2 class="title"><a id="sample-files"></a>Chapter 4. Generating sample files</h2>
				1027	</div>
				1028	</div>
				1029	</div>
				1030	<div class="toc">
				1031	<p>
				1032	<b>Table of Contents</b>
				1033	</p>
				1034	<dl>
				1035	<dt>
				1036	<span class="sect1">
				1037	<a href="#processing-buffer">1. Processing the buffer</a>
				1038	</span>
				1039	</dt>
				1040	<dd>
				1041	<dl>
				1042	<dt>
				1043	<span class="sect2">
				1044	<a href="#handling-kernel-samples">1.1. Handling kernel samples</a>
				1045	</span>
				1046	</dt>
				1047	</dl>
				1048	</dd>
				1049	<dt>
				1050	<span class="sect1">
				1051	<a href="#sample-file-generation">2. Locating and creating sample files</a>
				1052	</span>
				1053	</dt>
				1054	<dt>
				1055	<span class="sect1">
				1056	<a href="#sample-file-writing">3. Writing data to a sample file</a>
				1057	</span>
				1058	</dt>
				1059	</dl>
				1060	</div>
				1061	<div class="sect1" lang="en" xml:lang="en">
				1062	<div class="titlepage">
				1063	<div>
				1064	<div>
				1065	<h2 class="title" style="clear: both"><a id="processing-buffer"></a>1. Processing the buffer</h2>
				1066	</div>
				1067	</div>
				1068	</div>
				1069	<p>
				1070	Now we can move onto user-space in our description of how raw interrupt
				1071	samples are processed into useful information. As we described in
				1072	previous sections, the kernel OProfile driver creates a large buffer of
				1073	sample data consisting of offset values, interspersed with
				1074	notification of changes in context. These context changes indicate how
				1075	following samples should be attributed, and include task switches, CPU
				1076	changes, and which dcookie the sample value is against. By processing
				1077	this buffer entry-by-entry, we can determine where the samples should
				1078	be accredited to. This is particularly important when using the
				1079	<code class="option">--separate</code>.
				1080	</p>
				1081	<p>
				1082	The file <code class="filename">daemon/opd_trans.c</code> contains the basic routine
				1083	for the buffer processing. The <code class="varname">struct transient</code>
				1084	structure is used to hold changes in context. Its members are modified
				1085	as we process each entry; it is passed into the routines in
				1086	<code class="filename">daemon/opd_sfile.c</code> for actually logging the sample
				1087	to a particular sample file (which will be held in
				1088	<code class="filename">/var/lib/oprofile/samples/current</code>).
				1089	</p>
				1090	<p>
				1091	The buffer format is designed for conciseness, as high sampling rates
				1092	can easily generate a lot of data. Thus, context changes are prefixed
				1093	by an escape code, identified by <code class="function">is_escape_code()</code>.
				1094	If an escape code is found, the next entry in the buffer identifies
				1095	what type of context change is being read. These are handed off to
				1096	various handlers (see the <code class="varname">handlers</code> array), which
				1097	modify the transient structure as appropriate. If it's not an escape
				1098	code, then it must be a PC offset value, and the very next entry will
				1099	be the numeric hardware counter. These values are read and recorded
				1100	in the transient structure; we then do a lookup to find the correct
				1101	sample file, and log the sample, as described in the next section.
				1102	</p>
				1103	<div class="sect2" lang="en" xml:lang="en">
				1104	<div class="titlepage">
				1105	<div>
				1106	<div>
				1107	<h3 class="title"><a id="handling-kernel-samples"></a>1.1. Handling kernel samples</h3>
				1108	</div>
				1109	</div>
				1110	</div>
				1111	<p>
				1112	Samples from kernel code require a little special handling. Because
				1113	the binary text which the sample is against does not correspond to
				1114	any file that the kernel directly knows about, the OProfile driver
				1115	stores the absolute PC value in the buffer, instead of the file offset.
				1116	Of course, we need an offset against some particular binary. To handle
				1117	this, we keep a list of loaded modules by parsing
				1118	<code class="filename">/proc/modules</code> as needed. When a module is loaded,
				1119	a notification is placed in the OProfile buffer, and this triggers a
				1120	re-read. We store the module name, and the loading address and size.
				1121	This is also done for the main kernel image, as specified by the user.
				1122	The absolute PC value is matched against each address range, and
				1123	modified into an offset when the matching module is found. See
				1124	<code class="filename">daemon/opd_kernel.c</code> for the details.
				1125	</p>
				1126	</div>
				1127	</div>
				1128	<div class="sect1" lang="en" xml:lang="en">
				1129	<div class="titlepage">
				1130	<div>
				1131	<div>
				1132	<h2 class="title" style="clear: both"><a id="sample-file-generation"></a>2. Locating and creating sample files</h2>
				1133	</div>
				1134	</div>
				1135	</div>
				1136	<p>
				1137	We have a sample value and its satellite data stored in a
				1138	<code class="varname">struct transient</code>, and we must locate an
				1139	actual sample file to store the sample in, using the context
				1140	information in the transient structure as a key. The transient data to
				1141	sample file lookup is handled in
				1142	<code class="filename">daemon/opd_sfile.c</code>. A hash is taken of the
				1143	transient values that are relevant (depending upon the setting of
				1144	<code class="option">--separate</code>, some values might be irrelevant), and the
				1145	hash value is used to lookup the list of currently open sample files.
				1146	Of course, the sample file might not be found, in which case we need
				1147	to create and open it.
				1148	</p>
				1149	<p>
				1150	OProfile uses a rather complex scheme for naming sample files, in order
				1151	to make selecting relevant sample files easier for the post-profiling
				1152	utilities. The exact details of the scheme are given in
				1153	<code class="filename">oprofile-tests/pp_interface</code>, but for now it will
				1154	suffice to remember that the filename will include only relevant
				1155	information for the current settings, taken from the transient data. A
				1156	fully-specified filename looks something like :
				1157	</p>
				1158	<code class="computeroutput">
				1159	/var/lib/oprofile/samples/current/{root}/usr/bin/xmms/{dep}/{root}/lib/tls/libc-2.3.2.so/CPU_CLK_UNHALTED.100000.0.28082.28089.0
				1160	</code>
				1161	<p>
				1162	It should be clear that this identifies such information as the
				1163	application binary, the dependent (library) binary, the hardware event,
				1164	and the process and thread ID. Typically, not all this information is
				1165	needed, in which cases some values may be replaced with the token
				1166	<code class="filename">all</code>.
				1167	</p>
				1168	<p>
				1169	The code that generates this filename and opens the file is found in
				1170	<code class="filename">daemon/opd_mangling.c</code>. You may have realised that
				1171	at this point, we do not have the binary image file names, only the
				1172	dcookie values. In order to determine a file name, a dcookie value is
				1173	looked up in the dcookie cache. This is to be found in
				1174	<code class="filename">daemon/opd_cookie.c</code>. Since dcookies are both
				1175	persistent and unique during a sampling session, we can cache the
				1176	values. If the value is not found in the cache, then we ask the kernel
				1177	to do the lookup from value to file name for us by calling
				1178	<code class="function">lookup_dcookie()</code>. This looks up the value in a
				1179	kernel-side cache (see <code class="filename">fs/dcookies.c</code>) and returns
				1180	the fully-qualified file name to userspace.
				1181	</p>
				1182	</div>
				1183	<div class="sect1" lang="en" xml:lang="en">
				1184	<div class="titlepage">
				1185	<div>
				1186	<div>
				1187	<h2 class="title" style="clear: both"><a id="sample-file-writing"></a>3. Writing data to a sample file</h2>
				1188	</div>
				1189	</div>
				1190	</div>
				1191	<p>
				1192	Each specific sample file is a hashed collection, where the key is
				1193	the PC offset from the transient data, and the value is the number of
				1194	samples recorded against that offset. The files are
				1195	<code class="function">mmap()</code>ed into the daemon's memory space. The code
				1196	to actually log the write against the sample file can be found in
				1197	<code class="filename">libdb/</code>.
				1198	</p>
				1199	<p>
				1200	For recording stack traces, we have a more complicated sample filename
				1201	mangling scheme that allows us to identify cross-binary calls. We use
				1202	the same sample file format, where the key is a 64-bit value composed
				1203	from the from,to pair of offsets.
				1204	</p>
				1205	</div>
				1206	</div>
				1207	<div class="chapter" lang="en" xml:lang="en">
				1208	<div class="titlepage">
				1209	<div>
				1210	<div>
				1211	<h2 class="title"><a id="output"></a>Chapter 5. Generating useful output</h2>
				1212	</div>
				1213	</div>
				1214	</div>
				1215	<div class="toc">
				1216	<p>
				1217	<b>Table of Contents</b>
				1218	</p>
				1219	<dl>
				1220	<dt>
				1221	<span class="sect1">
				1222	<a href="#profile-specification">1. Handling the profile specification</a>
				1223	</span>
				1224	</dt>
				1225	<dt>
				1226	<span class="sect1">
				1227	<a href="#sample-file-collating">2. Collating the candidate sample files</a>
				1228	</span>
				1229	</dt>
				1230	<dd>
				1231	<dl>
				1232	<dt>
				1233	<span class="sect2">
				1234	<a href="#sample-file-classifying">2.1. Classifying sample files</a>
				1235	</span>
				1236	</dt>
				1237	<dt>
				1238	<span class="sect2">
				1239	<a href="#sample-file-inverting">2.2. Creating inverted profile lists</a>
				1240	</span>
				1241	</dt>
				1242	</dl>
				1243	</dd>
				1244	<dt>
				1245	<span class="sect1">
				1246	<a href="#generating-profile-data">3. Generating profile data</a>
				1247	</span>
				1248	</dt>
				1249	<dd>
				1250	<dl>
				1251	<dt>
				1252	<span class="sect2">
				1253	<a href="#bfd">3.1. Processing the binary image</a>
				1254	</span>
				1255	</dt>
				1256	<dt>
				1257	<span class="sect2">
				1258	<a href="#processing-sample-files">3.2. Processing the sample files</a>
				1259	</span>
				1260	</dt>
				1261	</dl>
				1262	</dd>
				1263	<dt>
				1264	<span class="sect1">
				1265	<a href="#generating-output">4. Generating output</a>
				1266	</span>
				1267	</dt>
				1268	</dl>
				1269	</div>
				1270	<p>
				1271	All of the tools used to generate human-readable output have to take
				1272	roughly the same steps to collect the data for processing. First, the
				1273	profile specification given by the user has to be parsed. Next, a list
				1274	of sample files matching the specification has to obtained. Using this
				1275	list, we need to locate the binary file for each sample file, and then
				1276	use them to extract meaningful data, before a final collation and
				1277	presentation to the user.
				1278	</p>
				1279	<div class="sect1" lang="en" xml:lang="en">
				1280	<div class="titlepage">
				1281	<div>
				1282	<div>
				1283	<h2 class="title" style="clear: both"><a id="profile-specification"></a>1. Handling the profile specification</h2>
				1284	</div>
				1285	</div>
				1286	</div>
				1287	<p>
				1288	The profile specification presented by the user is parsed in
				1289	the function <code class="function">profile_spec::create()</code>. This
				1290	creates an object representing the specification. Then we
				1291	use <code class="function">profile_spec::generate_file_list()</code>
				1292	to search for all sample files and match them against the
				1293	<code class="varname">profile_spec</code>.
				1294	</p>
				1295	<p>
				1296	To enable this matching process to work, the attributes of
				1297	each sample file is encoded in its filename. This is a low-tech
				1298	approach to matching specifications against candidate sample
				1299	files, but it works reasonably well. A typical sample file
				1300	might look like these:
				1301	</p>
				1302	<table xmlns="" border="0" style="background: #E0E0E0;" width="90%">
				1303	<tr>
				1304	<td>
				1305	<pre class="screen">
				1306	/var/lib/oprofile/samples/current/{root}/bin/ls/{dep}/{root}/bin/ls/{cg}/{root}/bin/ls/CPU_CLK_UNHALTED.100000.0.all.all.all
				1307	/var/lib/oprofile/samples/current/{root}/bin/ls/{dep}/{root}/bin/ls/CPU_CLK_UNHALTED.100000.0.all.all.all
				1308	/var/lib/oprofile/samples/current/{root}/bin/ls/{dep}/{root}/bin/ls/CPU_CLK_UNHALTED.100000.0.7423.7424.0
				1309	/var/lib/oprofile/samples/current/{kern}/r128/{dep}/{kern}/r128/CPU_CLK_UNHALTED.100000.0.all.all.all
				1310	</pre>
				1311	</td>
				1312	</tr>
				1313	</table>
				1314	<p>
				1315	This looks unnecessarily complex, but it's actually fairly simple. First
				1316	we have the session of the sample, here
				1317	<code class="filename">/var/lib/oprofile/samples/current</code>. This could
				1318	equally well be inside an archive from <span><strong class="command">oparchive</strong></span>.
				1319	Next we have one of the tokens <code class="filename">{root}</code> or
				1320	<code class="filename">{kern}</code>. <code class="filename">{root}</code> indicates
				1321	that the binary is found on a file system, and we will encode its path
				1322	in the next section (e.g. <code class="filename">/bin/ls</code>).
				1323	<code class="filename">{kern}</code> indicates a kernel module - on 2.6 kernels
				1324	the path information is not available from the kernel, so we have to
				1325	special-case kernel modules like this; we encode merely the name of the
				1326	module as loaded.
				1327	</p>
				1328	<p>
				1329	Next there is a <code class="filename">{dep}</code> token, indicating another
				1330	token/path which identifies the dependent binary image. This is used even for
				1331	the "primary" binary (i.e. the one that was
				1332	<code class="function">execve()</code>d), as it simplifies processing. Finally,
				1333	if this sample file is a normal flat profile, the actual file is next in
				1334	the path. If it's a call-graph sample file, we need one further
				1335	specification, to allow us to identify cross-binary arcs in the call
				1336	graph.
				1337	</p>
				1338	<p>
				1339	The actual sample file name is dot-separated, where the fields are, in
				1340	order: event name, event count, unit mask, task group ID, task ID, and
				1341	CPU number.
				1342	</p>
				1343	<p>
				1344	This sample file can be reliably parsed (with
				1345	<code class="function">parse_filename()</code>) into a
				1346	<code class="varname">filename_spec</code>. Finally, we can check whether to
				1347	include the sample file in the final results by comparing this
				1348	<code class="varname">filename_spec</code> against the
				1349	<code class="varname">profile_spec</code> the user specified (for the interested,
				1350	see <code class="function">valid_candidate()</code> and
				1351	<code class="function">profile_spec::match</code>). Then comes the really
				1352	complicated bit...
				1353	</p>
				1354	</div>
				1355	<div class="sect1" lang="en" xml:lang="en">
				1356	<div class="titlepage">
				1357	<div>
				1358	<div>
				1359	<h2 class="title" style="clear: both"><a id="sample-file-collating"></a>2. Collating the candidate sample files</h2>
				1360	</div>
				1361	</div>
				1362	</div>
				1363	<p>
				1364	At this point we have a duplicate-free list of sample files we need
				1365	to process. But first we need to do some further arrangement: we
				1366	need to classify each sample file, and we may also need to "invert"
				1367	the profiles.
				1368	</p>
				1369	<div class="sect2" lang="en" xml:lang="en">
				1370	<div class="titlepage">
				1371	<div>
				1372	<div>
				1373	<h3 class="title"><a id="sample-file-classifying"></a>2.1. Classifying sample files</h3>
				1374	</div>
				1375	</div>
				1376	</div>
				1377	<p>
				1378	It's possible for utilities like <span><strong class="command">opreport</strong></span> to show
				1379	data in columnar format: for example, we might want to show the results
				1380	of two threads within a process side-by-side. To do this, we need
				1381	to classify each sample file into classes - the classes correspond
				1382	with each <span><strong class="command">opreport</strong></span> column. The function that handles
				1383	this is <code class="function">arrange_profiles()</code>. Each sample file
				1384	is added to a particular class. If the sample file is the first in
				1385	its class, a template is generated from the sample file. Each template
				1386	describes a particular class (thus, in our example above, each template
				1387	will have a different thread ID, and this uniquely identifies each
				1388	class).
				1389	</p>
				1390	<p>
				1391	Each class has a list of "profile sets" matching that class's template.
				1392	A profile set is either a profile of the primary binary image, or any of
				1393	its dependent images. After all sample files have been listed in one of
				1394	the profile sets belonging to the classes, we have to name each class and
				1395	perform error-checking. This is done by
				1396	<code class="function">identify_classes()</code>; each class is checked to ensure
				1397	that its "axis" is the same as all the others. This is needed because
				1398	<span><strong class="command">opreport</strong></span> can't produce results in 3D format: we can
				1399	only differ in one aspect, such as thread ID or event name.
				1400	</p>
				1401	</div>
				1402	<div class="sect2" lang="en" xml:lang="en">
				1403	<div class="titlepage">
				1404	<div>
				1405	<div>
				1406	<h3 class="title"><a id="sample-file-inverting"></a>2.2. Creating inverted profile lists</h3>
				1407	</div>
				1408	</div>
				1409	</div>
				1410	<p>
				1411	Remember that if we're using certain profile separation options, such as
				1412	"--separate=lib", a single binary could be a dependent image to many
				1413	different binaries. For example, the C library image would be a
				1414	dependent image for most programs that have been profiled. As it
				1415	happens, this can cause severe performance problems: without some
				1416	re-arrangement, these dependent binary images would be opened each
				1417	time we need to process sample files for each program.
				1418	</p>
				1419	<p>
				1420	The solution is to "invert" the profiles via
				1421	<code class="function">invert_profiles()</code>. We create a new data structure
				1422	where the dependent binary is first, and the primary binary images using
				1423	that dependent binary are listed as sub-images. This helps our
				1424	performance problem, as now we only need to open each dependent image
				1425	once, when we process the list of inverted profiles.
				1426	</p>
				1427	</div>
				1428	</div>
				1429	<div class="sect1" lang="en" xml:lang="en">
				1430	<div class="titlepage">
				1431	<div>
				1432	<div>
				1433	<h2 class="title" style="clear: both"><a id="generating-profile-data"></a>3. Generating profile data</h2>
				1434	</div>
				1435	</div>
				1436	</div>
				1437	<p>
				1438	Things don't get any simpler at this point, unfortunately. At this point
				1439	we've collected and classified the sample files into the set of inverted
				1440	profiles, as described in the previous section. Now we need to process
				1441	each inverted profile and make something of the data. The entry point
				1442	for this is <code class="function">populate_for_image()</code>.
				1443	</p>
				1444	<div class="sect2" lang="en" xml:lang="en">
				1445	<div class="titlepage">
				1446	<div>
				1447	<div>
				1448	<h3 class="title"><a id="bfd"></a>3.1. Processing the binary image</h3>
				1449	</div>
				1450	</div>
				1451	</div>
				1452	<p>
				1453	The first thing we do with an inverted profile is attempt to open the
				1454	binary image (remember each inverted profile set is only for one binary
				1455	image, but may have many sample files to process). The
				1456	<code class="varname">op_bfd</code> class provides an abstracted interface to
				1457	this; internally it uses <code class="filename">libbfd</code>. The main purpose
				1458	of this class is to process the symbols for the binary image; this is
				1459	also where symbol filtering happens. This is actually quite tricky, but
				1460	should be clear from the source.
				1461	</p>
				1462	</div>
				1463	<div class="sect2" lang="en" xml:lang="en">
				1464	<div class="titlepage">
				1465	<div>
				1466	<div>
				1467	<h3 class="title"><a id="processing-sample-files"></a>3.2. Processing the sample files</h3>
				1468	</div>
				1469	</div>
				1470	</div>
				1471	<p>
				1472	The class <code class="varname">profile_container</code> is a hold-all that
				1473	contains all the processed results. It is a container of
				1474	<code class="varname">profile_t</code> objects. The
				1475	<code class="function">add_sample_files()</code> method uses
				1476	<code class="filename">libdb</code> to open the given sample file and add the
				1477	key/value types to the <code class="varname">profile_t</code>. Once this has been
				1478	done, <code class="function">profile_container::add()</code> is passed the
				1479	<code class="varname">profile_t</code> plus the <code class="varname">op_bfd</code> for
				1480	processing.
				1481	</p>
				1482	<p>
				1483	<code class="function">profile_container::add()</code> walks through the symbols
				1484	collected in the <code class="varname">op_bfd</code>.
				1485	<code class="function">op_bfd::get_symbol_range()</code> gives us the start and
				1486	end of the symbol as an offset from the start of the binary image,
				1487	then we interrogate the <code class="varname">profile_t</code> for the relevant samples
				1488	for that offset range. We create a <code class="varname">symbol_entry</code>
				1489	object for this symbol and fill it in. If needed, here we also collect
				1490	debug information from the <code class="varname">op_bfd</code>, and possibly
				1491	record the detailed sample information (as used by <span><strong class="command">opreport
				1492	-d</strong></span> and <span><strong class="command">opannotate</strong></span>).
				1493	Finally the <code class="varname">symbol_entry</code> is added to
				1494	a private container of <code class="varname">profile_container</code> - this
				1495	<code class="varname">symbol_container</code> holds all such processed symbols.
				1496	</p>
				1497	</div>
				1498	</div>
				1499	<div class="sect1" lang="en" xml:lang="en">
				1500	<div class="titlepage">
				1501	<div>
				1502	<div>
				1503	<h2 class="title" style="clear: both"><a id="generating-output"></a>4. Generating output</h2>
				1504	</div>
				1505	</div>
				1506	</div>
				1507	<p>
				1508	After the processing described in the previous section, we've now got
				1509	full details of what we need to output stored in the
				1510	<code class="varname">profile_container</code> on a symbol-by-symbol basis. To
				1511	produce output, we need to replay that data and format it suitably.
				1512	</p>
				1513	<p>
				1514	<span><strong class="command">opreport</strong></span> first asks the
				1515	<code class="varname">profile_container</code> for a
				1516	<code class="varname">symbol_collection</code> (this is also where thresholding
				1517	happens).
				1518	This is sorted, then a
				1519	<code class="varname">opreport_formatter</code> is initialised.
				1520	This object initialises a set of field formatters as requested. Then
				1521	<code class="function">opreport_formatter::output()</code> is called. This
				1522	iterates through the (sorted) <code class="varname">symbol_collection</code>;
				1523	for each entry, the selected fields (as set by the
				1524	<code class="varname">format_flags</code> options) are output by calling the
				1525	field formatters, with the <code class="varname">symbol_entry</code> passed in.
				1526	</p>
				1527	</div>
				1528	</div>
				1529	<div class="glossary">
				1530	<div class="titlepage">
				1531	<div>
				1532	<div>
				1533	<h2 class="title"><a id="glossary"></a>Glossary of OProfile source concepts and types</h2>
				1534	</div>
				1535	</div>
				1536	</div>
				1537	<dl>
				1538	<dt>application image</dt>
				1539	<dd>
				1540	<p>
				1541	The primary binary image used by an application. This is derived
				1542	from the kernel and corresponds to the binary started upon running
				1543	an application: for example, <code class="filename">/bin/bash</code>.
				1544	</p>
				1545	</dd>
				1546	<dt>binary image</dt>
				1547	<dd>
				1548	<p>
				1549	An ELF file containing executable code: this includes kernel modules,
				1550	the kernel itself (a.k.a. <code class="filename">vmlinux</code>), shared libraries,
				1551	and application binaries.
				1552	</p>
				1553	</dd>
				1554	<dt>dcookie</dt>
				1555	<dd>
				1556	<p>
				1557	Short for "dentry cookie". A unique ID that can be looked up to provide
				1558	the full path name of a binary image.
				1559	</p>
				1560	</dd>
				1561	<dt>dependent image</dt>
				1562	<dd>
				1563	<p>
				1564	A binary image that is dependent upon an application, used with
				1565	per-application separation. Most commonly, shared libraries. For example,
				1566	if <code class="filename">/bin/bash</code> is running and we take
				1567	some samples inside the C library itself due to <span><strong class="command">bash</strong></span>
				1568	calling library code, then the image <code class="filename">/lib/libc.so</code>
				1569	would be dependent upon <code class="filename">/bin/bash</code>.
				1570	</p>
				1571	</dd>
				1572	<dt>merging</dt>
				1573	<dd>
				1574	<p>
				1575	This refers to the ability to merge several distinct sample files
				1576	into one set of data at runtime, in the post-profiling tools. For example,
				1577	per-thread sample files can be merged into one set of data, because
				1578	they are compatible (i.e. the aggregation of the data is meaningful),
				1579	but it's not possible to merge sample files for two different events,
				1580	because there would be no useful meaning to the results.
				1581	</p>
				1582	</dd>
				1583	<dt>profile class</dt>
				1584	<dd>
				1585	<p>
				1586	A collection of profile data that has been collected under the same
				1587	class template. For example, if we're using <span><strong class="command">opreport</strong></span>
				1588	to show results after profiling with two performance counters enabled
				1589	profiling <code class="constant">DATA_MEM_REFS</code> and <code class="constant">CPU_CLK_UNHALTED</code>,
				1590	there would be two profile classes, one for each event. Or if we're on
				1591	an SMP system and doing per-cpu profiling, and we request
				1592	<span><strong class="command">opreport</strong></span> to show results for each CPU side-by-side,
				1593	there would be a profile class for each CPU.
				1594	</p>
				1595	</dd>
				1596	<dt>profile specification</dt>
				1597	<dd>
				1598	<p>
				1599	The parameters the user passes to the post-profiling tools that limit
				1600	what sample files are used. This specification is matched against
				1601	the available sample files to generate a selection of profile data.
				1602	</p>
				1603	</dd>
				1604	<dt>profile template</dt>
				1605	<dd>
				1606	<p>
				1607	The parameters that define what goes in a particular profile class.
				1608	This includes a symbolic name (e.g. "cpu:1") and the code-usable
				1609	equivalent.
				1610	</p>
				1611	</dd>
				1612	</dl>
				1613	</div>
				1614	</div>
				1615	</body>
				1616	</html>