blob: e048f687c4feb7822718b8f24edea3e7b0295c9b [file] [log] [blame]
San Mehate20e1342009-06-03 15:36:35 -07001/*
2 * block queue tracing application
3 *
4 * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5 * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
6 *
7 * Rewrite to have a single thread per CPU (managing all devices on that CPU)
8 * Alan D. Brunelle <alan.brunelle@hp.com> - January 2009
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 *
24 */
25
26#include <errno.h>
27#include <stdarg.h>
28#include <stdio.h>
29#include <stdlib.h>
30#include <string.h>
31#include <fcntl.h>
32#include <getopt.h>
33#include <sched.h>
34#include <unistd.h>
35#include <poll.h>
36#include <signal.h>
37#include <pthread.h>
38#include <locale.h>
39#include <sys/ioctl.h>
40#include <sys/types.h>
41#include <sys/stat.h>
42#include <sys/vfs.h>
43#include <sys/mman.h>
44#include <sys/param.h>
45#include <sys/time.h>
46#include <sys/resource.h>
47#include <sys/socket.h>
48#include <netinet/in.h>
49#include <arpa/inet.h>
50#include <netdb.h>
51#include <sys/sendfile.h>
52
53#include "btt/list.h"
54#include "blktrace.h"
55
56/*
57 * You may want to increase this even more, if you are logging at a high
58 * rate and see skipped/missed events
59 */
60#define BUF_SIZE (512 * 1024)
61#define BUF_NR (4)
62
63#define FILE_VBUF_SIZE (128 * 1024)
64
65#define DEBUGFS_TYPE (0x64626720)
66#define TRACE_NET_PORT (8462)
67
68enum {
69 Net_none = 0,
70 Net_server,
71 Net_client,
72};
73
74enum thread_status {
75 Th_running,
76 Th_leaving,
77 Th_error
78};
79
80/*
81 * Generic stats collected: nevents can be _roughly_ estimated by data_read
82 * (discounting pdu...)
83 *
84 * These fields are updated w/ pdc_dr_update & pdc_nev_update below.
85 */
86struct pdc_stats {
87 unsigned long long data_read;
88 unsigned long long nevents;
89};
90
91struct devpath {
92 struct list_head head;
93 char *path; /* path to device special file */
94 char *buts_name; /* name returned from bt kernel code */
95 struct pdc_stats *stats;
Wei Wangc80018f2018-08-29 14:04:30 -070096 int fd, ncpus;
San Mehate20e1342009-06-03 15:36:35 -070097 unsigned long long drops;
98
99 /*
100 * For piped output only:
101 *
102 * Each tracer will have a tracer_devpath_head that it will add new
103 * data onto. It's list is protected above (tracer_devpath_head.mutex)
104 * and it will signal the processing thread using the dp_cond,
105 * dp_mutex & dp_entries variables above.
106 */
107 struct tracer_devpath_head *heads;
108
109 /*
110 * For network server mode only:
111 */
112 struct cl_host *ch;
113 u32 cl_id;
114 time_t cl_connect_time;
115 struct io_info *ios;
116};
117
118/*
119 * For piped output to stdout we will have each tracer thread (one per dev)
120 * tack buffers read from the relay queues on a per-device list.
121 *
122 * The main thread will then collect trace buffers from each of lists in turn.
123 *
124 * We will use a mutex to guard each of the trace_buf list. The tracers
125 * can then signal the main thread using <dp_cond,dp_mutex> and
126 * dp_entries. (When dp_entries is 0, and a tracer adds an entry it will
127 * signal. When dp_entries is 0, the main thread will wait for that condition
128 * to be signalled.)
129 *
130 * adb: It may be better just to have a large buffer per tracer per dev,
131 * and then use it as a ring-buffer. This would certainly cut down a lot
132 * of malloc/free thrashing, at the cost of more memory movements (potentially).
133 */
134struct trace_buf {
135 struct list_head head;
136 struct devpath *dpp;
137 void *buf;
138 int cpu, len;
139};
140
141struct tracer_devpath_head {
142 pthread_mutex_t mutex;
143 struct list_head head;
144 struct trace_buf *prev;
145};
146
147/*
148 * Used to handle the mmap() interfaces for output file (containing traces)
149 */
150struct mmap_info {
151 void *fs_buf;
152 unsigned long long fs_size, fs_max_size, fs_off, fs_buf_len;
153 unsigned long buf_size, buf_nr;
154 int pagesize;
155};
156
157/*
158 * Each thread doing work on a (client) side of blktrace will have one
159 * of these. The ios array contains input/output information, pfds holds
160 * poll() data. The volatile's provide flags to/from the main executing
161 * thread.
162 */
163struct tracer {
164 struct list_head head;
165 struct io_info *ios;
166 struct pollfd *pfds;
167 pthread_t thread;
168 int cpu, nios;
169 volatile int status, is_done;
170};
171
172/*
173 * networking stuff follows. we include a magic number so we know whether
174 * to endianness convert or not.
175 *
176 * The len field is overloaded:
177 * 0 - Indicates an "open" - allowing the server to set up for a dev/cpu
178 * 1 - Indicates a "close" - Shut down connection orderly
179 *
180 * The cpu field is overloaded on close: it will contain the number of drops.
181 */
182struct blktrace_net_hdr {
183 u32 magic; /* same as trace magic */
184 char buts_name[32]; /* trace name */
185 u32 cpu; /* for which cpu */
186 u32 max_cpus;
187 u32 len; /* length of following trace data */
188 u32 cl_id; /* id for set of client per-cpu connections */
189 u32 buf_size; /* client buf_size for this trace */
190 u32 buf_nr; /* client buf_nr for this trace */
191 u32 page_size; /* client page_size for this trace */
192};
193
194/*
195 * Each host encountered has one of these. The head is used to link this
196 * on to the network server's ch_list. Connections associated with this
197 * host are linked on conn_list, and any devices traced on that host
198 * are connected on the devpaths list.
199 */
200struct cl_host {
201 struct list_head head;
202 struct list_head conn_list;
203 struct list_head devpaths;
204 struct net_server_s *ns;
205 char *hostname;
206 struct in_addr cl_in_addr;
207 int connects, ndevs, cl_opens;
208};
209
210/*
211 * Each connection (client to server socket ('fd')) has one of these. A
212 * back reference to the host ('ch'), and lists headers (for the host
213 * list, and the network server conn_list) are also included.
214 */
215struct cl_conn {
216 struct list_head ch_head, ns_head;
217 struct cl_host *ch;
218 int fd, ncpus;
219 time_t connect_time;
220};
221
222/*
223 * The network server requires some poll structures to be maintained -
224 * one per conection currently on conn_list. The nchs/ch_list values
225 * are for each host connected to this server. The addr field is used
226 * for scratch as new connections are established.
227 */
228struct net_server_s {
229 struct list_head conn_list;
230 struct list_head ch_list;
231 struct pollfd *pfds;
232 int listen_fd, connects, nchs;
233 struct sockaddr_in addr;
234};
235
236/*
237 * This structure is (generically) used to providide information
238 * for a read-to-write set of values.
239 *
240 * ifn & ifd represent input information
241 *
242 * ofn, ofd, ofp, obuf & mmap_info are used for output file (optionally).
243 */
244struct io_info {
245 struct devpath *dpp;
246 FILE *ofp;
247 char *obuf;
248 struct cl_conn *nc; /* Server network connection */
249
250 /*
251 * mmap controlled output files
252 */
253 struct mmap_info mmap_info;
254
255 /*
256 * Client network fields
257 */
258 unsigned int ready;
259 unsigned long long data_queued;
260
261 /*
262 * Input/output file descriptors & names
263 */
264 int ifd, ofd;
265 char ifn[MAXPATHLEN + 64];
266 char ofn[MAXPATHLEN + 64];
267};
268
269static char blktrace_version[] = "2.0.0";
270
271/*
272 * Linkage to blktrace helper routines (trace conversions)
273 */
274int data_is_native = -1;
275
276static int ndevs;
Wei Wangc80018f2018-08-29 14:04:30 -0700277static int max_cpus;
San Mehate20e1342009-06-03 15:36:35 -0700278static int ncpus;
Wei Wangc80018f2018-08-29 14:04:30 -0700279static cpu_set_t *online_cpus;
San Mehate20e1342009-06-03 15:36:35 -0700280static int pagesize;
281static int act_mask = ~0U;
282static int kill_running_trace;
283static int stop_watch;
284static int piped_output;
285
286static char *debugfs_path = "/sys/kernel/debug";
287static char *output_name;
288static char *output_dir;
289
290static unsigned long buf_size = BUF_SIZE;
291static unsigned long buf_nr = BUF_NR;
292
293static FILE *pfp;
294
295static LIST_HEAD(devpaths);
296static LIST_HEAD(tracers);
297
298static volatile int done;
299
300/*
301 * tracer threads add entries, the main thread takes them off and processes
302 * them. These protect the dp_entries variable.
303 */
304static pthread_cond_t dp_cond = PTHREAD_COND_INITIALIZER;
305static pthread_mutex_t dp_mutex = PTHREAD_MUTEX_INITIALIZER;
306static volatile int dp_entries;
307
308/*
309 * These synchronize master / thread interactions.
310 */
311static pthread_cond_t mt_cond = PTHREAD_COND_INITIALIZER;
312static pthread_mutex_t mt_mutex = PTHREAD_MUTEX_INITIALIZER;
313static volatile int nthreads_running;
314static volatile int nthreads_leaving;
315static volatile int nthreads_error;
316static volatile int tracers_run;
317
318/*
319 * network cmd line params
320 */
321static struct sockaddr_in hostname_addr;
322static char hostname[MAXHOSTNAMELEN];
323static int net_port = TRACE_NET_PORT;
324static int net_use_sendfile = 1;
325static int net_mode;
326static int *cl_fds;
327
328static int (*handle_pfds)(struct tracer *, int, int);
329static int (*handle_list)(struct tracer_devpath_head *, struct list_head *);
330
331#define S_OPTS "d:a:A:r:o:kw:vVb:n:D:lh:p:sI:"
332static struct option l_opts[] = {
333 {
334 .name = "dev",
335 .has_arg = required_argument,
336 .flag = NULL,
337 .val = 'd'
338 },
339 {
340 .name = "input-devs",
341 .has_arg = required_argument,
342 .flag = NULL,
343 .val = 'I'
344 },
345 {
346 .name = "act-mask",
347 .has_arg = required_argument,
348 .flag = NULL,
349 .val = 'a'
350 },
351 {
352 .name = "set-mask",
353 .has_arg = required_argument,
354 .flag = NULL,
355 .val = 'A'
356 },
357 {
358 .name = "relay",
359 .has_arg = required_argument,
360 .flag = NULL,
361 .val = 'r'
362 },
363 {
364 .name = "output",
365 .has_arg = required_argument,
366 .flag = NULL,
367 .val = 'o'
368 },
369 {
370 .name = "kill",
371 .has_arg = no_argument,
372 .flag = NULL,
373 .val = 'k'
374 },
375 {
376 .name = "stopwatch",
377 .has_arg = required_argument,
378 .flag = NULL,
379 .val = 'w'
380 },
381 {
382 .name = "version",
383 .has_arg = no_argument,
384 .flag = NULL,
385 .val = 'v'
386 },
387 {
388 .name = "version",
389 .has_arg = no_argument,
390 .flag = NULL,
391 .val = 'V'
392 },
393 {
394 .name = "buffer-size",
395 .has_arg = required_argument,
396 .flag = NULL,
397 .val = 'b'
398 },
399 {
400 .name = "num-sub-buffers",
401 .has_arg = required_argument,
402 .flag = NULL,
403 .val = 'n'
404 },
405 {
406 .name = "output-dir",
407 .has_arg = required_argument,
408 .flag = NULL,
409 .val = 'D'
410 },
411 {
412 .name = "listen",
413 .has_arg = no_argument,
414 .flag = NULL,
415 .val = 'l'
416 },
417 {
418 .name = "host",
419 .has_arg = required_argument,
420 .flag = NULL,
421 .val = 'h'
422 },
423 {
424 .name = "port",
425 .has_arg = required_argument,
426 .flag = NULL,
427 .val = 'p'
428 },
429 {
430 .name = "no-sendfile",
431 .has_arg = no_argument,
432 .flag = NULL,
433 .val = 's'
434 },
435 {
436 .name = NULL,
437 }
438};
439
Wei Wangc80018f2018-08-29 14:04:30 -0700440static char usage_str[] = "\n\n" \
441 "-d <dev> | --dev=<dev>\n" \
442 "[ -r <debugfs path> | --relay=<debugfs path> ]\n" \
443 "[ -o <file> | --output=<file>]\n" \
444 "[ -D <dir> | --output-dir=<dir>\n" \
445 "[ -w <time> | --stopwatch=<time>]\n" \
446 "[ -a <action field> | --act-mask=<action field>]\n" \
447 "[ -A <action mask> | --set-mask=<action mask>]\n" \
448 "[ -b <size> | --buffer-size]\n" \
449 "[ -n <number> | --num-sub-buffers=<number>]\n" \
450 "[ -l | --listen]\n" \
451 "[ -h <hostname> | --host=<hostname>]\n" \
452 "[ -p <port number> | --port=<port number>]\n" \
453 "[ -s | --no-sendfile]\n" \
454 "[ -I <devs file> | --input-devs=<devs file>]\n" \
455 "[ -v <version> | --version]\n" \
456 "[ -V <version> | --version]\n" \
457
San Mehate20e1342009-06-03 15:36:35 -0700458 "\t-d Use specified device. May also be given last after options\n" \
459 "\t-r Path to mounted debugfs, defaults to /sys/kernel/debug\n" \
460 "\t-o File(s) to send output to\n" \
461 "\t-D Directory to prepend to output file names\n" \
San Mehate20e1342009-06-03 15:36:35 -0700462 "\t-w Stop after defined time, in seconds\n" \
463 "\t-a Only trace specified actions. See documentation\n" \
464 "\t-A Give trace mask as a single value. See documentation\n" \
Wei Wangc80018f2018-08-29 14:04:30 -0700465 "\t-b Sub buffer size in KiB (default 512)\n" \
466 "\t-n Number of sub buffers (default 4)\n" \
San Mehate20e1342009-06-03 15:36:35 -0700467 "\t-l Run in network listen mode (blktrace server)\n" \
468 "\t-h Run in network client mode, connecting to the given host\n" \
469 "\t-p Network port to use (default 8462)\n" \
470 "\t-s Make the network client NOT use sendfile() to transfer data\n" \
471 "\t-I Add devices found in <devs file>\n" \
Wei Wangc80018f2018-08-29 14:04:30 -0700472 "\t-v Print program version info\n" \
San Mehate20e1342009-06-03 15:36:35 -0700473 "\t-V Print program version info\n\n";
474
475static void clear_events(struct pollfd *pfd)
476{
477 pfd->events = 0;
478 pfd->revents = 0;
479}
480
481static inline int net_client_use_sendfile(void)
482{
483 return net_mode == Net_client && net_use_sendfile;
484}
485
486static inline int net_client_use_send(void)
487{
488 return net_mode == Net_client && !net_use_sendfile;
489}
490
491static inline int use_tracer_devpaths(void)
492{
493 return piped_output || net_client_use_send();
494}
495
496static inline int in_addr_eq(struct in_addr a, struct in_addr b)
497{
498 return a.s_addr == b.s_addr;
499}
500
501static inline void pdc_dr_update(struct devpath *dpp, int cpu, int data_read)
502{
503 dpp->stats[cpu].data_read += data_read;
504}
505
506static inline void pdc_nev_update(struct devpath *dpp, int cpu, int nevents)
507{
508 dpp->stats[cpu].nevents += nevents;
509}
510
511static void show_usage(char *prog)
512{
Wei Wangc80018f2018-08-29 14:04:30 -0700513 fprintf(stderr, "Usage: %s %s", prog, usage_str);
San Mehate20e1342009-06-03 15:36:35 -0700514}
515
516/*
517 * Create a timespec 'msec' milliseconds into the future
518 */
519static inline void make_timespec(struct timespec *tsp, long delta_msec)
520{
521 struct timeval now;
522
523 gettimeofday(&now, NULL);
524 tsp->tv_sec = now.tv_sec;
525 tsp->tv_nsec = 1000L * now.tv_usec;
526
527 tsp->tv_nsec += (delta_msec * 1000000L);
528 if (tsp->tv_nsec > 1000000000L) {
529 long secs = tsp->tv_nsec / 1000000000L;
530
531 tsp->tv_sec += secs;
532 tsp->tv_nsec -= (secs * 1000000000L);
533 }
534}
535
536/*
537 * Add a timer to ensure wait ends
538 */
539static void t_pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
540{
541 struct timespec ts;
542
543 make_timespec(&ts, 50);
544 pthread_cond_timedwait(cond, mutex, &ts);
545}
546
547static void unblock_tracers(void)
548{
549 pthread_mutex_lock(&mt_mutex);
550 tracers_run = 1;
551 pthread_cond_broadcast(&mt_cond);
552 pthread_mutex_unlock(&mt_mutex);
553}
554
555static void tracer_wait_unblock(struct tracer *tp)
556{
557 pthread_mutex_lock(&mt_mutex);
558 while (!tp->is_done && !tracers_run)
559 pthread_cond_wait(&mt_cond, &mt_mutex);
560 pthread_mutex_unlock(&mt_mutex);
561}
562
563static void tracer_signal_ready(struct tracer *tp,
564 enum thread_status th_status,
565 int status)
566{
567 pthread_mutex_lock(&mt_mutex);
568 tp->status = status;
569
570 if (th_status == Th_running)
571 nthreads_running++;
572 else if (th_status == Th_error)
573 nthreads_error++;
574 else
575 nthreads_leaving++;
576
577 pthread_cond_signal(&mt_cond);
578 pthread_mutex_unlock(&mt_mutex);
579}
580
581static void wait_tracers_ready(int ncpus_started)
582{
583 pthread_mutex_lock(&mt_mutex);
584 while ((nthreads_running + nthreads_error) < ncpus_started)
585 t_pthread_cond_wait(&mt_cond, &mt_mutex);
586 pthread_mutex_unlock(&mt_mutex);
587}
588
589static void wait_tracers_leaving(void)
590{
591 pthread_mutex_lock(&mt_mutex);
592 while (nthreads_leaving < nthreads_running)
593 t_pthread_cond_wait(&mt_cond, &mt_mutex);
594 pthread_mutex_unlock(&mt_mutex);
595}
596
597static void init_mmap_info(struct mmap_info *mip)
598{
599 mip->buf_size = buf_size;
600 mip->buf_nr = buf_nr;
601 mip->pagesize = pagesize;
602}
603
604static void net_close_connection(int *fd)
605{
606 shutdown(*fd, SHUT_RDWR);
607 close(*fd);
608 *fd = -1;
609}
610
611static void dpp_free(struct devpath *dpp)
612{
613 if (dpp->stats)
614 free(dpp->stats);
615 if (dpp->ios)
616 free(dpp->ios);
617 if (dpp->path)
618 free(dpp->path);
619 if (dpp->buts_name)
620 free(dpp->buts_name);
621 free(dpp);
622}
623
624static int lock_on_cpu(int cpu)
625{
Wei Wangc80018f2018-08-29 14:04:30 -0700626 cpu_set_t * cpu_mask;
627 size_t size;
San Mehate20e1342009-06-03 15:36:35 -0700628
Wei Wangc80018f2018-08-29 14:04:30 -0700629 cpu_mask = CPU_ALLOC(max_cpus);
630 size = CPU_ALLOC_SIZE(max_cpus);
631
632 CPU_ZERO_S(size, cpu_mask);
633 CPU_SET_S(cpu, size, cpu_mask);
634 if (sched_setaffinity(0, size, cpu_mask) < 0) {
635 CPU_FREE(cpu_mask);
San Mehate20e1342009-06-03 15:36:35 -0700636 return errno;
Wei Wangc80018f2018-08-29 14:04:30 -0700637 }
San Mehate20e1342009-06-03 15:36:35 -0700638
Wei Wangc80018f2018-08-29 14:04:30 -0700639 CPU_FREE(cpu_mask);
San Mehate20e1342009-06-03 15:36:35 -0700640 return 0;
641}
642
San Mehate20e1342009-06-03 15:36:35 -0700643static int increase_limit(int resource, rlim_t increase)
644{
645 struct rlimit rlim;
646 int save_errno = errno;
647
648 if (!getrlimit(resource, &rlim)) {
649 rlim.rlim_cur += increase;
650 if (rlim.rlim_cur >= rlim.rlim_max)
651 rlim.rlim_max = rlim.rlim_cur + increase;
652
653 if (!setrlimit(resource, &rlim))
654 return 1;
655 }
656
657 errno = save_errno;
658 return 0;
659}
San Mehate20e1342009-06-03 15:36:35 -0700660
661static int handle_open_failure(void)
662{
663 if (errno == ENFILE || errno == EMFILE)
San Mehate20e1342009-06-03 15:36:35 -0700664 return increase_limit(RLIMIT_NOFILE, 16);
San Mehate20e1342009-06-03 15:36:35 -0700665 return 0;
666}
667
668static int handle_mem_failure(size_t length)
669{
670 if (errno == ENFILE)
671 return handle_open_failure();
672 else if (errno == ENOMEM)
San Mehate20e1342009-06-03 15:36:35 -0700673 return increase_limit(RLIMIT_MEMLOCK, 2 * length);
San Mehate20e1342009-06-03 15:36:35 -0700674 return 0;
675}
676
677static FILE *my_fopen(const char *path, const char *mode)
678{
679 FILE *fp;
680
681 do {
682 fp = fopen(path, mode);
683 } while (fp == NULL && handle_open_failure());
684
685 return fp;
686}
687
688static int my_open(const char *path, int flags)
689{
690 int fd;
691
692 do {
693 fd = open(path, flags);
694 } while (fd < 0 && handle_open_failure());
695
696 return fd;
697}
698
699static int my_socket(int domain, int type, int protocol)
700{
701 int fd;
702
703 do {
704 fd = socket(domain, type, protocol);
705 } while (fd < 0 && handle_open_failure());
706
707 return fd;
708}
709
710static int my_accept(int sockfd, struct sockaddr *addr, socklen_t *addrlen)
711{
712 int fd;
713
714 do {
715 fd = accept(sockfd, addr, addrlen);
716 } while (fd < 0 && handle_open_failure());
717
718 return fd;
719}
720
721static void *my_mmap(void *addr, size_t length, int prot, int flags, int fd,
722 off_t offset)
723{
724 void *new;
725
726 do {
727 new = mmap(addr, length, prot, flags, fd, offset);
728 } while (new == MAP_FAILED && handle_mem_failure(length));
729
730 return new;
731}
732
Wei Wangc80018f2018-08-29 14:04:30 -0700733static int my_mlock(struct tracer *tp,
734 const void *addr, size_t len)
San Mehate20e1342009-06-03 15:36:35 -0700735{
Wei Wangc80018f2018-08-29 14:04:30 -0700736 int ret, retry = 0;
San Mehate20e1342009-06-03 15:36:35 -0700737
738 do {
739 ret = mlock(addr, len);
Wei Wangc80018f2018-08-29 14:04:30 -0700740 if ((retry >= 10) && tp && tp->is_done)
741 break;
742 retry++;
San Mehate20e1342009-06-03 15:36:35 -0700743 } while (ret < 0 && handle_mem_failure(len));
744
745 return ret;
746}
747
Wei Wangc80018f2018-08-29 14:04:30 -0700748static int setup_mmap(int fd, unsigned int maxlen,
749 struct mmap_info *mip,
750 struct tracer *tp)
San Mehate20e1342009-06-03 15:36:35 -0700751{
752 if (mip->fs_off + maxlen > mip->fs_buf_len) {
753 unsigned long nr = max(16, mip->buf_nr);
754
755 if (mip->fs_buf) {
756 munlock(mip->fs_buf, mip->fs_buf_len);
757 munmap(mip->fs_buf, mip->fs_buf_len);
758 mip->fs_buf = NULL;
759 }
760
761 mip->fs_off = mip->fs_size & (mip->pagesize - 1);
762 mip->fs_buf_len = (nr * mip->buf_size) - mip->fs_off;
763 mip->fs_max_size += mip->fs_buf_len;
764
765 if (ftruncate(fd, mip->fs_max_size) < 0) {
766 perror("setup_mmap: ftruncate");
767 return 1;
768 }
769
770 mip->fs_buf = my_mmap(NULL, mip->fs_buf_len, PROT_WRITE,
771 MAP_SHARED, fd,
772 mip->fs_size - mip->fs_off);
773 if (mip->fs_buf == MAP_FAILED) {
774 perror("setup_mmap: mmap");
775 return 1;
776 }
Wei Wangc80018f2018-08-29 14:04:30 -0700777 if (my_mlock(tp, mip->fs_buf, mip->fs_buf_len) < 0) {
778 perror("setup_mlock: mlock");
779 return 1;
780 }
San Mehate20e1342009-06-03 15:36:35 -0700781 }
782
783 return 0;
784}
785
786static int __stop_trace(int fd)
787{
788 /*
789 * Should be stopped, don't complain if it isn't
790 */
791 ioctl(fd, BLKTRACESTOP);
792 return ioctl(fd, BLKTRACETEARDOWN);
793}
794
795static int write_data(char *buf, int len)
796{
797 int ret;
798
799rewrite:
800 ret = fwrite(buf, len, 1, pfp);
801 if (ferror(pfp) || ret != 1) {
802 if (errno == EINTR) {
803 clearerr(pfp);
804 goto rewrite;
805 }
806
807 if (!piped_output || (errno != EPIPE && errno != EBADF)) {
808 fprintf(stderr, "write(%d) failed: %d/%s\n",
809 len, errno, strerror(errno));
810 }
811 goto err;
812 }
813
814 fflush(pfp);
815 return 0;
816
817err:
818 clearerr(pfp);
819 return 1;
820}
821
822/*
823 * Returns the number of bytes read (successfully)
824 */
825static int __net_recv_data(int fd, void *buf, unsigned int len)
826{
827 unsigned int bytes_left = len;
828
829 while (bytes_left && !done) {
830 int ret = recv(fd, buf, bytes_left, MSG_WAITALL);
831
832 if (ret == 0)
833 break;
834 else if (ret < 0) {
835 if (errno == EAGAIN) {
836 usleep(50);
837 continue;
838 }
839 perror("server: net_recv_data: recv failed");
840 break;
841 } else {
842 buf += ret;
843 bytes_left -= ret;
844 }
845 }
846
847 return len - bytes_left;
848}
849
850static int net_recv_data(int fd, void *buf, unsigned int len)
851{
852 return __net_recv_data(fd, buf, len);
853}
854
855/*
856 * Returns number of bytes written
857 */
858static int net_send_data(int fd, void *buf, unsigned int buf_len)
859{
860 int ret;
861 unsigned int bytes_left = buf_len;
862
863 while (bytes_left) {
864 ret = send(fd, buf, bytes_left, 0);
865 if (ret < 0) {
866 perror("send");
867 break;
868 }
869
870 buf += ret;
871 bytes_left -= ret;
872 }
873
874 return buf_len - bytes_left;
875}
876
877static int net_send_header(int fd, int cpu, char *buts_name, int len)
878{
879 struct blktrace_net_hdr hdr;
880
881 memset(&hdr, 0, sizeof(hdr));
882
883 hdr.magic = BLK_IO_TRACE_MAGIC;
Wei Wangc80018f2018-08-29 14:04:30 -0700884 memset(hdr.buts_name, 0, sizeof(hdr.buts_name));
San Mehate20e1342009-06-03 15:36:35 -0700885 strncpy(hdr.buts_name, buts_name, sizeof(hdr.buts_name));
Wei Wangc80018f2018-08-29 14:04:30 -0700886 hdr.buts_name[sizeof(hdr.buts_name) - 1] = '\0';
San Mehate20e1342009-06-03 15:36:35 -0700887 hdr.cpu = cpu;
Wei Wangc80018f2018-08-29 14:04:30 -0700888 hdr.max_cpus = max_cpus;
San Mehate20e1342009-06-03 15:36:35 -0700889 hdr.len = len;
890 hdr.cl_id = getpid();
891 hdr.buf_size = buf_size;
892 hdr.buf_nr = buf_nr;
893 hdr.page_size = pagesize;
894
895 return net_send_data(fd, &hdr, sizeof(hdr)) != sizeof(hdr);
896}
897
898static void net_send_open_close(int fd, int cpu, char *buts_name, int len)
899{
900 struct blktrace_net_hdr ret_hdr;
901
902 net_send_header(fd, cpu, buts_name, len);
903 net_recv_data(fd, &ret_hdr, sizeof(ret_hdr));
904}
905
906static void net_send_open(int fd, int cpu, char *buts_name)
907{
908 net_send_open_close(fd, cpu, buts_name, 0);
909}
910
911static void net_send_close(int fd, char *buts_name, int drops)
912{
913 /*
914 * Overload CPU w/ number of drops
915 *
916 * XXX: Need to clear/set done around call - done=1 (which
917 * is true here) stops reads from happening... :-(
918 */
919 done = 0;
920 net_send_open_close(fd, drops, buts_name, 1);
921 done = 1;
922}
923
924static void ack_open_close(int fd, char *buts_name)
925{
926 net_send_header(fd, 0, buts_name, 2);
927}
928
929static void net_send_drops(int fd)
930{
931 struct list_head *p;
932
933 __list_for_each(p, &devpaths) {
934 struct devpath *dpp = list_entry(p, struct devpath, head);
935
936 net_send_close(fd, dpp->buts_name, dpp->drops);
937 }
938}
939
940/*
941 * Returns:
942 * 0: "EOF"
943 * 1: OK
944 * -1: Error
945 */
946static int net_get_header(struct cl_conn *nc, struct blktrace_net_hdr *bnh)
947{
948 int bytes_read;
949 int fl = fcntl(nc->fd, F_GETFL);
950
951 fcntl(nc->fd, F_SETFL, fl | O_NONBLOCK);
952 bytes_read = __net_recv_data(nc->fd, bnh, sizeof(*bnh));
953 fcntl(nc->fd, F_SETFL, fl & ~O_NONBLOCK);
954
955 if (bytes_read == sizeof(*bnh))
956 return 1;
957 else if (bytes_read == 0)
958 return 0;
959 else
960 return -1;
961}
962
963static int net_setup_addr(void)
964{
965 struct sockaddr_in *addr = &hostname_addr;
966
967 memset(addr, 0, sizeof(*addr));
968 addr->sin_family = AF_INET;
969 addr->sin_port = htons(net_port);
970
971 if (inet_aton(hostname, &addr->sin_addr) != 1) {
972 struct hostent *hent;
973retry:
974 hent = gethostbyname(hostname);
975 if (!hent) {
976 if (h_errno == TRY_AGAIN) {
977 usleep(100);
978 goto retry;
979 } else if (h_errno == NO_RECOVERY) {
980 fprintf(stderr, "gethostbyname(%s)"
981 "non-recoverable error encountered\n",
982 hostname);
983 } else {
984 /*
985 * HOST_NOT_FOUND, NO_ADDRESS or NO_DATA
986 */
987 fprintf(stderr, "Host %s not found\n",
988 hostname);
989 }
990 return 1;
991 }
992
993 memcpy(&addr->sin_addr, hent->h_addr, 4);
Wei Wangc80018f2018-08-29 14:04:30 -0700994 memset(hostname, 0, sizeof(hostname));
995 strncpy(hostname, hent->h_name, sizeof(hostname));
996 hostname[sizeof(hostname) - 1] = '\0';
San Mehate20e1342009-06-03 15:36:35 -0700997 }
998
999 return 0;
1000}
1001
1002static int net_setup_client(void)
1003{
1004 int fd;
1005 struct sockaddr_in *addr = &hostname_addr;
1006
1007 fd = my_socket(AF_INET, SOCK_STREAM, 0);
1008 if (fd < 0) {
1009 perror("client: socket");
1010 return -1;
1011 }
1012
1013 if (connect(fd, (struct sockaddr *)addr, sizeof(*addr)) < 0) {
1014 if (errno == ECONNREFUSED)
1015 fprintf(stderr,
1016 "\nclient: Connection to %s refused, "
1017 "perhaps the server is not started?\n\n",
1018 hostname);
1019 else
1020 perror("client: connect");
1021
1022 close(fd);
1023 return -1;
1024 }
1025
1026 return fd;
1027}
1028
1029static int open_client_connections(void)
1030{
1031 int cpu;
Wei Wangc80018f2018-08-29 14:04:30 -07001032 size_t alloc_size = CPU_ALLOC_SIZE(max_cpus);
San Mehate20e1342009-06-03 15:36:35 -07001033
1034 cl_fds = calloc(ncpus, sizeof(*cl_fds));
Wei Wangc80018f2018-08-29 14:04:30 -07001035 for (cpu = 0; cpu < max_cpus; cpu++) {
1036 if (!CPU_ISSET_S(cpu, alloc_size, online_cpus))
1037 continue;
San Mehate20e1342009-06-03 15:36:35 -07001038 cl_fds[cpu] = net_setup_client();
1039 if (cl_fds[cpu] < 0)
1040 goto err;
1041 }
1042 return 0;
1043
1044err:
1045 while (cpu > 0)
1046 close(cl_fds[cpu--]);
1047 free(cl_fds);
1048 return 1;
1049}
1050
1051static void close_client_connections(void)
1052{
1053 if (cl_fds) {
1054 int cpu, *fdp;
Wei Wangc80018f2018-08-29 14:04:30 -07001055 size_t alloc_size = CPU_ALLOC_SIZE(max_cpus);
San Mehate20e1342009-06-03 15:36:35 -07001056
Wei Wangc80018f2018-08-29 14:04:30 -07001057 for (cpu = 0, fdp = cl_fds; cpu < max_cpus; cpu++, fdp++) {
1058 if (!CPU_ISSET_S(cpu, alloc_size, online_cpus))
1059 continue;
San Mehate20e1342009-06-03 15:36:35 -07001060 if (*fdp >= 0) {
1061 net_send_drops(*fdp);
1062 net_close_connection(fdp);
1063 }
1064 }
1065 free(cl_fds);
1066 }
1067}
1068
Wei Wangc80018f2018-08-29 14:04:30 -07001069static int setup_buts(void)
San Mehate20e1342009-06-03 15:36:35 -07001070{
1071 struct list_head *p;
Wei Wangc80018f2018-08-29 14:04:30 -07001072 int ret = 0;
San Mehate20e1342009-06-03 15:36:35 -07001073
1074 __list_for_each(p, &devpaths) {
1075 struct blk_user_trace_setup buts;
1076 struct devpath *dpp = list_entry(p, struct devpath, head);
1077
1078 memset(&buts, 0, sizeof(buts));
1079 buts.buf_size = buf_size;
1080 buts.buf_nr = buf_nr;
1081 buts.act_mask = act_mask;
Wei Wangc80018f2018-08-29 14:04:30 -07001082
San Mehate20e1342009-06-03 15:36:35 -07001083 if (ioctl(dpp->fd, BLKTRACESETUP, &buts) >= 0) {
Wei Wangc80018f2018-08-29 14:04:30 -07001084 dpp->ncpus = max_cpus;
San Mehate20e1342009-06-03 15:36:35 -07001085 dpp->buts_name = strdup(buts.name);
1086 if (dpp->stats)
1087 free(dpp->stats);
1088 dpp->stats = calloc(dpp->ncpus, sizeof(*dpp->stats));
1089 memset(dpp->stats, 0, dpp->ncpus * sizeof(*dpp->stats));
Wei Wangc80018f2018-08-29 14:04:30 -07001090 } else {
San Mehate20e1342009-06-03 15:36:35 -07001091 fprintf(stderr, "BLKTRACESETUP(2) %s failed: %d/%s\n",
1092 dpp->path, errno, strerror(errno));
Wei Wangc80018f2018-08-29 14:04:30 -07001093 ret++;
1094 }
San Mehate20e1342009-06-03 15:36:35 -07001095 }
Wei Wangc80018f2018-08-29 14:04:30 -07001096
1097 return ret;
San Mehate20e1342009-06-03 15:36:35 -07001098}
1099
1100static void start_buts(void)
1101{
1102 struct list_head *p;
1103
1104 __list_for_each(p, &devpaths) {
1105 struct devpath *dpp = list_entry(p, struct devpath, head);
1106
1107 if (ioctl(dpp->fd, BLKTRACESTART) < 0) {
1108 fprintf(stderr, "BLKTRACESTART %s failed: %d/%s\n",
1109 dpp->path, errno, strerror(errno));
1110 }
1111 }
1112}
1113
1114static int get_drops(struct devpath *dpp)
1115{
1116 int fd, drops = 0;
1117 char fn[MAXPATHLEN + 64], tmp[256];
1118
1119 snprintf(fn, sizeof(fn), "%s/block/%s/dropped", debugfs_path,
1120 dpp->buts_name);
1121
1122 fd = my_open(fn, O_RDONLY);
1123 if (fd < 0) {
1124 /*
1125 * This may be ok: the kernel may not support
1126 * dropped counts.
1127 */
1128 if (errno != ENOENT)
1129 fprintf(stderr, "Could not open %s: %d/%s\n",
1130 fn, errno, strerror(errno));
1131 return 0;
1132 } else if (read(fd, tmp, sizeof(tmp)) < 0) {
1133 fprintf(stderr, "Could not read %s: %d/%s\n",
1134 fn, errno, strerror(errno));
1135 } else
1136 drops = atoi(tmp);
1137 close(fd);
1138
1139 return drops;
1140}
1141
1142static void get_all_drops(void)
1143{
1144 struct list_head *p;
1145
1146 __list_for_each(p, &devpaths) {
1147 struct devpath *dpp = list_entry(p, struct devpath, head);
1148
1149 dpp->drops = get_drops(dpp);
1150 }
1151}
1152
1153static inline struct trace_buf *alloc_trace_buf(int cpu, int bufsize)
1154{
1155 struct trace_buf *tbp;
1156
1157 tbp = malloc(sizeof(*tbp) + bufsize);
1158 INIT_LIST_HEAD(&tbp->head);
1159 tbp->len = 0;
1160 tbp->buf = (void *)(tbp + 1);
1161 tbp->cpu = cpu;
1162 tbp->dpp = NULL; /* Will be set when tbp is added */
1163
1164 return tbp;
1165}
1166
1167static void free_tracer_heads(struct devpath *dpp)
1168{
1169 int cpu;
1170 struct tracer_devpath_head *hd;
1171
Wei Wangc80018f2018-08-29 14:04:30 -07001172 for (cpu = 0, hd = dpp->heads; cpu < max_cpus; cpu++, hd++) {
San Mehate20e1342009-06-03 15:36:35 -07001173 if (hd->prev)
1174 free(hd->prev);
1175
1176 pthread_mutex_destroy(&hd->mutex);
1177 }
1178 free(dpp->heads);
1179}
1180
1181static int setup_tracer_devpaths(void)
1182{
1183 struct list_head *p;
1184
1185 if (net_client_use_send())
1186 if (open_client_connections())
1187 return 1;
1188
1189 __list_for_each(p, &devpaths) {
1190 int cpu;
1191 struct tracer_devpath_head *hd;
1192 struct devpath *dpp = list_entry(p, struct devpath, head);
1193
Wei Wangc80018f2018-08-29 14:04:30 -07001194 dpp->heads = calloc(max_cpus, sizeof(struct tracer_devpath_head));
1195 for (cpu = 0, hd = dpp->heads; cpu < max_cpus; cpu++, hd++) {
San Mehate20e1342009-06-03 15:36:35 -07001196 INIT_LIST_HEAD(&hd->head);
1197 pthread_mutex_init(&hd->mutex, NULL);
1198 hd->prev = NULL;
1199 }
1200 }
1201
1202 return 0;
1203}
1204
1205static inline void add_trace_buf(struct devpath *dpp, int cpu,
1206 struct trace_buf **tbpp)
1207{
1208 struct trace_buf *tbp = *tbpp;
1209 struct tracer_devpath_head *hd = &dpp->heads[cpu];
1210
1211 tbp->dpp = dpp;
1212
1213 pthread_mutex_lock(&hd->mutex);
1214 list_add_tail(&tbp->head, &hd->head);
1215 pthread_mutex_unlock(&hd->mutex);
1216
1217 *tbpp = alloc_trace_buf(cpu, buf_size);
1218}
1219
1220static inline void incr_entries(int entries_handled)
1221{
1222 pthread_mutex_lock(&dp_mutex);
1223 if (dp_entries == 0)
1224 pthread_cond_signal(&dp_cond);
1225 dp_entries += entries_handled;
1226 pthread_mutex_unlock(&dp_mutex);
1227}
1228
1229static void decr_entries(int handled)
1230{
1231 pthread_mutex_lock(&dp_mutex);
1232 dp_entries -= handled;
1233 pthread_mutex_unlock(&dp_mutex);
1234}
1235
1236static int wait_empty_entries(void)
1237{
1238 pthread_mutex_lock(&dp_mutex);
1239 while (!done && dp_entries == 0)
1240 t_pthread_cond_wait(&dp_cond, &dp_mutex);
1241 pthread_mutex_unlock(&dp_mutex);
1242
1243 return !done;
1244}
1245
1246static int add_devpath(char *path)
1247{
1248 int fd;
1249 struct devpath *dpp;
Wei Wangc80018f2018-08-29 14:04:30 -07001250 struct list_head *p;
San Mehate20e1342009-06-03 15:36:35 -07001251
1252 /*
Wei Wangc80018f2018-08-29 14:04:30 -07001253 * Verify device is not duplicated
1254 */
1255 __list_for_each(p, &devpaths) {
1256 struct devpath *tmp = list_entry(p, struct devpath, head);
1257 if (!strcmp(tmp->path, path))
1258 return 0;
1259 }
1260 /*
San Mehate20e1342009-06-03 15:36:35 -07001261 * Verify device is valid before going too far
1262 */
1263 fd = my_open(path, O_RDONLY | O_NONBLOCK);
1264 if (fd < 0) {
1265 fprintf(stderr, "Invalid path %s specified: %d/%s\n",
1266 path, errno, strerror(errno));
1267 return 1;
1268 }
1269
1270 dpp = malloc(sizeof(*dpp));
1271 memset(dpp, 0, sizeof(*dpp));
1272 dpp->path = strdup(path);
1273 dpp->fd = fd;
Wei Wangc80018f2018-08-29 14:04:30 -07001274 ndevs++;
San Mehate20e1342009-06-03 15:36:35 -07001275 list_add_tail(&dpp->head, &devpaths);
1276
1277 return 0;
1278}
1279
1280static void rel_devpaths(void)
1281{
1282 struct list_head *p, *q;
1283
1284 list_for_each_safe(p, q, &devpaths) {
1285 struct devpath *dpp = list_entry(p, struct devpath, head);
1286
1287 list_del(&dpp->head);
1288 __stop_trace(dpp->fd);
1289 close(dpp->fd);
1290
1291 if (dpp->heads)
1292 free_tracer_heads(dpp);
1293
1294 dpp_free(dpp);
1295 ndevs--;
1296 }
1297}
1298
1299static int flush_subbuf_net(struct trace_buf *tbp)
1300{
1301 int fd = cl_fds[tbp->cpu];
1302 struct devpath *dpp = tbp->dpp;
1303
1304 if (net_send_header(fd, tbp->cpu, dpp->buts_name, tbp->len))
1305 return 1;
1306 else if (net_send_data(fd, tbp->buf, tbp->len) != tbp->len)
1307 return 1;
1308
1309 return 0;
1310}
1311
1312static int
1313handle_list_net(__attribute__((__unused__))struct tracer_devpath_head *hd,
1314 struct list_head *list)
1315{
1316 struct trace_buf *tbp;
1317 struct list_head *p, *q;
1318 int entries_handled = 0;
1319
1320 list_for_each_safe(p, q, list) {
1321 tbp = list_entry(p, struct trace_buf, head);
1322
1323 list_del(&tbp->head);
1324 entries_handled++;
1325
1326 if (cl_fds[tbp->cpu] >= 0) {
1327 if (flush_subbuf_net(tbp)) {
1328 close(cl_fds[tbp->cpu]);
1329 cl_fds[tbp->cpu] = -1;
1330 }
1331 }
1332
1333 free(tbp);
1334 }
1335
1336 return entries_handled;
1337}
1338
1339/*
1340 * Tack 'tbp's buf onto the tail of 'prev's buf
1341 */
1342static struct trace_buf *tb_combine(struct trace_buf *prev,
1343 struct trace_buf *tbp)
1344{
1345 unsigned long tot_len;
1346
1347 tot_len = prev->len + tbp->len;
1348 if (tot_len > buf_size) {
1349 /*
1350 * tbp->head isn't connected (it was 'prev'
1351 * so it had been taken off of the list
1352 * before). Therefore, we can realloc
1353 * the whole structures, as the other fields
1354 * are "static".
1355 */
Wei Wangc80018f2018-08-29 14:04:30 -07001356 prev = realloc(prev, sizeof(*prev) + tot_len);
San Mehate20e1342009-06-03 15:36:35 -07001357 prev->buf = (void *)(prev + 1);
1358 }
1359
1360 memcpy(prev->buf + prev->len, tbp->buf, tbp->len);
1361 prev->len = tot_len;
1362
1363 free(tbp);
1364 return prev;
1365}
1366
1367static int handle_list_file(struct tracer_devpath_head *hd,
1368 struct list_head *list)
1369{
1370 int off, t_len, nevents;
1371 struct blk_io_trace *t;
1372 struct list_head *p, *q;
1373 int entries_handled = 0;
1374 struct trace_buf *tbp, *prev;
1375
1376 prev = hd->prev;
1377 list_for_each_safe(p, q, list) {
1378 tbp = list_entry(p, struct trace_buf, head);
1379 list_del(&tbp->head);
1380 entries_handled++;
1381
1382 /*
1383 * If there was some leftover before, tack this new
1384 * entry onto the tail of the previous one.
1385 */
1386 if (prev)
1387 tbp = tb_combine(prev, tbp);
1388
1389 /*
1390 * See how many whole traces there are - send them
1391 * all out in one go.
1392 */
1393 off = 0;
1394 nevents = 0;
1395 while (off + (int)sizeof(*t) <= tbp->len) {
1396 t = (struct blk_io_trace *)(tbp->buf + off);
1397 t_len = sizeof(*t) + t->pdu_len;
1398 if (off + t_len > tbp->len)
1399 break;
1400
1401 off += t_len;
1402 nevents++;
1403 }
1404 if (nevents)
1405 pdc_nev_update(tbp->dpp, tbp->cpu, nevents);
1406
1407 /*
1408 * Write any full set of traces, any remaining data is kept
1409 * for the next pass.
1410 */
1411 if (off) {
1412 if (write_data(tbp->buf, off) || off == tbp->len) {
1413 free(tbp);
1414 prev = NULL;
1415 }
1416 else {
1417 /*
1418 * Move valid data to beginning of buffer
1419 */
1420 tbp->len -= off;
1421 memmove(tbp->buf, tbp->buf + off, tbp->len);
1422 prev = tbp;
1423 }
1424 } else
1425 prev = tbp;
1426 }
1427 hd->prev = prev;
1428
1429 return entries_handled;
1430}
1431
1432static void __process_trace_bufs(void)
1433{
1434 int cpu;
1435 struct list_head *p;
1436 struct list_head list;
1437 int handled = 0;
1438
1439 __list_for_each(p, &devpaths) {
1440 struct devpath *dpp = list_entry(p, struct devpath, head);
1441 struct tracer_devpath_head *hd = dpp->heads;
1442
Wei Wangc80018f2018-08-29 14:04:30 -07001443 for (cpu = 0; cpu < max_cpus; cpu++, hd++) {
San Mehate20e1342009-06-03 15:36:35 -07001444 pthread_mutex_lock(&hd->mutex);
1445 if (list_empty(&hd->head)) {
1446 pthread_mutex_unlock(&hd->mutex);
1447 continue;
1448 }
1449
1450 list_replace_init(&hd->head, &list);
1451 pthread_mutex_unlock(&hd->mutex);
1452
1453 handled += handle_list(hd, &list);
1454 }
1455 }
1456
1457 if (handled)
1458 decr_entries(handled);
1459}
1460
1461static void process_trace_bufs(void)
1462{
1463 while (wait_empty_entries())
1464 __process_trace_bufs();
1465}
1466
1467static void clean_trace_bufs(void)
1468{
1469 /*
1470 * No mutex needed here: we're only reading from the lists,
1471 * tracers are done
1472 */
1473 while (dp_entries)
1474 __process_trace_bufs();
1475}
1476
1477static inline void read_err(int cpu, char *ifn)
1478{
1479 if (errno != EAGAIN)
1480 fprintf(stderr, "Thread %d failed read of %s: %d/%s\n",
1481 cpu, ifn, errno, strerror(errno));
1482}
1483
1484static int net_sendfile(struct io_info *iop)
1485{
1486 int ret;
1487
1488 ret = sendfile(iop->ofd, iop->ifd, NULL, iop->ready);
1489 if (ret < 0) {
1490 perror("sendfile");
1491 return 1;
1492 } else if (ret < (int)iop->ready) {
1493 fprintf(stderr, "short sendfile send (%d of %d)\n",
1494 ret, iop->ready);
1495 return 1;
1496 }
1497
1498 return 0;
1499}
1500
1501static inline int net_sendfile_data(struct tracer *tp, struct io_info *iop)
1502{
1503 struct devpath *dpp = iop->dpp;
1504
1505 if (net_send_header(iop->ofd, tp->cpu, dpp->buts_name, iop->ready))
1506 return 1;
1507 return net_sendfile(iop);
1508}
1509
Wei Wangc80018f2018-08-29 14:04:30 -07001510static int fill_ofname(char *dst, int dstlen, char *subdir, char *buts_name,
1511 int cpu)
San Mehate20e1342009-06-03 15:36:35 -07001512{
1513 int len;
1514 struct stat sb;
San Mehate20e1342009-06-03 15:36:35 -07001515
1516 if (output_dir)
Wei Wangc80018f2018-08-29 14:04:30 -07001517 len = snprintf(dst, dstlen, "%s/", output_dir);
San Mehate20e1342009-06-03 15:36:35 -07001518 else
Wei Wangc80018f2018-08-29 14:04:30 -07001519 len = snprintf(dst, dstlen, "./");
San Mehate20e1342009-06-03 15:36:35 -07001520
Wei Wangc80018f2018-08-29 14:04:30 -07001521 if (subdir)
1522 len += snprintf(dst + len, dstlen - len, "%s", subdir);
San Mehate20e1342009-06-03 15:36:35 -07001523
Wei Wangc80018f2018-08-29 14:04:30 -07001524 if (stat(dst, &sb) < 0) {
San Mehate20e1342009-06-03 15:36:35 -07001525 if (errno != ENOENT) {
1526 fprintf(stderr,
1527 "Destination dir %s stat failed: %d/%s\n",
Wei Wangc80018f2018-08-29 14:04:30 -07001528 dst, errno, strerror(errno));
San Mehate20e1342009-06-03 15:36:35 -07001529 return 1;
1530 }
1531 /*
1532 * There is no synchronization between multiple threads
1533 * trying to create the directory at once. It's harmless
1534 * to let them try, so just detect the problem and move on.
1535 */
Wei Wangc80018f2018-08-29 14:04:30 -07001536 if (mkdir(dst, 0755) < 0 && errno != EEXIST) {
San Mehate20e1342009-06-03 15:36:35 -07001537 fprintf(stderr,
1538 "Destination dir %s can't be made: %d/%s\n",
Wei Wangc80018f2018-08-29 14:04:30 -07001539 dst, errno, strerror(errno));
San Mehate20e1342009-06-03 15:36:35 -07001540 return 1;
1541 }
1542 }
1543
1544 if (output_name)
Wei Wangc80018f2018-08-29 14:04:30 -07001545 snprintf(dst + len, dstlen - len, "%s.blktrace.%d",
San Mehate20e1342009-06-03 15:36:35 -07001546 output_name, cpu);
1547 else
Wei Wangc80018f2018-08-29 14:04:30 -07001548 snprintf(dst + len, dstlen - len, "%s.blktrace.%d",
1549 buts_name, cpu);
San Mehate20e1342009-06-03 15:36:35 -07001550
1551 return 0;
1552}
1553
1554static int set_vbuf(struct io_info *iop, int mode, size_t size)
1555{
1556 iop->obuf = malloc(size);
1557 if (setvbuf(iop->ofp, iop->obuf, mode, size) < 0) {
1558 fprintf(stderr, "setvbuf(%s, %d) failed: %d/%s\n",
1559 iop->dpp->path, (int)size, errno,
1560 strerror(errno));
1561 free(iop->obuf);
1562 return 1;
1563 }
1564
1565 return 0;
1566}
1567
1568static int iop_open(struct io_info *iop, int cpu)
1569{
Wei Wangc80018f2018-08-29 14:04:30 -07001570 char hostdir[MAXPATHLEN + 64];
1571
San Mehate20e1342009-06-03 15:36:35 -07001572 iop->ofd = -1;
Wei Wangc80018f2018-08-29 14:04:30 -07001573 if (net_mode == Net_server) {
1574 struct cl_conn *nc = iop->nc;
1575 int len;
1576
1577 len = snprintf(hostdir, sizeof(hostdir), "%s-",
1578 nc->ch->hostname);
1579 len += strftime(hostdir + len, sizeof(hostdir) - len, "%F-%T/",
1580 gmtime(&iop->dpp->cl_connect_time));
1581 } else {
1582 hostdir[0] = 0;
1583 }
1584
1585 if (fill_ofname(iop->ofn, sizeof(iop->ofn), hostdir,
1586 iop->dpp->buts_name, cpu))
San Mehate20e1342009-06-03 15:36:35 -07001587 return 1;
1588
1589 iop->ofp = my_fopen(iop->ofn, "w+");
1590 if (iop->ofp == NULL) {
1591 fprintf(stderr, "Open output file %s failed: %d/%s\n",
1592 iop->ofn, errno, strerror(errno));
1593 return 1;
1594 }
1595
1596 if (set_vbuf(iop, _IOLBF, FILE_VBUF_SIZE)) {
1597 fprintf(stderr, "set_vbuf for file %s failed: %d/%s\n",
1598 iop->ofn, errno, strerror(errno));
1599 fclose(iop->ofp);
1600 return 1;
1601 }
1602
1603 iop->ofd = fileno(iop->ofp);
1604 return 0;
1605}
1606
1607static void close_iop(struct io_info *iop)
1608{
1609 struct mmap_info *mip = &iop->mmap_info;
1610
1611 if (mip->fs_buf)
1612 munmap(mip->fs_buf, mip->fs_buf_len);
1613
1614 if (!piped_output) {
1615 if (ftruncate(fileno(iop->ofp), mip->fs_size) < 0) {
1616 fprintf(stderr,
1617 "Ignoring err: ftruncate(%s): %d/%s\n",
1618 iop->ofn, errno, strerror(errno));
1619 }
1620 }
1621
1622 if (iop->ofp)
1623 fclose(iop->ofp);
1624 if (iop->obuf)
1625 free(iop->obuf);
1626}
1627
1628static void close_ios(struct tracer *tp)
1629{
1630 while (tp->nios > 0) {
1631 struct io_info *iop = &tp->ios[--tp->nios];
1632
1633 iop->dpp->drops = get_drops(iop->dpp);
1634 if (iop->ifd >= 0)
1635 close(iop->ifd);
1636
1637 if (iop->ofp)
1638 close_iop(iop);
1639 else if (iop->ofd >= 0) {
1640 struct devpath *dpp = iop->dpp;
1641
1642 net_send_close(iop->ofd, dpp->buts_name, dpp->drops);
1643 net_close_connection(&iop->ofd);
1644 }
1645 }
1646
1647 free(tp->ios);
1648 free(tp->pfds);
1649}
1650
1651static int open_ios(struct tracer *tp)
1652{
1653 struct pollfd *pfd;
1654 struct io_info *iop;
1655 struct list_head *p;
1656
1657 tp->ios = calloc(ndevs, sizeof(struct io_info));
1658 memset(tp->ios, 0, ndevs * sizeof(struct io_info));
1659
1660 tp->pfds = calloc(ndevs, sizeof(struct pollfd));
1661 memset(tp->pfds, 0, ndevs * sizeof(struct pollfd));
1662
1663 tp->nios = 0;
1664 iop = tp->ios;
1665 pfd = tp->pfds;
1666 __list_for_each(p, &devpaths) {
1667 struct devpath *dpp = list_entry(p, struct devpath, head);
1668
1669 iop->dpp = dpp;
1670 iop->ofd = -1;
1671 snprintf(iop->ifn, sizeof(iop->ifn), "%s/block/%s/trace%d",
1672 debugfs_path, dpp->buts_name, tp->cpu);
1673
1674 iop->ifd = my_open(iop->ifn, O_RDONLY | O_NONBLOCK);
1675 if (iop->ifd < 0) {
1676 fprintf(stderr, "Thread %d failed open %s: %d/%s\n",
1677 tp->cpu, iop->ifn, errno, strerror(errno));
1678 return 1;
1679 }
1680
1681 init_mmap_info(&iop->mmap_info);
1682
1683 pfd->fd = iop->ifd;
1684 pfd->events = POLLIN;
1685
1686 if (piped_output)
1687 ;
1688 else if (net_client_use_sendfile()) {
1689 iop->ofd = net_setup_client();
1690 if (iop->ofd < 0)
1691 goto err;
1692 net_send_open(iop->ofd, tp->cpu, dpp->buts_name);
1693 } else if (net_mode == Net_none) {
1694 if (iop_open(iop, tp->cpu))
1695 goto err;
1696 } else {
1697 /*
1698 * This ensures that the server knows about all
1699 * connections & devices before _any_ closes
1700 */
1701 net_send_open(cl_fds[tp->cpu], tp->cpu, dpp->buts_name);
1702 }
1703
1704 pfd++;
1705 iop++;
1706 tp->nios++;
1707 }
1708
1709 return 0;
1710
1711err:
1712 close(iop->ifd); /* tp->nios _not_ bumped */
1713 close_ios(tp);
1714 return 1;
1715}
1716
1717static int handle_pfds_file(struct tracer *tp, int nevs, int force_read)
1718{
1719 struct mmap_info *mip;
1720 int i, ret, nentries = 0;
1721 struct pollfd *pfd = tp->pfds;
1722 struct io_info *iop = tp->ios;
1723
1724 for (i = 0; nevs > 0 && i < ndevs; i++, pfd++, iop++) {
1725 if (pfd->revents & POLLIN || force_read) {
1726 mip = &iop->mmap_info;
1727
Wei Wangc80018f2018-08-29 14:04:30 -07001728 ret = setup_mmap(iop->ofd, buf_size, mip, tp);
San Mehate20e1342009-06-03 15:36:35 -07001729 if (ret < 0) {
1730 pfd->events = 0;
1731 break;
1732 }
1733
1734 ret = read(iop->ifd, mip->fs_buf + mip->fs_off,
1735 buf_size);
1736 if (ret > 0) {
1737 pdc_dr_update(iop->dpp, tp->cpu, ret);
1738 mip->fs_size += ret;
1739 mip->fs_off += ret;
1740 nentries++;
1741 } else if (ret == 0) {
1742 /*
1743 * Short reads after we're done stop us
1744 * from trying reads.
1745 */
1746 if (tp->is_done)
1747 clear_events(pfd);
1748 } else {
1749 read_err(tp->cpu, iop->ifn);
1750 if (errno != EAGAIN || tp->is_done)
1751 clear_events(pfd);
1752 }
1753 nevs--;
1754 }
1755 }
1756
1757 return nentries;
1758}
1759
1760static int handle_pfds_netclient(struct tracer *tp, int nevs, int force_read)
1761{
1762 struct stat sb;
1763 int i, nentries = 0;
San Mehate20e1342009-06-03 15:36:35 -07001764 struct pollfd *pfd = tp->pfds;
1765 struct io_info *iop = tp->ios;
1766
Wei Wangc80018f2018-08-29 14:04:30 -07001767 for (i = 0; i < ndevs; i++, pfd++, iop++) {
San Mehate20e1342009-06-03 15:36:35 -07001768 if (pfd->revents & POLLIN || force_read) {
1769 if (fstat(iop->ifd, &sb) < 0) {
1770 perror(iop->ifn);
1771 pfd->events = 0;
1772 } else if (sb.st_size > (off_t)iop->data_queued) {
1773 iop->ready = sb.st_size - iop->data_queued;
1774 iop->data_queued = sb.st_size;
1775
1776 if (!net_sendfile_data(tp, iop)) {
1777 pdc_dr_update(iop->dpp, tp->cpu,
1778 iop->ready);
1779 nentries++;
1780 } else
1781 clear_events(pfd);
1782 }
1783 if (--nevs == 0)
1784 break;
1785 }
1786 }
1787
1788 if (nentries)
1789 incr_entries(nentries);
1790
1791 return nentries;
1792}
1793
1794static int handle_pfds_entries(struct tracer *tp, int nevs, int force_read)
1795{
1796 int i, nentries = 0;
1797 struct trace_buf *tbp;
1798 struct pollfd *pfd = tp->pfds;
1799 struct io_info *iop = tp->ios;
1800
1801 tbp = alloc_trace_buf(tp->cpu, buf_size);
1802 for (i = 0; i < ndevs; i++, pfd++, iop++) {
1803 if (pfd->revents & POLLIN || force_read) {
1804 tbp->len = read(iop->ifd, tbp->buf, buf_size);
1805 if (tbp->len > 0) {
1806 pdc_dr_update(iop->dpp, tp->cpu, tbp->len);
1807 add_trace_buf(iop->dpp, tp->cpu, &tbp);
1808 nentries++;
1809 } else if (tbp->len == 0) {
1810 /*
1811 * Short reads after we're done stop us
1812 * from trying reads.
1813 */
1814 if (tp->is_done)
1815 clear_events(pfd);
1816 } else {
1817 read_err(tp->cpu, iop->ifn);
1818 if (errno != EAGAIN || tp->is_done)
1819 clear_events(pfd);
1820 }
1821 if (!piped_output && --nevs == 0)
1822 break;
1823 }
1824 }
1825 free(tbp);
1826
1827 if (nentries)
1828 incr_entries(nentries);
1829
1830 return nentries;
1831}
1832
1833static void *thread_main(void *arg)
1834{
1835 int ret, ndone, to_val;
1836 struct tracer *tp = arg;
1837
1838 ret = lock_on_cpu(tp->cpu);
1839 if (ret)
1840 goto err;
1841
1842 ret = open_ios(tp);
1843 if (ret)
1844 goto err;
1845
1846 if (piped_output)
1847 to_val = 50; /* Frequent partial handles */
1848 else
1849 to_val = 500; /* 1/2 second intervals */
1850
1851
1852 tracer_signal_ready(tp, Th_running, 0);
1853 tracer_wait_unblock(tp);
1854
1855 while (!tp->is_done) {
1856 ndone = poll(tp->pfds, ndevs, to_val);
1857 if (ndone || piped_output)
1858 (void)handle_pfds(tp, ndone, piped_output);
1859 else if (ndone < 0 && errno != EINTR)
1860 fprintf(stderr, "Thread %d poll failed: %d/%s\n",
1861 tp->cpu, errno, strerror(errno));
1862 }
1863
1864 /*
1865 * Trace is stopped, pull data until we get a short read
1866 */
1867 while (handle_pfds(tp, ndevs, 1) > 0)
1868 ;
1869
1870 close_ios(tp);
1871 tracer_signal_ready(tp, Th_leaving, 0);
1872 return NULL;
1873
1874err:
1875 tracer_signal_ready(tp, Th_error, ret);
1876 return NULL;
1877}
1878
1879static int start_tracer(int cpu)
1880{
1881 struct tracer *tp;
1882
1883 tp = malloc(sizeof(*tp));
1884 memset(tp, 0, sizeof(*tp));
1885
1886 INIT_LIST_HEAD(&tp->head);
1887 tp->status = 0;
1888 tp->cpu = cpu;
1889
1890 if (pthread_create(&tp->thread, NULL, thread_main, tp)) {
1891 fprintf(stderr, "FAILED to start thread on CPU %d: %d/%s\n",
1892 cpu, errno, strerror(errno));
1893 free(tp);
1894 return 1;
1895 }
1896
1897 list_add_tail(&tp->head, &tracers);
1898 return 0;
1899}
1900
Wei Wangc80018f2018-08-29 14:04:30 -07001901static int create_output_files(int cpu)
1902{
1903 char fname[MAXPATHLEN + 64];
1904 struct list_head *p;
1905 FILE *f;
1906
1907 __list_for_each(p, &devpaths) {
1908 struct devpath *dpp = list_entry(p, struct devpath, head);
1909
1910 if (fill_ofname(fname, sizeof(fname), NULL, dpp->buts_name,
1911 cpu))
1912 return 1;
1913 f = my_fopen(fname, "w+");
1914 if (!f)
1915 return 1;
1916 fclose(f);
1917 }
1918 return 0;
1919}
1920
San Mehate20e1342009-06-03 15:36:35 -07001921static void start_tracers(void)
1922{
Wei Wangc80018f2018-08-29 14:04:30 -07001923 int cpu, started = 0;
San Mehate20e1342009-06-03 15:36:35 -07001924 struct list_head *p;
Wei Wangc80018f2018-08-29 14:04:30 -07001925 size_t alloc_size = CPU_ALLOC_SIZE(max_cpus);
San Mehate20e1342009-06-03 15:36:35 -07001926
Wei Wangc80018f2018-08-29 14:04:30 -07001927 for (cpu = 0; cpu < max_cpus; cpu++) {
1928 if (!CPU_ISSET_S(cpu, alloc_size, online_cpus)) {
1929 /*
1930 * Create fake empty output files so that other tools
1931 * like blkparse don't have to bother with sparse CPU
1932 * number space.
1933 */
1934 if (create_output_files(cpu))
1935 break;
1936 continue;
1937 }
San Mehate20e1342009-06-03 15:36:35 -07001938 if (start_tracer(cpu))
1939 break;
Wei Wangc80018f2018-08-29 14:04:30 -07001940 started++;
1941 }
San Mehate20e1342009-06-03 15:36:35 -07001942
Wei Wangc80018f2018-08-29 14:04:30 -07001943 wait_tracers_ready(started);
San Mehate20e1342009-06-03 15:36:35 -07001944
1945 __list_for_each(p, &tracers) {
1946 struct tracer *tp = list_entry(p, struct tracer, head);
1947 if (tp->status)
1948 fprintf(stderr,
1949 "FAILED to start thread on CPU %d: %d/%s\n",
1950 tp->cpu, tp->status, strerror(tp->status));
1951 }
1952}
1953
1954static void stop_tracers(void)
1955{
1956 struct list_head *p;
1957
1958 /*
1959 * Stop the tracing - makes the tracer threads clean up quicker.
1960 */
1961 __list_for_each(p, &devpaths) {
1962 struct devpath *dpp = list_entry(p, struct devpath, head);
1963 (void)ioctl(dpp->fd, BLKTRACESTOP);
1964 }
1965
1966 /*
1967 * Tell each tracer to quit
1968 */
1969 __list_for_each(p, &tracers) {
1970 struct tracer *tp = list_entry(p, struct tracer, head);
1971 tp->is_done = 1;
1972 }
Wei Wangc80018f2018-08-29 14:04:30 -07001973 pthread_cond_broadcast(&mt_cond);
San Mehate20e1342009-06-03 15:36:35 -07001974}
1975
1976static void del_tracers(void)
1977{
1978 struct list_head *p, *q;
1979
1980 list_for_each_safe(p, q, &tracers) {
1981 struct tracer *tp = list_entry(p, struct tracer, head);
1982
1983 list_del(&tp->head);
1984 free(tp);
1985 }
1986}
1987
1988static void wait_tracers(void)
1989{
1990 struct list_head *p;
1991
1992 if (use_tracer_devpaths())
1993 process_trace_bufs();
1994
1995 wait_tracers_leaving();
1996
1997 __list_for_each(p, &tracers) {
1998 int ret;
1999 struct tracer *tp = list_entry(p, struct tracer, head);
2000
2001 ret = pthread_join(tp->thread, NULL);
2002 if (ret)
2003 fprintf(stderr, "Thread join %d failed %d\n",
2004 tp->cpu, ret);
2005 }
2006
2007 if (use_tracer_devpaths())
2008 clean_trace_bufs();
2009
2010 get_all_drops();
2011}
2012
2013static void exit_tracing(void)
2014{
2015 signal(SIGINT, SIG_IGN);
2016 signal(SIGHUP, SIG_IGN);
2017 signal(SIGTERM, SIG_IGN);
2018 signal(SIGALRM, SIG_IGN);
2019
2020 stop_tracers();
2021 wait_tracers();
2022 del_tracers();
2023 rel_devpaths();
2024}
2025
2026static void handle_sigint(__attribute__((__unused__)) int sig)
2027{
2028 done = 1;
2029 stop_tracers();
2030}
2031
2032static void show_stats(struct list_head *devpaths)
2033{
2034 FILE *ofp;
2035 struct list_head *p;
2036 unsigned long long nevents, data_read;
2037 unsigned long long total_drops = 0;
2038 unsigned long long total_events = 0;
2039
2040 if (piped_output)
2041 ofp = my_fopen("/dev/null", "w");
2042 else
2043 ofp = stdout;
2044
2045 __list_for_each(p, devpaths) {
2046 int cpu;
2047 struct pdc_stats *sp;
2048 struct devpath *dpp = list_entry(p, struct devpath, head);
2049
2050 if (net_mode == Net_server)
2051 printf("server: end of run for %s:%s\n",
2052 dpp->ch->hostname, dpp->buts_name);
2053
2054 data_read = 0;
2055 nevents = 0;
2056
2057 fprintf(ofp, "=== %s ===\n", dpp->buts_name);
2058 for (cpu = 0, sp = dpp->stats; cpu < dpp->ncpus; cpu++, sp++) {
2059 /*
2060 * Estimate events if not known...
2061 */
2062 if (sp->nevents == 0) {
2063 sp->nevents = sp->data_read /
2064 sizeof(struct blk_io_trace);
2065 }
2066
2067 fprintf(ofp,
2068 " CPU%3d: %20llu events, %8llu KiB data\n",
2069 cpu, sp->nevents, (sp->data_read + 1023) >> 10);
2070
2071 data_read += sp->data_read;
2072 nevents += sp->nevents;
2073 }
2074
2075 fprintf(ofp, " Total: %20llu events (dropped %llu),"
2076 " %8llu KiB data\n", nevents,
2077 dpp->drops, (data_read + 1024) >> 10);
2078
2079 total_drops += dpp->drops;
2080 total_events += (nevents + dpp->drops);
2081 }
2082
2083 fflush(ofp);
2084 if (piped_output)
2085 fclose(ofp);
2086
2087 if (total_drops) {
2088 double drops_ratio = 1.0;
2089
2090 if (total_events)
2091 drops_ratio = (double)total_drops/(double)total_events;
2092
2093 fprintf(stderr, "\nYou have %llu (%5.1lf%%) dropped events\n"
2094 "Consider using a larger buffer size (-b) "
2095 "and/or more buffers (-n)\n",
2096 total_drops, 100.0 * drops_ratio);
2097 }
2098}
2099
2100static int handle_args(int argc, char *argv[])
2101{
2102 int c, i;
2103 struct statfs st;
2104 int act_mask_tmp = 0;
2105
2106 while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) >= 0) {
2107 switch (c) {
2108 case 'a':
2109 i = find_mask_map(optarg);
2110 if (i < 0) {
2111 fprintf(stderr, "Invalid action mask %s\n",
2112 optarg);
2113 return 1;
2114 }
2115 act_mask_tmp |= i;
2116 break;
2117
2118 case 'A':
2119 if ((sscanf(optarg, "%x", &i) != 1) ||
2120 !valid_act_opt(i)) {
2121 fprintf(stderr,
2122 "Invalid set action mask %s/0x%x\n",
2123 optarg, i);
2124 return 1;
2125 }
2126 act_mask_tmp = i;
2127 break;
2128
2129 case 'd':
2130 if (add_devpath(optarg) != 0)
2131 return 1;
2132 break;
2133
2134 case 'I': {
2135 char dev_line[256];
2136 FILE *ifp = my_fopen(optarg, "r");
2137
2138 if (!ifp) {
2139 fprintf(stderr,
2140 "Invalid file for devices %s\n",
2141 optarg);
2142 return 1;
2143 }
2144
Wei Wangc80018f2018-08-29 14:04:30 -07002145 while (fscanf(ifp, "%s\n", dev_line) == 1) {
2146 if (add_devpath(dev_line) != 0) {
2147 fclose(ifp);
San Mehate20e1342009-06-03 15:36:35 -07002148 return 1;
Wei Wangc80018f2018-08-29 14:04:30 -07002149 }
2150 }
2151 fclose(ifp);
San Mehate20e1342009-06-03 15:36:35 -07002152 break;
2153 }
2154
2155 case 'r':
2156 debugfs_path = optarg;
2157 break;
2158
2159 case 'o':
2160 output_name = optarg;
2161 break;
2162 case 'k':
2163 kill_running_trace = 1;
2164 break;
2165 case 'w':
2166 stop_watch = atoi(optarg);
2167 if (stop_watch <= 0) {
2168 fprintf(stderr,
2169 "Invalid stopwatch value (%d secs)\n",
2170 stop_watch);
2171 return 1;
2172 }
2173 break;
2174 case 'V':
2175 case 'v':
2176 printf("%s version %s\n", argv[0], blktrace_version);
2177 exit(0);
2178 /*NOTREACHED*/
2179 case 'b':
2180 buf_size = strtoul(optarg, NULL, 10);
2181 if (buf_size <= 0 || buf_size > 16*1024) {
2182 fprintf(stderr, "Invalid buffer size (%lu)\n",
2183 buf_size);
2184 return 1;
2185 }
2186 buf_size <<= 10;
2187 break;
2188 case 'n':
2189 buf_nr = strtoul(optarg, NULL, 10);
2190 if (buf_nr <= 0) {
2191 fprintf(stderr,
2192 "Invalid buffer nr (%lu)\n", buf_nr);
2193 return 1;
2194 }
2195 break;
2196 case 'D':
2197 output_dir = optarg;
2198 break;
2199 case 'h':
2200 net_mode = Net_client;
Wei Wangc80018f2018-08-29 14:04:30 -07002201 memset(hostname, 0, sizeof(hostname));
2202 strncpy(hostname, optarg, sizeof(hostname));
2203 hostname[sizeof(hostname) - 1] = '\0';
San Mehate20e1342009-06-03 15:36:35 -07002204 break;
2205 case 'l':
2206 net_mode = Net_server;
2207 break;
2208 case 'p':
2209 net_port = atoi(optarg);
2210 break;
2211 case 's':
2212 net_use_sendfile = 0;
2213 break;
2214 default:
2215 show_usage(argv[0]);
2216 exit(1);
2217 /*NOTREACHED*/
2218 }
2219 }
2220
2221 while (optind < argc)
2222 if (add_devpath(argv[optind++]) != 0)
2223 return 1;
2224
2225 if (net_mode != Net_server && ndevs == 0) {
2226 show_usage(argv[0]);
2227 return 1;
2228 }
2229
Wei Wangc80018f2018-08-29 14:04:30 -07002230 if (statfs(debugfs_path, &st) < 0) {
San Mehate20e1342009-06-03 15:36:35 -07002231 fprintf(stderr, "Invalid debug path %s: %d/%s\n",
2232 debugfs_path, errno, strerror(errno));
2233 return 1;
2234 }
2235
Wei Wangc80018f2018-08-29 14:04:30 -07002236 if (st.f_type != (long)DEBUGFS_TYPE) {
2237 fprintf(stderr, "Debugfs is not mounted at %s\n", debugfs_path);
2238 return 1;
2239 }
2240
San Mehate20e1342009-06-03 15:36:35 -07002241 if (act_mask_tmp != 0)
2242 act_mask = act_mask_tmp;
2243
2244 if (net_mode == Net_client && net_setup_addr())
2245 return 1;
2246
2247 /*
2248 * Set up for appropriate PFD handler based upon output name.
2249 */
2250 if (net_client_use_sendfile())
2251 handle_pfds = handle_pfds_netclient;
2252 else if (net_client_use_send())
2253 handle_pfds = handle_pfds_entries;
2254 else if (output_name && (strcmp(output_name, "-") == 0)) {
2255 piped_output = 1;
2256 handle_pfds = handle_pfds_entries;
2257 pfp = stdout;
Wei Wangc80018f2018-08-29 14:04:30 -07002258 if (setvbuf(pfp, NULL, _IONBF, 0)) {
2259 perror("setvbuf stdout");
2260 return 1;
2261 }
San Mehate20e1342009-06-03 15:36:35 -07002262 } else
2263 handle_pfds = handle_pfds_file;
2264 return 0;
2265}
2266
2267static void ch_add_connection(struct net_server_s *ns, struct cl_host *ch,
2268 int fd)
2269{
2270 struct cl_conn *nc;
2271
2272 nc = malloc(sizeof(*nc));
2273 memset(nc, 0, sizeof(*nc));
2274
2275 time(&nc->connect_time);
2276 nc->ch = ch;
2277 nc->fd = fd;
2278 nc->ncpus = -1;
2279
2280 list_add_tail(&nc->ch_head, &ch->conn_list);
2281 ch->connects++;
2282
2283 list_add_tail(&nc->ns_head, &ns->conn_list);
2284 ns->connects++;
2285 ns->pfds = realloc(ns->pfds, (ns->connects+1) * sizeof(struct pollfd));
2286}
2287
2288static void ch_rem_connection(struct net_server_s *ns, struct cl_host *ch,
2289 struct cl_conn *nc)
2290{
2291 net_close_connection(&nc->fd);
2292
2293 list_del(&nc->ch_head);
2294 ch->connects--;
2295
2296 list_del(&nc->ns_head);
2297 ns->connects--;
2298 ns->pfds = realloc(ns->pfds, (ns->connects+1) * sizeof(struct pollfd));
2299
2300 free(nc);
2301}
2302
2303static struct cl_host *net_find_client_host(struct net_server_s *ns,
2304 struct in_addr cl_in_addr)
2305{
2306 struct list_head *p;
2307
2308 __list_for_each(p, &ns->ch_list) {
2309 struct cl_host *ch = list_entry(p, struct cl_host, head);
2310
2311 if (in_addr_eq(ch->cl_in_addr, cl_in_addr))
2312 return ch;
2313 }
2314
2315 return NULL;
2316}
2317
2318static struct cl_host *net_add_client_host(struct net_server_s *ns,
2319 struct sockaddr_in *addr)
2320{
2321 struct cl_host *ch;
2322
2323 ch = malloc(sizeof(*ch));
2324 memset(ch, 0, sizeof(*ch));
2325
2326 ch->ns = ns;
2327 ch->cl_in_addr = addr->sin_addr;
2328 list_add_tail(&ch->head, &ns->ch_list);
2329 ns->nchs++;
2330
2331 ch->hostname = strdup(inet_ntoa(addr->sin_addr));
2332 printf("server: connection from %s\n", ch->hostname);
2333
2334 INIT_LIST_HEAD(&ch->conn_list);
2335 INIT_LIST_HEAD(&ch->devpaths);
2336
2337 return ch;
2338}
2339
2340static void device_done(struct devpath *dpp, int ncpus)
2341{
2342 int cpu;
2343 struct io_info *iop;
2344
2345 for (cpu = 0, iop = dpp->ios; cpu < ncpus; cpu++, iop++)
2346 close_iop(iop);
2347
2348 list_del(&dpp->head);
2349 dpp_free(dpp);
2350}
2351
2352static void net_ch_remove(struct cl_host *ch, int ncpus)
2353{
2354 struct list_head *p, *q;
2355 struct net_server_s *ns = ch->ns;
2356
2357 list_for_each_safe(p, q, &ch->devpaths) {
2358 struct devpath *dpp = list_entry(p, struct devpath, head);
2359 device_done(dpp, ncpus);
2360 }
2361
2362 list_for_each_safe(p, q, &ch->conn_list) {
2363 struct cl_conn *nc = list_entry(p, struct cl_conn, ch_head);
2364
2365 ch_rem_connection(ns, ch, nc);
2366 }
2367
2368 list_del(&ch->head);
2369 ns->nchs--;
2370
2371 if (ch->hostname)
2372 free(ch->hostname);
2373 free(ch);
2374}
2375
2376static void net_add_connection(struct net_server_s *ns)
2377{
2378 int fd;
2379 struct cl_host *ch;
2380 socklen_t socklen = sizeof(ns->addr);
2381
2382 fd = my_accept(ns->listen_fd, (struct sockaddr *)&ns->addr, &socklen);
2383 if (fd < 0) {
2384 /*
2385 * This is OK: we just won't accept this connection,
2386 * nothing fatal.
2387 */
2388 perror("accept");
2389 } else {
2390 ch = net_find_client_host(ns, ns->addr.sin_addr);
2391 if (!ch)
2392 ch = net_add_client_host(ns, &ns->addr);
2393
2394 ch_add_connection(ns, ch, fd);
2395 }
2396}
2397
2398static struct devpath *nc_add_dpp(struct cl_conn *nc,
2399 struct blktrace_net_hdr *bnh,
2400 time_t connect_time)
2401{
2402 int cpu;
2403 struct io_info *iop;
2404 struct devpath *dpp;
2405
2406 dpp = malloc(sizeof(*dpp));
2407 memset(dpp, 0, sizeof(*dpp));
2408
2409 dpp->buts_name = strdup(bnh->buts_name);
2410 dpp->path = strdup(bnh->buts_name);
2411 dpp->fd = -1;
2412 dpp->ch = nc->ch;
2413 dpp->cl_id = bnh->cl_id;
2414 dpp->cl_connect_time = connect_time;
2415 dpp->ncpus = nc->ncpus;
2416 dpp->stats = calloc(dpp->ncpus, sizeof(*dpp->stats));
2417 memset(dpp->stats, 0, dpp->ncpus * sizeof(*dpp->stats));
2418
2419 list_add_tail(&dpp->head, &nc->ch->devpaths);
2420 nc->ch->ndevs++;
2421
2422 dpp->ios = calloc(nc->ncpus, sizeof(*iop));
2423 memset(dpp->ios, 0, ndevs * sizeof(*iop));
2424
2425 for (cpu = 0, iop = dpp->ios; cpu < nc->ncpus; cpu++, iop++) {
2426 iop->dpp = dpp;
2427 iop->nc = nc;
2428 init_mmap_info(&iop->mmap_info);
2429
2430 if (iop_open(iop, cpu))
2431 goto err;
2432 }
2433
2434 return dpp;
2435
2436err:
2437 /*
2438 * Need to unravel what's been done...
2439 */
2440 while (cpu >= 0)
2441 close_iop(&dpp->ios[cpu--]);
2442 dpp_free(dpp);
2443
2444 return NULL;
2445}
2446
2447static struct devpath *nc_find_dpp(struct cl_conn *nc,
2448 struct blktrace_net_hdr *bnh)
2449{
2450 struct list_head *p;
2451 time_t connect_time = nc->connect_time;
2452
2453 __list_for_each(p, &nc->ch->devpaths) {
2454 struct devpath *dpp = list_entry(p, struct devpath, head);
2455
2456 if (!strcmp(dpp->buts_name, bnh->buts_name))
2457 return dpp;
2458
2459 if (dpp->cl_id == bnh->cl_id)
2460 connect_time = dpp->cl_connect_time;
2461 }
2462
2463 return nc_add_dpp(nc, bnh, connect_time);
2464}
2465
2466static void net_client_read_data(struct cl_conn *nc, struct devpath *dpp,
2467 struct blktrace_net_hdr *bnh)
2468{
2469 int ret;
2470 struct io_info *iop = &dpp->ios[bnh->cpu];
2471 struct mmap_info *mip = &iop->mmap_info;
2472
Wei Wangc80018f2018-08-29 14:04:30 -07002473 if (setup_mmap(iop->ofd, bnh->len, &iop->mmap_info, NULL)) {
San Mehate20e1342009-06-03 15:36:35 -07002474 fprintf(stderr, "ncd(%s:%d): mmap failed\n",
2475 nc->ch->hostname, nc->fd);
2476 exit(1);
2477 }
2478
2479 ret = net_recv_data(nc->fd, mip->fs_buf + mip->fs_off, bnh->len);
2480 if (ret > 0) {
2481 pdc_dr_update(dpp, bnh->cpu, ret);
2482 mip->fs_size += ret;
2483 mip->fs_off += ret;
2484 } else if (ret < 0)
2485 exit(1);
2486}
2487
2488/*
2489 * Returns 1 if we closed a host - invalidates other polling information
2490 * that may be present.
2491 */
2492static int net_client_data(struct cl_conn *nc)
2493{
2494 int ret;
2495 struct devpath *dpp;
2496 struct blktrace_net_hdr bnh;
2497
2498 ret = net_get_header(nc, &bnh);
2499 if (ret == 0)
2500 return 0;
2501
2502 if (ret < 0) {
2503 fprintf(stderr, "ncd(%d): header read failed\n", nc->fd);
2504 exit(1);
2505 }
2506
2507 if (data_is_native == -1 && check_data_endianness(bnh.magic)) {
2508 fprintf(stderr, "ncd(%d): received data is bad\n", nc->fd);
2509 exit(1);
2510 }
2511
2512 if (!data_is_native) {
2513 bnh.magic = be32_to_cpu(bnh.magic);
2514 bnh.cpu = be32_to_cpu(bnh.cpu);
2515 bnh.max_cpus = be32_to_cpu(bnh.max_cpus);
2516 bnh.len = be32_to_cpu(bnh.len);
2517 bnh.cl_id = be32_to_cpu(bnh.cl_id);
2518 bnh.buf_size = be32_to_cpu(bnh.buf_size);
2519 bnh.buf_nr = be32_to_cpu(bnh.buf_nr);
2520 bnh.page_size = be32_to_cpu(bnh.page_size);
2521 }
2522
2523 if ((bnh.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
2524 fprintf(stderr, "ncd(%s:%d): bad data magic\n",
2525 nc->ch->hostname, nc->fd);
2526 exit(1);
2527 }
2528
2529 if (nc->ncpus == -1)
2530 nc->ncpus = bnh.max_cpus;
2531
2532 /*
2533 * len == 0 means the other end is sending us a new connection/dpp
2534 * len == 1 means that the other end signalled end-of-run
2535 */
2536 dpp = nc_find_dpp(nc, &bnh);
2537 if (bnh.len == 0) {
2538 /*
2539 * Just adding in the dpp above is enough
2540 */
2541 ack_open_close(nc->fd, dpp->buts_name);
2542 nc->ch->cl_opens++;
2543 } else if (bnh.len == 1) {
2544 /*
2545 * overload cpu count with dropped events
2546 */
2547 dpp->drops = bnh.cpu;
2548
2549 ack_open_close(nc->fd, dpp->buts_name);
2550 if (--nc->ch->cl_opens == 0) {
2551 show_stats(&nc->ch->devpaths);
2552 net_ch_remove(nc->ch, nc->ncpus);
2553 return 1;
2554 }
2555 } else
2556 net_client_read_data(nc, dpp, &bnh);
2557
2558 return 0;
2559}
2560
2561static void handle_client_data(struct net_server_s *ns, int events)
2562{
2563 struct cl_conn *nc;
2564 struct pollfd *pfd;
2565 struct list_head *p, *q;
2566
2567 pfd = &ns->pfds[1];
2568 list_for_each_safe(p, q, &ns->conn_list) {
2569 if (pfd->revents & POLLIN) {
2570 nc = list_entry(p, struct cl_conn, ns_head);
2571
2572 if (net_client_data(nc) || --events == 0)
2573 break;
2574 }
2575 pfd++;
2576 }
2577}
2578
2579static void net_setup_pfds(struct net_server_s *ns)
2580{
2581 struct pollfd *pfd;
2582 struct list_head *p;
2583
2584 ns->pfds[0].fd = ns->listen_fd;
2585 ns->pfds[0].events = POLLIN;
2586
2587 pfd = &ns->pfds[1];
2588 __list_for_each(p, &ns->conn_list) {
2589 struct cl_conn *nc = list_entry(p, struct cl_conn, ns_head);
2590
2591 pfd->fd = nc->fd;
2592 pfd->events = POLLIN;
2593 pfd++;
2594 }
2595}
2596
2597static int net_server_handle_connections(struct net_server_s *ns)
2598{
2599 int events;
2600
2601 printf("server: waiting for connections...\n");
2602
2603 while (!done) {
2604 net_setup_pfds(ns);
2605 events = poll(ns->pfds, ns->connects + 1, -1);
2606 if (events < 0) {
2607 if (errno != EINTR) {
2608 perror("FATAL: poll error");
2609 return 1;
2610 }
2611 } else if (events > 0) {
2612 if (ns->pfds[0].revents & POLLIN) {
2613 net_add_connection(ns);
2614 events--;
2615 }
2616
2617 if (events)
2618 handle_client_data(ns, events);
2619 }
2620 }
2621
2622 return 0;
2623}
2624
2625static int net_server(void)
2626{
2627 int fd, opt;
2628 int ret = 1;
2629 struct net_server_s net_server;
2630 struct net_server_s *ns = &net_server;
2631
2632 memset(ns, 0, sizeof(*ns));
2633 INIT_LIST_HEAD(&ns->ch_list);
2634 INIT_LIST_HEAD(&ns->conn_list);
2635 ns->pfds = malloc(sizeof(struct pollfd));
2636
2637 fd = my_socket(AF_INET, SOCK_STREAM, 0);
2638 if (fd < 0) {
2639 perror("server: socket");
2640 goto out;
2641 }
2642
2643 opt = 1;
2644 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0) {
2645 perror("setsockopt");
2646 goto out;
2647 }
2648
2649 memset(&ns->addr, 0, sizeof(ns->addr));
2650 ns->addr.sin_family = AF_INET;
2651 ns->addr.sin_addr.s_addr = htonl(INADDR_ANY);
2652 ns->addr.sin_port = htons(net_port);
2653
2654 if (bind(fd, (struct sockaddr *) &ns->addr, sizeof(ns->addr)) < 0) {
2655 perror("bind");
2656 goto out;
2657 }
2658
2659 if (listen(fd, 1) < 0) {
2660 perror("listen");
2661 goto out;
2662 }
2663
2664 /*
2665 * The actual server looping is done here:
2666 */
2667 ns->listen_fd = fd;
2668 ret = net_server_handle_connections(ns);
2669
2670 /*
2671 * Clean up and return...
2672 */
2673out:
2674 free(ns->pfds);
2675 return ret;
2676}
2677
2678static int run_tracers(void)
2679{
2680 atexit(exit_tracing);
2681 if (net_mode == Net_client)
2682 printf("blktrace: connecting to %s\n", hostname);
2683
Wei Wangc80018f2018-08-29 14:04:30 -07002684 if (setup_buts())
2685 return 1;
San Mehate20e1342009-06-03 15:36:35 -07002686
2687 if (use_tracer_devpaths()) {
2688 if (setup_tracer_devpaths())
2689 return 1;
2690
2691 if (piped_output)
2692 handle_list = handle_list_file;
2693 else
2694 handle_list = handle_list_net;
2695 }
2696
2697 start_tracers();
2698 if (nthreads_running == ncpus) {
2699 unblock_tracers();
2700 start_buts();
2701 if (net_mode == Net_client)
2702 printf("blktrace: connected!\n");
2703 if (stop_watch)
2704 alarm(stop_watch);
2705 } else
2706 stop_tracers();
2707
2708 wait_tracers();
2709 if (nthreads_running == ncpus)
2710 show_stats(&devpaths);
2711 if (net_client_use_send())
2712 close_client_connections();
2713 del_tracers();
2714
2715 return 0;
2716}
2717
Wei Wangc80018f2018-08-29 14:04:30 -07002718static cpu_set_t *get_online_cpus(void)
2719{
2720 FILE *cpus;
2721 cpu_set_t *set;
2722 size_t alloc_size;
2723 int cpuid, prevcpuid = -1;
2724 char nextch;
2725 int n, ncpu, curcpu = 0;
2726 int *cpu_nums;
2727
2728 ncpu = sysconf(_SC_NPROCESSORS_CONF);
2729 if (ncpu < 0)
2730 return NULL;
2731
2732 cpu_nums = malloc(sizeof(int)*ncpu);
2733 if (!cpu_nums) {
2734 errno = ENOMEM;
2735 return NULL;
2736 }
2737
2738 /*
2739 * There is no way to easily get maximum CPU number. So we have to
2740 * parse the file first to find it out and then create appropriate
2741 * cpuset
2742 */
2743 cpus = my_fopen("/sys/devices/system/cpu/online", "r");
2744 for (;;) {
2745 n = fscanf(cpus, "%d%c", &cpuid, &nextch);
2746 if (n <= 0)
2747 break;
2748 if (n == 2 && nextch == '-') {
2749 prevcpuid = cpuid;
2750 continue;
2751 }
2752 if (prevcpuid == -1)
2753 prevcpuid = cpuid;
2754 while (prevcpuid <= cpuid) {
2755 /* More CPUs listed than configured? */
2756 if (curcpu >= ncpu) {
2757 errno = EINVAL;
2758 return NULL;
2759 }
2760 cpu_nums[curcpu++] = prevcpuid++;
2761 }
2762 prevcpuid = -1;
2763 }
2764 fclose(cpus);
2765
2766 ncpu = curcpu;
2767 max_cpus = cpu_nums[ncpu - 1] + 1;
2768
2769 /* Now that we have maximum cpu number, create a cpuset */
2770 set = CPU_ALLOC(max_cpus);
2771 if (!set) {
2772 errno = ENOMEM;
2773 return NULL;
2774 }
2775 alloc_size = CPU_ALLOC_SIZE(max_cpus);
2776 CPU_ZERO_S(alloc_size, set);
2777
2778 for (curcpu = 0; curcpu < ncpu; curcpu++)
2779 CPU_SET_S(cpu_nums[curcpu], alloc_size, set);
2780
2781 free(cpu_nums);
2782
2783 return set;
2784}
2785
San Mehate20e1342009-06-03 15:36:35 -07002786int main(int argc, char *argv[])
2787{
2788 int ret = 0;
2789
2790 setlocale(LC_NUMERIC, "en_US");
2791 pagesize = getpagesize();
Wei Wangc80018f2018-08-29 14:04:30 -07002792 online_cpus = get_online_cpus();
2793 if (!online_cpus) {
2794 fprintf(stderr, "cannot get online cpus %d/%s\n",
San Mehate20e1342009-06-03 15:36:35 -07002795 errno, strerror(errno));
2796 ret = 1;
2797 goto out;
2798 } else if (handle_args(argc, argv)) {
2799 ret = 1;
2800 goto out;
2801 }
2802
Wei Wangc80018f2018-08-29 14:04:30 -07002803 ncpus = CPU_COUNT_S(CPU_ALLOC_SIZE(max_cpus), online_cpus);
2804 if (ndevs > 1 && output_name && strcmp(output_name, "-") != 0) {
2805 fprintf(stderr, "-o not supported with multiple devices\n");
2806 ret = 1;
2807 goto out;
2808 }
2809
San Mehate20e1342009-06-03 15:36:35 -07002810 signal(SIGINT, handle_sigint);
2811 signal(SIGHUP, handle_sigint);
2812 signal(SIGTERM, handle_sigint);
2813 signal(SIGALRM, handle_sigint);
2814 signal(SIGPIPE, SIG_IGN);
2815
2816 if (kill_running_trace) {
2817 struct devpath *dpp;
2818 struct list_head *p;
2819
2820 __list_for_each(p, &devpaths) {
2821 dpp = list_entry(p, struct devpath, head);
2822 if (__stop_trace(dpp->fd)) {
2823 fprintf(stderr,
2824 "BLKTRACETEARDOWN %s failed: %d/%s\n",
2825 dpp->path, errno, strerror(errno));
2826 }
2827 }
2828 } else if (net_mode == Net_server) {
2829 if (output_name) {
2830 fprintf(stderr, "-o ignored in server mode\n");
2831 output_name = NULL;
2832 }
2833 ret = net_server();
2834 } else
2835 ret = run_tracers();
2836
2837out:
2838 if (pfp)
2839 fclose(pfp);
2840 rel_devpaths();
2841 return ret;
2842}