blob: e5127880fbebc9d2d272e281d21e668aac48e7b0 [file] [log] [blame]
/*
* Copyright (C) 2017 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/traced/probes/ftrace/cpu_reader.h"
#include <signal.h>
#include <dirent.h>
#include <map>
#include <queue>
#include <string>
#include <utility>
#include "perfetto/base/build_config.h"
#include "perfetto/base/logging.h"
#include "perfetto/ext/base/metatrace.h"
#include "perfetto/ext/base/optional.h"
#include "perfetto/ext/base/utils.h"
#include "src/traced/probes/ftrace/ftrace_controller.h"
#include "src/traced/probes/ftrace/ftrace_data_source.h"
#include "src/traced/probes/ftrace/proto_translation_table.h"
#include "protos/perfetto/trace/ftrace/ftrace_event.pbzero.h"
#include "protos/perfetto/trace/ftrace/ftrace_event_bundle.pbzero.h"
#include "protos/perfetto/trace/ftrace/generic.pbzero.h"
#include "protos/perfetto/trace/trace_packet.pbzero.h"
namespace perfetto {
namespace {
// For further documentation of these constants see the kernel source:
// linux/include/linux/ring_buffer.h
// Some information about the values of these constants are exposed to user
// space at: /sys/kernel/debug/tracing/events/header_event
constexpr uint32_t kTypeDataTypeLengthMax = 28;
constexpr uint32_t kTypePadding = 29;
constexpr uint32_t kTypeTimeExtend = 30;
constexpr uint32_t kTypeTimeStamp = 31;
struct EventHeader {
uint32_t type_or_length : 5;
uint32_t time_delta : 27;
};
struct TimeStamp {
uint64_t tv_nsec;
uint64_t tv_sec;
};
bool ReadIntoString(const uint8_t* start,
const uint8_t* end,
uint32_t field_id,
protozero::Message* out) {
for (const uint8_t* c = start; c < end; c++) {
if (*c != '\0')
continue;
out->AppendBytes(field_id, reinterpret_cast<const char*>(start),
static_cast<uintptr_t>(c - start));
return true;
}
return false;
}
bool ReadDataLoc(const uint8_t* start,
const uint8_t* field_start,
const uint8_t* end,
const Field& field,
protozero::Message* message) {
PERFETTO_DCHECK(field.ftrace_size == 4);
// See
// https://github.com/torvalds/linux/blob/master/include/trace/trace_events.h
uint32_t data = 0;
const uint8_t* ptr = field_start;
if (!CpuReader::ReadAndAdvance(&ptr, end, &data)) {
PERFETTO_DFATAL("Buffer overflowed.");
return false;
}
const uint16_t offset = data & 0xffff;
const uint16_t len = (data >> 16) & 0xffff;
const uint8_t* const string_start = start + offset;
const uint8_t* const string_end = string_start + len;
if (string_start <= start || string_end > end) {
PERFETTO_DFATAL("Buffer overflowed.");
return false;
}
ReadIntoString(string_start, string_end, field.proto_field_id, message);
return true;
}
bool SetBlocking(int fd, bool is_blocking) {
int flags = fcntl(fd, F_GETFL, 0);
flags = (is_blocking) ? (flags & ~O_NONBLOCK) : (flags | O_NONBLOCK);
return fcntl(fd, F_SETFL, flags) == 0;
}
} // namespace
using protos::pbzero::GenericFtraceEvent;
CpuReader::CpuReader(const ProtoTranslationTable* table,
size_t cpu,
base::ScopedFile trace_fd)
: table_(table), cpu_(cpu), trace_fd_(std::move(trace_fd)) {
PERFETTO_CHECK(trace_fd_);
PERFETTO_CHECK(SetBlocking(*trace_fd_, false));
}
CpuReader::~CpuReader() = default;
size_t CpuReader::ReadCycle(
uint8_t* parsing_buf,
size_t parsing_buf_size_pages,
size_t max_pages,
const std::set<FtraceDataSource*>& started_data_sources) {
PERFETTO_DCHECK(max_pages > 0 && parsing_buf_size_pages > 0);
metatrace::ScopedEvent evt(metatrace::TAG_FTRACE,
metatrace::FTRACE_CPU_READ_CYCLE);
// Work in batches to keep cache locality, and limit memory usage.
size_t batch_pages = std::min(parsing_buf_size_pages, max_pages);
size_t total_pages_read = 0;
for (bool is_first_batch = true;; is_first_batch = false) {
size_t pages_read = ReadAndProcessBatch(
parsing_buf, batch_pages, is_first_batch, started_data_sources);
PERFETTO_DCHECK(pages_read <= batch_pages);
total_pages_read += pages_read;
// Check whether we've caught up to the writer, or possibly giving up on
// this attempt due to some error.
if (pages_read != batch_pages)
break;
// Check if we've hit the limit of work for this cycle.
if (total_pages_read >= max_pages)
break;
}
PERFETTO_METATRACE_COUNTER(TAG_FTRACE, FTRACE_PAGES_DRAINED,
total_pages_read);
return total_pages_read;
}
// metatrace note: mark the reading phase as FTRACE_CPU_READ_BATCH, but let the
// parsing time be implied (by the difference between the caller's span, and
// this reading span). Makes it easier to estimate the read/parse ratio when
// looking at the trace in the UI.
size_t CpuReader::ReadAndProcessBatch(
uint8_t* parsing_buf,
size_t max_pages,
bool first_batch_in_cycle,
const std::set<FtraceDataSource*>& started_data_sources) {
size_t pages_read = 0;
{
metatrace::ScopedEvent evt(metatrace::TAG_FTRACE,
metatrace::FTRACE_CPU_READ_BATCH);
for (; pages_read < max_pages;) {
uint8_t* curr_page = parsing_buf + (pages_read * base::kPageSize);
ssize_t res =
PERFETTO_EINTR(read(*trace_fd_, curr_page, base::kPageSize));
if (res < 0) {
// Expected errors:
// EAGAIN: no data (since we're in non-blocking mode).
// ENONMEM, EBUSY: temporary ftrace failures (they happen).
if (errno != EAGAIN && errno != ENOMEM && errno != EBUSY)
PERFETTO_PLOG("Unexpected error on raw ftrace read");
break; // stop reading regardless of errno
}
// As long as all of our reads are for a single page, the kernel should
// return exactly a well-formed raw ftrace page (if not in the steady
// state of reading out fully-written pages, the kernel will construct
// pages as necessary, copying over events and zero-filling at the end).
// A sub-page read() is therefore not expected in practice (unless
// there's a concurrent reader requesting less than a page?). Crash if
// encountering this situation. Kernel source pointer: see usage of
// |info->read| within |tracing_buffers_read|.
// TODO(rsavitski): don't crash, throw away the partial read & pipe
// through an error signal.
if (res == 0) {
// Very rare, but possible. Stop for now, should recover.
PERFETTO_DLOG("[cpu%zu]: 0-sized read from ftrace pipe.", cpu_);
break;
}
PERFETTO_CHECK(res == static_cast<ssize_t>(base::kPageSize));
pages_read += 1;
// Compare the amount of ftrace data read against an empirical threshold
// to make an educated guess on whether we should read more. To figure
// out the amount of ftrace data, we need to parse the page header (since
// the read always returns a page, zero-filled at the end). If we read
// fewer bytes than the threshold, it means that we caught up with the
// write pointer and we started consuming ftrace events in real-time.
// This cannot be just 4096 because it needs to account for
// fragmentation, i.e. for the fact that the last trace event didn't fit
// in the current page and hence the current page was terminated
// prematurely.
static constexpr size_t kRoughlyAPage = base::kPageSize - 512;
const uint8_t* scratch_ptr = curr_page;
base::Optional<PageHeader> hdr =
ParsePageHeader(&scratch_ptr, table_->page_header_size_len());
PERFETTO_DCHECK(hdr && hdr->size > 0 && hdr->size <= base::kPageSize);
if (!hdr.has_value()) {
PERFETTO_ELOG("[cpu%zu]: can't parse page header", cpu_);
break;
}
// Note that the first read after starting the read cycle being small is
// normal. It means that we're given the remainder of events from a
// page that we've partially consumed during the last read of the previous
// cycle (having caught up to the writer).
if (hdr->size < kRoughlyAPage &&
!(first_batch_in_cycle && pages_read == 1)) {
break;
}
}
} // end of metatrace::FTRACE_CPU_READ_BATCH
// Parse the pages and write to the trace for of all relevant data
// sources.
for (size_t i = 0; i < pages_read; i++) {
uint8_t* curr_page = parsing_buf + (i * base::kPageSize);
for (FtraceDataSource* data_source : started_data_sources) {
auto packet = data_source->trace_writer()->NewTracePacket();
auto* bundle = packet->set_ftrace_events();
auto* metadata = data_source->mutable_metadata();
auto* filter = data_source->event_filter();
// Note: The fastpath in proto_trace_parser.cc speculates on the fact
// that the cpu field is the first field of the proto message. If this
// changes, change proto_trace_parser.cc accordingly.
bundle->set_cpu(static_cast<uint32_t>(cpu_));
size_t evt_size = ParsePage(curr_page, filter, bundle, table_, metadata);
PERFETTO_DCHECK(evt_size);
bundle->set_lost_events(metadata->lost_events);
}
}
return pages_read;
}
// A page header consists of:
// * timestamp: 8 bytes
// * commit: 8 bytes on 64 bit, 4 bytes on 32 bit kernels
//
// The kernel reports this at /sys/kernel/debug/tracing/events/header_page.
//
// |commit|'s bottom bits represent the length of the payload following this
// header. The top bits have been repurposed as a bitset of flags pertaining to
// data loss. We look only at the "there has been some data lost" flag
// (RB_MISSED_EVENTS), and ignore the relatively tricky "appended the precise
// lost events count past the end of the valid data, as there was room to do so"
// flag (RB_MISSED_STORED).
//
// static
base::Optional<CpuReader::PageHeader> CpuReader::ParsePageHeader(
const uint8_t** ptr,
uint16_t page_header_size_len) {
// Mask for the data length portion of the |commit| field. Note that the
// kernel implementation never explicitly defines the boundary (beyond using
// bits 30 and 31 as flags), but 27 bits are mentioned as sufficient in the
// original commit message, and is the constant used by trace-cmd.
constexpr static uint64_t kDataSizeMask = (1ull << 27) - 1;
// If set, indicates that the relevant cpu has lost events since the last read
// (clearing the bit internally).
constexpr static uint64_t kMissedEventsFlag = (1ull << 31);
const uint8_t* end_of_page = *ptr + base::kPageSize;
PageHeader page_header;
if (!CpuReader::ReadAndAdvance<uint64_t>(ptr, end_of_page,
&page_header.timestamp))
return base::nullopt;
uint32_t size_and_flags;
// On little endian, we can just read a uint32_t and reject the rest of the
// number later.
if (!CpuReader::ReadAndAdvance<uint32_t>(
ptr, end_of_page, base::AssumeLittleEndian(&size_and_flags)))
return base::nullopt;
page_header.size = size_and_flags & kDataSizeMask;
page_header.lost_events = bool(size_and_flags & kMissedEventsFlag);
PERFETTO_DCHECK(page_header.size <= base::kPageSize);
// Reject rest of the number, if applicable. On 32-bit, size_bytes - 4 will
// evaluate to 0 and this will be a no-op. On 64-bit, this will advance by 4
// bytes.
PERFETTO_DCHECK(page_header_size_len >= 4);
*ptr += page_header_size_len - 4;
return base::make_optional(page_header);
}
// A raw ftrace buffer page consists of a header followed by a sequence of
// binary ftrace events. See |ParsePageHeader| for the format of the earlier.
//
// This method is deliberately static so it can be tested independently.
size_t CpuReader::ParsePage(const uint8_t* ptr,
const EventFilter* filter,
FtraceEventBundle* bundle,
const ProtoTranslationTable* table,
FtraceMetadata* metadata) {
const uint8_t* const start_of_page = ptr;
const uint8_t* const end_of_page = ptr + base::kPageSize;
auto page_header = ParsePageHeader(&ptr, table->page_header_size_len());
if (!page_header.has_value())
return 0;
// ParsePageHeader advances |ptr| to point past the end of the header.
metadata->lost_events = static_cast<uint32_t>(page_header->lost_events);
const uint8_t* const end = ptr + page_header->size;
if (end > end_of_page)
return 0;
uint64_t timestamp = page_header->timestamp;
while (ptr < end) {
EventHeader event_header;
if (!ReadAndAdvance(&ptr, end, &event_header))
return 0;
timestamp += event_header.time_delta;
switch (event_header.type_or_length) {
case kTypePadding: {
// Left over page padding or discarded event.
if (event_header.time_delta == 0) {
// Not clear what the correct behaviour is in this case.
PERFETTO_DFATAL("Empty padding event.");
return 0;
}
uint32_t length;
if (!ReadAndAdvance<uint32_t>(&ptr, end, &length))
return 0;
ptr += length;
break;
}
case kTypeTimeExtend: {
// Extend the time delta.
uint32_t time_delta_ext;
if (!ReadAndAdvance<uint32_t>(&ptr, end, &time_delta_ext))
return 0;
// See https://goo.gl/CFBu5x
timestamp += (static_cast<uint64_t>(time_delta_ext)) << 27;
break;
}
case kTypeTimeStamp: {
// Sync time stamp with external clock.
TimeStamp time_stamp;
if (!ReadAndAdvance<TimeStamp>(&ptr, end, &time_stamp))
return 0;
// Not implemented in the kernel, nothing should generate this.
PERFETTO_DFATAL("Unimplemented in kernel. Should be unreachable.");
break;
}
// Data record:
default: {
PERFETTO_CHECK(event_header.type_or_length <= kTypeDataTypeLengthMax);
// type_or_length is <=28 so it represents the length of a data
// record. if == 0, this is an extended record and the size of the
// record is stored in the first uint32_t word in the payload. See
// Kernel's include/linux/ring_buffer.h
uint32_t event_size;
if (event_header.type_or_length == 0) {
if (!ReadAndAdvance<uint32_t>(&ptr, end, &event_size))
return 0;
// Size includes the size field itself.
if (event_size < 4)
return 0;
event_size -= 4;
} else {
event_size = 4 * event_header.type_or_length;
}
const uint8_t* start = ptr;
const uint8_t* next = ptr + event_size;
if (next > end)
return 0;
uint16_t ftrace_event_id;
if (!ReadAndAdvance<uint16_t>(&ptr, end, &ftrace_event_id))
return 0;
if (filter->IsEventEnabled(ftrace_event_id)) {
protos::pbzero::FtraceEvent* event = bundle->add_event();
event->set_timestamp(timestamp);
if (!ParseEvent(ftrace_event_id, start, next, table, event, metadata))
return 0;
}
// Jump to next event.
ptr = next;
}
}
}
return static_cast<size_t>(ptr - start_of_page);
}
// |start| is the start of the current event.
// |end| is the end of the buffer.
bool CpuReader::ParseEvent(uint16_t ftrace_event_id,
const uint8_t* start,
const uint8_t* end,
const ProtoTranslationTable* table,
protozero::Message* message,
FtraceMetadata* metadata) {
PERFETTO_DCHECK(start < end);
const size_t length = static_cast<size_t>(end - start);
// TODO(hjd): Rework to work even if the event is unknown.
const Event& info = *table->GetEventById(ftrace_event_id);
// TODO(hjd): Test truncated events.
// If the end of the buffer is before the end of the event give up.
if (info.size > length) {
PERFETTO_DFATAL("Buffer overflowed.");
return false;
}
bool success = true;
for (const Field& field : table->common_fields())
success &= ParseField(field, start, end, message, metadata);
protozero::Message* nested =
message->BeginNestedMessage<protozero::Message>(info.proto_field_id);
// Parse generic event.
if (info.proto_field_id == protos::pbzero::FtraceEvent::kGenericFieldNumber) {
nested->AppendString(GenericFtraceEvent::kEventNameFieldNumber, info.name);
for (const Field& field : info.fields) {
auto generic_field = nested->BeginNestedMessage<protozero::Message>(
GenericFtraceEvent::kFieldFieldNumber);
// TODO(taylori): Avoid outputting field names every time.
generic_field->AppendString(GenericFtraceEvent::Field::kNameFieldNumber,
field.ftrace_name);
success &= ParseField(field, start, end, generic_field, metadata);
}
} else { // Parse all other events.
for (const Field& field : info.fields) {
success &= ParseField(field, start, end, nested, metadata);
}
}
if (PERFETTO_UNLIKELY(info.proto_field_id ==
protos::pbzero::FtraceEvent::kTaskRenameFieldNumber)) {
// For task renames, we want to store that the pid was renamed. We use the
// common pid to reduce code complexity as in all the cases we care about,
// the common pid is the same as the renamed pid (the pid inside the event).
PERFETTO_DCHECK(metadata->last_seen_common_pid);
metadata->AddRenamePid(metadata->last_seen_common_pid);
}
// This finalizes |nested| and |proto_field| automatically.
message->Finalize();
metadata->FinishEvent();
return success;
}
// Caller must guarantee that the field fits in the range,
// explicitly: start + field.ftrace_offset + field.ftrace_size <= end
// The only exception is fields with strategy = kCStringToString
// where the total size isn't known up front. In this case ParseField
// will check the string terminates in the bounds and won't read past |end|.
bool CpuReader::ParseField(const Field& field,
const uint8_t* start,
const uint8_t* end,
protozero::Message* message,
FtraceMetadata* metadata) {
PERFETTO_DCHECK(start + field.ftrace_offset + field.ftrace_size <= end);
const uint8_t* field_start = start + field.ftrace_offset;
uint32_t field_id = field.proto_field_id;
switch (field.strategy) {
case kUint8ToUint32:
case kUint8ToUint64:
ReadIntoVarInt<uint8_t>(field_start, field_id, message);
return true;
case kUint16ToUint32:
case kUint16ToUint64:
ReadIntoVarInt<uint16_t>(field_start, field_id, message);
return true;
case kUint32ToUint32:
case kUint32ToUint64:
ReadIntoVarInt<uint32_t>(field_start, field_id, message);
return true;
case kUint64ToUint64:
ReadIntoVarInt<uint64_t>(field_start, field_id, message);
return true;
case kInt8ToInt32:
case kInt8ToInt64:
ReadIntoVarInt<int8_t>(field_start, field_id, message);
return true;
case kInt16ToInt32:
case kInt16ToInt64:
ReadIntoVarInt<int16_t>(field_start, field_id, message);
return true;
case kInt32ToInt32:
case kInt32ToInt64:
ReadIntoVarInt<int32_t>(field_start, field_id, message);
return true;
case kInt64ToInt64:
ReadIntoVarInt<int64_t>(field_start, field_id, message);
return true;
case kFixedCStringToString:
// TODO(hjd): Add AppendMaxLength string to protozero.
return ReadIntoString(field_start, field_start + field.ftrace_size,
field_id, message);
case kCStringToString:
// TODO(hjd): Kernel-dive to check this how size:0 char fields work.
return ReadIntoString(field_start, end, field.proto_field_id, message);
case kStringPtrToString:
// TODO(hjd): Figure out how to read these.
return true;
case kDataLocToString:
return ReadDataLoc(start, field_start, end, field, message);
case kBoolToUint32:
case kBoolToUint64:
ReadIntoVarInt<uint8_t>(field_start, field_id, message);
return true;
case kInode32ToUint64:
ReadInode<uint32_t>(field_start, field_id, message, metadata);
return true;
case kInode64ToUint64:
ReadInode<uint64_t>(field_start, field_id, message, metadata);
return true;
case kPid32ToInt32:
case kPid32ToInt64:
ReadPid(field_start, field_id, message, metadata);
return true;
case kCommonPid32ToInt32:
case kCommonPid32ToInt64:
ReadCommonPid(field_start, field_id, message, metadata);
return true;
case kDevId32ToUint64:
ReadDevId<uint32_t>(field_start, field_id, message, metadata);
return true;
case kDevId64ToUint64:
ReadDevId<uint64_t>(field_start, field_id, message, metadata);
return true;
}
PERFETTO_FATAL("Not reached"); // For gcc
}
} // namespace perfetto