From 2a8de11c0de7d7847f165200cad7bcbbd302559c Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Fri, 29 Dec 2017 00:41:48 +0100 Subject: [PATCH] Support off-CPU profiling When we encounter a perf data file that was recorded with perf record --switch-events -e sched:sched_switch --call-graph dwarf then we now take the off-CPU time from the switch events and the call stack from the sched_switch events to support off-CPU profiling. I.e. we will show both the number of scheduler switches in the sched:sched_switch column, as well as the off-CPU time in a new column. In the timeline, the off-CPU time is always highlighted in red and can be hovered to inspect the sched switch delay. Fixes #8 --- src/models/data.h | 1 + src/models/timelinedelegate.cpp | 81 ++++++++++++++++++----- src/parsers/perf/perfparser.cpp | 44 +++++++++++- src/resultspage.cpp | 2 +- tests/integrationtests/tst_perfparser.cpp | 69 ++++++++++++++++--- 5 files changed, 170 insertions(+), 27 deletions(-) diff --git a/src/models/data.h b/src/models/data.h index 8cd275a2..3e1e86cb 100644 --- a/src/models/data.h +++ b/src/models/data.h @@ -516,6 +516,7 @@ struct EventResults QVector threads; QVector> stacks; QVector totalCosts; + qint32 offCpuTimeCostId = -1; ThreadEvents* findThread(qint32 pid, qint32 tid); }; diff --git a/src/models/timelinedelegate.cpp b/src/models/timelinedelegate.cpp index eaff28d3..03b3b210 100644 --- a/src/models/timelinedelegate.cpp +++ b/src/models/timelinedelegate.cpp @@ -153,6 +153,7 @@ void TimeLineDelegate::paint(QPainter* painter, const QStyleOptionViewItem& opti const QModelIndex& index) const { const auto data = dataFromIndex(index, option.rect, m_zoomStack); + const auto offCpuCostId = index.data(EventModel::EventResultsRole).value().offCpuTimeCostId; const bool is_alternate = option.features & QStyleOptionViewItem::Alternate; const auto& palette = option.palette; @@ -192,9 +193,21 @@ void TimeLineDelegate::paint(QPainter* painter, const QStyleOptionViewItem& opti QPen pen(scheme.foreground(KColorScheme::NeutralText), 1); painter->setPen(pen); painter->setBrush({}); + auto offCpuColor = scheme.background(KColorScheme::NegativeBackground).color(); - int last_x = -1; + if (offCpuCostId != -1) { + for (const auto& event : data.events) { + if (event.type != offCpuCostId) { + continue; + } + const auto x = data.mapTimeToX(event.time); + const auto x2 = data.mapTimeToX(event.time + event.cost); + painter->fillRect(x, 0, x2 - x, data.h, offCpuColor); + } + } + + int last_x = -1; // TODO: accumulate cost for events that fall to the same pixel somehow // but how to then sync the y scale across different delegates? // somehow deduce threshold via min time delta and max cost? @@ -249,28 +262,66 @@ bool TimeLineDelegate::helpEvent(QHelpEvent* event, QAbstractItemView* view, const auto localX = event->pos().x(); const auto mappedX = localX - option.rect.x() - data.padding; const auto time = data.mapXToTime(mappedX); - auto it = findEvent(data.events.constBegin(), data.events.constEnd(), time); + const auto start = findEvent(data.events.constBegin(), data.events.constEnd(), time); // find the maximum sample cost in the range spanned by one pixel - uint numSamples = 0; - quint64 maxCost = 0; - quint64 totalCost = 0; - while (it != data.events.constEnd() && data.mapTimeToX(it->time) == mappedX) { - if (it->type != m_eventType) { + struct FoundSamples { + uint numSamples = 0; + quint64 maxCost = 0; + quint64 totalCost = 0; + int type = -1; + }; + auto findSamples = [&](int costType, bool contains) -> FoundSamples { + FoundSamples ret; + ret.type = costType; + auto it = start; + if (contains) { + // for a contains check, we must only include events for the correct type + // otherwise we might skip the sched switch e.g. + while (it->type != costType && it != data.events.constBegin()) { + --it; + } + } + while (it != data.events.constEnd()) { + if (it->type != costType) { + ++it; + continue; + } + const auto timeX = data.mapTimeToX(it->time); + if (timeX > mappedX) { + // event lies to the right of the selected time + break; + } else if (contains && mappedX > data.mapTimeToX(it->time + it->cost)) { + // event lies to the left of the selected time + ++it; + continue; + } + Q_ASSERT(contains || mappedX == timeX); + ++ret.numSamples; + ret.maxCost = std::max(ret.maxCost, it->cost); + ret.totalCost += it->cost; ++it; - continue; } - ++numSamples; - maxCost = std::max(maxCost, it->cost); - totalCost += it->cost; - ++it; + return ret; + }; + auto found = findSamples(m_eventType, false); + const auto offCpuCostId = index.data(EventModel::EventResultsRole).value().offCpuTimeCostId; + if (offCpuCostId != -1 && !found.numSamples) { + // check whether we are hovering an off-CPU area + found = findSamples(offCpuCostId, true); } const auto formattedTime = Util::formatTimeString(time - data.minTime); const auto totalCosts = index.data(EventModel::TotalCostsRole).value>(); - if (numSamples > 0) { + if (found.numSamples > 0 && found.type == offCpuCostId) { + QToolTip::showText(event->globalPos(), tr("time: %1\nsched switches: %2\ntotal off-CPU time: %3\nlongest sched switch: %4") + .arg(formattedTime, QString::number(found.numSamples), + Util::formatTimeString(found.totalCost), + Util::formatTimeString(found.maxCost))); + } else if (found.numSamples > 0) { QToolTip::showText(event->globalPos(), tr("time: %1\n%5 samples: %2\ntotal sample cost: %3\nmax sample cost: %4") - .arg(formattedTime).arg(numSamples) - .arg(totalCost).arg(maxCost).arg(totalCosts.value(m_eventType).label)); + .arg(formattedTime, QString::number(found.numSamples), + Util::formatCost(found.totalCost), Util::formatCost(found.maxCost), + totalCosts.value(found.type).label)); } else { QToolTip::showText(event->globalPos(), tr("time: %1 (no %2 samples)") .arg(formattedTime, totalCosts.value(m_eventType).label)); diff --git a/src/parsers/perf/perfparser.cpp b/src/parsers/perf/perfparser.cpp index c5d43911..326f4a94 100644 --- a/src/parsers/perf/perfparser.cpp +++ b/src/parsers/perf/perfparser.cpp @@ -1054,8 +1054,49 @@ struct PerfParserPrivate } if (!contextSwitch.switchOut && thread->state == Data::ThreadEvents::OffCpu) { - thread->offCpuTime += contextSwitch.time - thread->lastSwitchTime; + const auto switchTime = contextSwitch.time - thread->lastSwitchTime; + thread->offCpuTime += switchTime; + + if (eventResult.offCpuTimeCostId == -1) { + eventResult.offCpuTimeCostId = addCostType(PerfParser::tr("off-CPU Time")); + } + + auto& totalCost = summaryResult.costs[eventResult.offCpuTimeCostId]; + totalCost.sampleCount++; + totalCost.totalPeriod += switchTime; + + qint32 stackId = -1; + if (!thread->events.isEmpty() && m_schedSwitchCostId != -1) { + auto it = std::find_if(thread->events.rbegin(), thread->events.rend(), + [this](const Data::Event& event) { + return event.type == m_schedSwitchCostId; + }); + if (it != thread->events.rend()) { + stackId = it->stackId; + } + } + if (stackId != -1) { + const auto& frames = eventResult.stacks[stackId]; + QSet recursionGuard; + auto frameCallback = [this, &recursionGuard, switchTime] + (const Data::Symbol& symbol, const Data::Location& location) + { + addCallerCalleeEvent(symbol, location, + eventResult.offCpuTimeCostId, switchTime, + &recursionGuard, &callerCalleeResult, + bottomUpResult.costs.numTypes()); + }; + bottomUpResult.addEvent(eventResult.offCpuTimeCostId, switchTime, frames, frameCallback); + } + + Data::Event event; + event.time = thread->lastSwitchTime; + event.cost = switchTime; + event.type = eventResult.offCpuTimeCostId; + event.stackId = stackId; + thread->events.push_back(event); } + thread->lastSwitchTime = contextSwitch.time; thread->state = contextSwitch.switchOut ? Data::ThreadEvents::OffCpu @@ -1151,6 +1192,7 @@ struct PerfParserPrivate QHash attributeIdsToCostIds; QHash attributeNameToCostIds; qint32 m_nextCostId = 0; + qint32 m_schedSwitchCostId = -1; }; PerfParser::PerfParser(QObject* parent) diff --git a/src/resultspage.cpp b/src/resultspage.cpp index 9457c2ef..29b3125a 100644 --- a/src/resultspage.cpp +++ b/src/resultspage.cpp @@ -90,7 +90,7 @@ ResultsPage::ResultsPage(PerfParser *parser, QWidget *parent) int typeId = -1; for (const auto& type : data.totalCosts) { ++typeId; - if (!type.sampleCount) { + if (!type.sampleCount || typeId == data.offCpuTimeCostId) { continue; } ui->timeLineEventSource->addItem(type.label, typeId); diff --git a/tests/integrationtests/tst_perfparser.cpp b/tests/integrationtests/tst_perfparser.cpp index 7bb1b1af..46a1ab4a 100644 --- a/tests/integrationtests/tst_perfparser.cpp +++ b/tests/integrationtests/tst_perfparser.cpp @@ -64,13 +64,15 @@ do {\ namespace { template -bool searchForChildSymbol(const T& root, const QString& searchString) +bool searchForChildSymbol(const T& root, const QString& searchString, bool exact = true) { - if (root.symbol.symbol == searchString) { + if (exact && root.symbol.symbol == searchString) { + return true; + } else if (!exact && root.symbol.symbol.contains(searchString)) { return true; } else { for (auto entry : root.children) { - if (searchForChildSymbol(entry, searchString)) { + if (searchForChildSymbol(entry, searchString, exact)) { return true; } } @@ -78,23 +80,25 @@ bool searchForChildSymbol(const T& root, const QString& searchString) return false; } -bool compareCosts(const Data::TopDown &lhs, const Data::TopDown &rhs, const Data::TopDownResults &results) +bool compareCosts(const Data::TopDown &lhs, const Data::TopDown &rhs, const Data::TopDownResults &results, + int costIndex) { - return results.inclusiveCosts.cost(0, lhs.id) < results.inclusiveCosts.cost(0, rhs.id); + return results.inclusiveCosts.cost(costIndex, lhs.id) < results.inclusiveCosts.cost(costIndex, rhs.id); } -bool compareCosts(const Data::BottomUp &lhs, const Data::BottomUp &rhs, const Data::BottomUpResults &results) +bool compareCosts(const Data::BottomUp &lhs, const Data::BottomUp &rhs, const Data::BottomUpResults &results, + int costIndex) { - return results.costs.cost(0, lhs.id) < results.costs.cost(0, rhs.id); + return results.costs.cost(costIndex, lhs.id) < results.costs.cost(costIndex, rhs.id); } template -int maxElementTopIndex(const Results &collection) +int maxElementTopIndex(const Results &collection, int costIndex = 0) { using DataType = decltype(*collection.root.children.begin()); auto topResult = std::max_element(collection.root.children.constBegin(), collection.root.children.constEnd(), - [collection](const DataType &lhs, const DataType &rhs) { - return compareCosts(lhs, rhs, collection); + [collection, costIndex](const DataType &lhs, const DataType &rhs) { + return compareCosts(lhs, rhs, collection, costIndex); }); return std::distance(collection.root.children.begin(), topResult); } @@ -369,7 +373,52 @@ private slots: QVERIFY((thread.timeEnd - thread.timeStart) > thread.offCpuTime); } } + + void testOffCpu() + { + if (!canTrace("events/sched/sched_switch")) { + QSKIP("cannot access sched_switch trace points. execute the following to run this test:\n" + " sudo mount -o remount,mode=755 /sys/kernel/debug{,/tracing} with mode=755"); + } + + const QStringList perfOptions = {"--call-graph", "dwarf", "--switch-events", "-e", "cycles", "-e", "sched:sched_switch"}; + + const QString exePath = qApp->applicationDirPath() + "/../tests/test-clients/cpp-sleep/cpp-sleep"; + + QTemporaryFile tempFile; + tempFile.open(); + + perfRecord(perfOptions, exePath, {}, tempFile.fileName()); + testPerfData(Data::Symbol{"hypot", "libm"}, Data::Symbol{"start", "cpp-sleep"}, tempFile.fileName(), false); + + QCOMPARE(m_bottomUpData.costs.numTypes(), 3); + QCOMPARE(m_bottomUpData.costs.typeName(0), "cycles"); + QCOMPARE(m_bottomUpData.costs.typeName(1), "sched:sched_switch"); + QCOMPARE(m_bottomUpData.costs.typeName(2), "off-CPU Time"); + + // find sched switch hotspot + int bottomUpTopIndex = maxElementTopIndex(m_bottomUpData, 1); + QVERIFY(bottomUpTopIndex != -1); + + // should be the same as off-cpu hotspot + QCOMPARE(bottomUpTopIndex, maxElementTopIndex(m_bottomUpData, 2)); + + const auto topBottomUp = m_bottomUpData.root.children[bottomUpTopIndex]; + QVERIFY(topBottomUp.symbol.symbol.contains("schedule")); + QVERIFY(topBottomUp.symbol.binary.contains("kernel")); + QVERIFY(searchForChildSymbol(topBottomUp, "std::this_thread::sleep_for", false)); + + QVERIFY(m_bottomUpData.costs.cost(1, topBottomUp.id) >= 10); // at least 10 sched switches + QVERIFY(m_bottomUpData.costs.cost(2, topBottomUp.id) >= 1E9); // at least 1s sleep time + } + private: + static bool canTrace(const QString& path) + { + QFileInfo info("/sys/kernel/debug/tracing/" + path); + return info.isDir() && info.isReadable(); + } + Data::Summary m_summaryData; Data::BottomUpResults m_bottomUpData; Data::TopDownResults m_topDownData;