Skip to content

Commit

Permalink
Log allocation failures (#988)
Browse files Browse the repository at this point in the history
Right now when we run out of memory, the last allocation that causes the OOM is not being logged. Adding a new `allocate failure` line to the log to help with debugging. For now these lines are ignored by the replay utility.

@abellina

Authors:
  - Rong Ou (https://github.com/rongou)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: #988
  • Loading branch information
rongou authored Mar 15, 2022
1 parent 4997097 commit 6856d86
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 12 deletions.
3 changes: 2 additions & 1 deletion benchmarks/replay/replay.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -217,10 +217,11 @@ struct replay_benchmark {
cv.wait(lock, [&]() { return event_index == event.index; });
}

// rmm::detail::action::ALLOCATE_FAILURE is ignored.
if (rmm::detail::action::ALLOCATE == event.act) {
auto ptr = mr_->allocate(event.size);
set_allocation(event.pointer, allocation{ptr, event.size});
} else {
} else if (rmm::detail::action::FREE == event.act) {
auto alloc = remove_allocation(event.pointer);
mr_->deallocate(alloc.ptr, event.size);
}
Expand Down
21 changes: 16 additions & 5 deletions benchmarks/utilities/log_parser.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

namespace rmm::detail {

enum class action : bool { ALLOCATE, FREE };
enum class action { ALLOCATE, FREE, ALLOCATE_FAILURE };

/**
* @brief Represents an allocation event
Expand Down Expand Up @@ -85,8 +85,13 @@ struct event {

inline std::ostream& operator<<(std::ostream& os, event const& evt)
{
const auto* act_string = (evt.act == action::ALLOCATE) ? "allocate" : "free";

const auto* act_string{[&evt] {
switch (evt.act) {
case action::ALLOCATE: return "allocate";
case action::FREE: return "free";
default: return "allocate failure";
}
}()};
const auto format_width{9};

os << "Thread: " << evt.thread_id << std::setw(format_width) << act_string
Expand Down Expand Up @@ -164,8 +169,14 @@ inline std::vector<event> parse_csv(std::string const& filename)

for (std::size_t i = 0; i < actions.size(); ++i) {
auto const& action = actions[i];
RMM_EXPECTS((action == "allocate") or (action == "free"), "Invalid action string.");
auto act = (action == "allocate") ? action::ALLOCATE : action::FREE;
RMM_EXPECTS((action == "allocate") or (action == "allocate failure") or (action == "free"),
"Invalid action string.");
auto act{action::ALLOCATE_FAILURE};
if (action == "allocate") {
act = action::ALLOCATE;
} else if (action == "free") {
act = action::FREE;
}
events[i] = event{tids[i], act, sizes[i], pointers[i], streams[i], i};
}
return events;
Expand Down
23 changes: 17 additions & 6 deletions include/rmm/mr/device/logging_resource_adaptor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,10 +205,16 @@ class logging_resource_adaptor final : public device_memory_resource {
* @brief Allocates memory of size at least `bytes` using the upstream
* resource and logs the allocation.
*
* If the upstream allocation is successful logs the
* following CSV formatted line to the file specified at construction:
* If the upstream allocation is successful, logs the following CSV formatted
* line to the file specified at construction:
* ```
* thread_id,*TIMESTAMP*,"allocate",*bytes*,*stream*
* thread_id,*TIMESTAMP*,"allocate",*pointer*,*bytes*,*stream*
* ```
*
* If the upstream allocation failed, logs the following CSV formatted line
* to the file specified at construction:
* ```
* thread_id,*TIMESTAMP*,"allocate failure",0x0,*bytes*,*stream*
* ```
*
* The returned pointer has at least 256B alignment.
Expand All @@ -222,9 +228,14 @@ class logging_resource_adaptor final : public device_memory_resource {
*/
void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
{
auto const ptr = upstream_->allocate(bytes, stream);
logger_->info("allocate,{},{},{}", ptr, bytes, fmt::ptr(stream.value()));
return ptr;
try {
auto const ptr = upstream_->allocate(bytes, stream);
logger_->info("allocate,{},{},{}", ptr, bytes, fmt::ptr(stream.value()));
return ptr;
} catch (...) {
logger_->info("allocate failure,{},{},{}", nullptr, bytes, fmt::ptr(stream.value()));
throw;
}
}

/**
Expand Down
35 changes: 35 additions & 0 deletions tests/logger_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* limitations under the License.
*/

#include "./byte_literals.hpp"
#include <benchmarks/utilities/log_parser.hpp>

#include <rmm/mr/device/cuda_memory_resource.hpp>
Expand All @@ -24,6 +25,9 @@

#include <thread>

namespace rmm::test {
namespace {

class raii_restore_env {
public:
raii_restore_env(char const* name) : name_(name)
Expand Down Expand Up @@ -212,6 +216,34 @@ TEST(Adaptor, EnvironmentPath)
expect_log_events(filename, expected_events);
}

TEST(Adaptor, AllocateFailure)
{
std::string filename{"logs/failure.txt"};
rmm::mr::cuda_memory_resource upstream;

auto log_mr = rmm::mr::make_logging_adaptor(&upstream, filename);

auto const size0{99};
auto const size1{1_TiB};

auto* ptr0 = log_mr.allocate(size0);
log_mr.deallocate(ptr0, size0);
try {
log_mr.allocate(size1);
} catch (...) {
}
log_mr.flush();

using rmm::detail::action;
using rmm::detail::event;

std::vector<event> expected_events{{action::ALLOCATE, size0, ptr0},
{action::FREE, size0, ptr0},
{action::ALLOCATE_FAILURE, size1, nullptr}};

expect_log_events(filename, expected_events);
}

TEST(Adaptor, STDOUT)
{
testing::internal::CaptureStdout();
Expand Down Expand Up @@ -247,3 +279,6 @@ TEST(Adaptor, STDERR)
std::string header = output.substr(0, output.find('\n'));
ASSERT_EQ(header, log_mr.header());
}

} // namespace
} // namespace rmm::test

0 comments on commit 6856d86

Please sign in to comment.