Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add patch to work around potential crash in PyTorch 1.13.1 caused by GCC 12 compiler bug #19133

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1-foss-2022b.eb
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ patches = [
'PyTorch-1.12.1_fix-vsx-vector-funcs.patch',
'PyTorch-1.12.1_skip-test_round_robin.patch',
'PyTorch-1.13.1_disable-test-sharding.patch',
'PyTorch-1.13.1_fix-duplicate-kDefaultTimeout-definition.patch',
'PyTorch-1.13.1_fix-flaky-jit-test.patch',
'PyTorch-1.13.1_fix-fsdp-fp16-test.patch',
'PyTorch-1.13.1_fix-gcc-12-missing-includes.patch',
Expand All @@ -42,6 +43,7 @@ patches = [
'PyTorch-1.13.1_skip-failing-grad-test.patch',
'PyTorch-1.13.1_skip-failing-singular-grad-test.patch',
'PyTorch-1.13.1_skip-tests-without-fbgemm.patch',
'PyTorch-1.13.1_workaround-gcc12-destructor-exception-bug.patch',
]
checksums = [
{'pytorch-v1.13.1.tar.gz': 'dbc229ee9750b02b514937d017744443a269ea0241ed3f32b9af0703589d25d4'},
Expand All @@ -67,6 +69,8 @@ checksums = [
{'PyTorch-1.12.1_fix-vsx-vector-funcs.patch': 'caccbf60f62eac313896c1eaec78b08f5d0fdfcb907079087490bb13d1561aa2'},
{'PyTorch-1.12.1_skip-test_round_robin.patch': '63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349'},
{'PyTorch-1.13.1_disable-test-sharding.patch': 'df2074adeba47998ce2993d99ca64eb6f1c79ab7057f553b436efdec264d3572'},
{'PyTorch-1.13.1_fix-duplicate-kDefaultTimeout-definition.patch':
'882f8cfaf33490a4372928fb6673cbbfa40e5be1b64bf7e0cc2924d73cf872e8'},
{'PyTorch-1.13.1_fix-flaky-jit-test.patch': '71efdeb29b5e5b4982c9f5cb2182733654a34d52f85bb5487bc4d7d99b86101b'},
{'PyTorch-1.13.1_fix-fsdp-fp16-test.patch': '8ae68e60d6e1f92f50322b7f0381c7e65251fba32d7606e3a238a36a2f55b5cf'},
{'PyTorch-1.13.1_fix-gcc-12-missing-includes.patch':
Expand Down Expand Up @@ -95,6 +99,8 @@ checksums = [
'72688a57b2bb617665ad1a1d5e362c5111ae912c10936bb38a089c0204729f48'},
{'PyTorch-1.13.1_skip-tests-without-fbgemm.patch':
'481e595f673baf8ae58b41697a6792b83048b0264aa79b422f48cd8c22948bb7'},
{'PyTorch-1.13.1_workaround-gcc12-destructor-exception-bug.patch':
'a09a2d7ebd428c65988729578bb3fa372565ba176ab9ed7abf11f6fcb15e903e'},
]

osdependencies = [OS_PKG_IBVERBS_DEV]
Expand Down Expand Up @@ -123,8 +129,6 @@ dependencies = [
('expecttest', '0.1.3'),
]

custom_opts = ['CMAKE_CXX_STANDARD=17']

excluded_tests = {
'': [
# This test seems to take too long on NVIDIA Ampere at least.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
From 9a18968253e28ba8d8bdf646731087000c7876b7 Mon Sep 17 00:00:00 2001
From: sclarkson <[email protected]>
Date: Tue, 21 Mar 2023 21:44:49 +0000
Subject: [PATCH] Fix kDefaultTimeout multiple definition build failure
(#97270)

Make the namespace explicit to avoid the constexpr conflict on GCC 11.

Fixes #90448

@ezyang

Pull Request resolved: https://github.com/pytorch/pytorch/pull/97270
Approved by: https://github.com/ezyang
---
torch/csrc/distributed/c10d/ProcessGroupGloo.hpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
index a64bc37c4de522..e4d2967c8604ea 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
@@ -125,7 +125,7 @@ class TORCH_API ProcessGroupGloo : public Backend {
}

void wait(const std::vector<std::string>& keys) override {
- store_->wait(keys, Store::kDefaultTimeout);
+ store_->wait(keys, ::c10d::Store::kDefaultTimeout);
}

void wait(
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
GCC 12 introduced a regression that may cause it to call the destructor twice on an object.
See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112301
This is visible in e.g. `test_cpp_extensions_jit.py -k test_warning`
See also https://github.com/pytorch/pytorch/issues/112383

Workaround this by trying to avoid the throwing PyWarningHandler destructor.

Author: Alexander Grund (TU Dresden)

diff --git a/torch/csrc/Exceptions.cpp b/torch/csrc/Exceptions.cpp
index 5210d6f713d..3c91ed378e1 100644
--- a/torch/csrc/Exceptions.cpp
+++ b/torch/csrc/Exceptions.cpp
@@ -226,6 +226,10 @@ PyWarningHandler::PyWarningHandler() noexcept(true)
/// NOLINTNEXTLINE(bugprone-exception-escape)
PyWarningHandler::~PyWarningHandler() noexcept(false) {
c10::Warning::set_warning_handler(prev_handler_);
+ process_warnings();
+}
+
+void PyWarningHandler::process_warnings() {
auto& warning_buffer = internal_handler_.warning_buffer_;

if (warning_buffer.size() > 0) {
diff --git a/torch/csrc/Exceptions.h b/torch/csrc/Exceptions.h
index 89256c64bba..8514d08c8d0 100644
--- a/torch/csrc/Exceptions.h
+++ b/torch/csrc/Exceptions.h
@@ -110,6 +110,13 @@ static inline void PyErr_SetString(PyObject* type, const std::string& message) {
retstmnt; \
}

+/** To be called inside a HANDLE_TH_ERRORS..END_HANDLE_TH_ERRORS_* block
+ * before returning a value / where no further warnings can occur.
+ * Avoids throwing an error in the destructor which triggers a bug in GCC 12+
+ * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112301
+ */
+#define FLUSH_TH_ERRORS __enforce_warning_buffer.process_warnings();
+
#define END_HANDLE_TH_ERRORS_PYBIND \
} \
catch (...) { \
@@ -381,6 +388,9 @@ struct PyWarningHandler {
in_exception_ = true;
}

+ // Trigger processing of warnings
+ TORCH_API void process_warnings();
+
private:
InternalHandler internal_handler_;
at::WarningHandler* prev_handler_;
@@ -394,12 +404,20 @@ using Arg = typename function_traits<Func>::template arg<i>::type;
template <typename Func, size_t... Is>
auto wrap_pybind_function_impl_(Func&& f, std::index_sequence<Is...>) {
using traits = function_traits<Func>;
+ using result_type = typename traits::result_type;
namespace py = pybind11;

// f=f is needed to handle function references on older compilers
- return [f = f](Arg<Func, Is>... args) -> typename traits::result_type {
+ return [f = f](Arg<Func, Is>... args) -> result_type {
HANDLE_TH_ERRORS
- return f(std::forward<Arg<Func, Is>>(args)...);
+ if constexpr (std::is_void<result_type>::value) {
+ c10::guts::invoke(f, std::forward<Arg<Func, Is>>(args)...);
+ FLUSH_TH_ERRORS
+ } else {
+ auto res = c10::guts::invoke(f, std::forward<Arg<Func, Is>>(args)...);
+ FLUSH_TH_ERRORS
+ return res;
+ }
END_HANDLE_TH_ERRORS_PYBIND
};
}
Loading