From 22fdce61d61a6b68f84c3e6e263de2860429ccea Mon Sep 17 00:00:00 2001 From: Mark Dickinson Date: Sat, 10 Sep 2022 16:46:34 +0100 Subject: [PATCH 1/8] Fix undefined behaviour in bu_ulonglong --- .../2022-09-10-16-46-16.gh-issue-96735.0YzJuG.rst | 1 + Modules/_struct.c | 15 +++++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-09-10-16-46-16.gh-issue-96735.0YzJuG.rst diff --git a/Misc/NEWS.d/next/Library/2022-09-10-16-46-16.gh-issue-96735.0YzJuG.rst b/Misc/NEWS.d/next/Library/2022-09-10-16-46-16.gh-issue-96735.0YzJuG.rst new file mode 100644 index 00000000000000..a29ada98ac20e9 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-09-10-16-46-16.gh-issue-96735.0YzJuG.rst @@ -0,0 +1 @@ +Fix undefined behaviour in :func:`struct.unpack`. diff --git a/Modules/_struct.c b/Modules/_struct.c index 9d66568a282662..5064f660b072bb 100644 --- a/Modules/_struct.c +++ b/Modules/_struct.c @@ -835,16 +835,19 @@ bu_uint(_structmodulestate *state, const char *p, const formatdef *f) static PyObject * bu_longlong(_structmodulestate *state, const char *p, const formatdef *f) { - long long x = 0; - Py_ssize_t i = f->size; + unsigned long long x = 0; + + /* This function is only ever used in the case f->size == 8. */ + assert(f->size == 8); + Py_ssize_t i = 8; const unsigned char *bytes = (const unsigned char *)p; do { x = (x<<8) | *bytes++; } while (--i > 0); - /* Extend the sign bit. */ - if (SIZEOF_LONG_LONG > f->size) - x |= -(x & ((long long)1 << ((8 * f->size) - 1))); - return PyLong_FromLongLong(x); + + /* Extend sign, avoiding implementation-defined or undefined behaviour. */ + return PyLong_FromLongLong(x & 0x8000000000000000 ? + -1 - (long long)(0xFFFFFFFFFFFFFFFF - x) : (long long)x); } static PyObject * From e12943b091dfb3e9b606f415024dfdd28caf3ba1 Mon Sep 17 00:00:00 2001 From: Mark Dickinson Date: Sat, 10 Sep 2022 16:58:55 +0100 Subject: [PATCH 2/8] Add regression test --- Lib/test/test_struct.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Lib/test/test_struct.py b/Lib/test/test_struct.py index ab738770546c0b..5fe686ffffe842 100644 --- a/Lib/test/test_struct.py +++ b/Lib/test/test_struct.py @@ -726,6 +726,13 @@ def test_issue45034_signed(self): with self.assertRaisesRegex(struct.error, error_msg): struct.pack('h', -70000) # too small + def test_long_long_unpack_undefined_behaviour(self): + # Regression test for python/cpython#96735. + self.assertEqual( + struct.unpack('!q', b'\xff\xff\xff\xff\xff\xff\xff\xff'), + (-1,) + ) + class UnpackIteratorTest(unittest.TestCase): """ From 8efefd8b5c91312c574ae1996052f45d7387e86a Mon Sep 17 00:00:00 2001 From: Mark Dickinson Date: Sat, 10 Sep 2022 17:05:05 +0100 Subject: [PATCH 3/8] Also apply the fix to lu_longlong --- Lib/test/test_struct.py | 6 +++++- Modules/_struct.c | 15 +++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/Lib/test/test_struct.py b/Lib/test/test_struct.py index 5fe686ffffe842..3770961e2ffaf0 100644 --- a/Lib/test/test_struct.py +++ b/Lib/test/test_struct.py @@ -729,7 +729,11 @@ def test_issue45034_signed(self): def test_long_long_unpack_undefined_behaviour(self): # Regression test for python/cpython#96735. self.assertEqual( - struct.unpack('!q', b'\xff\xff\xff\xff\xff\xff\xff\xff'), + struct.unpack('q', b'\xff\xff\xff\xff\xff\xff\xff\xff'), (-1,) ) diff --git a/Modules/_struct.c b/Modules/_struct.c index 5064f660b072bb..71c6d318b5c592 100644 --- a/Modules/_struct.c +++ b/Modules/_struct.c @@ -1059,16 +1059,19 @@ lu_uint(_structmodulestate *state, const char *p, const formatdef *f) static PyObject * lu_longlong(_structmodulestate *state, const char *p, const formatdef *f) { - long long x = 0; - Py_ssize_t i = f->size; + unsigned long long x = 0; + + /* This function is only ever used in the case f->size == 8. */ + assert(f->size == 8); + Py_ssize_t i = 8; const unsigned char *bytes = (const unsigned char *)p; do { x = (x<<8) | bytes[--i]; } while (i > 0); - /* Extend the sign bit. */ - if (SIZEOF_LONG_LONG > f->size) - x |= -(x & ((long long)1 << ((8 * f->size) - 1))); - return PyLong_FromLongLong(x); + + /* Extend sign, avoiding implementation-defined or undefined behaviour. */ + return PyLong_FromLongLong(x & 0x8000000000000000 ? + -1 - (long long)(0xFFFFFFFFFFFFFFFF - x) : (long long)x); } static PyObject * From 0d49e73cfc5f71fb984f68ac16efe85b4e70d78e Mon Sep 17 00:00:00 2001 From: Mark Dickinson Date: Sat, 10 Sep 2022 17:27:54 +0100 Subject: [PATCH 4/8] Fix similar undefined behaviour issues in bu_int and lu_int --- Modules/_struct.c | 72 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 53 insertions(+), 19 deletions(-) diff --git a/Modules/_struct.c b/Modules/_struct.c index 71c6d318b5c592..67cfc913269b20 100644 --- a/Modules/_struct.c +++ b/Modules/_struct.c @@ -805,19 +805,37 @@ static const formatdef native_table[] = { /* Big-endian routines. *****************************************************/ +static PyObject * +bu_short(_structmodulestate *state, const char *p, const formatdef *f) +{ + unsigned long x = 0; + + /* This function is only ever used in the case f->size == 2. */ + assert(f->size == 2); + Py_ssize_t i = 2; + const unsigned char *bytes = (const unsigned char *)p; + do { + x = (x<<8) | *bytes++; + } while (--i > 0); + /* Extend sign. */ + return PyLong_FromLong(x & 0x8000U ? (long)x - 0x10000 : (long)x); +} + static PyObject * bu_int(_structmodulestate *state, const char *p, const formatdef *f) { long x = 0; - Py_ssize_t i = f->size; + + /* This function is only ever used in the case f->size == 4. */ + assert(f->size == 4); + Py_ssize_t i = 4; const unsigned char *bytes = (const unsigned char *)p; do { x = (x<<8) | *bytes++; } while (--i > 0); - /* Extend the sign bit. */ - if (SIZEOF_LONG > f->size) - x |= -(x & (1L << ((8 * f->size) - 1))); - return PyLong_FromLong(x); + /* Extend sign, avoiding implementation-defined or undefined behaviour. */ + return PyLong_FromLong(x & 0x80000000U ? + -1 - (long)(0xFFFFFFFFU - x) : (long)x); } static PyObject * @@ -844,10 +862,9 @@ bu_longlong(_structmodulestate *state, const char *p, const formatdef *f) do { x = (x<<8) | *bytes++; } while (--i > 0); - /* Extend sign, avoiding implementation-defined or undefined behaviour. */ - return PyLong_FromLongLong(x & 0x8000000000000000 ? - -1 - (long long)(0xFFFFFFFFFFFFFFFF - x) : (long long)x); + return PyLong_FromLongLong(x & 0x8000000000000000U ? + -1 - (long long)(0xFFFFFFFFFFFFFFFFU - x) : (long long)x); } static PyObject * @@ -1012,7 +1029,7 @@ static formatdef bigendian_table[] = { {'c', 1, 0, nu_char, np_char}, {'s', 1, 0, NULL}, {'p', 1, 0, NULL}, - {'h', 2, 0, bu_int, bp_int}, + {'h', 2, 0, bu_short, bp_int}, {'H', 2, 0, bu_uint, bp_uint}, {'i', 4, 0, bu_int, bp_int}, {'I', 4, 0, bu_uint, bp_uint}, @@ -1029,19 +1046,37 @@ static formatdef bigendian_table[] = { /* Little-endian routines. *****************************************************/ +static PyObject * +lu_short(_structmodulestate *state, const char *p, const formatdef *f) +{ + unsigned long x = 0; + + /* This function is only ever used in the case f->size == 2. */ + assert(f->size == 2); + Py_ssize_t i = 2; + const unsigned char *bytes = (const unsigned char *)p; + do { + x = (x<<8) | bytes[--i]; + } while (i > 0); + /* Extend sign. */ + return PyLong_FromLong(x & 0x8000U ? (long)x - 0x10000 : (long)x); +} + static PyObject * lu_int(_structmodulestate *state, const char *p, const formatdef *f) { - long x = 0; - Py_ssize_t i = f->size; + unsigned long x = 0; + + /* This function is only ever used in the case f->size == 4. */ + assert(f->size == 4); + Py_ssize_t i = 4; const unsigned char *bytes = (const unsigned char *)p; do { x = (x<<8) | bytes[--i]; } while (i > 0); - /* Extend the sign bit. */ - if (SIZEOF_LONG > f->size) - x |= -(x & (1L << ((8 * f->size) - 1))); - return PyLong_FromLong(x); + /* Extend sign, avoiding implementation-defined or undefined behaviour. */ + return PyLong_FromLong(x & 0x80000000U ? + -1 - (long)(0xFFFFFFFFU - x) : (long)x); } static PyObject * @@ -1068,10 +1103,9 @@ lu_longlong(_structmodulestate *state, const char *p, const formatdef *f) do { x = (x<<8) | bytes[--i]; } while (i > 0); - /* Extend sign, avoiding implementation-defined or undefined behaviour. */ - return PyLong_FromLongLong(x & 0x8000000000000000 ? - -1 - (long long)(0xFFFFFFFFFFFFFFFF - x) : (long long)x); + return PyLong_FromLongLong(x & 0x8000000000000000U ? + -1 - (long long)(0xFFFFFFFFFFFFFFFFU - x) : (long long)x); } static PyObject * @@ -1219,7 +1253,7 @@ static formatdef lilendian_table[] = { {'c', 1, 0, nu_char, np_char}, {'s', 1, 0, NULL}, {'p', 1, 0, NULL}, - {'h', 2, 0, lu_int, lp_int}, + {'h', 2, 0, lu_short, lp_int}, {'H', 2, 0, lu_uint, lp_uint}, {'i', 4, 0, lu_int, lp_int}, {'I', 4, 0, lu_uint, lp_uint}, From 5aeef09d6e30859681bfaf37604a676603be7b8b Mon Sep 17 00:00:00 2001 From: Mark Dickinson Date: Sat, 10 Sep 2022 17:37:50 +0100 Subject: [PATCH 5/8] Remove redundant test --- Lib/test/test_struct.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/Lib/test/test_struct.py b/Lib/test/test_struct.py index 3770961e2ffaf0..ab738770546c0b 100644 --- a/Lib/test/test_struct.py +++ b/Lib/test/test_struct.py @@ -726,17 +726,6 @@ def test_issue45034_signed(self): with self.assertRaisesRegex(struct.error, error_msg): struct.pack('h', -70000) # too small - def test_long_long_unpack_undefined_behaviour(self): - # Regression test for python/cpython#96735. - self.assertEqual( - struct.unpack('q', b'\xff\xff\xff\xff\xff\xff\xff\xff'), - (-1,) - ) - class UnpackIteratorTest(unittest.TestCase): """ From 204e924c85e04b8ea8186c1dfa9cf4618063b127 Mon Sep 17 00:00:00 2001 From: Mark Dickinson Date: Sat, 10 Sep 2022 17:42:25 +0100 Subject: [PATCH 6/8] Fix accumulator type --- Modules/_struct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/_struct.c b/Modules/_struct.c index 67cfc913269b20..3a7b5fdb989e33 100644 --- a/Modules/_struct.c +++ b/Modules/_struct.c @@ -824,7 +824,7 @@ bu_short(_structmodulestate *state, const char *p, const formatdef *f) static PyObject * bu_int(_structmodulestate *state, const char *p, const formatdef *f) { - long x = 0; + unsigned long x = 0; /* This function is only ever used in the case f->size == 4. */ assert(f->size == 4); From 994703e10e0af676559a7028ba77afee4e2bcb23 Mon Sep 17 00:00:00 2001 From: Mark Dickinson Date: Tue, 13 Sep 2022 20:04:24 +0100 Subject: [PATCH 7/8] Branch-free sign-extension --- Modules/_struct.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/Modules/_struct.c b/Modules/_struct.c index 3a7b5fdb989e33..9312cd3971690a 100644 --- a/Modules/_struct.c +++ b/Modules/_struct.c @@ -817,8 +817,9 @@ bu_short(_structmodulestate *state, const char *p, const formatdef *f) do { x = (x<<8) | *bytes++; } while (--i > 0); - /* Extend sign. */ - return PyLong_FromLong(x & 0x8000U ? (long)x - 0x10000 : (long)x); + /* Extend sign, avoiding implementation-defined or undefined behaviour. */ + x = (x ^ 0x8000U) - 0x8000U; + return PyLong_FromLong(x & 0x8000U ? -1 - (long)(~x): (long)x); } static PyObject * @@ -834,8 +835,8 @@ bu_int(_structmodulestate *state, const char *p, const formatdef *f) x = (x<<8) | *bytes++; } while (--i > 0); /* Extend sign, avoiding implementation-defined or undefined behaviour. */ - return PyLong_FromLong(x & 0x80000000U ? - -1 - (long)(0xFFFFFFFFU - x) : (long)x); + x = (x ^ 0x80000000U) - 0x80000000U; + return PyLong_FromLong(x & 0x80000000U ? -1 - (long)(~x): (long)x); } static PyObject * @@ -863,8 +864,9 @@ bu_longlong(_structmodulestate *state, const char *p, const formatdef *f) x = (x<<8) | *bytes++; } while (--i > 0); /* Extend sign, avoiding implementation-defined or undefined behaviour. */ - return PyLong_FromLongLong(x & 0x8000000000000000U ? - -1 - (long long)(0xFFFFFFFFFFFFFFFFU - x) : (long long)x); + x = (x ^ 0x8000000000000000U) - 0x8000000000000000U; + return PyLong_FromLongLong( + x & 0x8000000000000000U ? -1 - (long long)(~x): (long long)x); } static PyObject * @@ -1058,8 +1060,9 @@ lu_short(_structmodulestate *state, const char *p, const formatdef *f) do { x = (x<<8) | bytes[--i]; } while (i > 0); - /* Extend sign. */ - return PyLong_FromLong(x & 0x8000U ? (long)x - 0x10000 : (long)x); + /* Extend sign, avoiding implementation-defined or undefined behaviour. */ + x = (x ^ 0x8000U) - 0x8000U; + return PyLong_FromLong(x & 0x8000U ? -1 - (long)(~x): (long)x); } static PyObject * @@ -1075,8 +1078,8 @@ lu_int(_structmodulestate *state, const char *p, const formatdef *f) x = (x<<8) | bytes[--i]; } while (i > 0); /* Extend sign, avoiding implementation-defined or undefined behaviour. */ - return PyLong_FromLong(x & 0x80000000U ? - -1 - (long)(0xFFFFFFFFU - x) : (long)x); + x = (x ^ 0x80000000U) - 0x80000000U; + return PyLong_FromLong(x & 0x80000000U ? -1 - (long)(~x): (long)x); } static PyObject * @@ -1104,8 +1107,9 @@ lu_longlong(_structmodulestate *state, const char *p, const formatdef *f) x = (x<<8) | bytes[--i]; } while (i > 0); /* Extend sign, avoiding implementation-defined or undefined behaviour. */ - return PyLong_FromLongLong(x & 0x8000000000000000U ? - -1 - (long long)(0xFFFFFFFFFFFFFFFFU - x) : (long long)x); + x = (x ^ 0x8000000000000000U) - 0x8000000000000000U; + return PyLong_FromLongLong( + x & 0x8000000000000000U ? -1 - (long long)(~x): (long long)x); } static PyObject * From 1294440b5c04c807f36911ab7d8721f87045ca90 Mon Sep 17 00:00:00 2001 From: Mark Dickinson Date: Tue, 13 Sep 2022 20:11:55 +0100 Subject: [PATCH 8/8] Fix unconventional spacing --- Modules/_struct.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Modules/_struct.c b/Modules/_struct.c index 9312cd3971690a..09f52a981485dd 100644 --- a/Modules/_struct.c +++ b/Modules/_struct.c @@ -819,7 +819,7 @@ bu_short(_structmodulestate *state, const char *p, const formatdef *f) } while (--i > 0); /* Extend sign, avoiding implementation-defined or undefined behaviour. */ x = (x ^ 0x8000U) - 0x8000U; - return PyLong_FromLong(x & 0x8000U ? -1 - (long)(~x): (long)x); + return PyLong_FromLong(x & 0x8000U ? -1 - (long)(~x) : (long)x); } static PyObject * @@ -836,7 +836,7 @@ bu_int(_structmodulestate *state, const char *p, const formatdef *f) } while (--i > 0); /* Extend sign, avoiding implementation-defined or undefined behaviour. */ x = (x ^ 0x80000000U) - 0x80000000U; - return PyLong_FromLong(x & 0x80000000U ? -1 - (long)(~x): (long)x); + return PyLong_FromLong(x & 0x80000000U ? -1 - (long)(~x) : (long)x); } static PyObject * @@ -866,7 +866,7 @@ bu_longlong(_structmodulestate *state, const char *p, const formatdef *f) /* Extend sign, avoiding implementation-defined or undefined behaviour. */ x = (x ^ 0x8000000000000000U) - 0x8000000000000000U; return PyLong_FromLongLong( - x & 0x8000000000000000U ? -1 - (long long)(~x): (long long)x); + x & 0x8000000000000000U ? -1 - (long long)(~x) : (long long)x); } static PyObject * @@ -1062,7 +1062,7 @@ lu_short(_structmodulestate *state, const char *p, const formatdef *f) } while (i > 0); /* Extend sign, avoiding implementation-defined or undefined behaviour. */ x = (x ^ 0x8000U) - 0x8000U; - return PyLong_FromLong(x & 0x8000U ? -1 - (long)(~x): (long)x); + return PyLong_FromLong(x & 0x8000U ? -1 - (long)(~x) : (long)x); } static PyObject * @@ -1079,7 +1079,7 @@ lu_int(_structmodulestate *state, const char *p, const formatdef *f) } while (i > 0); /* Extend sign, avoiding implementation-defined or undefined behaviour. */ x = (x ^ 0x80000000U) - 0x80000000U; - return PyLong_FromLong(x & 0x80000000U ? -1 - (long)(~x): (long)x); + return PyLong_FromLong(x & 0x80000000U ? -1 - (long)(~x) : (long)x); } static PyObject * @@ -1109,7 +1109,7 @@ lu_longlong(_structmodulestate *state, const char *p, const formatdef *f) /* Extend sign, avoiding implementation-defined or undefined behaviour. */ x = (x ^ 0x8000000000000000U) - 0x8000000000000000U; return PyLong_FromLongLong( - x & 0x8000000000000000U ? -1 - (long long)(~x): (long long)x); + x & 0x8000000000000000U ? -1 - (long long)(~x) : (long long)x); } static PyObject *