From d39945ec55aaa14d62c90fac3f7541034c5597be Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 3 Oct 2023 18:24:05 +0300 Subject: [PATCH 01/16] gh-110289: C API: Add PyUnicode_EqualToString() function --- Doc/c-api/unicode.rst | 10 +++ Doc/data/stable_abi.dat | 1 + Doc/whatsnew/3.13.rst | 7 +++ Include/unicodeobject.h | 8 +++ Lib/test/test_stable_abi_ctypes.py | 1 + ...-10-03-19-01-20.gh-issue-110289.YBIHEz.rst | 1 + Misc/stable_abi.toml | 2 + Objects/unicodeobject.c | 61 +++++++++++++++++++ PC/python3dll.c | 1 + 9 files changed, 92 insertions(+) create mode 100644 Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 2a2cb1b8c458e7..f552380124bb37 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1396,6 +1396,16 @@ They all return ``NULL`` or ``-1`` if an exception occurs. :c:func:`PyErr_Occurred` to check for errors. +.. c:function:: int PyUnicode_EqualToString(PyObject *unicode, const char *string) + + Compare a Unicode object with a UTF-8 encoded C string and return true + if they are equal and false otherwise. + + This function does not raise exceptions. + + .. versionadded:: 3.13 + + .. c:function:: int PyUnicode_CompareWithASCIIString(PyObject *uni, const char *string) Compare a Unicode object, *uni*, with *string* and return ``-1``, ``0``, ``1`` for less diff --git a/Doc/data/stable_abi.dat b/Doc/data/stable_abi.dat index c189c78238f40f..abfc186cdc460d 100644 --- a/Doc/data/stable_abi.dat +++ b/Doc/data/stable_abi.dat @@ -755,6 +755,7 @@ function,PyUnicode_DecodeUnicodeEscape,3.2,, function,PyUnicode_EncodeCodePage,3.7,on Windows, function,PyUnicode_EncodeFSDefault,3.2,, function,PyUnicode_EncodeLocale,3.7,, +function,PyUnicode_EqualToString,3.13,, function,PyUnicode_FSConverter,3.2,, function,PyUnicode_FSDecoder,3.2,, function,PyUnicode_Find,3.2,, diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 1ef04fa7ae6adc..7f05a0275f4664 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -1001,6 +1001,13 @@ New Features :c:macro:`Py_TPFLAGS_MANAGED_DICT` flag. (Contributed by Victor Stinner in :gh:`107073`.) +* Add :c:func:`PyUnicode_EqualToString` function: compare Unicode object with + a :c:expr:`const char*` UTF-8 encoded bytes string and return true if they + are equal or false otherwise. + This function does not raise exceptions. + (Contributed by Serhiy Storchaka in :gh:`110289`.) + + Porting to Python 3.13 ---------------------- diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index f00277787122aa..a7fad22e606b28 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -957,6 +957,14 @@ PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( const char *right /* ASCII-encoded string */ ); +#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030D0000 +/* Compare a Unicode object with UTF-8 encoded C string and return 1 for equal + and 0 otherwise. + This function does not raise exceptions. */ + +PyAPI_FUNC(int) PyUnicode_EqualToString(PyObject *, const char *); +#endif + /* Rich compare two strings and return one of the following: - NULL in case an exception was raised diff --git a/Lib/test/test_stable_abi_ctypes.py b/Lib/test/test_stable_abi_ctypes.py index 94f817f8e1d159..f224d67e6416d6 100644 --- a/Lib/test/test_stable_abi_ctypes.py +++ b/Lib/test/test_stable_abi_ctypes.py @@ -770,6 +770,7 @@ def test_windows_feature_macros(self): "PyUnicode_DecodeUnicodeEscape", "PyUnicode_EncodeFSDefault", "PyUnicode_EncodeLocale", + "PyUnicode_EqualToString", "PyUnicode_FSConverter", "PyUnicode_FSDecoder", "PyUnicode_Find", diff --git a/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst b/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst new file mode 100644 index 00000000000000..ada5072071a476 --- /dev/null +++ b/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst @@ -0,0 +1 @@ +Add :c:func:`PyUnicode_EqualToString` function. diff --git a/Misc/stable_abi.toml b/Misc/stable_abi.toml index 8df3f85e61eec6..20f6ea560b4316 100644 --- a/Misc/stable_abi.toml +++ b/Misc/stable_abi.toml @@ -2460,3 +2460,5 @@ added = '3.13' [function.PyMapping_HasKeyStringWithError] added = '3.13' +[function.PyUnicode_EqualToString] + added = '3.13' diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 49981a1f881c21..8c71990a011849 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10673,6 +10673,67 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) } } +int +PyUnicode_EqualToString(PyObject *unicode, const char *str) +{ + assert(_PyUnicode_CHECK(unicode)); + assert(str); + if (PyUnicode_IS_ASCII(unicode)) { + size_t len = (size_t)PyUnicode_GET_LENGTH(unicode); + return strlen(str) == len && + memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0; + } + if (PyUnicode_UTF8(unicode) != NULL) { + size_t len = (size_t)PyUnicode_UTF8_LENGTH(unicode); + return strlen(str) == len && + memcmp(PyUnicode_UTF8(unicode), str, len) == 0; + } + + Py_UCS4 ch; + Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); + int kind = PyUnicode_KIND(unicode); + const void *data = PyUnicode_DATA(unicode); + /* Compare Unicode string and UTF-8 string */ + for (Py_ssize_t i = 0; i < len; i++) { + ch = PyUnicode_READ(kind, data, i); + if (ch == 0x80) { + return 0; + } + else if (ch < 0x80) { + if (ch != (unsigned char)*str++) { + return 0; + } + } + else if (ch < 0x800) { + if ((0xc0 | (ch >> 6)) != (unsigned char)*str++ || + (0x80 | (ch & 0x3f)) != (unsigned char)*str++) + { + return 0; + } + } + else if (ch < 0x10000) { + if (Py_UNICODE_IS_SURROGATE(ch) || + (0xe0 | (ch >> 12)) != (unsigned char)*str++ || + (0x80 | ((ch >> 6) & 0x3f)) != (unsigned char)*str++ || + (0x80 | (ch & 0x3f)) != (unsigned char)*str++) + { + return 0; + } + } + else { + assert(ch <= MAX_UNICODE); + if ((0xf0 | (ch >> 18)) != (unsigned char)*str++ || + (0x80 | ((ch >> 12) & 0x3f)) != (unsigned char)*str++ || + (0x80 | ((ch >> 6) & 0x3f)) != (unsigned char)*str++ || + (0x80 | (ch & 0x3f)) != (unsigned char)*str++) + { + return 0; + } + } + } + return *str == 0; +} + int _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str) { diff --git a/PC/python3dll.c b/PC/python3dll.c index 2c1cc8098ce856..5f629ccf99d28a 100755 --- a/PC/python3dll.c +++ b/PC/python3dll.c @@ -688,6 +688,7 @@ EXPORT_FUNC(PyUnicode_DecodeUTF8Stateful) EXPORT_FUNC(PyUnicode_EncodeCodePage) EXPORT_FUNC(PyUnicode_EncodeFSDefault) EXPORT_FUNC(PyUnicode_EncodeLocale) +EXPORT_FUNC(PyUnicode_EqualToString) EXPORT_FUNC(PyUnicode_Find) EXPORT_FUNC(PyUnicode_FindChar) EXPORT_FUNC(PyUnicode_Format) From 4793161fcb730e2d09794a2b7cf91460b2d48a87 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 3 Oct 2023 21:20:38 +0300 Subject: [PATCH 02/16] Add tests and address review comments. --- Doc/c-api/unicode.rst | 4 ++- Doc/data/stable_abi.dat | 2 +- Doc/whatsnew/3.13.rst | 2 +- Include/unicodeobject.h | 2 +- Lib/test/test_capi/test_unicode.py | 31 +++++++++++++++++++ Lib/test/test_stable_abi_ctypes.py | 2 +- ...-10-03-19-01-20.gh-issue-110289.YBIHEz.rst | 2 +- Misc/stable_abi.toml | 2 +- Modules/_testcapi/unicode.c | 19 ++++++++++++ Objects/unicodeobject.c | 4 +-- PC/python3dll.c | 2 +- 11 files changed, 62 insertions(+), 10 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index f552380124bb37..c8bd0d7f81c7e5 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1396,10 +1396,12 @@ They all return ``NULL`` or ``-1`` if an exception occurs. :c:func:`PyErr_Occurred` to check for errors. -.. c:function:: int PyUnicode_EqualToString(PyObject *unicode, const char *string) +.. c:function:: int PyUnicode_EqualToUTF8(PyObject *unicode, const char *string) Compare a Unicode object with a UTF-8 encoded C string and return true if they are equal and false otherwise. + If the Unicode object contains null or surrogate characters or + the C string not encoded to UTF-8 return false. This function does not raise exceptions. diff --git a/Doc/data/stable_abi.dat b/Doc/data/stable_abi.dat index abfc186cdc460d..1407659d1ae576 100644 --- a/Doc/data/stable_abi.dat +++ b/Doc/data/stable_abi.dat @@ -755,7 +755,7 @@ function,PyUnicode_DecodeUnicodeEscape,3.2,, function,PyUnicode_EncodeCodePage,3.7,on Windows, function,PyUnicode_EncodeFSDefault,3.2,, function,PyUnicode_EncodeLocale,3.7,, -function,PyUnicode_EqualToString,3.13,, +function,PyUnicode_EqualToUTF8,3.13,, function,PyUnicode_FSConverter,3.2,, function,PyUnicode_FSDecoder,3.2,, function,PyUnicode_Find,3.2,, diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 7f05a0275f4664..ccc29fd915fcf2 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -1001,7 +1001,7 @@ New Features :c:macro:`Py_TPFLAGS_MANAGED_DICT` flag. (Contributed by Victor Stinner in :gh:`107073`.) -* Add :c:func:`PyUnicode_EqualToString` function: compare Unicode object with +* Add :c:func:`PyUnicode_EqualToUTF8` function: compare Unicode object with a :c:expr:`const char*` UTF-8 encoded bytes string and return true if they are equal or false otherwise. This function does not raise exceptions. diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index a7fad22e606b28..e2787497c47bef 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -962,7 +962,7 @@ PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( and 0 otherwise. This function does not raise exceptions. */ -PyAPI_FUNC(int) PyUnicode_EqualToString(PyObject *, const char *); +PyAPI_FUNC(int) PyUnicode_EqualToUTF8(PyObject *, const char *); #endif /* Rich compare two strings and return one of the following: diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 622ee8993907fa..e6e3792c639aec 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1297,6 +1297,37 @@ def test_comparewithasciistring(self): # CRASHES comparewithasciistring([], b'abc') # CRASHES comparewithasciistring(NULL, b'abc') + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_equaltoutf8(self): + """Test PyUnicode_EqualToUTF8()""" + from _testcapi import unicode_equaltoutf8 as equaltoutf8 + + strings = [ + 'abc', '\xa1\xa2\xa3', '\u4f60\u597d\u4e16', + '\U0001f600\U0001f601\U0001f602' + ] + for s in strings: + b = s.encode() + self.assertEqual(equaltoutf8(s, b), 1) + self.assertEqual(equaltoutf8(b.decode(), b), 1) + self.assertEqual(equaltoutf8(s + 'x', b + b'x'), 1) + self.assertEqual(equaltoutf8(s + 'x', b + b'y'), 0) + self.assertEqual(equaltoutf8(s + '\0', b + b'\0'), 0) + self.assertEqual(equaltoutf8(s, b + b'x'), 0) + self.assertEqual(equaltoutf8(s, b[:-1]), 0) + self.assertEqual(equaltoutf8(s, b[:-1] + b'x'), 0) + + # surrogateescape + self.assertEqual(equaltoutf8('\udcfe', b'\xfe'), 0) + # surrogatepass + self.assertEqual(equaltoutf8('\udcfe', b'\xed\xb3\xbe'), 0) + + # CRASHES equaltoutf8(b'abc', b'abc') + # CRASHES equaltoutf8([], b'abc') + # CRASHES equaltoutf8(NULL, b'abc') + # CRASHES equaltoutf8('abc') # NULL + @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') def test_richcompare(self): diff --git a/Lib/test/test_stable_abi_ctypes.py b/Lib/test/test_stable_abi_ctypes.py index f224d67e6416d6..d1ae4e382c9470 100644 --- a/Lib/test/test_stable_abi_ctypes.py +++ b/Lib/test/test_stable_abi_ctypes.py @@ -770,7 +770,7 @@ def test_windows_feature_macros(self): "PyUnicode_DecodeUnicodeEscape", "PyUnicode_EncodeFSDefault", "PyUnicode_EncodeLocale", - "PyUnicode_EqualToString", + "PyUnicode_EqualToUTF8", "PyUnicode_FSConverter", "PyUnicode_FSDecoder", "PyUnicode_Find", diff --git a/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst b/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst index ada5072071a476..b1582bc1591590 100644 --- a/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst +++ b/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst @@ -1 +1 @@ -Add :c:func:`PyUnicode_EqualToString` function. +Add :c:func:`PyUnicode_EqualToUTF8` function. diff --git a/Misc/stable_abi.toml b/Misc/stable_abi.toml index 20f6ea560b4316..ae39fea3a66a24 100644 --- a/Misc/stable_abi.toml +++ b/Misc/stable_abi.toml @@ -2460,5 +2460,5 @@ added = '3.13' [function.PyMapping_HasKeyStringWithError] added = '3.13' -[function.PyUnicode_EqualToString] +[function.PyUnicode_EqualToUTF8] added = '3.13' diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 232b2ad543fca0..83fb8a7cfbcb87 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -1429,6 +1429,24 @@ unicode_comparewithasciistring(PyObject *self, PyObject *args) return PyLong_FromLong(result); } +/* Test PyUnicode_EqualToUTF8() */ +static PyObject * +unicode_equaltoutf8(PyObject *self, PyObject *args) +{ + PyObject *left; + const char *right = NULL; + Py_ssize_t right_len; + int result; + + if (!PyArg_ParseTuple(args, "O|y#", &left, &right, &right_len)) + return NULL; + + NULLABLE(left); + result = PyUnicode_EqualToUTF8(left, right); + assert(!PyErr_Occurred()); + return PyLong_FromLong(result); +} + /* Test PyUnicode_RichCompare() */ static PyObject * unicode_richcompare(PyObject *self, PyObject *args) @@ -2044,6 +2062,7 @@ static PyMethodDef TestMethods[] = { {"unicode_replace", unicode_replace, METH_VARARGS}, {"unicode_compare", unicode_compare, METH_VARARGS}, {"unicode_comparewithasciistring",unicode_comparewithasciistring,METH_VARARGS}, + {"unicode_equaltoutf8", unicode_equaltoutf8, METH_VARARGS}, {"unicode_richcompare", unicode_richcompare, METH_VARARGS}, {"unicode_format", unicode_format, METH_VARARGS}, {"unicode_contains", unicode_contains, METH_VARARGS}, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 8c71990a011849..4993b0c9c52b17 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10674,7 +10674,7 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) } int -PyUnicode_EqualToString(PyObject *unicode, const char *str) +PyUnicode_EqualToUTF8(PyObject *unicode, const char *str) { assert(_PyUnicode_CHECK(unicode)); assert(str); @@ -10696,7 +10696,7 @@ PyUnicode_EqualToString(PyObject *unicode, const char *str) /* Compare Unicode string and UTF-8 string */ for (Py_ssize_t i = 0; i < len; i++) { ch = PyUnicode_READ(kind, data, i); - if (ch == 0x80) { + if (ch == 0) { return 0; } else if (ch < 0x80) { diff --git a/PC/python3dll.c b/PC/python3dll.c index 5f629ccf99d28a..0beb61f28e0ef8 100755 --- a/PC/python3dll.c +++ b/PC/python3dll.c @@ -688,7 +688,7 @@ EXPORT_FUNC(PyUnicode_DecodeUTF8Stateful) EXPORT_FUNC(PyUnicode_EncodeCodePage) EXPORT_FUNC(PyUnicode_EncodeFSDefault) EXPORT_FUNC(PyUnicode_EncodeLocale) -EXPORT_FUNC(PyUnicode_EqualToString) +EXPORT_FUNC(PyUnicode_EqualToUTF8) EXPORT_FUNC(PyUnicode_Find) EXPORT_FUNC(PyUnicode_FindChar) EXPORT_FUNC(PyUnicode_Format) From c55f9ac784a417bc615f2335c813c0e39437e0fd Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 4 Oct 2023 10:53:32 +0300 Subject: [PATCH 03/16] Apply suggestions from code review Co-authored-by: Victor Stinner --- Doc/c-api/unicode.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index c8bd0d7f81c7e5..c9f11d93638333 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1401,7 +1401,7 @@ They all return ``NULL`` or ``-1`` if an exception occurs. Compare a Unicode object with a UTF-8 encoded C string and return true if they are equal and false otherwise. If the Unicode object contains null or surrogate characters or - the C string not encoded to UTF-8 return false. + the C string is not encoded to UTF-8 return 0. This function does not raise exceptions. From bdf2f1e27cdc42ec976a7a23b83f0aade13a56ad Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 4 Oct 2023 11:30:14 +0300 Subject: [PATCH 04/16] Address some of review comments and test the UTF-8 cache. --- Doc/c-api/unicode.rst | 6 +++--- Lib/test/test_capi/test_unicode.py | 28 +++++++++++++++++---------- Objects/unicodeobject.c | 31 +++++++++++++++++------------- 3 files changed, 39 insertions(+), 26 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index c9f11d93638333..dec451464137fd 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1398,10 +1398,10 @@ They all return ``NULL`` or ``-1`` if an exception occurs. .. c:function:: int PyUnicode_EqualToUTF8(PyObject *unicode, const char *string) - Compare a Unicode object with a UTF-8 encoded C string and return true - if they are equal and false otherwise. + Compare a Unicode object with a UTF-8 encoded C string and return true (``1``) + if they are equal and false (``0``) otherwise. If the Unicode object contains null or surrogate characters or - the C string is not encoded to UTF-8 return 0. + the C string is not encoded to UTF-8 return false. This function does not raise exceptions. diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index e6e3792c639aec..c3d7e3bc4c56ab 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1302,26 +1302,34 @@ def test_comparewithasciistring(self): def test_equaltoutf8(self): """Test PyUnicode_EqualToUTF8()""" from _testcapi import unicode_equaltoutf8 as equaltoutf8 + from _testcapi import unicode_asutf8andsize as asutf8andsize strings = [ 'abc', '\xa1\xa2\xa3', '\u4f60\u597d\u4e16', '\U0001f600\U0001f601\U0001f602' ] for s in strings: + # Call PyUnicode_AsUTF8AndSize() which creates the UTF-8 + # encoded string cached in the Unicode object. + asutf8andsize(s, 0) b = s.encode() - self.assertEqual(equaltoutf8(s, b), 1) - self.assertEqual(equaltoutf8(b.decode(), b), 1) + self.assertEqual(equaltoutf8(s, b), 1) # Use the UTF-8 cache. + s2 = b.decode() # New Unicode object without the UTF-8 cache. + self.assertEqual(equaltoutf8(s2, b), 1) self.assertEqual(equaltoutf8(s + 'x', b + b'x'), 1) self.assertEqual(equaltoutf8(s + 'x', b + b'y'), 0) self.assertEqual(equaltoutf8(s + '\0', b + b'\0'), 0) - self.assertEqual(equaltoutf8(s, b + b'x'), 0) - self.assertEqual(equaltoutf8(s, b[:-1]), 0) - self.assertEqual(equaltoutf8(s, b[:-1] + b'x'), 0) - - # surrogateescape - self.assertEqual(equaltoutf8('\udcfe', b'\xfe'), 0) - # surrogatepass - self.assertEqual(equaltoutf8('\udcfe', b'\xed\xb3\xbe'), 0) + self.assertEqual(equaltoutf8(s2, b + b'x'), 0) + self.assertEqual(equaltoutf8(s2, b[:-1]), 0) + self.assertEqual(equaltoutf8(s2, b[:-1] + b'x'), 0) + + # Surrogate characters are always treated as not equal + self.assertEqual(equaltoutf8('\udcfe', + '\udcfe'.encode("utf8", "surrogateescape")), 0) + self.assertEqual(equaltoutf8('\udcfe', + '\udcfe'.encode("utf8", "surrogatepass")), 0) + self.assertEqual(equaltoutf8('\ud801', + '\ud801'.encode("utf8", "surrogatepass")), 0) # CRASHES equaltoutf8(b'abc', b'abc') # CRASHES equaltoutf8([], b'abc') diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 4993b0c9c52b17..18e99a500bf3c6 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10678,6 +10678,7 @@ PyUnicode_EqualToUTF8(PyObject *unicode, const char *str) { assert(_PyUnicode_CHECK(unicode)); assert(str); + if (PyUnicode_IS_ASCII(unicode)) { size_t len = (size_t)PyUnicode_GET_LENGTH(unicode); return strlen(str) == len && @@ -10689,49 +10690,53 @@ PyUnicode_EqualToUTF8(PyObject *unicode, const char *str) memcmp(PyUnicode_UTF8(unicode), str, len) == 0; } - Py_UCS4 ch; + const unsigned char *s = (const unsigned char *)str; Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); int kind = PyUnicode_KIND(unicode); const void *data = PyUnicode_DATA(unicode); /* Compare Unicode string and UTF-8 string */ for (Py_ssize_t i = 0; i < len; i++) { - ch = PyUnicode_READ(kind, data, i); + Py_UCS4 ch = PyUnicode_READ(kind, data, i); if (ch == 0) { return 0; } else if (ch < 0x80) { - if (ch != (unsigned char)*str++) { + if (s[0] != ch) { return 0; } + s += 1; } else if (ch < 0x800) { - if ((0xc0 | (ch >> 6)) != (unsigned char)*str++ || - (0x80 | (ch & 0x3f)) != (unsigned char)*str++) + if (s[0] != (0xc0 | (ch >> 6)) || + s[1] != (0x80 | (ch & 0x3f))) { return 0; } + s += 2; } else if (ch < 0x10000) { if (Py_UNICODE_IS_SURROGATE(ch) || - (0xe0 | (ch >> 12)) != (unsigned char)*str++ || - (0x80 | ((ch >> 6) & 0x3f)) != (unsigned char)*str++ || - (0x80 | (ch & 0x3f)) != (unsigned char)*str++) + s[0] != (0xe0 | (ch >> 12)) || + s[1] != (0x80 | ((ch >> 6) & 0x3f)) || + s[2] != (0x80 | (ch & 0x3f))) { return 0; } + s += 3; } else { assert(ch <= MAX_UNICODE); - if ((0xf0 | (ch >> 18)) != (unsigned char)*str++ || - (0x80 | ((ch >> 12) & 0x3f)) != (unsigned char)*str++ || - (0x80 | ((ch >> 6) & 0x3f)) != (unsigned char)*str++ || - (0x80 | (ch & 0x3f)) != (unsigned char)*str++) + if (s[0] != (0xf0 | (ch >> 18)) || + s[1] != (0x80 | ((ch >> 12) & 0x3f)) || + s[2] != (0x80 | ((ch >> 6) & 0x3f)) || + s[3] != (0x80 | (ch & 0x3f))) { return 0; } + s += 4; } } - return *str == 0; + return *s == 0; } int From 7223c14e3f9629d777fd27b477ad39d516472d80 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 4 Oct 2023 16:15:04 +0300 Subject: [PATCH 05/16] Address review comments. --- Doc/c-api/unicode.rst | 6 +++--- Lib/test/test_capi/test_unicode.py | 9 +++++++++ Modules/_testcapi/unicode.c | 3 ++- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index dec451464137fd..36926b0681f7bc 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1398,10 +1398,10 @@ They all return ``NULL`` or ``-1`` if an exception occurs. .. c:function:: int PyUnicode_EqualToUTF8(PyObject *unicode, const char *string) - Compare a Unicode object with a UTF-8 encoded C string and return true (``1``) - if they are equal and false (``0``) otherwise. + Compare a Unicode object with a UTF-8 or ASCII encoded C string + and return true (``1``) if they are equal and false (``0``) otherwise. If the Unicode object contains null or surrogate characters or - the C string is not encoded to UTF-8 return false. + the C string is not encoded to UTF-8 or ASCII, return false. This function does not raise exceptions. diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index c3d7e3bc4c56ab..f3fff6aa4dab9d 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1331,6 +1331,15 @@ def test_equaltoutf8(self): self.assertEqual(equaltoutf8('\ud801', '\ud801'.encode("utf8", "surrogatepass")), 0) + def check_not_equal_encoding(text, encoding): + self.assertEqual(equaltoutf8(text, text.encode(encoding)), 0) + self.assertNotEqual(text.encode(encoding), text.encode("utf8")) + + # Strings encoded to other encodings are not equal to expected UTF8-encoding string + check_not_equal_encoding('Stéphane', 'latin1') + check_not_equal_encoding('Stéphane', 'utf-16-le') # embedded null characters + check_not_equal_encoding('北京市', 'gbk') + # CRASHES equaltoutf8(b'abc', b'abc') # CRASHES equaltoutf8([], b'abc') # CRASHES equaltoutf8(NULL, b'abc') diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 83fb8a7cfbcb87..094cae40049e6d 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -1438,8 +1438,9 @@ unicode_equaltoutf8(PyObject *self, PyObject *args) Py_ssize_t right_len; int result; - if (!PyArg_ParseTuple(args, "O|y#", &left, &right, &right_len)) + if (!PyArg_ParseTuple(args, "O|y#", &left, &right, &right_len)) { return NULL; + } NULLABLE(left); result = PyUnicode_EqualToUTF8(left, right); From b2713274d26af3460d60f60b7189a3eeef823b9b Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 4 Oct 2023 17:53:32 +0300 Subject: [PATCH 06/16] Apply suggestions from code review Co-authored-by: Victor Stinner --- Doc/c-api/unicode.rst | 4 ++-- Doc/whatsnew/3.13.rst | 4 ++-- Lib/test/test_capi/test_unicode.py | 8 +++++++- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 36926b0681f7bc..ee72af3b5c9cb4 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1399,9 +1399,9 @@ They all return ``NULL`` or ``-1`` if an exception occurs. .. c:function:: int PyUnicode_EqualToUTF8(PyObject *unicode, const char *string) Compare a Unicode object with a UTF-8 or ASCII encoded C string - and return true (``1``) if they are equal and false (``0``) otherwise. + and return true (``1``) if they are equal, or false (``0``) otherwise. If the Unicode object contains null or surrogate characters or - the C string is not encoded to UTF-8 or ASCII, return false. + the C string is not encoded to UTF-8 or ASCII, return false (``0``) . This function does not raise exceptions. diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 067ccbd9259b9b..5181d346254b97 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -1004,8 +1004,8 @@ New Features (Contributed by Victor Stinner in :gh:`107073`.) * Add :c:func:`PyUnicode_EqualToUTF8` function: compare Unicode object with - a :c:expr:`const char*` UTF-8 encoded bytes string and return true if they - are equal or false otherwise. + a :c:expr:`const char*` UTF-8 encoded bytes string and return true (``1``) + if they are equal, or false (``0``) otherwise. This function does not raise exceptions. (Contributed by Serhiy Storchaka in :gh:`110289`.) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index f3fff6aa4dab9d..0dd9bebf33ad9d 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1306,7 +1306,8 @@ def test_equaltoutf8(self): strings = [ 'abc', '\xa1\xa2\xa3', '\u4f60\u597d\u4e16', - '\U0001f600\U0001f601\U0001f602' + '\U0001f600\U0001f601\U0001f602', + '\U0010ffff', ] for s in strings: # Call PyUnicode_AsUTF8AndSize() which creates the UTF-8 @@ -1323,6 +1324,11 @@ def test_equaltoutf8(self): self.assertEqual(equaltoutf8(s2, b[:-1]), 0) self.assertEqual(equaltoutf8(s2, b[:-1] + b'x'), 0) + # embedded null chars/bytes + self.assertEqual(equaltoutf8('abc', b'abc\0def\0'), 1) + self.assertEqual(equaltoutf8('a\0bc', b'abc'), 0) + self.assertEqual(equaltoutf8('abc', b'a\0bc'), 0) + # Surrogate characters are always treated as not equal self.assertEqual(equaltoutf8('\udcfe', '\udcfe'.encode("utf8", "surrogateescape")), 0) From 6f26ad6ccf4726073c912a7b2cb8e9bc469dfb38 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 4 Oct 2023 17:57:32 +0300 Subject: [PATCH 07/16] Remove trailing spaces. --- Lib/test/test_capi/test_unicode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 0dd9bebf33ad9d..3e32b1b7150ab1 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1328,7 +1328,7 @@ def test_equaltoutf8(self): self.assertEqual(equaltoutf8('abc', b'abc\0def\0'), 1) self.assertEqual(equaltoutf8('a\0bc', b'abc'), 0) self.assertEqual(equaltoutf8('abc', b'a\0bc'), 0) - + # Surrogate characters are always treated as not equal self.assertEqual(equaltoutf8('\udcfe', '\udcfe'.encode("utf8", "surrogateescape")), 0) From dd124b87e00aa51f54e1da9adccb7b46c0aa16f5 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 4 Oct 2023 18:41:44 +0300 Subject: [PATCH 08/16] Apply suggestions from code review Co-authored-by: Victor Stinner --- Include/unicodeobject.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index e2787497c47bef..1bce505e9c4d32 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -958,8 +958,8 @@ PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( ); #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030D0000 -/* Compare a Unicode object with UTF-8 encoded C string and return 1 for equal - and 0 otherwise. +/* Compare a Unicode object with UTF-8 encoded C string. + Return 1 if they are equal, or 0 otherwise. This function does not raise exceptions. */ PyAPI_FUNC(int) PyUnicode_EqualToUTF8(PyObject *, const char *); From 76b9177c4158e5b6f9afc898cf0cce8167c48ee4 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 4 Oct 2023 18:52:36 +0300 Subject: [PATCH 09/16] Use "z#" instead of "|y#". --- Lib/test/test_capi/test_unicode.py | 2 +- Modules/_testcapi/unicode.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 3e32b1b7150ab1..98cc69741baa0a 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1349,7 +1349,7 @@ def check_not_equal_encoding(text, encoding): # CRASHES equaltoutf8(b'abc', b'abc') # CRASHES equaltoutf8([], b'abc') # CRASHES equaltoutf8(NULL, b'abc') - # CRASHES equaltoutf8('abc') # NULL + # CRASHES equaltoutf8('abc', NULL) @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 094cae40049e6d..732d7f48ec49e4 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -1438,7 +1438,7 @@ unicode_equaltoutf8(PyObject *self, PyObject *args) Py_ssize_t right_len; int result; - if (!PyArg_ParseTuple(args, "O|y#", &left, &right, &right_len)) { + if (!PyArg_ParseTuple(args, "Oz#", &left, &right, &right_len)) { return NULL; } From ee5781d223e3bf55dc0a3de59e3eb81a9726b40a Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 5 Oct 2023 22:31:17 +0300 Subject: [PATCH 10/16] Apply suggestions from code review Co-authored-by: Antoine Pitrou --- Doc/c-api/unicode.rst | 2 +- Doc/whatsnew/3.13.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index ee72af3b5c9cb4..57e55cdbfe496a 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1401,7 +1401,7 @@ They all return ``NULL`` or ``-1`` if an exception occurs. Compare a Unicode object with a UTF-8 or ASCII encoded C string and return true (``1``) if they are equal, or false (``0``) otherwise. If the Unicode object contains null or surrogate characters or - the C string is not encoded to UTF-8 or ASCII, return false (``0``) . + the C string is not valid UTF-8, false (``0``) is returned. This function does not raise exceptions. diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 5181d346254b97..ff92bd17065922 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -1004,7 +1004,7 @@ New Features (Contributed by Victor Stinner in :gh:`107073`.) * Add :c:func:`PyUnicode_EqualToUTF8` function: compare Unicode object with - a :c:expr:`const char*` UTF-8 encoded bytes string and return true (``1``) + a :c:expr:`const char*` UTF-8 encoded string and return true (``1``) if they are equal, or false (``0``) otherwise. This function does not raise exceptions. (Contributed by Serhiy Storchaka in :gh:`110289`.) From 1a4eb7bb149cd1b3dcdde7e822119f80ece85cd6 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 6 Oct 2023 09:44:53 +0300 Subject: [PATCH 11/16] Add PyUnicode_EqualToUTF8AndSize(). --- Doc/c-api/unicode.rst | 18 ++++-- Doc/data/stable_abi.dat | 1 + Doc/whatsnew/3.13.rst | 8 +-- Include/unicodeobject.h | 1 + Lib/test/test_capi/test_unicode.py | 58 +++++++++++++++++-- Lib/test/test_stable_abi_ctypes.py | 1 + ...-10-03-19-01-20.gh-issue-110289.YBIHEz.rst | 2 +- Misc/stable_abi.toml | 2 + Modules/_testcapi/unicode.c | 24 ++++++++ Objects/unicodeobject.c | 36 +++++++----- PC/python3dll.c | 1 + 11 files changed, 125 insertions(+), 27 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 57e55cdbfe496a..00f4bac65a252a 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1396,11 +1396,12 @@ They all return ``NULL`` or ``-1`` if an exception occurs. :c:func:`PyErr_Occurred` to check for errors. -.. c:function:: int PyUnicode_EqualToUTF8(PyObject *unicode, const char *string) +.. c:function:: int PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *string, Py_ssize_t size) - Compare a Unicode object with a UTF-8 or ASCII encoded C string - and return true (``1``) if they are equal, or false (``0``) otherwise. - If the Unicode object contains null or surrogate characters or + Compare a Unicode object with a char buffer which is interpreted as + being UTF-8 or ASCII encoded and return true (``1``) if they are equal, + or false (``0``) otherwise. + If the Unicode object contains surrogate characters or the C string is not valid UTF-8, false (``0``) is returned. This function does not raise exceptions. @@ -1408,6 +1409,15 @@ They all return ``NULL`` or ``-1`` if an exception occurs. .. versionadded:: 3.13 +.. c:function:: int PyUnicode_EqualToUTF8(PyObject *unicode, const char *string) + + Similar to :c:func:`PyUnicode_EqualToUTF8AndSize`, but compute the string + length using :c:func:`!strlen`. + If the Unicode object contains null characters, false (``0``) is returned. + + .. versionadded:: 3.13 + + .. c:function:: int PyUnicode_CompareWithASCIIString(PyObject *uni, const char *string) Compare a Unicode object, *uni*, with *string* and return ``-1``, ``0``, ``1`` for less diff --git a/Doc/data/stable_abi.dat b/Doc/data/stable_abi.dat index 1407659d1ae576..bfb1f97b554fc6 100644 --- a/Doc/data/stable_abi.dat +++ b/Doc/data/stable_abi.dat @@ -756,6 +756,7 @@ function,PyUnicode_EncodeCodePage,3.7,on Windows, function,PyUnicode_EncodeFSDefault,3.2,, function,PyUnicode_EncodeLocale,3.7,, function,PyUnicode_EqualToUTF8,3.13,, +function,PyUnicode_EqualToUTF8AndSize,3.13,, function,PyUnicode_FSConverter,3.2,, function,PyUnicode_FSDecoder,3.2,, function,PyUnicode_Find,3.2,, diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index ff92bd17065922..2577606373e4ba 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -1003,10 +1003,10 @@ New Features functions on Python 3.11 and 3.12. (Contributed by Victor Stinner in :gh:`107073`.) -* Add :c:func:`PyUnicode_EqualToUTF8` function: compare Unicode object with - a :c:expr:`const char*` UTF-8 encoded string and return true (``1``) - if they are equal, or false (``0``) otherwise. - This function does not raise exceptions. +* Add :c:func:`PyUnicode_EqualToUTF8AndSize` and :c:func:`PyUnicode_EqualToUTF8` + functions: compare Unicode object with a :c:expr:`const char*` UTF-8 encoded + string and return true (``1``) if they are equal, or false (``0``) otherwise. + These functions do not raise exceptions. (Contributed by Serhiy Storchaka in :gh:`110289`.) * Add :c:func:`PyThreadState_GetUnchecked()` function: similar to diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 1bce505e9c4d32..dee00715b3c51d 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -963,6 +963,7 @@ PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( This function does not raise exceptions. */ PyAPI_FUNC(int) PyUnicode_EqualToUTF8(PyObject *, const char *); +PyAPI_FUNC(int) PyUnicode_EqualToUTF8AndSize(PyObject *, const char *, Py_ssize_t); #endif /* Rich compare two strings and return one of the following: diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 98cc69741baa0a..0bbab9bfc0ec01 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1320,6 +1320,7 @@ def test_equaltoutf8(self): self.assertEqual(equaltoutf8(s + 'x', b + b'x'), 1) self.assertEqual(equaltoutf8(s + 'x', b + b'y'), 0) self.assertEqual(equaltoutf8(s + '\0', b + b'\0'), 0) + self.assertEqual(equaltoutf8(s + '\0', b), 0) self.assertEqual(equaltoutf8(s2, b + b'x'), 0) self.assertEqual(equaltoutf8(s2, b[:-1]), 0) self.assertEqual(equaltoutf8(s2, b[:-1] + b'x'), 0) @@ -1337,8 +1338,54 @@ def test_equaltoutf8(self): self.assertEqual(equaltoutf8('\ud801', '\ud801'.encode("utf8", "surrogatepass")), 0) + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_equaltoutf8andsize(self): + """Test PyUnicode_EqualToUTF8AndSize()""" + from _testcapi import unicode_equaltoutf8andsize as equaltoutf8andsize + from _testcapi import unicode_asutf8andsize as asutf8andsize + + strings = [ + 'abc', '\xa1\xa2\xa3', '\u4f60\u597d\u4e16', + '\U0001f600\U0001f601\U0001f602', + '\U0010ffff', + ] + for s in strings: + # Call PyUnicode_AsUTF8AndSize() which creates the UTF-8 + # encoded string cached in the Unicode object. + asutf8andsize(s, 0) + b = s.encode() + self.assertEqual(equaltoutf8andsize(s, b), 1) # Use the UTF-8 cache. + s2 = b.decode() # New Unicode object without the UTF-8 cache. + self.assertEqual(equaltoutf8andsize(s2, b), 1) + self.assertEqual(equaltoutf8andsize(s + 'x', b + b'x'), 1) + self.assertEqual(equaltoutf8andsize(s + 'x', b + b'y'), 0) + self.assertEqual(equaltoutf8andsize(s + '\0', b + b'\0'), 1) + self.assertEqual(equaltoutf8andsize(s + '\0', b), 0) + self.assertEqual(equaltoutf8andsize(s2, b + b'x'), 0) + self.assertEqual(equaltoutf8andsize(s2, b[:-1]), 0) + self.assertEqual(equaltoutf8andsize(s2, b[:-1] + b'x'), 0) + # Not null-terminated, + self.assertEqual(equaltoutf8andsize(s, b + b'x', len(b)), 1) + self.assertEqual(equaltoutf8andsize(s2, b + b'x', len(b)), 1) + self.assertEqual(equaltoutf8andsize(s + '\0', b + b'\0x', len(b) + 1), 1) + self.assertEqual(equaltoutf8andsize(s2, b, len(b) - 1), 0) + + # embedded null chars/bytes + self.assertEqual(equaltoutf8andsize('abc', b'abc\0def\0'), 0) + self.assertEqual(equaltoutf8andsize('a\0bc', b'abc'), 0) + self.assertEqual(equaltoutf8andsize('abc', b'a\0bc'), 0) + + # Surrogate characters are always treated as not equal + self.assertEqual(equaltoutf8andsize('\udcfe', + '\udcfe'.encode("utf8", "surrogateescape")), 0) + self.assertEqual(equaltoutf8andsize('\udcfe', + '\udcfe'.encode("utf8", "surrogatepass")), 0) + self.assertEqual(equaltoutf8andsize('\ud801', + '\ud801'.encode("utf8", "surrogatepass")), 0) + def check_not_equal_encoding(text, encoding): - self.assertEqual(equaltoutf8(text, text.encode(encoding)), 0) + self.assertEqual(equaltoutf8andsize(text, text.encode(encoding)), 0) self.assertNotEqual(text.encode(encoding), text.encode("utf8")) # Strings encoded to other encodings are not equal to expected UTF8-encoding string @@ -1346,10 +1393,11 @@ def check_not_equal_encoding(text, encoding): check_not_equal_encoding('Stéphane', 'utf-16-le') # embedded null characters check_not_equal_encoding('北京市', 'gbk') - # CRASHES equaltoutf8(b'abc', b'abc') - # CRASHES equaltoutf8([], b'abc') - # CRASHES equaltoutf8(NULL, b'abc') - # CRASHES equaltoutf8('abc', NULL) + # CRASHES equaltoutf8andsize('abc', b'abc', -1) + # CRASHES equaltoutf8andsize(b'abc', b'abc') + # CRASHES equaltoutf8andsize([], b'abc') + # CRASHES equaltoutf8andsize(NULL, b'abc') + # CRASHES equaltoutf8andsize('abc', NULL) @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') diff --git a/Lib/test/test_stable_abi_ctypes.py b/Lib/test/test_stable_abi_ctypes.py index d1ae4e382c9470..2a22f6edbf4761 100644 --- a/Lib/test/test_stable_abi_ctypes.py +++ b/Lib/test/test_stable_abi_ctypes.py @@ -771,6 +771,7 @@ def test_windows_feature_macros(self): "PyUnicode_EncodeFSDefault", "PyUnicode_EncodeLocale", "PyUnicode_EqualToUTF8", + "PyUnicode_EqualToUTF8AndSize", "PyUnicode_FSConverter", "PyUnicode_FSDecoder", "PyUnicode_Find", diff --git a/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst b/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst index b1582bc1591590..9028e35130d50c 100644 --- a/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst +++ b/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst @@ -1 +1 @@ -Add :c:func:`PyUnicode_EqualToUTF8` function. +Add :c:func:`PyUnicode_EqualToUTF8AndSize` and :c:func:`PyUnicode_EqualToUTF8` functions. diff --git a/Misc/stable_abi.toml b/Misc/stable_abi.toml index ae39fea3a66a24..4503a9c45d4ac0 100644 --- a/Misc/stable_abi.toml +++ b/Misc/stable_abi.toml @@ -2462,3 +2462,5 @@ added = '3.13' [function.PyUnicode_EqualToUTF8] added = '3.13' +[function.PyUnicode_EqualToUTF8AndSize] + added = '3.13' diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 732d7f48ec49e4..d52d88a65d86fc 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -1448,6 +1448,29 @@ unicode_equaltoutf8(PyObject *self, PyObject *args) return PyLong_FromLong(result); } +/* Test PyUnicode_EqualToUTF8AndSize() */ +static PyObject * +unicode_equaltoutf8andsize(PyObject *self, PyObject *args) +{ + PyObject *left; + const char *right = NULL; + Py_ssize_t right_len; + Py_ssize_t size = -100; + int result; + + if (!PyArg_ParseTuple(args, "Oz#|n", &left, &right, &right_len, &size)) { + return NULL; + } + + NULLABLE(left); + if (size == -100) { + size = right_len; + } + result = PyUnicode_EqualToUTF8AndSize(left, right, size); + assert(!PyErr_Occurred()); + return PyLong_FromLong(result); +} + /* Test PyUnicode_RichCompare() */ static PyObject * unicode_richcompare(PyObject *self, PyObject *args) @@ -2064,6 +2087,7 @@ static PyMethodDef TestMethods[] = { {"unicode_compare", unicode_compare, METH_VARARGS}, {"unicode_comparewithasciistring",unicode_comparewithasciistring,METH_VARARGS}, {"unicode_equaltoutf8", unicode_equaltoutf8, METH_VARARGS}, + {"unicode_equaltoutf8andsize",unicode_equaltoutf8andsize, METH_VARARGS}, {"unicode_richcompare", unicode_richcompare, METH_VARARGS}, {"unicode_format", unicode_format, METH_VARARGS}, {"unicode_contains", unicode_contains, METH_VARARGS}, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 18e99a500bf3c6..63b65f35f2936a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10675,39 +10675,47 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) int PyUnicode_EqualToUTF8(PyObject *unicode, const char *str) +{ + return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str)); +} + +int +PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size) { assert(_PyUnicode_CHECK(unicode)); assert(str); if (PyUnicode_IS_ASCII(unicode)) { - size_t len = (size_t)PyUnicode_GET_LENGTH(unicode); - return strlen(str) == len && + Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); + return size == len && memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0; } if (PyUnicode_UTF8(unicode) != NULL) { - size_t len = (size_t)PyUnicode_UTF8_LENGTH(unicode); - return strlen(str) == len && + Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode); + return size == len && memcmp(PyUnicode_UTF8(unicode), str, len) == 0; } - const unsigned char *s = (const unsigned char *)str; Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); + if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) { + return 0; + } + const unsigned char *s = (const unsigned char *)str; + const unsigned char *ends = s + (size_t)size; int kind = PyUnicode_KIND(unicode); const void *data = PyUnicode_DATA(unicode); /* Compare Unicode string and UTF-8 string */ for (Py_ssize_t i = 0; i < len; i++) { Py_UCS4 ch = PyUnicode_READ(kind, data, i); - if (ch == 0) { - return 0; - } - else if (ch < 0x80) { - if (s[0] != ch) { + if (ch < 0x80) { + if (ends == s || s[0] != ch) { return 0; } s += 1; } else if (ch < 0x800) { - if (s[0] != (0xc0 | (ch >> 6)) || + if (ends - s < 2 || + s[0] != (0xc0 | (ch >> 6)) || s[1] != (0x80 | (ch & 0x3f))) { return 0; @@ -10716,6 +10724,7 @@ PyUnicode_EqualToUTF8(PyObject *unicode, const char *str) } else if (ch < 0x10000) { if (Py_UNICODE_IS_SURROGATE(ch) || + ends - s < 3 || s[0] != (0xe0 | (ch >> 12)) || s[1] != (0x80 | ((ch >> 6) & 0x3f)) || s[2] != (0x80 | (ch & 0x3f))) @@ -10726,7 +10735,8 @@ PyUnicode_EqualToUTF8(PyObject *unicode, const char *str) } else { assert(ch <= MAX_UNICODE); - if (s[0] != (0xf0 | (ch >> 18)) || + if (ends - s < 4 || + s[0] != (0xf0 | (ch >> 18)) || s[1] != (0x80 | ((ch >> 12) & 0x3f)) || s[2] != (0x80 | ((ch >> 6) & 0x3f)) || s[3] != (0x80 | (ch & 0x3f))) @@ -10736,7 +10746,7 @@ PyUnicode_EqualToUTF8(PyObject *unicode, const char *str) s += 4; } } - return *s == 0; + return s == ends; } int diff --git a/PC/python3dll.c b/PC/python3dll.c index 0beb61f28e0ef8..1fb4c810cf1cfb 100755 --- a/PC/python3dll.c +++ b/PC/python3dll.c @@ -689,6 +689,7 @@ EXPORT_FUNC(PyUnicode_EncodeCodePage) EXPORT_FUNC(PyUnicode_EncodeFSDefault) EXPORT_FUNC(PyUnicode_EncodeLocale) EXPORT_FUNC(PyUnicode_EqualToUTF8) +EXPORT_FUNC(PyUnicode_EqualToUTF8AndSize) EXPORT_FUNC(PyUnicode_Find) EXPORT_FUNC(PyUnicode_FindChar) EXPORT_FUNC(PyUnicode_Format) From b1243770a7bb5d8234708d8070320fd256a8b5df Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 7 Oct 2023 15:43:47 +0300 Subject: [PATCH 12/16] Apply suggestions from code review Co-authored-by: Victor Stinner --- Doc/c-api/unicode.rst | 2 +- Lib/test/test_capi/test_unicode.py | 5 ++--- Objects/unicodeobject.c | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 00f4bac65a252a..5ab9f1cab23ef8 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1411,7 +1411,7 @@ They all return ``NULL`` or ``-1`` if an exception occurs. .. c:function:: int PyUnicode_EqualToUTF8(PyObject *unicode, const char *string) - Similar to :c:func:`PyUnicode_EqualToUTF8AndSize`, but compute the string + Similar to :c:func:`PyUnicode_EqualToUTF8AndSize`, but compute *string* length using :c:func:`!strlen`. If the Unicode object contains null characters, false (``0``) is returned. diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 0bbab9bfc0ec01..e10c4ff3a94f7c 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1372,9 +1372,8 @@ def test_equaltoutf8andsize(self): self.assertEqual(equaltoutf8andsize(s2, b, len(b) - 1), 0) # embedded null chars/bytes - self.assertEqual(equaltoutf8andsize('abc', b'abc\0def\0'), 0) - self.assertEqual(equaltoutf8andsize('a\0bc', b'abc'), 0) - self.assertEqual(equaltoutf8andsize('abc', b'a\0bc'), 0) + self.assertEqual(equaltoutf8andsize('abc\0def', b'abc\0def', 7), 1) + self.assertEqual(equaltoutf8andsize('abc\0def\0', b'abc\0def\0', 8), 1) # Surrogate characters are always treated as not equal self.assertEqual(equaltoutf8andsize('\udcfe', diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 63b65f35f2936a..e234277c37513a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10714,7 +10714,7 @@ PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size s += 1; } else if (ch < 0x800) { - if (ends - s < 2 || + if ((ends - s) < 2 || s[0] != (0xc0 | (ch >> 6)) || s[1] != (0x80 | (ch & 0x3f))) { From 029f1a06efd2e41139d2ce9842e6a5511163c74a Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 7 Oct 2023 15:48:57 +0300 Subject: [PATCH 13/16] Add more parentheses. --- Objects/unicodeobject.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index e234277c37513a..33cbc987d43282 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10724,7 +10724,7 @@ PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size } else if (ch < 0x10000) { if (Py_UNICODE_IS_SURROGATE(ch) || - ends - s < 3 || + (ends - s) < 3 || s[0] != (0xe0 | (ch >> 12)) || s[1] != (0x80 | ((ch >> 6) & 0x3f)) || s[2] != (0x80 | (ch & 0x3f))) @@ -10735,7 +10735,7 @@ PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size } else { assert(ch <= MAX_UNICODE); - if (ends - s < 4 || + if ((ends - s) < 4 || s[0] != (0xf0 | (ch >> 18)) || s[1] != (0x80 | ((ch >> 12) & 0x3f)) || s[2] != (0x80 | ((ch >> 6) & 0x3f)) || From be2ffe844a5f274168f2f7e554f8e1f745e83cb9 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 7 Oct 2023 15:50:58 +0300 Subject: [PATCH 14/16] Remove redundant arguments. --- Lib/test/test_capi/test_unicode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index e10c4ff3a94f7c..28ab4ddb46009b 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1372,8 +1372,8 @@ def test_equaltoutf8andsize(self): self.assertEqual(equaltoutf8andsize(s2, b, len(b) - 1), 0) # embedded null chars/bytes - self.assertEqual(equaltoutf8andsize('abc\0def', b'abc\0def', 7), 1) - self.assertEqual(equaltoutf8andsize('abc\0def\0', b'abc\0def\0', 8), 1) + self.assertEqual(equaltoutf8andsize('abc\0def', b'abc\0def'), 1) + self.assertEqual(equaltoutf8andsize('abc\0def\0', b'abc\0def\0'), 1) # Surrogate characters are always treated as not equal self.assertEqual(equaltoutf8andsize('\udcfe', From 78de49d5f40466abfa88640eb251d956e6ebb855 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 10 Oct 2023 23:34:19 +0300 Subject: [PATCH 15/16] Turn docstrings into comments. --- Lib/test/test_capi/test_unicode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 28ab4ddb46009b..dd0dc950ca0b90 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1300,7 +1300,7 @@ def test_comparewithasciistring(self): @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') def test_equaltoutf8(self): - """Test PyUnicode_EqualToUTF8()""" + # Test PyUnicode_EqualToUTF8() from _testcapi import unicode_equaltoutf8 as equaltoutf8 from _testcapi import unicode_asutf8andsize as asutf8andsize @@ -1341,7 +1341,7 @@ def test_equaltoutf8(self): @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') def test_equaltoutf8andsize(self): - """Test PyUnicode_EqualToUTF8AndSize()""" + # Test PyUnicode_EqualToUTF8AndSize() from _testcapi import unicode_equaltoutf8andsize as equaltoutf8andsize from _testcapi import unicode_asutf8andsize as asutf8andsize From 19ad12633077bc6122a2a1340d6843f8da241574 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 11 Oct 2023 13:05:25 +0300 Subject: [PATCH 16/16] Add tests for empty strings. --- Lib/test/test_capi/test_unicode.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index dd0dc950ca0b90..a73e669dda7ddc 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1319,12 +1319,17 @@ def test_equaltoutf8(self): self.assertEqual(equaltoutf8(s2, b), 1) self.assertEqual(equaltoutf8(s + 'x', b + b'x'), 1) self.assertEqual(equaltoutf8(s + 'x', b + b'y'), 0) + self.assertEqual(equaltoutf8(s, b + b'\0'), 1) + self.assertEqual(equaltoutf8(s2, b + b'\0'), 1) self.assertEqual(equaltoutf8(s + '\0', b + b'\0'), 0) self.assertEqual(equaltoutf8(s + '\0', b), 0) self.assertEqual(equaltoutf8(s2, b + b'x'), 0) self.assertEqual(equaltoutf8(s2, b[:-1]), 0) self.assertEqual(equaltoutf8(s2, b[:-1] + b'x'), 0) + self.assertEqual(equaltoutf8('', b''), 1) + self.assertEqual(equaltoutf8('', b'\0'), 1) + # embedded null chars/bytes self.assertEqual(equaltoutf8('abc', b'abc\0def\0'), 1) self.assertEqual(equaltoutf8('a\0bc', b'abc'), 0) @@ -1360,6 +1365,8 @@ def test_equaltoutf8andsize(self): self.assertEqual(equaltoutf8andsize(s2, b), 1) self.assertEqual(equaltoutf8andsize(s + 'x', b + b'x'), 1) self.assertEqual(equaltoutf8andsize(s + 'x', b + b'y'), 0) + self.assertEqual(equaltoutf8andsize(s, b + b'\0'), 0) + self.assertEqual(equaltoutf8andsize(s2, b + b'\0'), 0) self.assertEqual(equaltoutf8andsize(s + '\0', b + b'\0'), 1) self.assertEqual(equaltoutf8andsize(s + '\0', b), 0) self.assertEqual(equaltoutf8andsize(s2, b + b'x'), 0) @@ -1371,6 +1378,10 @@ def test_equaltoutf8andsize(self): self.assertEqual(equaltoutf8andsize(s + '\0', b + b'\0x', len(b) + 1), 1) self.assertEqual(equaltoutf8andsize(s2, b, len(b) - 1), 0) + self.assertEqual(equaltoutf8andsize('', b''), 1) + self.assertEqual(equaltoutf8andsize('', b'\0'), 0) + self.assertEqual(equaltoutf8andsize('', b'x', 0), 1) + # embedded null chars/bytes self.assertEqual(equaltoutf8andsize('abc\0def', b'abc\0def'), 1) self.assertEqual(equaltoutf8andsize('abc\0def\0', b'abc\0def\0'), 1)