vstinner · encukou · May 27, 2024 · Jun 9, 2024 · Jun 11, 2024 · Jun 11, 2024
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
@@ -341,6 +341,75 @@ APIs:
    .. versionadded:: 3.3
 
 
+.. c:function:: const void* PyUnicode_Export(PyObject *unicode, uint32_t supported_formats,
+                 Py_ssize_t *nbytes, uint32_t *format)
+
+   Get the contents of a string in an “export format”.
+
+   Set *supported_formats* to formats from the following list, OR-ed together:
+
+   .. c:namespace:: NULL
+
+   ========================================  ========  ============================
+   Constant Identifier                       Value     Description
+   ========================================  ========  ============================
+   .. c:macro:: PyUnicode_FORMAT_ASCII       ``0x01``  ASCII string (``Py_UCS1*``)
+   .. c:macro:: PyUnicode_FORMAT_UCS1        ``0x02``  UCS-1 string (``Py_UCS1*``)
+   .. c:macro:: PyUnicode_FORMAT_UCS2        ``0x04``  UCS-2 string (``Py_UCS2*``)
+   .. c:macro:: PyUnicode_FORMAT_UCS4        ``0x08``  UCS-4 string (``Py_UCS4*``)
+   .. c:macro:: PyUnicode_FORMAT_UTF8        ``0x10``  UTF-8 string (``char*``)
+   ========================================  ========  ============================
+
+   Note that future versions of Python may introduce additional formats.
+
+   On success:
+
+   * Return a buffer containing the string data. Note that the buffer is not
+     necessarily zero-terminated.
+   * Set *\*format* to the buffer's format -- this will be one of the flags
+     set in *supported_formats*.
+   * Set *\*nbytes* to the size of the buffer, in bytes.
+
+   On error, set an exception, set *\*format* and *\*nbytes* to zero, and
+   return ``NULL``.
+
+   The returned buffer must be later released using
+   :c:func:`PyUnicode_ReleaseExport`.
+
+   The returned buffer must not be modified.
+
+   If possible, the export is a zero-copy operation -- for example,
+   the string's underlying storage is returned.
+
+   *unicode*, *nbytes* and *native_format* must not be NULL.
+
+   .. versionadded:: 3.14
+
+
+.. c:function:: void PyUnicode_ReleaseExport(PyObject *unicode, const void* data, uint32_t format)
+
+   Release a string's export buffer. The buffer is invalid after this call.
+
+   Each argument must match the corresponding argument or result of
+   a single earlier call to :c:func:`PyUnicode_Export`.
+
+   .. versionadded:: 3.14
+
+
+.. c:function:: PyObject* PyUnicode_Import(const void *data, Py_ssize_t nbytes, uint32_t format)
+
+   Create a string object from a buffer in an “export format”.
+
+   * Return a reference to a new string object on success.
+   * Set an exception and return ``NULL`` on error.
+
+   *data* must not be NULL. *nbytes* must be positive or zero.
+
+   See :c:func:`PyUnicode_Export` for the available native formats.
+
+   .. versionadded:: 3.14
+
+
 .. c:function:: PyObject* PyUnicode_FromKindAndData(int kind, const void *buffer, \
                                                     Py_ssize_t size)
 

diff --git a/Doc/data/stable_abi.dat b/Doc/data/stable_abi.dat
diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst
@@ -298,6 +298,11 @@ New Features
 
   (Contributed by Victor Stinner in :gh:`119182`.)
 
+* Add :c:func:`PyUnicode_Import` and :c:func:`PyUnicode_Export` functions to
+  import and export strings from/to buffers in a given format.
+  (Contributed by Victor Stinner in :gh:`119609`.)
+
+
 Porting to Python 3.14
 ----------------------
 

diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
@@ -248,6 +248,37 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
     const char *u              /* UTF-8 encoded string */
     );
 
+#define PyUnicode_FORMAT_ASCII 0x01
+#define PyUnicode_FORMAT_UCS1 0x02
+#define PyUnicode_FORMAT_UCS2 0x04
+#define PyUnicode_FORMAT_UCS4 0x08
+#define PyUnicode_FORMAT_UTF8 0x10
+
+// Get the content of a string in the requested format:
+// - Return the content, set '*nbytes' and '*format' on success.
+// - Set an exception and return NULL on error.
+//
+// The export must be released by PyUnicode_ReleaseExport().
+PyAPI_FUNC(const void*) PyUnicode_Export(
+    PyObject *unicode,
+    uint32_t supported_formats,
+    Py_ssize_t *nbytes,
+    uint32_t *format);
+
+// Release an export created by PyUnicode_Export().
+PyAPI_FUNC(void) PyUnicode_ReleaseExport(
+    PyObject *unicode,
+    const void* data,
+    uint32_t format);
+
+// Create a string object from a string in the format 'format'.
+// - Return a reference to a new string object on success.
+// - Set an exception and return NULL on error.
+PyAPI_FUNC(PyObject*) PyUnicode_Import(
+    const void *data,
+    Py_ssize_t nbytes,
+    uint32_t format);
+
 /* --- wchar_t support for platforms which support it --------------------- */
 
 #ifdef HAVE_WCHAR_H

diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py
@@ -24,6 +24,14 @@ class Str(str):
     pass
 
 
+PyUnicode_FORMAT_ASCII = 0x01
+PyUnicode_FORMAT_UCS1 = 0x02
+PyUnicode_FORMAT_UCS2 = 0x04
+PyUnicode_FORMAT_UCS4 = 0x08
+PyUnicode_FORMAT_UTF8 = 0x10
+# Invalid native format
+PyUnicode_FORMAT_INVALID = 0x20
+
 class CAPITest(unittest.TestCase):
 
     @support.cpython_only
@@ -1675,6 +1683,163 @@ def test_pep393_utf8_caching_bug(self):
                 # Check that the second call returns the same result
                 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
 
-
-if __name__ == "__main__":
+    def test_unicode_export(self):
+        # Test PyUnicode_Export() and PyUnicode_FreeExport()
+        unicode_export = _testlimitedcapi.unicode_export
+        if sys.byteorder == 'little':
+            ucs2_enc = 'utf-16le'
+            ucs4_enc = 'utf-32le'
+        else:
+            ucs2_enc = 'utf-16be'
+            ucs4_enc = 'utf-32be'
+
+        # export to the native format
+        formats = (PyUnicode_FORMAT_ASCII
+                   | PyUnicode_FORMAT_UCS1
+                   | PyUnicode_FORMAT_UCS2
+                   | PyUnicode_FORMAT_UCS4)
+        self.assertEqual(unicode_export("abc", formats),
+                         (b'abc', PyUnicode_FORMAT_ASCII))
+        self.assertEqual(unicode_export("latin1:\xe9", formats),
+                         (b'latin1:\xe9', PyUnicode_FORMAT_UCS1))
+        self.assertEqual(unicode_export('ucs2:\u20ac', formats),
+                         ('ucs2:\u20ac'.encode(ucs2_enc),
+                          PyUnicode_FORMAT_UCS2))
+        self.assertEqual(unicode_export('ucs4:\U0010ffff', formats),
+                         ('ucs4:\U0010ffff'.encode(ucs4_enc),
+                          PyUnicode_FORMAT_UCS4))
+
+        # export ASCII as UCS1
+        self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS1),
+                         (b'abc', PyUnicode_FORMAT_UCS1))
+
+        # always export to UCS4
+        self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS4),
+                         ('abc'.encode(ucs4_enc), PyUnicode_FORMAT_UCS4))
+        self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UCS4),
+                         ('latin1:\xe9'.encode(ucs4_enc), PyUnicode_FORMAT_UCS4))
+        self.assertEqual(unicode_export('ucs2:\u20ac', PyUnicode_FORMAT_UCS4),
+                         ('ucs2:\u20ac'.encode(ucs4_enc),
+                          PyUnicode_FORMAT_UCS4))
+        self.assertEqual(unicode_export('ucs4:\U0010ffff', PyUnicode_FORMAT_UCS4),
+                         ('ucs4:\U0010ffff'.encode(ucs4_enc),
+                          PyUnicode_FORMAT_UCS4))
+
+        # export to UCS2 unless it's UCS4
+        self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS2),
+                         ('abc'.encode(ucs2_enc), PyUnicode_FORMAT_UCS2))
+        self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UCS2),
+                         ('latin1:\xe9'.encode(ucs2_enc), PyUnicode_FORMAT_UCS2))
+        self.assertEqual(unicode_export('ucs2:\u20ac', PyUnicode_FORMAT_UCS2),
+                         ('ucs2:\u20ac'.encode(ucs2_enc),
+                          PyUnicode_FORMAT_UCS2))
+
+        # always export to UTF8
+        self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UTF8),
+                         ('abc'.encode('utf8'), PyUnicode_FORMAT_UTF8))
+        self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UTF8),
+                         ('latin1:\xe9'.encode('utf8'), PyUnicode_FORMAT_UTF8))
+        self.assertEqual(unicode_export('ucs2:\u20ac', PyUnicode_FORMAT_UTF8),
+                         ('ucs2:\u20ac'.encode('utf8'),
+                          PyUnicode_FORMAT_UTF8))
+        self.assertEqual(unicode_export('ucs4:\U0010ffff', PyUnicode_FORMAT_UTF8),
+                         ('ucs4:\U0010ffff'.encode('utf8'),
+                          PyUnicode_FORMAT_UTF8))
+
+        # No supported format or invalid format
+        with self.assertRaisesRegex(ValueError,
+                                    "unable to find a matching export format"):
+            unicode_export('abc', 0)
+        with self.assertRaisesRegex(ValueError,
+                                    "unable to find a matching export format"):
+            unicode_export('abc', PyUnicode_FORMAT_INVALID)
+
+    def test_unicode_import(self):
+        # Test PyUnicode_Import()
+        unicode_import = _testlimitedcapi.unicode_import
+        if sys.byteorder == 'little':
+            ucs2_enc = 'utf-16le'
+            ucs4_enc = 'utf-32le'
+        else:
+            ucs2_enc = 'utf-16be'
+            ucs4_enc = 'utf-32be'
+
+        self.assertEqual(unicode_import(b'abc', PyUnicode_FORMAT_ASCII),
+                         "abc")
+        self.assertEqual(unicode_import(b'latin1:\xe9', PyUnicode_FORMAT_UCS1),
+                         "latin1:\xe9")
+
+        self.assertEqual(unicode_import('ucs2:\u20ac'.encode(ucs2_enc),
+                                          PyUnicode_FORMAT_UCS2),
+                         'ucs2:\u20ac')
+
+        self.assertEqual(unicode_import('ucs4:\U0010ffff'.encode(ucs4_enc),
+                                          PyUnicode_FORMAT_UCS4),
+                         'ucs4:\U0010ffff')
+
+        text = "abc\xe9\U0010ffff"
+        self.assertEqual(unicode_import(text.encode('utf8'),
+                                          PyUnicode_FORMAT_UTF8),
+                         text)
+
+        # Empty string
+        for native_format in (
+            PyUnicode_FORMAT_ASCII,
+            PyUnicode_FORMAT_UCS1,
+            PyUnicode_FORMAT_UCS2,
+            PyUnicode_FORMAT_UCS4,
+            PyUnicode_FORMAT_UTF8,
+        ):
+            with self.subTest(native_format=native_format):
+                self.assertEqual(unicode_import(b'', native_format),
+                                 '')
+
+        # Invalid format
+        with self.assertRaises(ValueError):
+            unicode_import(b'', PyUnicode_FORMAT_INVALID)
+
+        # Invalid size
+        ucs2 = 'ucs2:\u20ac'.encode(ucs2_enc)
+        with self.assertRaises(ValueError):
+            unicode_import(ucs2[:-1], PyUnicode_FORMAT_UCS2)
+        ucs4 = 'ucs4:\U0010ffff'.encode(ucs4_enc)
+        with self.assertRaises(ValueError):
+            unicode_import(ucs4[:-1], PyUnicode_FORMAT_UCS4)
+        with self.assertRaises(ValueError):
+            unicode_import(ucs4[:-2], PyUnicode_FORMAT_UCS4)
+        with self.assertRaises(ValueError):
+            unicode_import(ucs4[:-3], PyUnicode_FORMAT_UCS4)
+
+    def test_unicode_import_export_roundtrip(self):
+        unicode_export = _testlimitedcapi.unicode_export
+        unicode_import = _testlimitedcapi.unicode_import
+        A = PyUnicode_FORMAT_ASCII
+        CS1 = PyUnicode_FORMAT_UCS1
+        CS2 = PyUnicode_FORMAT_UCS2
+        CS4 = PyUnicode_FORMAT_UCS4
+        TF8 = PyUnicode_FORMAT_UTF8
+        for string, alowed_encodings in (
+            ('', {A, CS1, CS2, CS4, TF8}),
+            ('ascii', {A, CS1, CS2, CS4, TF8}),
+            ('latin1:\xe9', {CS1, CS2, CS4, TF8}),
+            ('ucs2:\u20ac', {CS2, CS4, TF8}),
+            ('ucs4:\U0001f638', {CS4, TF8}),
+        ):
+            for encoding in A, CS1, CS2, CS4, TF8:
+                with self.subTest(string=string, encoding=encoding):
+                    if encoding not in alowed_encodings:
+                        with self.assertRaises(ValueError):
+                            unicode_export(string, encoding)
+                    else:
+                        buf, buf_enc = unicode_export(string, encoding)
+                        restored = unicode_import(buf, buf_enc)
+                        self.assertEqual(restored, string)
+
+                with self.subTest(string=string, encoding=-1):
+                    buf, buf_enc = unicode_export(string, -1)
+                    restored = unicode_import(buf, buf_enc)
+                    self.assertEqual(restored, string)
+
+
+if __name__ == '__main__':
     unittest.main()
diff --git a/Lib/test/test_stable_abi_ctypes.py b/Lib/test/test_stable_abi_ctypes.py
diff --git a/Misc/NEWS.d/next/C API/2024-05-27-17-46-17.gh-issue-119609.kPIx6S.rst b/Misc/NEWS.d/next/C API/2024-05-27-17-46-17.gh-issue-119609.kPIx6S.rst
@@ -0,0 +1,2 @@
+Add :c:func:`PyUnicode_Export` and :c:func:`PyUnicode_Import` functions to
+import and export strings from native buffers. Patch by Victor Stinner.
diff --git a/Misc/stable_abi.toml b/Misc/stable_abi.toml
@@ -2510,3 +2510,19 @@
 
 [function.Py_TYPE]
     added = '3.14'
+[function.PyUnicode_Import]
+    added = '3.14'
+[function.PyUnicode_Export]
+    added = '3.14'
+[function.PyUnicode_ReleaseExport]
+    added = '3.14'
+[const.PyUnicode_FORMAT_ASCII]
+    added = '3.14'
+[const.PyUnicode_FORMAT_UCS1]
+    added = '3.14'
+[const.PyUnicode_FORMAT_UCS2]
+    added = '3.14'
+[const.PyUnicode_FORMAT_UCS4]
+    added = '3.14'
+[const.PyUnicode_FORMAT_UTF8]
+    added = '3.14'