Skip to content

Commit

Permalink
pythongh-119609: Add PyUnicode_Export() function
Browse files Browse the repository at this point in the history
Add PyUnicode_Export() and PyUnicode_Import() functions to the C API.
  • Loading branch information
vstinner committed Jun 21, 2024
1 parent 733dac0 commit 32a373a
Show file tree
Hide file tree
Showing 11 changed files with 418 additions and 2 deletions.
55 changes: 55 additions & 0 deletions Doc/c-api/unicode.rst
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,61 @@ APIs:
.. versionadded:: 3.3
.. c:function:: const void* PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_ssize_t *size, uint32_t *format)
Export the contents of the *unicode* string in one of the requested format
*requested_formats*.
* On success, set *\*size* and *\*format*, and return the contents.
* On error, set an exception and return ``NULL``.
The contents is valid as long as *unicode* is valid.
The export must be released by :c:func:`PyUnicode_ReleaseExport`.
*unicode*, *size* and *format* must not be NULL.
Available formats:
.. c:namespace:: NULL
=================================== ======== ===========================
Constant Identifier Value Description
=================================== ======== ===========================
.. c:macro:: PyUnicode_FORMAT_ASCII ``0x01`` ASCII string (``Py_UCS1*``)
.. c:macro:: PyUnicode_FORMAT_UCS1 ``0x02`` UCS-1 string (``Py_UCS1*``)
.. c:macro:: PyUnicode_FORMAT_UCS2 ``0x04`` UCS-2 string (``Py_UCS2*``)
.. c:macro:: PyUnicode_FORMAT_UCS4 ``0x08`` UCS-4 string (``Py_UCS4*``)
.. c:macro:: PyUnicode_FORMAT_UTF8 ``0x10`` UTF-8 string (``char*``)
=================================== ======== ===========================
*requested_formats* can be a single format or a combination of the formats
in the table above.
.. versionadded:: 3.14
.. c:function:: void PyUnicode_ReleaseExport(PyObject *unicode, const void* data, uint32_t format)
Release an export created by :c:func:`PyUnicode_Export`.
.. versionadded:: 3.14
.. c:function:: PyObject* PyUnicode_Import(const void *data, Py_ssize_t size, uint32_t format)
Import a string from the *format* format.
* Return a reference to a new string object on success.
* Set an exception and return ``NULL`` on error.
*data* must not be NULL. *size* must be positive or zero.
See :c:func:`PyUnicode_Export` for the available formats.
.. versionadded:: 3.14
.. c:function:: PyObject* PyUnicode_FromKindAndData(int kind, const void *buffer, \
Py_ssize_t size)
Expand Down
3 changes: 3 additions & 0 deletions Doc/data/stable_abi.dat

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions Doc/whatsnew/3.14.rst
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,11 @@ New Features

(Contributed by Victor Stinner in :gh:`119182`.)

* Add :c:func:`PyUnicode_Export` and :c:func:`PyUnicode_Import` functions to
export and import strings.
(Contributed by Victor Stinner in :gh:`119609`.)


Porting to Python 3.14
----------------------

Expand Down
31 changes: 31 additions & 0 deletions Include/unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,37 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
const char *u /* UTF-8 encoded string */
);

#define PyUnicode_FORMAT_ASCII 0x01
#define PyUnicode_FORMAT_UCS1 0x02
#define PyUnicode_FORMAT_UCS2 0x04
#define PyUnicode_FORMAT_UCS4 0x08
#define PyUnicode_FORMAT_UTF8 0x10

// Get the content of a string in the requested format:
// - Return the content, set '*size' and '*format' on success.
// - Set an exception and return NULL on error.
//
// The export must be released by PyUnicode_ReleaseExport().
PyAPI_FUNC(const void*) PyUnicode_Export(
PyObject *unicode,
uint32_t requested_formats,
Py_ssize_t *size,
uint32_t *format);

// Release an export created by PyUnicode_Export().
PyAPI_FUNC(void) PyUnicode_ReleaseExport(
PyObject *unicode,
const void* data,
uint32_t format);

// Create a string object from a string in the format 'format'.
// - Return a reference to a new string object on success.
// - Set an exception and return NULL on error.
PyAPI_FUNC(PyObject*) PyUnicode_Import(
const void *data,
Py_ssize_t size,
uint32_t format);

/* --- wchar_t support for platforms which support it --------------------- */

#ifdef HAVE_WCHAR_H
Expand Down
130 changes: 128 additions & 2 deletions Lib/test/test_capi/test_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@ class Str(str):
pass


PyUnicode_FORMAT_ASCII = 0x01
PyUnicode_FORMAT_UCS1 = 0x02
PyUnicode_FORMAT_UCS2 = 0x04
PyUnicode_FORMAT_UCS4 = 0x08
PyUnicode_FORMAT_UTF8 = 0x10
# Invalid native format
PyUnicode_FORMAT_INVALID = 0x20

class CAPITest(unittest.TestCase):

@support.cpython_only
Expand Down Expand Up @@ -1675,6 +1683,124 @@ def test_pep393_utf8_caching_bug(self):
# Check that the second call returns the same result
self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))


if __name__ == "__main__":
def test_unicode_export(self):
# Test PyUnicode_Export() and PyUnicode_FreeExport()
unicode_export = _testlimitedcapi.unicode_export
if sys.byteorder == 'little':
ucs2_enc = 'utf-16le'
ucs4_enc = 'utf-32le'
else:
ucs2_enc = 'utf-16be'
ucs4_enc = 'utf-32be'

# export to the native format
formats = (PyUnicode_FORMAT_ASCII
| PyUnicode_FORMAT_UCS1
| PyUnicode_FORMAT_UCS2
| PyUnicode_FORMAT_UCS4)
self.assertEqual(unicode_export("abc", formats),
(b'abc', PyUnicode_FORMAT_ASCII))
self.assertEqual(unicode_export("latin1:\xe9", formats),
(b'latin1:\xe9', PyUnicode_FORMAT_UCS1))
self.assertEqual(unicode_export('ucs2:\u20ac', formats),
('ucs2:\u20ac'.encode(ucs2_enc),
PyUnicode_FORMAT_UCS2))
self.assertEqual(unicode_export('ucs4:\U0010ffff', formats),
('ucs4:\U0010ffff'.encode(ucs4_enc),
PyUnicode_FORMAT_UCS4))

# export ASCII as UCS1
self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS1),
(b'abc', PyUnicode_FORMAT_UCS1))

# always export to UCS4
self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS4),
('abc'.encode(ucs4_enc), PyUnicode_FORMAT_UCS4))
self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UCS4),
('latin1:\xe9'.encode(ucs4_enc), PyUnicode_FORMAT_UCS4))
self.assertEqual(unicode_export('ucs2:\u20ac', PyUnicode_FORMAT_UCS4),
('ucs2:\u20ac'.encode(ucs4_enc),
PyUnicode_FORMAT_UCS4))
self.assertEqual(unicode_export('ucs4:\U0010ffff', PyUnicode_FORMAT_UCS4),
('ucs4:\U0010ffff'.encode(ucs4_enc),
PyUnicode_FORMAT_UCS4))

# always export to UTF8
self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UTF8),
('abc'.encode('utf8'), PyUnicode_FORMAT_UTF8))
self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UTF8),
('latin1:\xe9'.encode('utf8'), PyUnicode_FORMAT_UTF8))
self.assertEqual(unicode_export('ucs2:\u20ac', PyUnicode_FORMAT_UTF8),
('ucs2:\u20ac'.encode('utf8'),
PyUnicode_FORMAT_UTF8))
self.assertEqual(unicode_export('ucs4:\U0010ffff', PyUnicode_FORMAT_UTF8),
('ucs4:\U0010ffff'.encode('utf8'),
PyUnicode_FORMAT_UTF8))

# No supported format or invalid format
with self.assertRaisesRegex(ValueError,
"unable to find a matching export format"):
unicode_export('abc', 0)
with self.assertRaisesRegex(ValueError,
"unable to find a matching export format"):
unicode_export('abc', PyUnicode_FORMAT_INVALID)

def test_unicode_import(self):
# Test PyUnicode_Import()
unicode_import = _testlimitedcapi.unicode_import
if sys.byteorder == 'little':
ucs2_enc = 'utf-16le'
ucs4_enc = 'utf-32le'
else:
ucs2_enc = 'utf-16be'
ucs4_enc = 'utf-32be'

self.assertEqual(unicode_import(b'abc', PyUnicode_FORMAT_ASCII),
"abc")
self.assertEqual(unicode_import(b'latin1:\xe9', PyUnicode_FORMAT_UCS1),
"latin1:\xe9")

self.assertEqual(unicode_import('ucs2:\u20ac'.encode(ucs2_enc),
PyUnicode_FORMAT_UCS2),
'ucs2:\u20ac')

self.assertEqual(unicode_import('ucs4:\U0010ffff'.encode(ucs4_enc),
PyUnicode_FORMAT_UCS4),
'ucs4:\U0010ffff')

text = "abc\xe9\U0010ffff"
self.assertEqual(unicode_import(text.encode('utf8'),
PyUnicode_FORMAT_UTF8),
text)

# Empty string
for native_format in (
PyUnicode_FORMAT_ASCII,
PyUnicode_FORMAT_UCS1,
PyUnicode_FORMAT_UCS2,
PyUnicode_FORMAT_UCS4,
PyUnicode_FORMAT_UTF8,
):
with self.subTest(native_format=native_format):
self.assertEqual(unicode_import(b'', native_format),
'')

# Invalid format
with self.assertRaises(ValueError):
unicode_import(b'', PyUnicode_FORMAT_INVALID)

# Invalid size
ucs2 = 'ucs2:\u20ac'.encode(ucs2_enc)
with self.assertRaises(ValueError):
unicode_import(ucs2[:-1], PyUnicode_FORMAT_UCS2)
ucs4 = 'ucs4:\U0010ffff'.encode(ucs4_enc)
with self.assertRaises(ValueError):
unicode_import(ucs4[:-1], PyUnicode_FORMAT_UCS4)
with self.assertRaises(ValueError):
unicode_import(ucs4[:-2], PyUnicode_FORMAT_UCS4)
with self.assertRaises(ValueError):
unicode_import(ucs4[:-3], PyUnicode_FORMAT_UCS4)


if __name__ == '__main__':
unittest.main()
3 changes: 3 additions & 0 deletions Lib/test/test_stable_abi_ctypes.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Add :c:func:`PyUnicode_Export` and :c:func:`PyUnicode_Import` functions to
export and import strings. Patch by Victor Stinner.
6 changes: 6 additions & 0 deletions Misc/stable_abi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2510,3 +2510,9 @@

[function.Py_TYPE]
added = '3.14'
[function.PyUnicode_Export]
added = '3.14'
[function.PyUnicode_ReleaseExport]
added = '3.14'
[function.PyUnicode_Import]
added = '3.14'
40 changes: 40 additions & 0 deletions Modules/_testlimitedcapi/unicode.c
Original file line number Diff line number Diff line change
Expand Up @@ -1837,6 +1837,44 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
#undef CHECK_FORMAT_0
}


// Test PyUnicode_Export()
static PyObject*
unicode_export(PyObject *self, PyObject *args)
{
PyObject *obj;
unsigned int supported_formats;
if (!PyArg_ParseTuple(args, "OI", &obj, &supported_formats)) {
return NULL;
}

Py_ssize_t size;
uint32_t format;
const void *data = PyUnicode_Export(obj, supported_formats, &size, &format);
if (data == NULL) {
return NULL;
}

PyObject *res = Py_BuildValue("y#I", data, size, (unsigned int)format);
PyUnicode_ReleaseExport(obj, data, format);
return res;
}


// Test PyUnicode_Import()
static PyObject*
unicode_import(PyObject *self, PyObject *args)
{
const void *data;
Py_ssize_t size;
unsigned int format;
if (!PyArg_ParseTuple(args, "y#I", &data, &size, &format)) {
return NULL;
}
return PyUnicode_Import(data, size, format);
}


static PyMethodDef TestMethods[] = {
{"codec_incrementalencoder", codec_incrementalencoder, METH_VARARGS},
{"codec_incrementaldecoder", codec_incrementaldecoder, METH_VARARGS},
Expand Down Expand Up @@ -1924,6 +1962,8 @@ static PyMethodDef TestMethods[] = {
{"unicode_format", unicode_format, METH_VARARGS},
{"unicode_contains", unicode_contains, METH_VARARGS},
{"unicode_isidentifier", unicode_isidentifier, METH_O},
{"unicode_export", unicode_export, METH_VARARGS},
{"unicode_import", unicode_import, METH_VARARGS},
{NULL},
};

Expand Down
Loading

0 comments on commit 32a373a

Please sign in to comment.