Skip to content

Commit

Permalink
pythongh-119609: Add PyUnicode_AsNativeFormat() function
Browse files Browse the repository at this point in the history
Add PyUnicode_AsNativeFormat() and PyUnicode_FromNativeFormat()
functions to the C API.
  • Loading branch information
vstinner committed May 27, 2024
1 parent 0bd0d40 commit 2f8bf77
Show file tree
Hide file tree
Showing 11 changed files with 281 additions and 2 deletions.
47 changes: 47 additions & 0 deletions Doc/c-api/unicode.rst
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,53 @@ APIs:
.. versionadded:: 3.3
.. c:function:: const void* PyUnicode_AsNativeFormat(PyObject *unicode, Py_ssize_t *size, int *native_format)
Get the contents of a string in its native format.
* Return the contents, set *\*size* and *\*native_format* on success.
* Set an exception and return ``NULL`` on error.
The contents is valid as long as *unicode* is valid.
*unicode*, *size* and *native_format* must not be NULL.
*\*native_format* is set to one of these native formats:
.. c:namespace:: NULL
======================================== ===== ============================
Constant Identifier Value Description
======================================== ===== ============================
.. c:macro:: PyUnicode_NATIVE_ASCII ``1`` ASCII string (``Py_UCS1*``)
.. c:macro:: PyUnicode_NATIVE_UCS1 ``2`` UCS-1 string (``Py_UCS1*``)
.. c:macro:: PyUnicode_NATIVE_UCS2 ``3`` UCS-2 string (``Py_UCS2*``)
.. c:macro:: PyUnicode_NATIVE_UCS4 ``4`` UCS-4 string (``Py_UCS4*``)
.. c:macro:: PyUnicode_NATIVE_UTF8 ``5`` UTF-8 string (``char*``)
======================================== ===== ============================
.. impl-detail::
In CPython, the :c:macro:`PyUnicode_NATIVE_UTF8` format is not used by
:c:func:`PyUnicode_AsNativeFormat`, but it's accepted by
:c:func:`PyUnicode_FromNativeFormat`.
.. versionadded:: 3.14
.. c:function:: PyObject* PyUnicode_FromNativeFormat(const void *data, Py_ssize_t size, int native_format)
Create a string object from a native format string.
* Return a reference to a new string object on success.
* Set an exception and return ``NULL`` on error.
*data* must not be NULL. *size* must be positive or zero.
See :c:func:`PyUnicode_AsNativeFormat` for the available native formats.
.. versionadded:: 3.14
.. c:function:: PyObject* PyUnicode_FromKindAndData(int kind, const void *buffer, \
Py_ssize_t size)
Expand Down
2 changes: 2 additions & 0 deletions Doc/data/stable_abi.dat

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions Doc/whatsnew/3.14.rst
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,12 @@ C API Changes
New Features
------------

* Add :c:func:`PyUnicode_AsNativeFormat` and
:c:func:`PyUnicode_FromNativeFormat` functions to import and export strings
in their native format.
(Contributed by Victor Stinner in :gh:`119609`.)


Porting to Python 3.14
----------------------

Expand Down
22 changes: 22 additions & 0 deletions Include/unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,28 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
const char *u /* UTF-8 encoded string */
);

#define PyUnicode_NATIVE_ASCII 1
#define PyUnicode_NATIVE_UCS1 2
#define PyUnicode_NATIVE_UCS2 3
#define PyUnicode_NATIVE_UCS4 4
#define PyUnicode_NATIVE_UTF8 5

// Get the content of a string in its native format.
// - Return the content, set '*size' and '*native_format' on success.
// - Set an exception and return NULL on error.
PyAPI_FUNC(const void*) PyUnicode_AsNativeFormat(
PyObject *unicode,
Py_ssize_t *size,
int *native_format);

// Create a string object from a native format string.
// - Return a reference to a new string object on success.
// - Set an exception and return NULL on error.
PyAPI_FUNC(PyObject*) PyUnicode_FromNativeFormat(
const void *data,
Py_ssize_t size,
int native_format);

/* --- wchar_t support for platforms which support it --------------------- */

#ifdef HAVE_WCHAR_H
Expand Down
81 changes: 79 additions & 2 deletions Lib/test/test_capi/test_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@ class Str(str):
pass


PyUnicode_NATIVE_ASCII = 1
PyUnicode_NATIVE_UCS1 = 2
PyUnicode_NATIVE_UCS2 = 3
PyUnicode_NATIVE_UCS4 = 4
PyUnicode_NATIVE_UTF8 = 5
# Invalid native format
PyUnicode_NATIVE_INVALID = 0

class CAPITest(unittest.TestCase):

@support.cpython_only
Expand Down Expand Up @@ -1675,6 +1683,75 @@ def test_pep393_utf8_caching_bug(self):
# Check that the second call returns the same result
self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))


if __name__ == "__main__":
def test_unicode_asnativeformat(self):
# Test PyUnicode_AsNativeFormat()
asnativeformat = _testlimitedcapi.unicode_asnativeformat
self.assertEqual(asnativeformat("abc"),
(b'abc', PyUnicode_NATIVE_ASCII))
self.assertEqual(asnativeformat("latin1:\xe9"),
(b'latin1:\xe9', PyUnicode_NATIVE_UCS1))

ucs2_enc = 'utf-16le' if sys.byteorder == 'little' else 'utf-16be'
self.assertEqual(asnativeformat('ucs2:\u20ac'),
('ucs2:\u20ac'.encode(ucs2_enc),
PyUnicode_NATIVE_UCS2))

ucs4_enc = 'utf-32le' if sys.byteorder == 'little' else 'utf-32be'
self.assertEqual(asnativeformat('ucs4:\U0010ffff'),
('ucs4:\U0010ffff'.encode(ucs4_enc),
PyUnicode_NATIVE_UCS4))

def test_unicode_fromnativeformat(self):
# Test PyUnicode_FromNativeFormat()
fromnativeformat = _testlimitedcapi.unicode_fromnativeformat
self.assertEqual(fromnativeformat(b'abc', PyUnicode_NATIVE_ASCII),
"abc")
self.assertEqual(fromnativeformat(b'latin1:\xe9', PyUnicode_NATIVE_UCS1),
"latin1:\xe9")

ucs2_enc = 'utf-16le' if sys.byteorder == 'little' else 'utf-16be'
self.assertEqual(fromnativeformat('ucs2:\u20ac'.encode(ucs2_enc),
PyUnicode_NATIVE_UCS2),
'ucs2:\u20ac')

ucs4_enc = 'utf-32le' if sys.byteorder == 'little' else 'utf-32be'
self.assertEqual(fromnativeformat('ucs4:\U0010ffff'.encode(ucs4_enc),
PyUnicode_NATIVE_UCS4),
'ucs4:\U0010ffff')

text = "abc\xe9\U0010ffff"
self.assertEqual(fromnativeformat(text.encode('utf8'),
PyUnicode_NATIVE_UTF8),
text)

# Empty string
for native_format in (
PyUnicode_NATIVE_ASCII,
PyUnicode_NATIVE_UCS1,
PyUnicode_NATIVE_UCS2,
PyUnicode_NATIVE_UCS4,
PyUnicode_NATIVE_UTF8,
):
with self.subTest(native_format=native_format):
self.assertEqual(fromnativeformat(b'', native_format),
'')

# Invalid format
with self.assertRaises(ValueError):
fromnativeformat(b'', PyUnicode_NATIVE_INVALID)

# Invalid size
ucs2 = 'ucs2:\u20ac'.encode(ucs2_enc)
with self.assertRaises(ValueError):
fromnativeformat(ucs2[:-1], PyUnicode_NATIVE_UCS2)
ucs4 = 'ucs4:\U0010ffff'.encode(ucs4_enc)
with self.assertRaises(ValueError):
fromnativeformat(ucs4[:-1], PyUnicode_NATIVE_UCS4)
with self.assertRaises(ValueError):
fromnativeformat(ucs4[:-2], PyUnicode_NATIVE_UCS4)
with self.assertRaises(ValueError):
fromnativeformat(ucs4[:-3], PyUnicode_NATIVE_UCS4)


if __name__ == '__main__':
unittest.main()
2 changes: 2 additions & 0 deletions Lib/test/test_stable_abi_ctypes.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Add :c:func:`PyUnicode_AsNativeFormat` and
:c:func:`PyUnicode_FromNativeFormat` functions to import and export strings
in their native format. Patch by Victor Stinner.
4 changes: 4 additions & 0 deletions Misc/stable_abi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2507,3 +2507,7 @@
added = '3.13'
[function.PyEval_GetFrameLocals]
added = '3.13'
[function.PyUnicode_AsNativeFormat]
added = '3.14'
[function.PyUnicode_FromNativeFormat]
added = '3.14'
31 changes: 31 additions & 0 deletions Modules/_testlimitedcapi/unicode.c
Original file line number Diff line number Diff line change
Expand Up @@ -1837,6 +1837,35 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
#undef CHECK_FORMAT_0
}


// Test PyUnicode_AsNativeFormat()
static PyObject*
unicode_asnativeformat(PyObject *self, PyObject *obj)
{
Py_ssize_t size;
int native_format;
const void *data = PyUnicode_AsNativeFormat(obj, &size, &native_format);
if (data == NULL) {
return NULL;
}
return Py_BuildValue("y#i", data, size, native_format);
}


// Test PyUnicode_FromNativeFormat()
static PyObject*
unicode_fromnativeformat(PyObject *self, PyObject *args)
{
const void *data;
Py_ssize_t size;
int native_format;
if (!PyArg_ParseTuple(args, "y#i", &data, &size, &native_format)) {
return NULL;
}
return PyUnicode_FromNativeFormat(data, size, native_format);
}


static PyMethodDef TestMethods[] = {
{"codec_incrementalencoder", codec_incrementalencoder, METH_VARARGS},
{"codec_incrementaldecoder", codec_incrementaldecoder, METH_VARARGS},
Expand Down Expand Up @@ -1924,6 +1953,8 @@ static PyMethodDef TestMethods[] = {
{"unicode_format", unicode_format, METH_VARARGS},
{"unicode_contains", unicode_contains, METH_VARARGS},
{"unicode_isidentifier", unicode_isidentifier, METH_O},
{"unicode_asnativeformat", unicode_asnativeformat, METH_O},
{"unicode_fromnativeformat", unicode_fromnativeformat, METH_VARARGS},
{NULL},
};

Expand Down
83 changes: 83 additions & 0 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -2094,6 +2094,89 @@ _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
return res;
}

const void*
PyUnicode_AsNativeFormat(PyObject *unicode,
Py_ssize_t *size, int *native_format)
{
if (!PyUnicode_Check(unicode)) {
*size = 0;
*native_format = 0;
PyErr_Format(PyExc_TypeError, "must be str, not %T", unicode);
return NULL;
}

Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);

if (PyUnicode_IS_ASCII(unicode)) {
*native_format = PyUnicode_NATIVE_ASCII;
*size = len;
return PyUnicode_1BYTE_DATA(unicode);
}
int kind = PyUnicode_KIND(unicode);

switch (kind)
{
case PyUnicode_1BYTE_KIND:
*native_format = PyUnicode_NATIVE_UCS1;
*size = len;
return PyUnicode_1BYTE_DATA(unicode);

case PyUnicode_2BYTE_KIND:
*native_format = PyUnicode_NATIVE_UCS2;
*size = len * 2;
return PyUnicode_2BYTE_DATA(unicode);

default:
assert(kind == PyUnicode_4BYTE_KIND);
*native_format = PyUnicode_NATIVE_UCS4;
*size = len * 4;
return PyUnicode_4BYTE_DATA(unicode);
}
}

PyObject*
PyUnicode_FromNativeFormat(const void *data, Py_ssize_t size,
int native_format)
{
if (size < 0) {
PyErr_SetString(PyExc_ValueError, "Negative size");
return NULL;
}

switch (native_format)
{
case PyUnicode_NATIVE_ASCII:
return PyUnicode_DecodeASCII((const char*)data, size, NULL);

case PyUnicode_NATIVE_UCS1:
return _PyUnicode_FromUCS1(data, size);

case PyUnicode_NATIVE_UCS2:
if (size % 2) {
PyErr_Format(PyExc_ValueError, "size must be a multiple of 2: %zd",
size);
return NULL;
}
return _PyUnicode_FromUCS2(data, size / 2);

case PyUnicode_NATIVE_UCS4:
if (size % 4) {
PyErr_Format(PyExc_ValueError, "size must be a multiple of 4: %zd",
size);
return NULL;
}
return _PyUnicode_FromUCS4(data, size / 4);

case PyUnicode_NATIVE_UTF8:
return PyUnicode_DecodeUTF8((const char*)data, size, NULL);

default:
PyErr_Format(PyExc_ValueError, "unknown native format %i",
native_format);
return NULL;
}
}

PyObject*
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
{
Expand Down
2 changes: 2 additions & 0 deletions PC/python3dll.c

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 2f8bf77

Please sign in to comment.