Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-111089: Add cache to PyUnicode_AsUTF8() for embedded NUL #111587

Closed
wants to merge 9 commits into from
9 changes: 8 additions & 1 deletion Doc/whatsnew/3.13.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1120,6 +1120,13 @@ New Features
* Add :c:func:`PyUnicode_AsUTF8` function to the limited C API.
(Contributed by Victor Stinner in :gh:`111089`.)

* Add ``PyASCIIObject.state.embed_null`` member to Python :class:`str` objects.
It is used as a cache by :c:func:`PyUnicode_AsUTF8` to only check once if a
string contains a null character. Strings created by
:c:func:`PyUnicode_FromString` initializes *embed_null* to 0 since the string
cannot contain a null character.
(Contributed by Victor Stinner in :gh:`111089`.)


Porting to Python 3.13
----------------------
Expand Down Expand Up @@ -1192,7 +1199,7 @@ Porting to Python 3.13

* The :c:func:`PyUnicode_AsUTF8` function now raises an exception if the string
contains embedded null characters. To accept embedded null characters and
truncate on purpose at the first null byte,
truncate on purpose at the first null character,
``PyUnicode_AsUTF8AndSize(unicode, NULL)`` can be used instead.
(Contributed by Victor Stinner in :gh:`111089`.)

Expand Down
9 changes: 8 additions & 1 deletion Include/cpython/unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,16 @@ typedef struct {
unsigned int ascii:1;
/* The object is statically allocated. */
unsigned int statically_allocated:1;
// Does the string embed null characters? Possible values:
// 0: No
// 1: Yes
// 2: Unknown, the string must be scanned
serhiy-storchaka marked this conversation as resolved.
Show resolved Hide resolved
// 3: Invalid state (must not be used)
// Cache used by PyUnicode_AsUTF8() to avoid calling strlen().
unsigned int embed_null:2;
/* Padding to ensure that PyUnicode_DATA() is always aligned to
4 bytes (see issue #19537 on m68k). */
unsigned int :24;
unsigned int :22;
} state;
} PyASCIIObject;

Expand Down
12 changes: 9 additions & 3 deletions Include/internal/pycore_runtime_init.h
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ extern PyTypeObject _PyExc_MemoryError;
_PyBytes_SIMPLE_INIT((CH), 1) \
}

#define _PyUnicode_ASCII_BASE_INIT(LITERAL, ASCII) \
#define _PyUnicode_ASCII_BASE_INIT(LITERAL, ASCII, EMBED_NUL) \
{ \
.ob_base = _PyObject_HEAD_INIT(&PyUnicode_Type), \
.length = sizeof(LITERAL) - 1, \
Expand All @@ -225,11 +225,17 @@ extern PyTypeObject _PyExc_MemoryError;
.compact = 1, \
.ascii = (ASCII), \
.statically_allocated = 1, \
.embed_null = (EMBED_NUL), \
}, \
}
#define _PyASCIIObject_INIT(LITERAL) \
{ \
._ascii = _PyUnicode_ASCII_BASE_INIT((LITERAL), 1), \
._ascii = _PyUnicode_ASCII_BASE_INIT((LITERAL), 1, 0), \
._data = (LITERAL) \
}
#define _PyASCIIObject_INIT_embed_null(LITERAL) \
{ \
._ascii = _PyUnicode_ASCII_BASE_INIT((LITERAL), 1, 1), \
._data = (LITERAL) \
}
#define INIT_STR(NAME, LITERAL) \
Expand All @@ -239,7 +245,7 @@ extern PyTypeObject _PyExc_MemoryError;
#define _PyUnicode_LATIN1_INIT(LITERAL, UTF8) \
{ \
._latin1 = { \
._base = _PyUnicode_ASCII_BASE_INIT((LITERAL), 0), \
._base = _PyUnicode_ASCII_BASE_INIT((LITERAL), 0, 0), \
.utf8 = (UTF8), \
.utf8_length = sizeof(UTF8) - 1, \
}, \
Expand Down
2 changes: 1 addition & 1 deletion Include/internal/pycore_runtime_init_generated.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Add ``PyASCIIObject.state.embed_null`` member to Python str objects. It is
used as a cache by :c:func:`PyUnicode_AsUTF8` to only check once if a string
contains a null character. Strings created by :c:func:`PyUnicode_FromString`
initializes *embed_null* to 0 since the string cannot contain a null character.
Patch by Victor Stinner.
64 changes: 61 additions & 3 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,8 @@ NOTE: In the interpreter's initialization phase, some globals are currently
# define OVERALLOCATE_FACTOR 4
#endif

#define EMBED_NULL_UNKNOWN 2

/* Forward declaration */
static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Expand All @@ -205,6 +207,10 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
static inline int unicode_is_finalizing(void);
static int unicode_is_singleton(PyObject *unicode);
#endif
static inline Py_ssize_t
findchar(const void *s, int kind,
Py_ssize_t size, Py_UCS4 ch,
int direction);


// Return a reference to the immortal empty string singleton.
Expand Down Expand Up @@ -623,6 +629,15 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
}
CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
}

if (_PyUnicode_STATE(ascii).embed_null != EMBED_NULL_UNKNOWN) {
Py_ssize_t pos = findchar(PyUnicode_DATA(ascii),
PyUnicode_KIND(ascii),
PyUnicode_GET_LENGTH(ascii),
0, 1);
CHECK(_PyUnicode_STATE(ascii).embed_null == (pos >= 0));
}

return 1;

#undef CHECK
Expand Down Expand Up @@ -1000,6 +1015,7 @@ resize_compact(PyObject *unicode, Py_ssize_t length)
_PyUnicode_UTF8(unicode) = NULL;
_PyUnicode_UTF8_LENGTH(unicode) = 0;
}
_PyUnicode_STATE(unicode).embed_null = EMBED_NULL_UNKNOWN;
#ifdef Py_TRACE_REFS
_Py_ForgetReference(unicode);
#endif
Expand Down Expand Up @@ -1053,6 +1069,7 @@ resize_inplace(PyObject *unicode, Py_ssize_t length)
_PyUnicode_UTF8(unicode) = NULL;
_PyUnicode_UTF8_LENGTH(unicode) = 0;
}
_PyUnicode_STATE(unicode).embed_null = EMBED_NULL_UNKNOWN;

data = (PyObject *)PyObject_Realloc(data, new_size);
if (data == NULL) {
Expand Down Expand Up @@ -1253,6 +1270,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
_PyUnicode_STATE(unicode).compact = 1;
_PyUnicode_STATE(unicode).ascii = is_ascii;
_PyUnicode_STATE(unicode).statically_allocated = 0;
_PyUnicode_STATE(unicode).embed_null = EMBED_NULL_UNKNOWN;
if (is_ascii) {
((char*)data)[size] = 0;
}
Expand Down Expand Up @@ -1777,6 +1795,8 @@ unicode_char(Py_UCS4 ch)
assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
PyUnicode_4BYTE_DATA(unicode)[0] = ch;
}
// ch >= 256 and so cannot be 0
_PyUnicode_STATE(unicode).embed_null = 0;
assert(_PyUnicode_CheckConsistency(unicode, 1));
return unicode;
}
Expand All @@ -1793,8 +1813,13 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
return NULL;
}

unsigned int embed_null;
if (size == -1) {
size = wcslen(u);
embed_null = 0;
}
else {
embed_null = EMBED_NULL_UNKNOWN;
}

/* If the Unicode data is known at construction time, we can apply
Expand Down Expand Up @@ -1859,6 +1884,7 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
default:
Py_UNREACHABLE();
}
_PyUnicode_STATE(unicode).embed_null = embed_null;

return unicode_result(unicode);
}
Expand Down Expand Up @@ -1890,7 +1916,16 @@ PyUnicode_FromString(const char *u)
PyErr_SetString(PyExc_OverflowError, "input too long");
return NULL;
}
return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
PyObject *unicode;
unicode = PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
if (unicode != NULL) {
// PyUnicode_DecodeUTF8Stateful(u, strlen(u)) cannot create NUL
// characters: the UTF-8 decoder with the strict error handler only
// creates a NUL character if the input string contains a NUL byte
// which cannot be the case here.
_PyUnicode_STATE(unicode).embed_null = 0;
}
return unicode;
}


Expand Down Expand Up @@ -1932,6 +1967,7 @@ _PyUnicode_FromId(_Py_Identifier *id)
if (!obj) {
return NULL;
}
_PyUnicode_STATE(obj).embed_null = 0;
PyUnicode_InternInPlace(&obj);

if (index >= ids->size) {
Expand Down Expand Up @@ -2204,6 +2240,7 @@ _PyUnicode_Copy(PyObject *unicode)

memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
length * PyUnicode_KIND(unicode));
_PyUnicode_STATE(copy).embed_null = _PyUnicode_STATE(unicode).embed_null;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_PyUnicode_Copy() makes a modifiable unicode object. It is legal to embed a null character in it after creation or replace an embeded null character with non-null character. In particular, it creates a new copy even from Latin1 character singletons.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, technically, any string can be modified anytime by the C API. People "should not do that", but since it's possible, I'm not sure if it's safe to make the assumption that people will not mutate a string long after its creation: after the cache is initialized.

assert(_PyUnicode_CheckConsistency(copy, 1));
return copy;
}
Expand Down Expand Up @@ -3846,10 +3883,29 @@ PyUnicode_AsUTF8(PyObject *unicode)
{
Py_ssize_t size;
const char *utf8 = PyUnicode_AsUTF8AndSize(unicode, &size);
if (utf8 != NULL && strlen(utf8) != (size_t)size) {
PyErr_SetString(PyExc_ValueError, "embedded null character");
if (utf8 == NULL) {
return NULL;
}

// Cache to avoid calling O(n) strlen() operation at every
// PyUnicode_AsUTF8() call on the same object.
if (_PyUnicode_STATE(unicode).embed_null != 0) {
if (_PyUnicode_STATE(unicode).embed_null == EMBED_NULL_UNKNOWN) {
if (strlen(utf8) != (size_t)size) {
_PyUnicode_STATE(unicode).embed_null = 1;
}
else {
_PyUnicode_STATE(unicode).embed_null = 0;
}
}

if (_PyUnicode_STATE(unicode).embed_null == 1) {
PyErr_SetString(PyExc_ValueError,
"embedded null character");
return NULL;
}
}

return utf8;
}

Expand Down Expand Up @@ -11039,6 +11095,7 @@ PyUnicode_Append(PyObject **p_left, PyObject *right)
Py_DECREF(left);
*p_left = res;
}
assert(_PyUnicode_STATE(*p_left).embed_null == EMBED_NULL_UNKNOWN);
assert(_PyUnicode_CheckConsistency(*p_left, 1));
return;

Expand Down Expand Up @@ -14627,6 +14684,7 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
_PyUnicode_STATE(self).compact = 0;
_PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
_PyUnicode_STATE(self).statically_allocated = 0;
_PyUnicode_STATE(self).embed_null = EMBED_NULL_UNKNOWN;
_PyUnicode_UTF8_LENGTH(self) = 0;
_PyUnicode_UTF8(self) = NULL;
_PyUnicode_DATA_ANY(self) = NULL;
Expand Down
13 changes: 12 additions & 1 deletion Tools/build/generate_global_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,14 @@ def open_for_changes(filename, orig):
def generate_global_strings(identifiers, strings):
filename = os.path.join(INTERNAL, 'pycore_global_strings.h')

# NUL characters are not supported; see _PyASCIIObject_INIT_embed_null().
for identifier in identifiers:
if "\0" in identifier:
raise Exception(f"an identifier contains a null character: {identifier!r}")
for string in strings:
if "\0" in string:
raise Exception(f"a string contains a null character: {string!r}")

# Read the non-generated part of the file.
with open(filename) as infile:
orig = infile.read()
Expand Down Expand Up @@ -321,7 +329,10 @@ def generate_runtime_init(identifiers, strings):
printer.write('')
with printer.block('#define _Py_str_ascii_INIT', continuation=True):
for i in range(128):
printer.write(f'_PyASCIIObject_INIT("\\x{i:02x}"),')
if i == 0:
printer.write(f'_PyASCIIObject_INIT_embed_null("\\x{i:02x}"),')
else:
printer.write(f'_PyASCIIObject_INIT("\\x{i:02x}"),')
immortal_objects.append(f'(PyObject *)&_Py_SINGLETON(strings).ascii[{i}]')
printer.write('')
with printer.block('#define _Py_str_latin1_INIT', continuation=True):
Expand Down
Loading