Skip to content

Commit

Permalink
pythongh-111089: Add cache to PyUnicode_AsUTF8() for embedded NUL
Browse files Browse the repository at this point in the history
Add PyASCIIObject.state.embed_null member to Python str objects. It
is used as a cache by PyUnicode_AsUTF8() to only check once if a
string contains a null character. Strings created by
PyUnicode_FromString() initializes *embed_null* since the string
cannot contain a null character.

Global static strings now also initialize the *embed_null* member.
The chr(0) singleton ("\0" string) is the only static string which
contains a null character.
  • Loading branch information
vstinner committed Nov 1, 2023
1 parent 102685c commit c026b35
Show file tree
Hide file tree
Showing 7 changed files with 85 additions and 10 deletions.
9 changes: 8 additions & 1 deletion Include/cpython/unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,16 @@ typedef struct {
unsigned int ascii:1;
/* The object is statically allocated. */
unsigned int statically_allocated:1;
// Does the string embed null characters? Possible values:
// 0: No
// 1: Yes
// 2: Unknown, the string must be scanned
// 3: Invalid state (must not be used)
// Cache used by PyUnicode_AsUTF8() to avoid calling strlen().
unsigned int embed_null:2;
/* Padding to ensure that PyUnicode_DATA() is always aligned to
4 bytes (see issue #19537 on m68k). */
unsigned int :24;
unsigned int :22;
} state;
} PyASCIIObject;

Expand Down
12 changes: 9 additions & 3 deletions Include/internal/pycore_runtime_init.h
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ extern PyTypeObject _PyExc_MemoryError;
_PyBytes_SIMPLE_INIT((CH), 1) \
}

#define _PyUnicode_ASCII_BASE_INIT(LITERAL, ASCII) \
#define _PyUnicode_ASCII_BASE_INIT(LITERAL, ASCII, EMBED_NUL) \
{ \
.ob_base = _PyObject_HEAD_INIT(&PyUnicode_Type), \
.length = sizeof(LITERAL) - 1, \
Expand All @@ -225,11 +225,17 @@ extern PyTypeObject _PyExc_MemoryError;
.compact = 1, \
.ascii = (ASCII), \
.statically_allocated = 1, \
.embed_null = (EMBED_NUL), \
}, \
}
#define _PyASCIIObject_INIT(LITERAL) \
{ \
._ascii = _PyUnicode_ASCII_BASE_INIT((LITERAL), 1), \
._ascii = _PyUnicode_ASCII_BASE_INIT((LITERAL), 1, 0), \
._data = (LITERAL) \
}
#define _PyASCIIObject_INIT_embed_null(LITERAL) \
{ \
._ascii = _PyUnicode_ASCII_BASE_INIT((LITERAL), 1, 1), \
._data = (LITERAL) \
}
#define INIT_STR(NAME, LITERAL) \
Expand All @@ -239,7 +245,7 @@ extern PyTypeObject _PyExc_MemoryError;
#define _PyUnicode_LATIN1_INIT(LITERAL, UTF8) \
{ \
._latin1 = { \
._base = _PyUnicode_ASCII_BASE_INIT((LITERAL), 0), \
._base = _PyUnicode_ASCII_BASE_INIT((LITERAL), 0, 0), \
.utf8 = (UTF8), \
.utf8_length = sizeof(UTF8) - 1, \
}, \
Expand Down
2 changes: 1 addition & 1 deletion Include/internal/pycore_runtime_init_generated.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Add ``PyASCIIObject.state.embed_null`` member to Python str objects. It is
used as a cache by :c:func:`PyUnicode_AsUTF8` to only check once if a string
contains a null character. Strings created by :c:func:`PyUnicode_FromString`
initializes *embed_null* since the string cannot contain a null character.
Patch by Victor Stinner.
7 changes: 6 additions & 1 deletion Modules/_testcapi/unicode.c
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,12 @@ unicode_fromstring(PyObject *self, PyObject *arg)
if (!PyArg_Parse(arg, "z#", &s, &size)) {
return NULL;
}
return PyUnicode_FromString(s);
PyObject *unicode = PyUnicode_FromString(s);
if (unicode == NULL) {
return NULL;
}
assert(((PyASCIIObject*)unicode)->state.embed_null == 0);
return unicode;
}

/* Test PyUnicode_FromKindAndData() */
Expand Down
47 changes: 44 additions & 3 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,10 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
static inline int unicode_is_finalizing(void);
static int unicode_is_singleton(PyObject *unicode);
#endif
static inline Py_ssize_t
findchar(const void *s, int kind,
Py_ssize_t size, Py_UCS4 ch,
int direction);


// Return a reference to the immortal empty string singleton.
Expand Down Expand Up @@ -623,6 +627,15 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
}
CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
}

if (_PyUnicode_STATE(ascii).embed_null != 2) {
Py_ssize_t pos = findchar(PyUnicode_DATA(ascii),
PyUnicode_KIND(ascii),
PyUnicode_GET_LENGTH(ascii),
0, 1);
assert(_PyUnicode_STATE(ascii).embed_null == (pos >= 0));
}

return 1;

#undef CHECK
Expand Down Expand Up @@ -1253,6 +1266,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
_PyUnicode_STATE(unicode).compact = 1;
_PyUnicode_STATE(unicode).ascii = is_ascii;
_PyUnicode_STATE(unicode).statically_allocated = 0;
_PyUnicode_STATE(unicode).embed_null = 2;
if (is_ascii) {
((char*)data)[size] = 0;
}
Expand Down Expand Up @@ -1890,7 +1904,16 @@ PyUnicode_FromString(const char *u)
PyErr_SetString(PyExc_OverflowError, "input too long");
return NULL;
}
return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
PyObject *unicode;
unicode = PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
if (unicode != NULL) {
// PyUnicode_DecodeUTF8Stateful(u, strlen(u)) cannot create NUL
// characters: the UTF-8 decoder with the strict error handler only
// creates a NUL character if the input string contains a NUL byte
// which cannot be the case here.
_PyUnicode_STATE(unicode).embed_null = 0;
}
return unicode;
}


Expand Down Expand Up @@ -1932,6 +1955,7 @@ _PyUnicode_FromId(_Py_Identifier *id)
if (!obj) {
return NULL;
}
_PyUnicode_STATE(obj).embed_null = 0;
PyUnicode_InternInPlace(&obj);

if (index >= ids->size) {
Expand Down Expand Up @@ -3846,10 +3870,27 @@ PyUnicode_AsUTF8(PyObject *unicode)
{
Py_ssize_t size;
const char *utf8 = PyUnicode_AsUTF8AndSize(unicode, &size);
if (utf8 != NULL && strlen(utf8) != (size_t)size) {
PyErr_SetString(PyExc_ValueError, "embedded null character");
if (utf8 == NULL) {
return NULL;
}

// Cache to avoid calling O(n) strlen() operation at every
// PyUnicode_AsUTF8() call on the same object.
if (_PyUnicode_STATE(unicode).embed_null == 2) {
if (strlen(utf8) != (size_t)size) {
_PyUnicode_STATE(unicode).embed_null = 1;
}
else {
_PyUnicode_STATE(unicode).embed_null = 0;
}
}

if (_PyUnicode_STATE(unicode).embed_null == 1) {
PyErr_SetString(PyExc_ValueError,
"embedded null character");
return NULL;
}

return utf8;
}

Expand Down
13 changes: 12 additions & 1 deletion Tools/build/generate_global_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,14 @@ def open_for_changes(filename, orig):
def generate_global_strings(identifiers, strings):
filename = os.path.join(INTERNAL, 'pycore_global_strings.h')

# NUL characters are not supported; see _PyASCIIObject_INIT_embed_null().
for identifier in identifiers:
if "\0" in identifier:
raise Exception(f"identifier contains embedded null character: {identifier!r}")
for string in strings:
if "\0" in string:
raise Exception(f"string contains embedded null character: {string!r}")

# Read the non-generated part of the file.
with open(filename) as infile:
orig = infile.read()
Expand Down Expand Up @@ -321,7 +329,10 @@ def generate_runtime_init(identifiers, strings):
printer.write('')
with printer.block('#define _Py_str_ascii_INIT', continuation=True):
for i in range(128):
printer.write(f'_PyASCIIObject_INIT("\\x{i:02x}"),')
if i == 0:
printer.write(f'_PyASCIIObject_INIT_embed_null("\\x{i:02x}"),')
else:
printer.write(f'_PyASCIIObject_INIT("\\x{i:02x}"),')
immortal_objects.append(f'(PyObject *)&_Py_SINGLETON(strings).ascii[{i}]')
printer.write('')
with printer.block('#define _Py_str_latin1_INIT', continuation=True):
Expand Down

0 comments on commit c026b35

Please sign in to comment.