From 935fef29bd825eb99638247ac614f4f0da2229b1 Mon Sep 17 00:00:00 2001
From: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
Date: Tue, 22 Oct 2024 07:18:36 +0200
Subject: [PATCH] Optimize DOM HTML serialization for UTF-8 (#16376)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Use a direct call for decoding the UTF-8 buffer

* Add fast path for UTF-8 HTML serialization

This patch adds a fast path to the HTML serialization encoding that has
to encode to UTF-8. Because the DOM internally represents all strings
using UTF-8, we only need to validate here.

Tested on Wikipedia English home page on an i7-4790:
```
Benchmark 1: ./sapi/cli/php x.php
  Time (mean ± σ):     516.0 ms ±   6.4 ms    [User: 511.2 ms, System: 3.5 ms]
  Range (min … max):   506.0 ms … 527.1 ms    10 runs

Benchmark 2: ./sapi/cli/php_old x.php
  Time (mean ± σ):     682.8 ms ±   6.5 ms    [User: 676.8 ms, System: 3.8 ms]
  Range (min … max):   675.8 ms … 695.6 ms    10 runs

Summary
  ./sapi/cli/php x.php ran
    1.32 ± 0.02 times faster than ./sapi/cli/php_old x.php
```

(And if you're interested: it takes over a second on my machine using the old DOMDocument class)

Future optimizations are certainly possible, but let's start here.
---
 ext/dom/html_document.c | 78 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 73 insertions(+), 5 deletions(-)

diff --git a/ext/dom/html_document.c b/ext/dom/html_document.c
index 3ca812bde50b..3e4ceeb14f2b 100644
--- a/ext/dom/html_document.c
+++ b/ext/dom/html_document.c
@@ -570,12 +570,11 @@ static bool dom_decode_encode_fast_path(
 		const lxb_char_t *buf_ref_backup = buf_ref;
 		lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
 		if (UNEXPECTED(codepoint > LXB_ENCODING_MAX_CODEPOINT)) {
-			size_t skip = buf_ref - buf_ref_backup; /* Skip invalid data, it's replaced by the UTF-8 replacement bytes */
 			if (!dom_process_parse_chunk(
 				ctx,
 				document,
 				parser,
-				buf_ref - last_output - skip,
+				buf_ref_backup - last_output,
 				last_output,
 				buf_ref - last_output,
 				tokenizer_error_offset,
@@ -1208,6 +1207,68 @@ static zend_result dom_write_output_stream(void *application_data, const char *b
 	return SUCCESS;
 }
 
+/* Fast path when the output encoding is UTF-8 */
+static zend_result dom_saveHTML_write_string_len_utf8_output(void *application_data, const char *buf, size_t len)
+{
+	dom_output_ctx *output = (dom_output_ctx *) application_data;
+
+	output->decode->status = LXB_STATUS_OK;
+
+	const lxb_char_t *buf_ref = (const lxb_char_t *) buf;
+	const lxb_char_t *last_output = buf_ref;
+	const lxb_char_t *buf_end = buf_ref + len;
+
+	while (buf_ref != buf_end) {
+		const lxb_char_t *buf_ref_backup = buf_ref;
+		lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(output->decode, &buf_ref, buf_end);
+		if (UNEXPECTED(codepoint > LXB_ENCODING_MAX_CODEPOINT)) {
+			if (UNEXPECTED(output->write_output(
+				output->output_data,
+				(const char *) last_output,
+				buf_ref_backup - last_output
+			) != SUCCESS)) {
+				return FAILURE;
+			}
+
+			if (codepoint == LXB_ENCODING_DECODE_CONTINUE) {
+				ZEND_ASSERT(buf_ref == buf_end);
+				/* The decoder needs more data but the entire buffer is consumed.
+				 * All valid data is outputted, and if the remaining data for the code point
+				 * is invalid, the next call will output the replacement bytes. */
+				output->decode->status = LXB_STATUS_CONTINUE;
+				return SUCCESS;
+			}
+
+			if (UNEXPECTED(output->write_output(
+				output->output_data,
+				(const char *) LXB_ENCODING_REPLACEMENT_BYTES,
+				LXB_ENCODING_REPLACEMENT_SIZE
+			) != SUCCESS)) {
+				return FAILURE;
+			}
+
+			last_output = buf_ref;
+		}
+	}
+
+	if (buf_ref != last_output) {
+		if (UNEXPECTED(output->write_output(
+			output->output_data,
+			(const char *) last_output,
+			buf_ref - last_output
+		) != SUCCESS)) {
+			return FAILURE;
+		}
+	}
+
+	return SUCCESS;
+}
+
+static zend_result dom_saveHTML_write_string_utf8_output(void *application_data, const char *buf)
+{
+	return dom_saveHTML_write_string_len_utf8_output(application_data, buf, strlen(buf));
+}
+
 static zend_result dom_saveHTML_write_string_len(void *application_data, const char *buf, size_t len)
 {
 	dom_output_ctx *output = (dom_output_ctx *) application_data;
@@ -1216,7 +1277,7 @@ static zend_result dom_saveHTML_write_string_len(void *application_data, const c
 	const lxb_char_t *buf_end = buf_ref + len;
 
 	do {
-		decode_status = output->decoding_data->decode(output->decode, &buf_ref, buf_end);
+		decode_status = lxb_encoding_decode_utf_8(output->decode, &buf_ref, buf_end);
 
 		const lxb_codepoint_t *codepoints_ref = output->codepoints;
 		const lxb_codepoint_t *codepoints_end = codepoints_ref + lxb_encoding_decode_buf_used(output->decode);
@@ -1272,8 +1333,15 @@ static zend_result dom_common_save(dom_output_ctx *output_ctx, dom_object *inter
 	output_ctx->encoding_output = encoding_output;
 
 	dom_html5_serialize_context ctx;
-	ctx.write_string_len = dom_saveHTML_write_string_len;
-	ctx.write_string = dom_saveHTML_write_string;
+	if (encoding_data->encoding == LXB_ENCODING_UTF_8) {
+		/* Fast path */
+		ctx.write_string_len = dom_saveHTML_write_string_len_utf8_output;
+		ctx.write_string = dom_saveHTML_write_string_utf8_output;
+	} else {
+		/* Slow path */
+		ctx.write_string_len = dom_saveHTML_write_string_len;
+		ctx.write_string = dom_saveHTML_write_string;
+	}
 	ctx.application_data = output_ctx;
 	ctx.private_data = php_dom_get_private_data(intern);
 	if (UNEXPECTED(dom_html5_serialize_outer(&ctx, node) != SUCCESS)) {