Add UTF-8 to UTF-32 decoding

Add internal functions to convert UTF-32 to UTF-8, with corresponding tests.
xkbcommon · Sep 23, 2024 · 98dee22 · 98dee22
1 parent b5f0779
commit 98dee22
Show file tree

Hide file tree

Showing 4 changed files with 148 additions and 1 deletion.
diff --git a/meson.build b/meson.build
@@ -753,7 +753,13 @@ test(
 )
 test(
     'utf8',
-    executable('test-utf8', 'test/utf8.c', dependencies: test_dep),
+    executable(
+        'test-utf8',
+        'test/utf8.c',
+        'src/utf8-decoding.c',
+        'src/utf8-decoding.h',
+        dependencies: test_dep
+    ),
     env: test_env,
 )
 test(

diff --git a/src/utf8-decoding.c b/src/utf8-decoding.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright © 2024 Pierre Le Marre <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "config.h"
+
+#include "utf8-decoding.h"
+
+/* Array mapping the leading byte to the length of a UTF-8 sequence.
+ * A value of zero indicates that the byte can not begin a UTF-8 sequence. */
+static const uint8_t utf8_sequence_length_by_leading_byte[256] = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00-0x0F */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x10-0x1F */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x20-0x2F */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x30-0x3F */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40-0x4F */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x50-0x5F */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60-0x6F */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x70-0x7F */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80-0x8F */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90-0x9F */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0-0xAF */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xB0-0xBF */
+    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xC0-0xCF */
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xD0-0xDF */
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 0xE0-0xEF */
+    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xF0-0xFF */
+};
+
+/* Length of next utf-8 sequence */
+uint8_t
+utf8_sequence_length(const char *s)
+{
+    return utf8_sequence_length_by_leading_byte[(unsigned char)s[0]];
+}
+
+/* Reads the next UTF-8 sequence in a string */
+uint32_t
+utf8_next_code_point(const char *s, size_t max_size, size_t *size_out)
+{
+    uint32_t cp = 0;
+    uint8_t len = utf8_sequence_length(s);
+    *size_out = 0;
+
+    if (!max_size || len > max_size)
+        return INVALID_UTF8_CODE_POINT;
+
+    /* Handle leading byte */
+    switch (len) {
+    case 1:
+        *size_out = 1;
+        return (uint32_t)s[0];
+    case 2:
+        cp = (uint32_t)s[0] & 0x1f;
+        break;
+    case 3:
+        cp = (uint32_t)s[0] & 0x0f;
+        break;
+    case 4:
+        cp = (uint32_t)s[0] & 0x07;
+        break;
+    default:
+        return INVALID_UTF8_CODE_POINT;
+    }
+
+    /* Process remaining bytes of the UTF-8 sequence */
+    for (size_t k = 1; k < len; k++) {
+        if (((uint32_t)s[k] & 0xc0) != 0x80)
+            return INVALID_UTF8_CODE_POINT;
+        cp <<= 6;
+        cp |= (uint32_t)s[k] & 0x3f;
+    }
+
+    /* Check surrogates */
+    if (cp >= 0xd800 && cp <= 0xdfff)
+        return INVALID_UTF8_CODE_POINT;
+
+    *size_out = len;
+    return cp;
+}
diff --git a/src/utf8-decoding.h b/src/utf8-decoding.h
@@ -0,0 +1,20 @@
+
+#ifndef UTF8_DECODING_H
+#define UTF8_DECODING_H
+
+#include "config.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+/* Check if a char is the start of a UTF-8 sequence */
+#define is_utf8_start(c) (((c) & 0xc0) != 0x80)
+#define INVALID_UTF8_CODE_POINT UINT32_MAX
+
+uint8_t
+utf8_sequence_length(const char *s);
+
+uint32_t
+utf8_next_code_point(const char *s, size_t max_size, size_t *size_out);
+
+#endif
diff --git a/test/utf8.c b/test/utf8.c
@@ -32,6 +32,7 @@
 #include "src/keysym.h"
 #include "test.h"
 #include "utf8.h"
+#include "utf8-decoding.h"
 #include "utils.h"
 
 #define VALID(lit) assert(is_valid_utf8(lit, sizeof(lit)-1))
@@ -179,13 +180,34 @@ test_utf32_to_utf8(void)
     check_utf32_to_utf8(0xffffffff, 0, "");
 }
 
+static void
+/* Check roundtrip UTF-32 → UTF-8 → UTF-32 */
+test_utf8_to_utf32(void)
+{
+    char buffer[XKB_KEYSYM_UTF8_MAX_SIZE];
+    for (uint32_t cp = 0; cp < 0x10ffff; cp++) {
+        int length = utf32_to_utf8(cp, buffer) - 1;
+        /* Check surrogates */
+        if (cp >= 0xd800 && cp <= 0xdfff) {
+            assert(length == -1);
+        } else {
+            assert(length > 0);
+            size_t length2 = 0;
+            uint32_t cp2 = utf8_next_code_point(buffer, (size_t)length, &length2);
+            assert(cp2 != INVALID_UTF8_CODE_POINT && cp2 == cp &&
+                   length2 == (size_t)length);
+        }
+    }
+}
+
 int
 main(void)
 {
     test_init();
 
     test_is_valid_utf8();
     test_utf32_to_utf8();
+    test_utf8_to_utf32();
 
     return 0;
 }