Skip to content

Commit

Permalink
Add UTF-8 to UTF-32 decoding
Browse files Browse the repository at this point in the history
Add internal functions to convert UTF-32 to UTF-8, with corresponding
tests.
  • Loading branch information
wismill committed Sep 23, 2024
1 parent b5f0779 commit 98dee22
Show file tree
Hide file tree
Showing 4 changed files with 148 additions and 1 deletion.
8 changes: 7 additions & 1 deletion meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -753,7 +753,13 @@ test(
)
test(
'utf8',
executable('test-utf8', 'test/utf8.c', dependencies: test_dep),
executable(
'test-utf8',
'test/utf8.c',
'src/utf8-decoding.c',
'src/utf8-decoding.h',
dependencies: test_dep
),
env: test_env,
)
test(
Expand Down
99 changes: 99 additions & 0 deletions src/utf8-decoding.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
/*
* Copyright © 2024 Pierre Le Marre <[email protected]>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/

#include "config.h"

#include "utf8-decoding.h"

/* Array mapping the leading byte to the length of a UTF-8 sequence.
* A value of zero indicates that the byte can not begin a UTF-8 sequence. */
static const uint8_t utf8_sequence_length_by_leading_byte[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00-0x0F */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x10-0x1F */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x20-0x2F */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x30-0x3F */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40-0x4F */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x50-0x5F */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60-0x6F */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x70-0x7F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80-0x8F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90-0x9F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0-0xAF */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xB0-0xBF */
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xC0-0xCF */
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xD0-0xDF */
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 0xE0-0xEF */
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xF0-0xFF */
};

/* Length of next utf-8 sequence */
uint8_t
utf8_sequence_length(const char *s)
{
return utf8_sequence_length_by_leading_byte[(unsigned char)s[0]];
}

/* Reads the next UTF-8 sequence in a string */
uint32_t
utf8_next_code_point(const char *s, size_t max_size, size_t *size_out)
{
uint32_t cp = 0;
uint8_t len = utf8_sequence_length(s);
*size_out = 0;

if (!max_size || len > max_size)
return INVALID_UTF8_CODE_POINT;

/* Handle leading byte */
switch (len) {
case 1:
*size_out = 1;
return (uint32_t)s[0];
case 2:
cp = (uint32_t)s[0] & 0x1f;
break;
case 3:
cp = (uint32_t)s[0] & 0x0f;
break;
case 4:
cp = (uint32_t)s[0] & 0x07;
break;
default:
return INVALID_UTF8_CODE_POINT;
}

/* Process remaining bytes of the UTF-8 sequence */
for (size_t k = 1; k < len; k++) {
if (((uint32_t)s[k] & 0xc0) != 0x80)
return INVALID_UTF8_CODE_POINT;
cp <<= 6;
cp |= (uint32_t)s[k] & 0x3f;
}

/* Check surrogates */
if (cp >= 0xd800 && cp <= 0xdfff)
return INVALID_UTF8_CODE_POINT;

*size_out = len;
return cp;
}
20 changes: 20 additions & 0 deletions src/utf8-decoding.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@

#ifndef UTF8_DECODING_H
#define UTF8_DECODING_H

#include "config.h"

#include <stddef.h>
#include <stdint.h>

/* Check if a char is the start of a UTF-8 sequence */
#define is_utf8_start(c) (((c) & 0xc0) != 0x80)
#define INVALID_UTF8_CODE_POINT UINT32_MAX

uint8_t
utf8_sequence_length(const char *s);

uint32_t
utf8_next_code_point(const char *s, size_t max_size, size_t *size_out);

#endif
22 changes: 22 additions & 0 deletions test/utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include "src/keysym.h"
#include "test.h"
#include "utf8.h"
#include "utf8-decoding.h"
#include "utils.h"

#define VALID(lit) assert(is_valid_utf8(lit, sizeof(lit)-1))
Expand Down Expand Up @@ -179,13 +180,34 @@ test_utf32_to_utf8(void)
check_utf32_to_utf8(0xffffffff, 0, "");
}

static void
/* Check roundtrip UTF-32 → UTF-8 → UTF-32 */
test_utf8_to_utf32(void)
{
char buffer[XKB_KEYSYM_UTF8_MAX_SIZE];
for (uint32_t cp = 0; cp < 0x10ffff; cp++) {
int length = utf32_to_utf8(cp, buffer) - 1;
/* Check surrogates */
if (cp >= 0xd800 && cp <= 0xdfff) {
assert(length == -1);
} else {
assert(length > 0);
size_t length2 = 0;
uint32_t cp2 = utf8_next_code_point(buffer, (size_t)length, &length2);
assert(cp2 != INVALID_UTF8_CODE_POINT && cp2 == cp &&
length2 == (size_t)length);
}
}
}

int
main(void)
{
test_init();

test_is_valid_utf8();
test_utf32_to_utf8();
test_utf8_to_utf32();

return 0;
}

0 comments on commit 98dee22

Please sign in to comment.