Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

<format> assumes strings are encoded in the active code page #1834

Merged
merged 7 commits into from
Apr 20, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions stl/inc/format
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,7 @@ _NODISCARD constexpr int _Unicode_width_estimate(const char32_t _Ch) noexcept {
if (_Ch < _Bound) {
return _Result;
}
_Result ^= 1;
_Result ^= 0b11u; // Flip between 1 and 2 on each iteration
}

return 1;
Expand Down Expand Up @@ -564,7 +564,7 @@ private:
}

public:
_NODISCARD int _Units_in_next_character(const char* _First, const char* _Last) const noexcept {
_NODISCARD int _Units_in_next_character(const char* const _First, const char* const _Last) const noexcept {
// Returns a count of the number of code units that compose the first encoded character in
// [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or
// *_First is not a valid lead byte.
Expand All @@ -589,7 +589,7 @@ public:
}
}

_NODISCARD const char* _Find_encoded(const char* _First, const char* _Last, const char _Val) const {
_NODISCARD const char* _Find_encoded(const char* _First, const char* const _Last, const char _Val) const {
// Returns the first occurrence of _Val as an encoded character (and not, for example, as a
// continuation byte) in [_First, _Last).
if constexpr (_Statically_Utf8) {
Expand Down Expand Up @@ -632,7 +632,7 @@ public:
template <bool _Statically_Utf8>
class _Fmt_codec<wchar_t, _Statically_Utf8> {
public:
_NODISCARD int _Units_in_next_character(const wchar_t* _First, const wchar_t* _Last) const noexcept {
_NODISCARD int _Units_in_next_character(const wchar_t* _First, const wchar_t* const _Last) const noexcept {
// Returns a count of the number of code units that compose the first encoded character in
// [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or
// *_First is an unpaired surrogate.
Expand All @@ -653,7 +653,8 @@ public:
return 2; // surrogate pair
}

_NODISCARD const wchar_t* _Find_encoded(const wchar_t* _First, const wchar_t* _Last, const wchar_t _Val) const {
_NODISCARD const wchar_t* _Find_encoded(
const wchar_t* const _First, const wchar_t* const _Last, const wchar_t _Val) const {
return _Find_unchecked(_First, _Last, _Val);
}

Expand All @@ -666,7 +667,7 @@ public:
}

// surrogate pair
_Ch = (_Ch - 0xD8000u) << 10;
_Ch = (_Ch - 0xD800u) << 10;
_Ch += static_cast<char32_t>(_Ptr[1]) - 0xDC00u;
_Ch += 0x10000u;
return _Unicode_width_estimate<_Width_estimate_high_intervals>(_Ch);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,36 @@ void test_parse_align() {
{.expected_alignment = _Fmt_align::_Center, .expected_fill = "\x92\x6e"sv});
}

void test_width_estimation() {
// Format strings of known width with a trailing delimiter using a precision large enough to
// include all but the delimiter to validate the width estimation code.
struct test_case {
const char* str;
int width;
};
constexpr test_case test_cases[] = {
// Pick a "short" and "long" codepoints (\x20 and \x96\x7b), then form
StephanTLavavej marked this conversation as resolved.
Show resolved Hide resolved
// all permutations of 3-codepoint prefixes. This gives us coverage of
// all transitions (e.g. short-to-long, long-to-long).
StephanTLavavej marked this conversation as resolved.
Show resolved Hide resolved
{"\x20\x20\x20\x58", 4},
{"\x20\x20\x96\x7b\x58", 5},
{"\x20\x96\x7b\x20\x58", 5},
{"\x96\x7b\x20\x20\x58", 5},
{"\x20\x96\x7b\x96\x7b\x58", 6},
{"\x96\x7b\x20\x96\x7b\x58", 6},
{"\x96\x7b\x96\x7b\x20\x58", 6},
{"\x96\x7b\x96\x7b\x96\x7b\x58", 7},
};

for (const auto& test : test_cases) {
basic_string_view sv{test.str};
StephanTLavavej marked this conversation as resolved.
Show resolved Hide resolved
sv = sv.substr(0, sv.size() - 1);
assert(format("{:.{}}", test.str, test.width - 1) == sv);
}
}

int main() {
test_multibyte_format_strings();
test_parse_align();
test_width_estimation();
}
119 changes: 105 additions & 14 deletions tests/std/tests/P0645R10_text_formatting_utf8/test.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#define _FORMAT_CODEPAGE __std_code_page::_Utf8

#include <cassert>
#include <clocale>
#include <format>
Expand All @@ -14,12 +12,12 @@ using namespace std;

void test_multibyte_format_strings() {
{
// Filling with footballs ("\xf0\x9f\x8f\x88" is U+1F3C8 AMERICAN FOOTBALL)
assert(format("{:\xf0\x9f\x8f\x88>4}"sv, 42) == "\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\x34\x32");
// Filling with footballs ("\U0001f3c8" is U+1F3C8 AMERICAN FOOTBALL)
assert(format("{:\U0001f3c8>4}"sv, 42) == "\U0001f3c8\U0001f3c8\x34\x32");

assert(format("{:\xf0\x9f\x8f\x88<4.2}", "1") == "\x31\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88"sv);
assert(format("{:\xf0\x9f\x8f\x88^4.2}", "1") == "\xf0\x9f\x8f\x88\x31\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88"sv);
assert(format("{:\xf0\x9f\x8f\x88>4.2}", "1") == "\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\x31"sv);
assert(format("{:\U0001f3c8<4.2}", "1") == "\x31\U0001f3c8\U0001f3c8\U0001f3c8"sv);
assert(format("{:\U0001f3c8^4.2}", "1") == "\U0001f3c8\x31\U0001f3c8\U0001f3c8"sv);
assert(format("{:\U0001f3c8>4.2}", "1") == "\U0001f3c8\U0001f3c8\U0001f3c8\x31"sv);
}

{
Expand All @@ -35,19 +33,112 @@ void test_parse_align() {
auto parse_align_fn = _Parse_align<char, testing_callbacks<char>>;

{
// "\xf0\x9f\x8f\x88" is U+1F3C8 AMERICAN FOOTBALL
test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88<X"sv, false, 5,
{.expected_alignment = _Fmt_align::_Left, .expected_fill = "\xf0\x9f\x8f\x88"sv});
test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88>X"sv, false, 5,
{.expected_alignment = _Fmt_align::_Right, .expected_fill = "\xf0\x9f\x8f\x88"sv});
test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88^X"sv, false, 5,
{.expected_alignment = _Fmt_align::_Center, .expected_fill = "\xf0\x9f\x8f\x88"sv});
test_parse_helper(parse_align_fn, "\U0001f3c8<X"sv, false, 5,
{.expected_alignment = _Fmt_align::_Left, .expected_fill = "\U0001f3c8"sv});
test_parse_helper(parse_align_fn, "\U0001f3c8>X"sv, false, 5,
{.expected_alignment = _Fmt_align::_Right, .expected_fill = "\U0001f3c8"sv});
test_parse_helper(parse_align_fn, "\U0001f3c8^X"sv, false, 5,
{.expected_alignment = _Fmt_align::_Center, .expected_fill = "\U0001f3c8"sv});
}
}

template <class CharT>
void test_width_estimation() {
// Format strings of known width with a trailing delimiter using a precision large enough to
// include all but the delimiter to validate the width estimation code.
struct test_case {
const CharT* str;
int width;
};
constexpr test_case test_cases[] = {
{TYPED_LITERAL(CharT, "\x58"), 1},
{TYPED_LITERAL(CharT, "x\x58"), 2},
// test the boundaries of the intervals defined in n4885 [format.str.std]/11
StephanTLavavej marked this conversation as resolved.
Show resolved Hide resolved
{TYPED_LITERAL(CharT, "\u10ff\x58"), 2},
{TYPED_LITERAL(CharT, "\u1100\x58"), 3},
{TYPED_LITERAL(CharT, "\u115f\x58"), 3},
{TYPED_LITERAL(CharT, "\u1160\x58"), 2},
{TYPED_LITERAL(CharT, "\u2328\x58"), 2},
{TYPED_LITERAL(CharT, "\u2329\x58"), 3},
{TYPED_LITERAL(CharT, "\u232a\x58"), 3},
{TYPED_LITERAL(CharT, "\u232b\x58"), 2},
{TYPED_LITERAL(CharT, "\u2e7f\x58"), 2},
{TYPED_LITERAL(CharT, "\u2e80\x58"), 3},
{TYPED_LITERAL(CharT, "\u303e\x58"), 3},
{TYPED_LITERAL(CharT, "\u303f\x58"), 2},
{TYPED_LITERAL(CharT, "\u3040\x58"), 3},
{TYPED_LITERAL(CharT, "\ua4cf\x58"), 3},
{TYPED_LITERAL(CharT, "\ua4d0\x58"), 2},
{TYPED_LITERAL(CharT, "\uabff\x58"), 2},
{TYPED_LITERAL(CharT, "\uac00\x58"), 3},
{TYPED_LITERAL(CharT, "\ud7a3\x58"), 3},
{TYPED_LITERAL(CharT, "\ud7a4\x58"), 2},
{TYPED_LITERAL(CharT, "\ud7ff\x58"), 2},
// skip over the surrogate pair range (\ud800-\udfff)
{TYPED_LITERAL(CharT, "\ue000\x58"), 2},
{TYPED_LITERAL(CharT, "\uf8ff\x58"), 2},
{TYPED_LITERAL(CharT, "\uf900\x58"), 3},
{TYPED_LITERAL(CharT, "\ufaff\x58"), 3},
{TYPED_LITERAL(CharT, "\ufb00\x58"), 2},
{TYPED_LITERAL(CharT, "\ufe0f\x58"), 2},
{TYPED_LITERAL(CharT, "\ufe10\x58"), 3},
{TYPED_LITERAL(CharT, "\ufe19\x58"), 3},
{TYPED_LITERAL(CharT, "\ufe1a\x58"), 2},
{TYPED_LITERAL(CharT, "\ufe2f\x58"), 2},
{TYPED_LITERAL(CharT, "\ufe30\x58"), 3},
{TYPED_LITERAL(CharT, "\ufe6f\x58"), 3},
{TYPED_LITERAL(CharT, "\ufe70\x58"), 2},
{TYPED_LITERAL(CharT, "\ufeff\x58"), 2},
{TYPED_LITERAL(CharT, "\uff00\x58"), 3},
{TYPED_LITERAL(CharT, "\uff60\x58"), 3},
{TYPED_LITERAL(CharT, "\uff61\x58"), 2},
{TYPED_LITERAL(CharT, "\uffdf\x58"), 2},
{TYPED_LITERAL(CharT, "\uffe0\x58"), 3},
{TYPED_LITERAL(CharT, "\uffe6\x58"), 3},
{TYPED_LITERAL(CharT, "\uffe7\x58"), 2},
{TYPED_LITERAL(CharT, "\U0001f2ff\x58"), 2},
{TYPED_LITERAL(CharT, "\U0001f300\x58"), 3},
{TYPED_LITERAL(CharT, "\U0001f64f\x58"), 3},
{TYPED_LITERAL(CharT, "\U0001f650\x58"), 2},
{TYPED_LITERAL(CharT, "\U0001f8ff\x58"), 2},
{TYPED_LITERAL(CharT, "\U0001f900\x58"), 3},
{TYPED_LITERAL(CharT, "\U0001f9ff\x58"), 3},
{TYPED_LITERAL(CharT, "\U0001fa00\x58"), 2},
{TYPED_LITERAL(CharT, "\U0001ffff\x58"), 2},
{TYPED_LITERAL(CharT, "\U00020000\x58"), 3},
{TYPED_LITERAL(CharT, "\U0002fffd\x58"), 3},
{TYPED_LITERAL(CharT, "\U0002fffe\x58"), 2},
{TYPED_LITERAL(CharT, "\U0002ffff\x58"), 2},
{TYPED_LITERAL(CharT, "\U00030000\x58"), 3},
{TYPED_LITERAL(CharT, "\U0003fffd\x58"), 3},
{TYPED_LITERAL(CharT, "\U0003fffe\x58"), 2},
{TYPED_LITERAL(CharT, "\U0010ffff\x58"), 2},

// Pick a "short" and "long" codepoints (\u2000 and \ufe40), then form
StephanTLavavej marked this conversation as resolved.
Show resolved Hide resolved
// all permutations of 3-codepoint prefixes. This gives us coverage of
// all transitions (e.g. short-to-long, long-to-long).
StephanTLavavej marked this conversation as resolved.
Show resolved Hide resolved
{TYPED_LITERAL(CharT, "\u2000\u2000\u2000\x58"), 4},
{TYPED_LITERAL(CharT, "\u2000\u2000\ufe40\x58"), 5},
{TYPED_LITERAL(CharT, "\u2000\ufe40\u2000\x58"), 5},
{TYPED_LITERAL(CharT, "\ufe40\u2000\u2000\x58"), 5},
{TYPED_LITERAL(CharT, "\u2000\ufe40\ufe40\x58"), 6},
{TYPED_LITERAL(CharT, "\ufe40\u2000\ufe40\x58"), 6},
{TYPED_LITERAL(CharT, "\ufe40\ufe40\u2000\x58"), 6},
{TYPED_LITERAL(CharT, "\ufe40\ufe40\ufe40\x58"), 7},
};

for (const auto& test : test_cases) {
basic_string_view sv{test.str};
sv = sv.substr(0, sv.size() - 1);
assert(format(TYPED_LITERAL(CharT, "{:.{}}"), test.str, test.width - 1) == sv);
}
}

void run_tests() {
test_multibyte_format_strings();
test_parse_align();
test_width_estimation<char>();
test_width_estimation<wchar_t>();
}

int main() {
Expand Down