-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement the
?
format specifier for strings and characters (#3656)
This implements [\[format.string.escaped\]], which is part of WG21-P2286R8 "Formatting Ranges" and modified by WG21-P2713R1 "Escaping Improvements In `std::format`". Works towards #2919. To implement this feature, two arrays, `__printable_ranges` and `_Grapheme_Extend_ranges`, are added to `__msvc_format_ucd_tables.hpp`. - `__printable_ranges` represents code points whose [`General_Category`] is in the groups `L`, `M`, `N`, `P`, `S` (that is, code points that are *not* from categories `Z` or `C`), plus the ASCII space character. - Characters outside of these ranges are always escaped, usually using the `\u{hex-digit-sequence}` format. ([\[format.string.escaped\]/(2.2.1.2.1)]) - It might make sense to store the unmodified `General_Category`, instead of this invented property. This requires more storage and a new data structure, though. - `_Grapheme_Extend_ranges` represents code points with the Unicode property `Grapheme_Extend=Yes`. - Characters in these ranges are escaped unless they immediately follow an unescaped character. ([\[format.string.escaped\]/(2.2.1.2.2)]) - It would be more space efficient to reuse the existing data for `Grapheme_Cluster_Break`: `Grapheme_Extend=Yes` is `Grapheme_Cluster_Break=Extend` minus `Emoji_Modifier=Yes`, and `Emoji_Modifier=Yes` is just `1F3FB..1F3FF`. I chose to define a new array for simplicity. When the literal encoding is not UTF-8, UTF-16, or UTF-32, the set of "separator or non-printable characters" is implementation-defined. In this implementation, the set consists of all characters that correspond to non-printable Unicode code points (that is, code points outside of `__printable_ranges`, see above). If a character is non-printable, it is translated into `\u{XXXX}`, where `XXXX` is the hex value of the Unicode code point (not the original value). If a code unit sequence cannot be converted to a Unicode scalar value, the `\x{XX}` escape sequence is used. [`General_Category`]: https://www.unicode.org/reports/tr44/#GC_Values_Table [\[format.string.escaped\]]: http://eel.is/c++draft/format.string.escaped [\[format.string.escaped\]/(2.2.1.2.1)]: http://eel.is/c++draft/format.string.escaped#2.2.1.2.1 [\[format.string.escaped\]/(2.2.1.2.2)]: http://eel.is/c++draft/format.string.escaped#2.2.1.2.2
- Loading branch information
1 parent
a0f0367
commit 9fdc896
Showing
13 changed files
with
1,757 additions
and
1,095 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Copyright (c) Microsoft Corporation. | ||
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
|
||
RUNALL_INCLUDE ..\concepts_latest_matrix.lst |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
// Copyright (c) Microsoft Corporation. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
|
||
#include <cassert> | ||
#include <format> | ||
#include <string> | ||
#include <string_view> | ||
#include <type_traits> | ||
#include <utility> | ||
|
||
using namespace std; | ||
|
||
template <typename CharT> | ||
[[nodiscard]] constexpr const CharT* choose_literal(const char* const str, const wchar_t* const wstr) noexcept { | ||
if constexpr (is_same_v<CharT, char>) { | ||
return str; | ||
} else { | ||
return wstr; | ||
} | ||
} | ||
|
||
#define STR(Literal) (choose_literal<CharT>(Literal, L##Literal)) | ||
|
||
template <class charT, class... Args> | ||
auto make_testing_format_args(Args&&... vals) { | ||
if constexpr (is_same_v<charT, wchar_t>) { | ||
return make_wformat_args(forward<Args>(vals)...); | ||
} else { | ||
return make_format_args(forward<Args>(vals)...); | ||
} | ||
} | ||
|
||
template <class CharT, class T> | ||
void test_escaped_character() { | ||
assert(format(STR("{:?}"), T('\t')) == STR(R"('\t')")); | ||
assert(format(STR("{:?}"), T('\n')) == STR(R"('\n')")); | ||
assert(format(STR("{:?}"), T('\r')) == STR(R"('\r')")); | ||
assert(format(STR("{:?}"), T('\"')) == STR(R"('"')")); | ||
assert(format(STR("{:?}"), T('\'')) == STR(R"('\'')")); | ||
assert(format(STR("{:?}"), T('\\')) == STR(R"('\\')")); | ||
|
||
assert(format(STR("{:?}"), T('\0')) == STR(R"('\u{0}')")); | ||
assert(format(STR("{:?}"), T('\v')) == STR(R"('\u{b}')")); | ||
assert(format(STR("{:?}"), T('\f')) == STR(R"('\u{c}')")); | ||
assert(format(STR("{:?}"), T('\x7F')) == STR(R"('\u{7f}')")); | ||
|
||
assert(format(STR("{:?}"), T(' ')) == STR("' '")); | ||
assert(format(STR("{:?}"), T('~')) == STR("'~'")); | ||
|
||
if constexpr (is_same_v<CharT, wchar_t> && is_same_v<T, wchar_t>) { | ||
assert(format(L"{:?}", L'\xA0') == LR"('\u{a0}')"); // U+00A0 NO-BREAK SPACE | ||
assert(format(L"{:?}", L'\x300') == LR"('\u{300}')"); // U+0300 COMBINING GRAVE ACCENT | ||
|
||
assert(format(L"{:?}", L'\xA1') == L"'\xA1'"); // U+00A1 INVERTED EXCLAMATION MARK | ||
|
||
assert(format(L"{:?}", L'\xD800') == LR"('\x{d800}')"); | ||
assert(format(L"{:?}", L'\xDFFF') == LR"('\x{dfff}')"); | ||
} | ||
} | ||
|
||
template <class CharT> | ||
void test_escaped_string() { | ||
assert(format(STR("{:?}"), STR("\t")) == STR(R"("\t")")); | ||
assert(format(STR("{:?}"), STR("\n")) == STR(R"("\n")")); | ||
assert(format(STR("{:?}"), STR("\r")) == STR(R"("\r")")); | ||
assert(format(STR("{:?}"), STR("\"")) == STR("\"\\\"\"")); | ||
assert(format(STR("{:?}"), STR("\'")) == STR(R"("'")")); | ||
assert(format(STR("{:?}"), STR("\\")) == STR(R"("\\")")); | ||
|
||
assert(format(STR("{:?}"), STR("\v")) == STR(R"("\u{b}")")); | ||
assert(format(STR("{:?}"), STR("\f")) == STR(R"("\u{c}")")); | ||
assert(format(STR("{:?}"), STR("\x7F")) == STR(R"("\u{7f}")")); | ||
|
||
assert(format(STR("{:?}"), STR(" ")) == STR("\" \"")); | ||
assert(format(STR("{:?}"), STR("~")) == STR("\"~\"")); | ||
|
||
assert(format(STR("[{:?}]"), basic_string{STR("\0 \n \t \x02 \x1b"), 9}) == STR(R"(["\u{0} \n \t \u{2} \u{1b}"])")); | ||
assert(format(STR("[{:?}]"), basic_string_view{STR("\0 \n \t \x02 \x1b"), 9}) | ||
== STR(R"(["\u{0} \n \t \u{2} \u{1b}"])")); | ||
|
||
if constexpr (is_same_v<CharT, wchar_t>) { | ||
assert(format(L"{:?}", L"\xA0") == LR"("\u{a0}")"); // U+00A0 NO-BREAK SPACE | ||
assert(format(L"{:?}", L"\U0010FFFF") == LR"("\u{10ffff}")"); // noncharacter | ||
assert(format(L"{:?}", L"\x300") == LR"("\u{300}")"); // U+0300 COMBINING GRAVE ACCENT | ||
|
||
assert(format(L"{:?}", L"\xA1") == L"\"\xA1\""); // U+00A1 INVERTED EXCLAMATION MARK | ||
assert(format(L"{:?}", L"\U00010000") == L"\"\U00010000\""); // U+10000 LINEAR B SYLLABLE B008 A | ||
|
||
assert(format(L"{:?}", L"\xD800") == L"\"\\x{d800}\""); | ||
assert(format(L"{:?}", L"\xDFFF") == L"\"\\x{dfff}\""); | ||
assert(format(L"{:?}", L"\xDFFF\xD800") == L"\"\\x{dfff}\\x{d800}\""); | ||
|
||
assert(format(L"{:?}", L"\xA0\x300") == L"\"\\u{a0}\\u{300}\""); | ||
assert(format(L"{:?}", L" \x300") == L"\" \x300\""); | ||
assert(format(L"{:?}", L"~\x300") == L"\"~\x300\""); | ||
} | ||
} | ||
|
||
template <class CharT, class T> | ||
void test_format_specs() { | ||
assert(format(STR("{:5?}"), T('\n')) == STR(R"('\n' )")); | ||
try { | ||
(void) vformat(STR("{:.3?}"), make_testing_format_args<CharT>(T('\n'))); | ||
assert(false); | ||
} catch (const format_error&) { | ||
} | ||
|
||
assert(format(STR("{:5?}"), STR("\n")) == STR(R"("\n" )")); | ||
assert(format(STR("{:.3?}"), STR("\n")) == STR(R"("\n)")); | ||
assert(format(STR("{:5.3?}"), STR("\n")) == STR(R"("\n )")); | ||
assert(format(STR("{:>5.3?}"), STR("\n")) == STR(R"( "\n)")); | ||
} | ||
|
||
int main() { | ||
test_escaped_character<char, char>(); | ||
test_escaped_character<wchar_t, char>(); | ||
test_escaped_character<wchar_t, wchar_t>(); | ||
|
||
test_escaped_string<char>(); | ||
test_escaped_string<wchar_t>(); | ||
|
||
test_format_specs<char, char>(); | ||
test_format_specs<wchar_t, char>(); | ||
test_format_specs<wchar_t, wchar_t>(); | ||
} |
28 changes: 28 additions & 0 deletions
28
tests/std/tests/P2286R8_formatting_ranges_legacy_text_encoding/env.lst
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# Copyright (c) Microsoft Corporation. | ||
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
|
||
# This is `concepts_latest_matrix.lst` with `/execution-charset:.932` added. | ||
# clang is excluded since it doesn't support non-UTF-8 execution charsets. | ||
|
||
RUNALL_INCLUDE ..\prefix.lst | ||
RUNALL_CROSSLIST | ||
PM_CL="/w14640 /Zc:threadSafeInit- /EHsc /std:c++latest /execution-charset:.932" | ||
RUNALL_CROSSLIST | ||
PM_CL="/MD /D_ITERATOR_DEBUG_LEVEL=0 /permissive- /Zc:noexceptTypes-" | ||
PM_CL="/MD /D_ITERATOR_DEBUG_LEVEL=1 /permissive-" | ||
PM_CL="/MD /D_ITERATOR_DEBUG_LEVEL=0 /permissive- /Zc:char8_t- /Zc:preprocessor" | ||
PM_CL="/MDd /D_ITERATOR_DEBUG_LEVEL=0 /permissive- /Zc:wchar_t-" | ||
PM_CL="/MDd /D_ITERATOR_DEBUG_LEVEL=1 /permissive-" | ||
PM_CL="/MDd /D_ITERATOR_DEBUG_LEVEL=2 /permissive- /fp:except /Zc:preprocessor" | ||
PM_CL="/MT /D_ITERATOR_DEBUG_LEVEL=0 /permissive-" | ||
PM_CL="/MT /D_ITERATOR_DEBUG_LEVEL=0 /permissive- /analyze:only /analyze:autolog-" | ||
PM_CL="/MT /D_ITERATOR_DEBUG_LEVEL=1 /permissive-" | ||
PM_CL="/MTd /D_ITERATOR_DEBUG_LEVEL=0 /permissive- /fp:strict" | ||
PM_CL="/MTd /D_ITERATOR_DEBUG_LEVEL=1 /permissive-" | ||
PM_CL="/MTd /D_ITERATOR_DEBUG_LEVEL=2 /permissive" | ||
PM_CL="/MTd /D_ITERATOR_DEBUG_LEVEL=2 /permissive- /analyze:only /analyze:autolog-" | ||
# PM_CL="/permissive- /BE /c /MD" | ||
# PM_CL="/permissive- /BE /c /MTd" | ||
# PM_COMPILER="clang-cl" PM_CL="-fno-ms-compatibility -fno-delayed-template-parsing -Wno-unqualified-std-cast-call /permissive- /MD" | ||
# PM_COMPILER="clang-cl" PM_CL="-fno-ms-compatibility -fno-delayed-template-parsing -Wno-unqualified-std-cast-call /permissive- /MTd /fp:strict" | ||
# PM_COMPILER="clang-cl" PM_CL="-fno-ms-compatibility -fno-delayed-template-parsing -Wno-unqualified-std-cast-call /permissive- /MT /fp:strict -fsanitize=undefined -fno-sanitize-recover=undefined" |
22 changes: 22 additions & 0 deletions
22
tests/std/tests/P2286R8_formatting_ranges_legacy_text_encoding/test.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
// Copyright (c) Microsoft Corporation. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
|
||
#include <cassert> | ||
#include <format> | ||
|
||
using namespace std; | ||
|
||
void test_escaped_string() { | ||
assert(format("{:?}", "\x81\x40") == "\"\\u{3000}\""); // U+3000 IDEOGRAPHIC SPACE | ||
|
||
assert(format("{:?}", "\x81\x41") == "\"\x81\x41\""); | ||
|
||
assert(format("{:?}", "\x81") == "\"\\x{81}\""); | ||
assert(format("{:?}", "\xEB!") == "\"\\x{eb}!\""); | ||
|
||
assert(format("{:?}", "\x81\x40\x40\x81") == "\"\\u{3000}\x40\\x{81}\""); | ||
} | ||
|
||
int main() { | ||
test_escaped_string(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
# Copyright (c) Microsoft Corporation. | ||
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
|
||
RUNALL_INCLUDE ..\concepts_latest_matrix.lst | ||
RUNALL_CROSSLIST | ||
PM_CL="/utf-8" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
// Copyright (c) Microsoft Corporation. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
|
||
#include <cassert> | ||
#include <clocale> | ||
#include <format> | ||
|
||
using namespace std; | ||
|
||
void test_escaped_string() { | ||
assert(format("{:?}", "\u00A0") == "\"\\u{a0}\""); // U+00A0 NO-BREAK SPACE | ||
assert(format("{:?}", "\u0300") == "\"\\u{300}\""); // U+0300 COMBINING GRAVE ACCENT | ||
|
||
assert(format("{:?}", "\u00A1") == "\"\u00A1\""); // U+00A1 INVERTED EXCLAMATION MARK | ||
assert(format("{:?}", "\U00010000") == "\"\U00010000\""); // U+10000 LINEAR B SYLLABLE B008 A | ||
|
||
assert(format("{:?}", "\xC0\x80") == "\"\\x{c0}\\x{80}\""); // ill-formed code unit sequence | ||
|
||
assert(format("{:?}", "\u00A0\u0300") == "\"\\u{a0}\\u{300}\""); | ||
assert(format("{:?}", " \u0300") == "\" \u0300\""); | ||
assert(format("{:?}", "a\u0300") == "\"a\u0300\""); | ||
} | ||
|
||
int main() { | ||
test_escaped_string(); | ||
|
||
assert(setlocale(LC_ALL, ".1252") != nullptr); | ||
test_escaped_string(); | ||
|
||
assert(setlocale(LC_ALL, ".932") != nullptr); | ||
test_escaped_string(); | ||
|
||
assert(setlocale(LC_ALL, ".UTF-8") != nullptr); | ||
test_escaped_string(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.