From 8d298aada04645ffcaf54463117c97b6d3e8ef70 Mon Sep 17 00:00:00 2001 From: Casey Carter Date: Tue, 13 Apr 2021 13:57:32 -0700 Subject: [PATCH 1/7] `format` assumes strings are encoded in the active code page Refactors all `format` functionality with knowledge of character encodings into a new class `_Fmt_codec`. `_Fmt_codec` internally caches the `_Cvtvec` structure that other functions previously passed around, and makes use of a new "pure C ABI function" `__std_get_cvt` to retrieve character conversion info for the active codepage. The ABI function reuses the `__std_code_page` and `__std_win_error` types from ``. These could be extracted into a more general `__msvc_win_abi.hpp` header, but I think `` is lightweight enough to just include directly for now. Drive-by: * BUG: `_Parse_precision` was not skipping the trailing `}` in a dynamic precision. * Improvement: `test_parse_helper` now verifies that the parse consumes the entire input when passed an expected length of `npos`. --- stl/CMakeLists.txt | 1 + stl/inc/format | 397 ++++++++++-------- stl/inc/xfilesystem_abi.h | 2 +- stl/src/format.cpp | 41 ++ tests/std/include/test_format_support.hpp | 5 +- .../test.cpp | 27 -- .../env.lst | 7 +- .../test.cpp | 67 ++- .../P0645R10_text_formatting_parsing/test.cpp | 18 +- .../P0645R10_text_formatting_utf8/env.lst | 26 +- .../P0645R10_text_formatting_utf8/test.cpp | 4 + 11 files changed, 326 insertions(+), 269 deletions(-) create mode 100644 stl/src/format.cpp diff --git a/stl/CMakeLists.txt b/stl/CMakeLists.txt index 751c6ad78a..12fa2b9255 100644 --- a/stl/CMakeLists.txt +++ b/stl/CMakeLists.txt @@ -249,6 +249,7 @@ endforeach() # Objs that exist in both libcpmt[d][01].lib and msvcprt[d].lib. set(IMPLIB_SOURCES ${CMAKE_CURRENT_LIST_DIR}/src/filesystem.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/format.cpp ${CMAKE_CURRENT_LIST_DIR}/src/locale0_implib.cpp ${CMAKE_CURRENT_LIST_DIR}/src/nothrow.cpp ${CMAKE_CURRENT_LIST_DIR}/src/sharedmutex.cpp diff --git a/stl/inc/format b/stl/inc/format index 939581aff9..dea6e05e8c 100644 --- a/stl/inc/format +++ b/stl/inc/format @@ -53,6 +53,7 @@ #include #include #include +#include #include #pragma pack(push, _CRT_PACKING) @@ -62,6 +63,9 @@ _STL_DISABLE_CLANG_WARNINGS #pragma push_macro("new") #undef new +extern "C" _NODISCARD __std_win_error __stdcall __std_get_cvt( + const __std_code_page _Codepage, _Cvtvec* const _Pcvt) noexcept; + _STD_BEGIN class format_error : public runtime_error { @@ -444,95 +448,231 @@ _NODISCARD constexpr bool _Is_execution_charset_utf8() { #pragma warning(pop) } -inline constexpr bool _Is_execution_charset_utf8_v = _Is_execution_charset_utf8(); +inline constexpr char16_t _Width_estimate_low_intervals[] = { // Per N4885 [format.string.std]/11 + 0x1100u, 0x1160u, 0x2329u, 0x232Bu, 0x2E80u, 0x303Fu, 0x3040u, 0xA4D0u, 0xAC00u, 0xD7A4u, 0xF900u, 0xFB00u, 0xFE10u, + 0xFE1Au, 0xFE30u, 0xFE70u, 0xFF00u, 0xFF61u, 0xFFE0u, 0xFFE7u}; -_NODISCARD constexpr int _Utf8_code_units_in_next_character( - const char* const _First, const char* const _Last) noexcept { - // Returns a count of the number of UTF-8 code units that compose the first encoded character in [_First, _Last), - // or -1 if [_First, _Last) doesn't contain an entire encoded character or *_First is not a valid lead byte. - const auto _Ch = static_cast(*_First); - if (_Ch < 0b1000'0000u) { - return 1; +inline constexpr char32_t _Width_estimate_high_intervals[] = { // Per N4885 [format.string.std]/11 + 0x1F300u, 0x1F650u, 0x1F900u, 0x1FA00u, 0x20000u, 0x2FFFEu, 0x30000u, 0x3FFFEu}; + +template +_NODISCARD constexpr int _Unicode_width_estimate(const char32_t _Ch) noexcept { + // Computes the width estimation for Unicode characters from N4885 [format.string.std]/11 + int _Result = 1; + for (const auto& _Bound : _Bounds) { + if (_Ch < _Bound) { + return _Result; + } + _Result ^= 1; } - const auto _Len = static_cast(_Last - _First); + return 1; +} + +template +class _Fmt_codec; + +template +class _Fmt_codec_base {}; - if (_Ch < 0b1110'0000u) { - // check for non-lead byte or partial 2-byte encoded character - return (_Ch >= 0b1100'0000u && _Len >= 2) ? 2 : -1; +template <> +class _Fmt_codec_base { +protected: + _Cvtvec _Cvt; + + _NODISCARD int _Double_byte_encoding_code_units_in_next_character( + const char* const _First, const char* const _Last) const { + // Returns a count of the number of code units that compose the first encoded character in [_First, _Last), + // or -1 if [_First, _Last) doesn't contain an entire encoded character or *_First is not a valid lead byte. + wchar_t _Wide; + mbstate_t _St{}; + const auto _Len = static_cast(_Last - _First); + const int _Result = _Mbrtowc(&_Wide, _First, _Len, &_St, &_Cvt); + if (_Result > 0) { + return _Result; + } else if (_Result < 0) { // invalid or incomplete encoded character + return -1; + } else { // next code unit is '\0' + return 1; + } } - if (_Ch < 0b1111'0000u) { - // check for partial 3-byte encoded character - return (_Len >= 3) ? 3 : -1; + _Fmt_codec_base() { + if (__std_get_cvt(__std_code_page::_Acp, &_Cvt) != __std_win_error::_Success) { + _THROW(format_error("FIXME")); + } } +}; - // check for partial 4-byte encoded character - return (_Len >= 4) ? 4 : -1; -} +template +class _Fmt_codec : private _Fmt_codec_base<_Statically_Utf8> { +private: + _NODISCARD static constexpr int _Utf8_code_units_in_next_character( + const char* const _First, const char* const _Last) noexcept { + // Returns a count of the number of UTF-8 code units that compose the first encoded character in [_First, + // _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or *_First is not a valid lead + // byte. + const auto _Ch = static_cast(*_First); + if (_Ch < 0b1000'0000u) { + return 1; + } -_NODISCARD inline int _Double_byte_encoding_code_units_in_next_character( - const char* const _First, const char* const _Last, const _Cvtvec& _Cvt) { - // Returns a count of the number of code units that compose the first encoded character in [_First, _Last), - // or -1 if [_First, _Last) doesn't contain an entire encoded character or *_First is not a valid lead byte. - wchar_t _Wide; - mbstate_t _St{}; - const auto _Len = static_cast(_Last - _First); - const int _Result = _Mbrtowc(&_Wide, _First, _Len, &_St, &_Cvt); - if (_Result > 0) { - return _Result; - } else if (_Result < 0) { // invalid or incomplete encoded character - return -1; - } else { // next code unit is '\0' - return 1; - } -} + const auto _Len = static_cast(_Last - _First); -_NODISCARD inline int _Code_units_in_next_character(const char* _First, const char* _Last, const _Cvtvec& _Cvt) { - // Returns a count of the number of code units that compose the first encoded character in - // [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or - // *_First is not a valid lead byte. - _STL_INTERNAL_CHECK(_First < _Last); + if (_Ch < 0b1110'0000u) { + // check for non-lead byte or partial 2-byte encoded character + return (_Ch >= 0b1100'0000u && _Len >= 2) ? 2 : -1; + } - if constexpr (_Is_execution_charset_utf8_v) { - return _Utf8_code_units_in_next_character(_First, _Last); - } else { - switch (_Cvt._Mbcurmax) { + if (_Ch < 0b1111'0000u) { + // check for partial 3-byte encoded character + return (_Len >= 3) ? 3 : -1; + } + + // check for partial 4-byte encoded character + return (_Len >= 4) ? 4 : -1; + } + + _NODISCARD static constexpr int _Estimate_utf8_character_width(const char* const _Ptr, const int _Units) noexcept { + // Return an estimate for the width of the character composed of _Units code units, + // whose first code unit is denoted by _Ptr. + auto _Ch = static_cast(*_Ptr); + switch (_Units) { default: - _STL_INTERNAL_CHECK(!"Bad number of encoding units for this code page"); - [[fallthrough]]; case 1: - return 1; // all characters have only one code unit - case 2: - return _Double_byte_encoding_code_units_in_next_character(_First, _Last, _Cvt); + return 1; + case 3: + _Ch &= 0b1111u; + break; + case 4: + _Ch &= 0b111u; + break; + } + + for (int _Idx = 1; _Idx < _Units; ++_Idx) { + _Ch = _Ch << 6 | (_Ptr[_Idx] & 0b11'1111u); + } - case 4: // Assume UTF-8 (as does _Mbrtowc) + if (_Units == 3) { + return _Unicode_width_estimate<_Width_estimate_low_intervals>(_Ch); + } + + return _Unicode_width_estimate<_Width_estimate_high_intervals>(_Ch); + } + +public: + _NODISCARD int _Units_in_next_character(const char* _First, const char* _Last) const noexcept { + // Returns a count of the number of code units that compose the first encoded character in + // [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or + // *_First is not a valid lead byte. + _STL_INTERNAL_CHECK(_First < _Last); + + if constexpr (_Statically_Utf8) { return _Utf8_code_units_in_next_character(_First, _Last); + } else { + switch (this->_Cvt._Mbcurmax) { + default: + _STL_INTERNAL_CHECK(!"Bad number of encoding units for this code page"); + [[fallthrough]]; + case 1: + return 1; // all characters have only one code unit + + case 2: + return this->_Double_byte_encoding_code_units_in_next_character(_First, _Last); + + case 4: // Assume UTF-8 (as does _Mbrtowc) + return _Utf8_code_units_in_next_character(_First, _Last); + } } } -} -_NODISCARD inline int _Code_units_in_next_character(const wchar_t* _First, const wchar_t* _Last, const _Cvtvec&) { - // Returns a count of the number of code units that compose the first encoded character in - // [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or - // *_First is an unpaired surrogate. - _STL_INTERNAL_CHECK(_First < _Last); + _NODISCARD const char* _Find_encoded(const char* _First, const char* _Last, const char _Val) const { + // Returns the first occurrence of _Val as an encoded character (and not, for example, as a + // continuation byte) in [_First, _Last). + if constexpr (_Statically_Utf8) { + return _Find_unchecked(_First, _Last, _Val); + } else { + if (this->_Cvt._Mbcurmax == 1 || this->_Cvt._Mbcurmax == 4) { + // As above and in _Mbrtowc, assume 4-byte encodings are UTF-8 + return _Find_unchecked(_First, _Last, _Val); + } - if (*_First < 0xD800u || *_First >= 0xE000u) { - return 1; + while (_First != _Last && *_First != _Val) { + const int _Units = _Units_in_next_character(_First, _Last); + if (_Units < 0) { + _THROW(format_error("Invalid encoded character in format string.")); + } + _First += _Units; + } + + return _First; + } } - if (*_First >= 0xDC00u) { // unpaired low surrogate - return -1; + _NODISCARD int _Estimate_width(const char* const _Ptr, const int _Units) const { + // Return an estimate for the width of the character composed of _Units code units, + // whose first code unit is denoted by _Ptr. + if constexpr (_Statically_Utf8) { + return _Estimate_utf8_character_width(_Ptr, _Units); + } else { + if (this->_Cvt._Mbcurmax != 4) { + // not a Unicode encoding; estimate width == number of code units + return _Units; + } + + // assume UTF-8 + return _Estimate_utf8_character_width(_Ptr, _Units); + } } +}; - if (++_First == _Last || *_First < 0xDC00u || *_First >= 0xE000u) { // unpaired high surrogate - return -1; +template +class _Fmt_codec { +private: +public: + _NODISCARD int _Units_in_next_character(const wchar_t* _First, const wchar_t* _Last) const noexcept { + // Returns a count of the number of code units that compose the first encoded character in + // [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or + // *_First is an unpaired surrogate. + _STL_INTERNAL_CHECK(_First < _Last); + + if (*_First < 0xD800u || *_First >= 0xE000u) { + return 1; + } + + if (*_First >= 0xDC00u) { // unpaired low surrogate + return -1; + } + + if (++_First == _Last || *_First < 0xDC00u || *_First >= 0xE000u) { // unpaired high surrogate + return -1; + } + + return 2; // surrogate pair } - return 2; // surrogate pair -} + _NODISCARD const wchar_t* _Find_encoded(const wchar_t* _First, const wchar_t* _Last, const wchar_t _Val) const { + // Returns the first occurrence of _Val as an encoded character (and not, for example, as a + // continuation byte) in [_First, _Last). + return _Find_unchecked(_First, _Last, _Val); + } + + _NODISCARD int _Estimate_width(const wchar_t* const _Ptr, const int _Units) const { + // Return an estimate for the width of the character composed of _Units code units, + // whose first code unit is denoted by _Ptr. + auto _Ch = static_cast(*_Ptr); + if (_Units == 1) { + return _Unicode_width_estimate<_Width_estimate_low_intervals>(_Ch); + } + + // surrogate pair + _Ch = (_Ch - 0xD8000u) << 10; + _Ch += static_cast(_Ptr[1]) - 0xDC00u; + _Ch += 0x10000u; + return _Unicode_width_estimate<_Width_estimate_high_intervals>(_Ch); + } +}; template _Callbacks_type> _NODISCARD const _CharT* _Parse_align(const _CharT* _Begin, const _CharT* _End, _Callbacks_type&& _Callbacks) { @@ -540,7 +680,7 @@ _NODISCARD const _CharT* _Parse_align(const _CharT* _Begin, const _CharT* _End, _STL_INTERNAL_CHECK(_Begin != _End && *_Begin != '}'); auto _Parsed_align = _Fmt_align::_None; - const int _Units = _Code_units_in_next_character(_Begin, _End, _Getcvt()); + const int _Units = _Fmt_codec<_CharT>{}._Units_in_next_character(_Begin, _End); if (_Units < 0) { // invalid fill character encoding _THROW(format_error("Invalid format string.")); } @@ -800,44 +940,19 @@ _NODISCARD constexpr const _CharT* _Parse_replacement_field( return _Begin + 1; } -template -_NODISCARD const _CharT* _Find_encoded( - const _CharT* _First, const _CharT* _Last, const _CharT _Val, const _Cvtvec& _Cvt) { - // Returns the first occurrence of _Val as an encoded character (and not, for example, as a - // continuation byte) in [_First, _Last). - if constexpr (_Is_execution_charset_utf8_v) { - return _Find_unchecked(_First, _Last, _Val); - } else { - if (_Cvt._Mbcurmax == 1 || _Cvt._Mbcurmax == 4) { - // As above and in _Mbrtowc, assume 4-byte encodings are UTF-8 - return _Find_unchecked(_First, _Last, _Val); - } - - while (_First != _Last && *_First != _Val) { - const int _Units = _Code_units_in_next_character(_First, _Last, _Cvt); - if (_Units < 0) { - _THROW(format_error("Invalid encoded character in format string.")); - } - _First += _Units; - } - - return _First; - } -} - template _HandlerT> void _Parse_format_string(basic_string_view<_CharT> _Format_str, _HandlerT&& _Handler) { - auto _Begin = _Format_str.data(); - auto _End = _Begin + _Format_str.size(); - const _Cvtvec& _Cvt = _Getcvt(); + auto _Begin = _Format_str.data(); + auto _End = _Begin + _Format_str.size(); + const _Fmt_codec<_CharT> _Counter{}; while (_Begin != _End) { const _CharT* _OpeningCurl = _Begin; if (*_Begin != '{') { - _OpeningCurl = _Find_encoded(_Begin, _End, _CharT{'{'}, _Cvt); + _OpeningCurl = _Counter._Find_encoded(_Begin, _End, _CharT{'{'}); for (;;) { - const _CharT* _ClosingCurl = _Find_encoded(_Begin, _OpeningCurl, _CharT{'}'}, _Cvt); + const _CharT* _ClosingCurl = _Counter._Find_encoded(_Begin, _OpeningCurl, _CharT{'}'}); // In this case there are neither closing nor opening curls in [_Begin, _OpeningCurl) // Write the whole thing out. @@ -2214,95 +2329,15 @@ _NODISCARD _OutputIt _Fmt_write( return _Fmt_write(_STD move(_Out), basic_string_view<_CharT>{_Value}, _Specs, _Locale); } -inline constexpr char16_t _Width_estimate_low_intervals[] = { // Per N4885 [format.string.std]/11 - 0x1100u, 0x1160u, 0x2329u, 0x232Bu, 0x2E80u, 0x303Fu, 0x3040u, 0xA4D0u, 0xAC00u, 0xD7A4u, 0xF900u, 0xFB00u, 0xFE10u, - 0xFE1Au, 0xFE30u, 0xFE70u, 0xFF00u, 0xFF61u, 0xFFE0u, 0xFFE7u}; - -inline constexpr char32_t _Width_estimate_high_intervals[] = { // Per N4885 [format.string.std]/11 - 0x1F300u, 0x1F650u, 0x1F900u, 0x1FA00u, 0x20000u, 0x2FFFEu, 0x30000u, 0x3FFFEu}; - -template -_NODISCARD constexpr int _Unicode_width_estimate(const char32_t _Ch) noexcept { - // Computes the width estimation for Unicode characters from N4885 [format.string.std]/11 - int _Result = 1; - for (const auto& _Bound : _Bounds) { - if (_Ch < _Bound) { - return _Result; - } - _Result ^= 1; - } - - return 1; -} - -_NODISCARD inline int _Estimate_utf8_character_width(const char* const _Ptr, const int _Units) noexcept { - // Return an estimate for the width of the character composed of _Units code units, - // whose first code unit is denoted by _Ptr. - auto _Ch = static_cast(*_Ptr); - switch (_Units) { - default: - case 1: - case 2: - return 1; - case 3: - _Ch &= 0b1111u; - break; - case 4: - _Ch &= 0b111u; - break; - } - - for (int _Idx = 1; _Idx < _Units; ++_Idx) { - _Ch = _Ch << 6 | (_Ptr[_Idx] & 0b11'1111u); - } - - if (_Units == 3) { - return _Unicode_width_estimate<_Width_estimate_low_intervals>(_Ch); - } - - return _Unicode_width_estimate<_Width_estimate_high_intervals>(_Ch); -} - -_NODISCARD inline int _Estimate_character_width(const char* _Ptr, const int _Units, const _Cvtvec& _Cvt) { - // Return an estimate for the width of the character composed of _Units code units, - // whose first code unit is denoted by _Ptr. - if constexpr (_Is_execution_charset_utf8_v) { - return _Estimate_utf8_character_width(_Ptr, _Units); - } else { - if (_Cvt._Mbcurmax != 4) { - // not a Unicode encoding; estimate width == number of code units - return _Units; - } - - // assume UTF-8 - return _Estimate_utf8_character_width(_Ptr, _Units); - } -} - -_NODISCARD inline int _Estimate_character_width(const wchar_t* _Ptr, const int _Units, const _Cvtvec&) { - // Return an estimate for the width of the character composed of _Units code units, - // whose first code unit is denoted by _Ptr. - auto _Ch = static_cast(*_Ptr); - if (_Units == 1) { - return _Unicode_width_estimate<_Width_estimate_low_intervals>(_Ch); - } - - // surrogate pair - _Ch = (_Ch - 0xD8000u) << 10; - _Ch += static_cast(_Ptr[1]) - 0xDC00u; - _Ch += 0x10000u; - return _Unicode_width_estimate<_Width_estimate_high_intervals>(_Ch); -} - template _NODISCARD const _CharT* _Measure_string_prefix(const basic_string_view<_CharT> _Value, int& _Width) { // Returns a pointer past-the-end of the largest prefix of _Value that fits in _Width, or all // of _Value if _Width is negative. Updates _Width to the estimated width of that prefix. - const int _Max_width = _Width; - auto _Pos = _Value.data(); - const auto _Last = _Pos + _Value.size(); - int _Estimated_width = 0; // the estimated width of [_Value.data(), _Pos) - const _Cvtvec& _Cvt = _Getcvt(); + const int _Max_width = _Width; + auto _Pos = _Value.data(); + const auto _Last = _Pos + _Value.size(); + int _Estimated_width = 0; // the estimated width of [_Value.data(), _Pos) + const _Fmt_codec<_CharT> _Codec; constexpr auto _Max_int = (numeric_limits::max)(); while (_Pos != _Last) { @@ -2312,8 +2347,8 @@ _NODISCARD const _CharT* _Measure_string_prefix(const basic_string_view<_CharT> } // TRANSITION, extended grapheme clustering - const int _Units = _Code_units_in_next_character(_Pos, _Last, _Cvt); - const int _Character_width = _Estimate_character_width(_Pos, _Units, _Cvt); + const int _Units = _Codec._Units_in_next_character(_Pos, _Last); + const int _Character_width = _Codec._Estimate_width(_Pos, _Units); if (_Max_int - _Character_width < _Estimated_width) { // avoid overflow // Either _Max_width isn't set, or adding this character will exceed it. diff --git a/stl/inc/xfilesystem_abi.h b/stl/inc/xfilesystem_abi.h index 052af6cc93..f0b456bd69 100644 --- a/stl/inc/xfilesystem_abi.h +++ b/stl/inc/xfilesystem_abi.h @@ -205,7 +205,7 @@ _BITMASK_OPS(__std_fs_file_flags) enum class __std_fs_file_handle : intptr_t { _Invalid = -1 }; -enum class __std_code_page : unsigned int { _Utf8 = 65001 }; +enum class __std_code_page : unsigned int { _Acp = 0, _Utf8 = 65001 }; struct __std_fs_convert_result { int _Len; diff --git a/stl/src/format.cpp b/stl/src/format.cpp new file mode 100644 index 0000000000..03bee36200 --- /dev/null +++ b/stl/src/format.cpp @@ -0,0 +1,41 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include + +#include + +static_assert(__std_code_page::_Acp == __std_code_page{CP_ACP}); + +// CODEPAGE is conditionally defined so tests can override it. +#ifndef CODEPAGE +#define CODEPAGE(x) static_cast(x) +#endif + +extern "C" [[nodiscard]] __std_win_error __stdcall __std_get_cvt( + const __std_code_page _Codepage, _Cvtvec* const _Pcvt) noexcept { + // get conversion info for an arbitrary codepage + *_Pcvt = {}; + + CPINFOEXW _Info{}; + const DWORD _Flags = 0; // reserved, must be zero + if (GetCPInfoExW(CODEPAGE(_Codepage), _Flags, &_Info) == 0) { + return __std_win_error{GetLastError()}; + } + + _Pcvt->_Page = _Info.CodePage; + _Pcvt->_Mbcurmax = _Info.MaxCharSize; + + for (int _Idx = 0; _Idx < MAX_LEADBYTES; _Idx += 2) { + if (_Info.LeadByte[_Idx] == 0 && _Info.LeadByte[_Idx + 1] == 0) { + break; + } + + for (unsigned char _First = _Info.LeadByte[_Idx], _Last = _Info.LeadByte[_Idx + 1]; _First != _Last; ++_First) { + _Pcvt->_Isleadbyte[_First >> 3] |= 1u << (_First & 0b111u); + } + } + + return __std_win_error::_Success; +} diff --git a/tests/std/include/test_format_support.hpp b/tests/std/include/test_format_support.hpp index 89dd541e85..77cc36eb85 100644 --- a/tests/std/include/test_format_support.hpp +++ b/tests/std/include/test_format_support.hpp @@ -117,9 +117,10 @@ void test_parse_helper(const CharT* (*func)(const CharT*, const CharT*, callback callback_type&& callbacks = {}) { try { auto end = func(view.data(), view.data() + view.size(), std::move(callbacks)); - if (expected_end_position != std::basic_string_view::npos) { - assert(end == view.data() + expected_end_position); + if (expected_end_position == std::basic_string_view::npos) { + expected_end_position = view.size(); } + assert(end == view.data() + expected_end_position); assert(!err_expected); } catch (const std::format_error&) { assert(err_expected); diff --git a/tests/std/tests/P0645R10_text_formatting_formatting/test.cpp b/tests/std/tests/P0645R10_text_formatting_formatting/test.cpp index ab37ef41f5..abee200168 100644 --- a/tests/std/tests/P0645R10_text_formatting_formatting/test.cpp +++ b/tests/std/tests/P0645R10_text_formatting_formatting/test.cpp @@ -975,31 +975,6 @@ void test_size() { test_size_helper(8, STR("{:8}"), STR("scully")); } -void test_multibyte_format_strings() { -#ifndef MSVC_INTERNAL_TESTING // TRANSITION, the Windows version on Contest VMs doesn't always understand ".UTF-8" - { - assert(setlocale(LC_ALL, ".UTF-8") != nullptr); - // Filling with footballs ("\xf0\x9f\x8f\x88" is U+1F3C8 AMERICAN FOOTBALL) - assert(format("{:\xf0\x9f\x8f\x88>4}"sv, 42) == "\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\x34\x32"); - - assert(format("{:\xf0\x9f\x8f\x88<4.2}", "1") == "\x31\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88"sv); - assert(format("{:\xf0\x9f\x8f\x88^4.2}", "1") == "\xf0\x9f\x8f\x88\x31\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88"sv); - assert(format("{:\xf0\x9f\x8f\x88>4.2}", "1") == "\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\x31"sv); - } - - { - assert(setlocale(LC_ALL, ".UTF-8") != nullptr); - try { - (void) format("{:\x9f\x8f\x88<10}"sv, 42); // Bad fill character encoding: missing lead byte before \x9f - assert(false); - } catch (const format_error&) { - } - } -#endif // MSVC_INTERNAL_TESTING - - assert(setlocale(LC_ALL, "C") != nullptr); -} - // The libfmt_ tests are derived from tests in // libfmt, Copyright (c) 2012 - present, Victor Zverovich // See NOTICE.txt for more information. @@ -1318,8 +1293,6 @@ void test() { test_size(); test_size(); - test_multibyte_format_strings(); - libfmt_formatter_test_escape(); libfmt_formatter_test_escape(); diff --git a/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/env.lst b/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/env.lst index 9aa5f2a5cd..8be668471a 100644 --- a/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/env.lst +++ b/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/env.lst @@ -1,7 +1,8 @@ # Copyright (c) Microsoft Corporation. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# This is `concepts_matrix.lst` with `/execution-charset:.932` added. +# This is `concepts_matrix.lst` with `/execution-charset:.932` added, and with +# `/Za` configurations disabled to support inclusion of ``. # clang is excluded since it doesn't support non-UTF-8 execution charsets. RUNALL_INCLUDE ..\prefix.lst @@ -21,7 +22,7 @@ PM_CL="/MTd /D_ITERATOR_DEBUG_LEVEL=0 /permissive- /fp:strict" PM_CL="/MTd /D_ITERATOR_DEBUG_LEVEL=1 /permissive-" PM_CL="/MTd /D_ITERATOR_DEBUG_LEVEL=2 /permissive" PM_CL="/MTd /D_ITERATOR_DEBUG_LEVEL=2 /permissive- /analyze:only /analyze:autolog-" -PM_CL="/permissive- /Za /MD" -PM_CL="/permissive- /Za /MDd" +# PM_CL="/permissive- /Za /MD" +# PM_CL="/permissive- /Za /MDd" # PM_CL="/permissive- /BE /c /MD" # PM_CL="/permissive- /BE /c /MTd" diff --git a/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp b/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp index beb4a499c6..4a2a2e8057 100644 --- a/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp +++ b/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp @@ -8,55 +8,48 @@ #include "test_format_support.hpp" +#pragma warning(disable : 5105) // macro expansion producing 'defined' has undefined behavior +#define CODEPAGE(x) (static_cast(x) ? static_cast(x) : 932u) +#include "../../../../stl/src/format.cpp" + using namespace std; void test_multibyte_format_strings() { - { - assert(setlocale(LC_ALL, ".932") != nullptr); - const auto s = - "\x93\xfa\x96{\x92\x6e\x90}"sv; // Note the use of `{` and `}` as continuation bytes (from GH-1576) - assert(format(s) == s); - - assert(format("{:.2}", s) == "\x93\xfa"sv); - assert(format("{:4.2}", s) == "\x93\xfa "sv); + const auto s = "\x93\xfa\x96{\x92\x6e\x90}"sv; // Note the use of `{` and `}` as continuation bytes (from GH-1576) + assert(format(s) == s); - assert(format("{:<4.2}", s) == "\x93\xfa "sv); - assert(format("{:^4.2}", s) == " \x93\xfa "sv); - assert(format("{:>4.2}", s) == " \x93\xfa"sv); + assert(format("{:.2}", s) == "\x93\xfa"sv); + assert(format("{:4.2}", s) == "\x93\xfa "sv); - assert(format("{:\x90}<4.2}", s) == "\x93\xfa\x90}\x90}"sv); - assert(format("{:\x90}^4.2}", s) == "\x90}\x93\xfa\x90}"sv); - assert(format("{:\x90}>4.2}", s) == "\x90}\x90}\x93\xfa"sv); + assert(format("{:<4.2}", s) == "\x93\xfa "sv); + assert(format("{:^4.2}", s) == " \x93\xfa "sv); + assert(format("{:>4.2}", s) == " \x93\xfa"sv); - assert(format("{:.3}", s) == "\x93\xfa"sv); - assert(format("{:4.3}", s) == "\x93\xfa "sv); + assert(format("{:\x90}<4.2}", s) == "\x93\xfa\x90}\x90}"sv); + assert(format("{:\x90}^4.2}", s) == "\x90}\x93\xfa\x90}"sv); + assert(format("{:\x90}>4.2}", s) == "\x90}\x90}\x93\xfa"sv); - assert(format("{:<4.3}", s) == "\x93\xfa "sv); - assert(format("{:^4.3}", s) == " \x93\xfa "sv); - assert(format("{:>4.3}", s) == " \x93\xfa"sv); + assert(format("{:.3}", s) == "\x93\xfa"sv); + assert(format("{:4.3}", s) == "\x93\xfa "sv); - assert(format("{:\x90}<4.3}", s) == "\x93\xfa\x90}\x90}"sv); - assert(format("{:\x90}^4.3}", s) == "\x90}\x93\xfa\x90}"sv); - assert(format("{:\x90}>4.3}", s) == "\x90}\x90}\x93\xfa"sv); - } + assert(format("{:<4.3}", s) == "\x93\xfa "sv); + assert(format("{:^4.3}", s) == " \x93\xfa "sv); + assert(format("{:>4.3}", s) == " \x93\xfa"sv); - assert(setlocale(LC_ALL, "C") != nullptr); + assert(format("{:\x90}<4.3}", s) == "\x93\xfa\x90}\x90}"sv); + assert(format("{:\x90}^4.3}", s) == "\x90}\x93\xfa\x90}"sv); + assert(format("{:\x90}>4.3}", s) == "\x90}\x90}\x93\xfa"sv); } void test_parse_align() { - auto parse_align_fn = _Parse_align>; - - { - assert(setlocale(LC_ALL, ".932") != nullptr); - test_parse_helper(parse_align_fn, "\x93\xfaX"sv, false, 3, - {.expected_alignment = _Fmt_align::_Right, .expected_fill = "\x96\x7b"sv}); - test_parse_helper(parse_align_fn, "\x92\x6e^X"sv, false, 3, - {.expected_alignment = _Fmt_align::_Center, .expected_fill = "\x92\x6e"sv}); - } - - assert(setlocale(LC_ALL, "C") != nullptr); + const auto parse_align_fn = _Parse_align>; + + test_parse_helper(parse_align_fn, "\x93\xfaX"sv, false, 3, + {.expected_alignment = _Fmt_align::_Right, .expected_fill = "\x96\x7b"sv}); + test_parse_helper(parse_align_fn, "\x92\x6e^X"sv, false, 3, + {.expected_alignment = _Fmt_align::_Center, .expected_fill = "\x92\x6e"sv}); } int main() { diff --git a/tests/std/tests/P0645R10_text_formatting_parsing/test.cpp b/tests/std/tests/P0645R10_text_formatting_parsing/test.cpp index ddf204c0db..39fe7bd217 100644 --- a/tests/std/tests/P0645R10_text_formatting_parsing/test.cpp +++ b/tests/std/tests/P0645R10_text_formatting_parsing/test.cpp @@ -36,7 +36,7 @@ bool test_parse_align() { // \x343E (which is from CJK unified ideographs extension A) and similar characters to parse as // an alignment specifier. auto s4 = L"*\x343E"sv; - test_parse_helper(parse_align_fn, s4, false, view_typ::npos, {.expected_fill = L"*"sv}); + test_parse_helper(parse_align_fn, s4, false, 0, {.expected_fill = L"*"sv}); // test multi-code-unit fill characters { @@ -47,22 +47,6 @@ bool test_parse_align() { test_parse_helper(parse_align_fn, L"\U0001F3C8^X"sv, false, 3, {.expected_alignment = _Fmt_align::_Center, .expected_fill = L"\U0001F3C8"sv}); } - } else { - // test multibyte fill characters -#ifndef MSVC_INTERNAL_TESTING // TRANSITION, the Windows version on Contest VMs doesn't always understand ".UTF-8" - { - assert(setlocale(LC_ALL, ".UTF-8") != nullptr); - // "\xf0\x9f\x8f\x88" is U+1F3C8 AMERICAN FOOTBALL - test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88X"sv, false, 5, - {.expected_alignment = _Fmt_align::_Right, .expected_fill = "\xf0\x9f\x8f\x88"sv}); - test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88^X"sv, false, 5, - {.expected_alignment = _Fmt_align::_Center, .expected_fill = "\xf0\x9f\x8f\x88"sv}); - } -#endif // MSVC_INTERNAL_TESTING - - assert(setlocale(LC_ALL, "C") != nullptr); } return true; diff --git a/tests/std/tests/P0645R10_text_formatting_utf8/env.lst b/tests/std/tests/P0645R10_text_formatting_utf8/env.lst index 42da0946d2..d5bd9f429e 100644 --- a/tests/std/tests/P0645R10_text_formatting_utf8/env.lst +++ b/tests/std/tests/P0645R10_text_formatting_utf8/env.lst @@ -1,6 +1,30 @@ # Copyright (c) Microsoft Corporation. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -RUNALL_INCLUDE ..\concepts_matrix.lst +# This is `concepts_matrix.lst` with `/utf-8` added, and with +# `/Za` configurations disabled to support inclusion of ``. +# clang is excluded since it doesn't support non-UTF-8 execution charsets. + +RUNALL_INCLUDE ..\prefix.lst +RUNALL_CROSSLIST +PM_CL="/w14640 /Zc:threadSafeInit- /EHsc /std:c++latest" +RUNALL_CROSSLIST +PM_CL="/MD /D_ITERATOR_DEBUG_LEVEL=0 /permissive- /Zc:noexceptTypes-" +PM_CL="/MD /D_ITERATOR_DEBUG_LEVEL=1 /permissive-" +PM_CL="/MD /D_ITERATOR_DEBUG_LEVEL=0 /permissive- /Zc:char8_t- /Zc:preprocessor" +PM_CL="/MDd /D_ITERATOR_DEBUG_LEVEL=0 /permissive- /Zc:wchar_t-" +PM_CL="/MDd /D_ITERATOR_DEBUG_LEVEL=1 /permissive-" +PM_CL="/MDd /D_ITERATOR_DEBUG_LEVEL=2 /permissive- /fp:except /Zc:preprocessor" +PM_CL="/MT /D_ITERATOR_DEBUG_LEVEL=0 /permissive-" +PM_CL="/MT /D_ITERATOR_DEBUG_LEVEL=0 /permissive- /analyze:only /analyze:autolog-" +PM_CL="/MT /D_ITERATOR_DEBUG_LEVEL=1 /permissive-" +PM_CL="/MTd /D_ITERATOR_DEBUG_LEVEL=0 /permissive- /fp:strict" +PM_CL="/MTd /D_ITERATOR_DEBUG_LEVEL=1 /permissive-" +PM_CL="/MTd /D_ITERATOR_DEBUG_LEVEL=2 /permissive" +PM_CL="/MTd /D_ITERATOR_DEBUG_LEVEL=2 /permissive- /analyze:only /analyze:autolog-" +# PM_CL="/permissive- /Za /MD" +# PM_CL="/permissive- /Za /MDd" +# PM_CL="/permissive- /BE /c /MD" +# PM_CL="/permissive- /BE /c /MTd" RUNALL_CROSSLIST PM_CL="/utf-8" diff --git a/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp b/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp index 5b98879423..f071e56642 100644 --- a/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp +++ b/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp @@ -8,6 +8,10 @@ #include "test_format_support.hpp" +#pragma warning(disable : 5105) // macro expansion producing 'defined' has undefined behavior +#define CODEPAGE(x) (static_cast(x) ? static_cast(x) : 65001u) +#include "../../../../stl/src/format.cpp" + using namespace std; void test_multibyte_format_strings() { From eaa454f39e7300d90bf1fde516bec954f65f1bc5 Mon Sep 17 00:00:00 2001 From: Casey Carter Date: Wed, 14 Apr 2021 08:39:44 -0700 Subject: [PATCH 2/7] More hygienic testing method --- stl/inc/format | 8 ++++-- stl/src/format.cpp | 2 ++ .../env.lst | 7 +++-- .../test.cpp | 6 ++--- .../P0645R10_text_formatting_utf8/env.lst | 26 +------------------ .../P0645R10_text_formatting_utf8/test.cpp | 6 ++--- 6 files changed, 16 insertions(+), 39 deletions(-) diff --git a/stl/inc/format b/stl/inc/format index dea6e05e8c..b8e193a55a 100644 --- a/stl/inc/format +++ b/stl/inc/format @@ -498,9 +498,13 @@ protected: } _Fmt_codec_base() { - if (__std_get_cvt(__std_code_page::_Acp, &_Cvt) != __std_win_error::_Success) { - _THROW(format_error("FIXME")); +#ifndef _FORMAT_CODEPAGE +#define _FORMAT_CODEPAGE __std_code_page::_Acp +#endif // _FORMAT_CODEPAGE + if (__std_get_cvt(_FORMAT_CODEPAGE, &_Cvt) != __std_win_error::_Success) { + _THROW(format_error("failed to retrieve codepage info")); } +#undef _FORMAT_CODEPAGE } }; diff --git a/stl/src/format.cpp b/stl/src/format.cpp index 03bee36200..98966a180e 100644 --- a/stl/src/format.cpp +++ b/stl/src/format.cpp @@ -21,6 +21,8 @@ extern "C" [[nodiscard]] __std_win_error __stdcall __std_get_cvt( CPINFOEXW _Info{}; const DWORD _Flags = 0; // reserved, must be zero if (GetCPInfoExW(CODEPAGE(_Codepage), _Flags, &_Info) == 0) { + // NB: the only documented failure mode for GetCPInfoExW is ERROR_INVALID_PARAMETER, + // so in practice it should never fail for CP_ACP. return __std_win_error{GetLastError()}; } diff --git a/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/env.lst b/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/env.lst index 8be668471a..9aa5f2a5cd 100644 --- a/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/env.lst +++ b/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/env.lst @@ -1,8 +1,7 @@ # Copyright (c) Microsoft Corporation. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# This is `concepts_matrix.lst` with `/execution-charset:.932` added, and with -# `/Za` configurations disabled to support inclusion of ``. +# This is `concepts_matrix.lst` with `/execution-charset:.932` added. # clang is excluded since it doesn't support non-UTF-8 execution charsets. RUNALL_INCLUDE ..\prefix.lst @@ -22,7 +21,7 @@ PM_CL="/MTd /D_ITERATOR_DEBUG_LEVEL=0 /permissive- /fp:strict" PM_CL="/MTd /D_ITERATOR_DEBUG_LEVEL=1 /permissive-" PM_CL="/MTd /D_ITERATOR_DEBUG_LEVEL=2 /permissive" PM_CL="/MTd /D_ITERATOR_DEBUG_LEVEL=2 /permissive- /analyze:only /analyze:autolog-" -# PM_CL="/permissive- /Za /MD" -# PM_CL="/permissive- /Za /MDd" +PM_CL="/permissive- /Za /MD" +PM_CL="/permissive- /Za /MDd" # PM_CL="/permissive- /BE /c /MD" # PM_CL="/permissive- /BE /c /MTd" diff --git a/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp b/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp index 4a2a2e8057..9ed6a591d8 100644 --- a/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp +++ b/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp @@ -1,6 +1,8 @@ // Copyright (c) Microsoft Corporation. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#define _FORMAT_CODEPAGE (__std_code_page{932}) + #include #include #include @@ -8,10 +10,6 @@ #include "test_format_support.hpp" -#pragma warning(disable : 5105) // macro expansion producing 'defined' has undefined behavior -#define CODEPAGE(x) (static_cast(x) ? static_cast(x) : 932u) -#include "../../../../stl/src/format.cpp" - using namespace std; void test_multibyte_format_strings() { diff --git a/tests/std/tests/P0645R10_text_formatting_utf8/env.lst b/tests/std/tests/P0645R10_text_formatting_utf8/env.lst index d5bd9f429e..42da0946d2 100644 --- a/tests/std/tests/P0645R10_text_formatting_utf8/env.lst +++ b/tests/std/tests/P0645R10_text_formatting_utf8/env.lst @@ -1,30 +1,6 @@ # Copyright (c) Microsoft Corporation. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# This is `concepts_matrix.lst` with `/utf-8` added, and with -# `/Za` configurations disabled to support inclusion of ``. -# clang is excluded since it doesn't support non-UTF-8 execution charsets. - -RUNALL_INCLUDE ..\prefix.lst -RUNALL_CROSSLIST -PM_CL="/w14640 /Zc:threadSafeInit- /EHsc /std:c++latest" -RUNALL_CROSSLIST -PM_CL="/MD /D_ITERATOR_DEBUG_LEVEL=0 /permissive- /Zc:noexceptTypes-" -PM_CL="/MD /D_ITERATOR_DEBUG_LEVEL=1 /permissive-" -PM_CL="/MD /D_ITERATOR_DEBUG_LEVEL=0 /permissive- /Zc:char8_t- /Zc:preprocessor" -PM_CL="/MDd /D_ITERATOR_DEBUG_LEVEL=0 /permissive- /Zc:wchar_t-" -PM_CL="/MDd /D_ITERATOR_DEBUG_LEVEL=1 /permissive-" -PM_CL="/MDd /D_ITERATOR_DEBUG_LEVEL=2 /permissive- /fp:except /Zc:preprocessor" -PM_CL="/MT /D_ITERATOR_DEBUG_LEVEL=0 /permissive-" -PM_CL="/MT /D_ITERATOR_DEBUG_LEVEL=0 /permissive- /analyze:only /analyze:autolog-" -PM_CL="/MT /D_ITERATOR_DEBUG_LEVEL=1 /permissive-" -PM_CL="/MTd /D_ITERATOR_DEBUG_LEVEL=0 /permissive- /fp:strict" -PM_CL="/MTd /D_ITERATOR_DEBUG_LEVEL=1 /permissive-" -PM_CL="/MTd /D_ITERATOR_DEBUG_LEVEL=2 /permissive" -PM_CL="/MTd /D_ITERATOR_DEBUG_LEVEL=2 /permissive- /analyze:only /analyze:autolog-" -# PM_CL="/permissive- /Za /MD" -# PM_CL="/permissive- /Za /MDd" -# PM_CL="/permissive- /BE /c /MD" -# PM_CL="/permissive- /BE /c /MTd" +RUNALL_INCLUDE ..\concepts_matrix.lst RUNALL_CROSSLIST PM_CL="/utf-8" diff --git a/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp b/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp index f071e56642..2fa79b83e0 100644 --- a/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp +++ b/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp @@ -1,6 +1,8 @@ // Copyright (c) Microsoft Corporation. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#define _FORMAT_CODEPAGE __std_code_page::_Utf8 + #include #include #include @@ -8,10 +10,6 @@ #include "test_format_support.hpp" -#pragma warning(disable : 5105) // macro expansion producing 'defined' has undefined behavior -#define CODEPAGE(x) (static_cast(x) ? static_cast(x) : 65001u) -#include "../../../../stl/src/format.cpp" - using namespace std; void test_multibyte_format_strings() { From 9494a9cc80121e766cacb0d07652128d14bfb9a7 Mon Sep 17 00:00:00 2001 From: Casey Carter Date: Wed, 14 Apr 2021 11:35:38 -0700 Subject: [PATCH 3/7] Review comments --- stl/inc/format | 17 +++++++---------- stl/msbuild/stl_base/stl.files.settings.targets | 1 + stl/src/format.cpp | 14 ++++++++------ 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/stl/inc/format b/stl/inc/format index b8e193a55a..fb942333a3 100644 --- a/stl/inc/format +++ b/stl/inc/format @@ -63,8 +63,7 @@ _STL_DISABLE_CLANG_WARNINGS #pragma push_macro("new") #undef new -extern "C" _NODISCARD __std_win_error __stdcall __std_get_cvt( - const __std_code_page _Codepage, _Cvtvec* const _Pcvt) noexcept; +extern "C" _NODISCARD __std_win_error __stdcall __std_get_cvt(__std_code_page _Codepage, _Cvtvec* _Pcvt) noexcept; _STD_BEGIN @@ -501,9 +500,8 @@ protected: #ifndef _FORMAT_CODEPAGE #define _FORMAT_CODEPAGE __std_code_page::_Acp #endif // _FORMAT_CODEPAGE - if (__std_get_cvt(_FORMAT_CODEPAGE, &_Cvt) != __std_win_error::_Success) { - _THROW(format_error("failed to retrieve codepage info")); - } + [[maybe_unused]] const __std_win_error _Result = __std_get_cvt(_FORMAT_CODEPAGE, &_Cvt); + _STL_INTERNAL_CHECK(_Result == __std_win_error::_Success); #undef _FORMAT_CODEPAGE } }; @@ -537,7 +535,7 @@ private: return (_Len >= 4) ? 4 : -1; } - _NODISCARD static constexpr int _Estimate_utf8_character_width(const char* const _Ptr, const int _Units) noexcept { + _NODISCARD static int _Estimate_utf8_character_width(const char* const _Ptr, const int _Units) noexcept { // Return an estimate for the width of the character composed of _Units code units, // whose first code unit is denoted by _Ptr. auto _Ch = static_cast(*_Ptr); @@ -633,7 +631,6 @@ public: template class _Fmt_codec { -private: public: _NODISCARD int _Units_in_next_character(const wchar_t* _First, const wchar_t* _Last) const noexcept { // Returns a count of the number of code units that compose the first encoded character in @@ -948,15 +945,15 @@ template _HandlerT> void _Parse_format_string(basic_string_view<_CharT> _Format_str, _HandlerT&& _Handler) { auto _Begin = _Format_str.data(); auto _End = _Begin + _Format_str.size(); - const _Fmt_codec<_CharT> _Counter{}; + const _Fmt_codec<_CharT> _Codec; while (_Begin != _End) { const _CharT* _OpeningCurl = _Begin; if (*_Begin != '{') { - _OpeningCurl = _Counter._Find_encoded(_Begin, _End, _CharT{'{'}); + _OpeningCurl = _Codec._Find_encoded(_Begin, _End, _CharT{'{'}); for (;;) { - const _CharT* _ClosingCurl = _Counter._Find_encoded(_Begin, _OpeningCurl, _CharT{'}'}); + const _CharT* _ClosingCurl = _Codec._Find_encoded(_Begin, _OpeningCurl, _CharT{'}'}); // In this case there are neither closing nor opening curls in [_Begin, _OpeningCurl) // Write the whole thing out. diff --git a/stl/msbuild/stl_base/stl.files.settings.targets b/stl/msbuild/stl_base/stl.files.settings.targets index 05ff1f5ddb..15aa0fbc46 100644 --- a/stl/msbuild/stl_base/stl.files.settings.targets +++ b/stl/msbuild/stl_base/stl.files.settings.targets @@ -161,6 +161,7 @@ SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception (controlled by IncludeInLink and IncludeInImportLib). --> (_Codepage), _Flags, &_Info) == 0) { // NB: the only documented failure mode for GetCPInfoExW is ERROR_INVALID_PARAMETER, // so in practice it should never fail for CP_ACP. return __std_win_error{GetLastError()}; From 799a55a525fd9c5e083f1b6c2b9dbc55561956a7 Mon Sep 17 00:00:00 2001 From: Casey Carter Date: Thu, 15 Apr 2021 14:20:43 -0700 Subject: [PATCH 4/7] Remove confusing comment --- stl/inc/format | 2 -- 1 file changed, 2 deletions(-) diff --git a/stl/inc/format b/stl/inc/format index fb942333a3..ed5c45dae7 100644 --- a/stl/inc/format +++ b/stl/inc/format @@ -654,8 +654,6 @@ public: } _NODISCARD const wchar_t* _Find_encoded(const wchar_t* _First, const wchar_t* _Last, const wchar_t _Val) const { - // Returns the first occurrence of _Val as an encoded character (and not, for example, as a - // continuation byte) in [_First, _Last). return _Find_unchecked(_First, _Last, _Val); } From c6efbdba4d3b69a1fc06af86a03f63cb9536cb96 Mon Sep 17 00:00:00 2001 From: Casey Carter Date: Fri, 16 Apr 2021 11:07:10 -0700 Subject: [PATCH 5/7] Add test coverage for width estimation ... fix 2 bugs exposed by that coverage, and address review comments. Drive-by: Remove the `_FORMAT_CODEPAGE` override from the `formatting_utf8` test which is unnecessary since we compile it with `/utf-8`. --- stl/inc/format | 11 +- .../test.cpp | 29 +++++ .../P0645R10_text_formatting_utf8/test.cpp | 119 +++++++++++++++--- 3 files changed, 140 insertions(+), 19 deletions(-) diff --git a/stl/inc/format b/stl/inc/format index ed5c45dae7..4275f77a71 100644 --- a/stl/inc/format +++ b/stl/inc/format @@ -462,7 +462,7 @@ _NODISCARD constexpr int _Unicode_width_estimate(const char32_t _Ch) noexcept { if (_Ch < _Bound) { return _Result; } - _Result ^= 1; + _Result ^= 0b11u; // Flip between 1 and 2 on each iteration } return 1; @@ -564,7 +564,7 @@ private: } public: - _NODISCARD int _Units_in_next_character(const char* _First, const char* _Last) const noexcept { + _NODISCARD int _Units_in_next_character(const char* const _First, const char* const _Last) const noexcept { // Returns a count of the number of code units that compose the first encoded character in // [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or // *_First is not a valid lead byte. @@ -589,7 +589,7 @@ public: } } - _NODISCARD const char* _Find_encoded(const char* _First, const char* _Last, const char _Val) const { + _NODISCARD const char* _Find_encoded(const char* _First, const char* const _Last, const char _Val) const { // Returns the first occurrence of _Val as an encoded character (and not, for example, as a // continuation byte) in [_First, _Last). if constexpr (_Statically_Utf8) { @@ -632,7 +632,7 @@ public: template class _Fmt_codec { public: - _NODISCARD int _Units_in_next_character(const wchar_t* _First, const wchar_t* _Last) const noexcept { + _NODISCARD int _Units_in_next_character(const wchar_t* _First, const wchar_t* const _Last) const noexcept { // Returns a count of the number of code units that compose the first encoded character in // [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or // *_First is an unpaired surrogate. @@ -653,7 +653,8 @@ public: return 2; // surrogate pair } - _NODISCARD const wchar_t* _Find_encoded(const wchar_t* _First, const wchar_t* _Last, const wchar_t _Val) const { + _NODISCARD const wchar_t* _Find_encoded( + const wchar_t* const _First, const wchar_t* const _Last, const wchar_t _Val) const { return _Find_unchecked(_First, _Last, _Val); } diff --git a/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp b/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp index 9ed6a591d8..bd81958f9c 100644 --- a/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp +++ b/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp @@ -50,7 +50,36 @@ void test_parse_align() { {.expected_alignment = _Fmt_align::_Center, .expected_fill = "\x92\x6e"sv}); } +void test_width_estimation() { + // Format strings of known width with a trailing delimiter using a precision large enough to + // include all but the delimiter to validate the width estimation code. + struct test_case { + const char* str; + int width; + }; + constexpr test_case test_cases[] = { + // Pick a "short" and "long" codepoints (\x20 and \x96\x7b), then form + // all permutations of 3-codepoint prefixes. This gives us coverage of + // all transitions (e.g. short-to-long, long-to-long). + {"\x20\x20\x20\x58", 4}, + {"\x20\x20\x96\x7b\x58", 5}, + {"\x20\x96\x7b\x20\x58", 5}, + {"\x96\x7b\x20\x20\x58", 5}, + {"\x20\x96\x7b\x96\x7b\x58", 6}, + {"\x96\x7b\x20\x96\x7b\x58", 6}, + {"\x96\x7b\x96\x7b\x20\x58", 6}, + {"\x96\x7b\x96\x7b\x96\x7b\x58", 7}, + }; + + for (const auto& test : test_cases) { + basic_string_view sv{test.str}; + sv = sv.substr(0, sv.size() - 1); + assert(format("{:.{}}", test.str, test.width - 1) == sv); + } +} + int main() { test_multibyte_format_strings(); test_parse_align(); + test_width_estimation(); } diff --git a/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp b/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp index 2fa79b83e0..9f33b368de 100644 --- a/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp +++ b/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp @@ -1,8 +1,6 @@ // Copyright (c) Microsoft Corporation. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#define _FORMAT_CODEPAGE __std_code_page::_Utf8 - #include #include #include @@ -14,12 +12,12 @@ using namespace std; void test_multibyte_format_strings() { { - // Filling with footballs ("\xf0\x9f\x8f\x88" is U+1F3C8 AMERICAN FOOTBALL) - assert(format("{:\xf0\x9f\x8f\x88>4}"sv, 42) == "\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\x34\x32"); + // Filling with footballs ("\U0001f3c8" is U+1F3C8 AMERICAN FOOTBALL) + assert(format("{:\U0001f3c8>4}"sv, 42) == "\U0001f3c8\U0001f3c8\x34\x32"); - assert(format("{:\xf0\x9f\x8f\x88<4.2}", "1") == "\x31\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88"sv); - assert(format("{:\xf0\x9f\x8f\x88^4.2}", "1") == "\xf0\x9f\x8f\x88\x31\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88"sv); - assert(format("{:\xf0\x9f\x8f\x88>4.2}", "1") == "\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\x31"sv); + assert(format("{:\U0001f3c8<4.2}", "1") == "\x31\U0001f3c8\U0001f3c8\U0001f3c8"sv); + assert(format("{:\U0001f3c8^4.2}", "1") == "\U0001f3c8\x31\U0001f3c8\U0001f3c8"sv); + assert(format("{:\U0001f3c8>4.2}", "1") == "\U0001f3c8\U0001f3c8\U0001f3c8\x31"sv); } { @@ -35,19 +33,112 @@ void test_parse_align() { auto parse_align_fn = _Parse_align>; { - // "\xf0\x9f\x8f\x88" is U+1F3C8 AMERICAN FOOTBALL - test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88X"sv, false, 5, - {.expected_alignment = _Fmt_align::_Right, .expected_fill = "\xf0\x9f\x8f\x88"sv}); - test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88^X"sv, false, 5, - {.expected_alignment = _Fmt_align::_Center, .expected_fill = "\xf0\x9f\x8f\x88"sv}); + test_parse_helper(parse_align_fn, "\U0001f3c8X"sv, false, 5, + {.expected_alignment = _Fmt_align::_Right, .expected_fill = "\U0001f3c8"sv}); + test_parse_helper(parse_align_fn, "\U0001f3c8^X"sv, false, 5, + {.expected_alignment = _Fmt_align::_Center, .expected_fill = "\U0001f3c8"sv}); + } +} + +template +void test_width_estimation() { + // Format strings of known width with a trailing delimiter using a precision large enough to + // include all but the delimiter to validate the width estimation code. + struct test_case { + const CharT* str; + int width; + }; + constexpr test_case test_cases[] = { + {TYPED_LITERAL(CharT, "\x58"), 1}, + {TYPED_LITERAL(CharT, "x\x58"), 2}, + // test the boundaries of the intervals defined in n4885 [format.str.std]/11 + {TYPED_LITERAL(CharT, "\u10ff\x58"), 2}, + {TYPED_LITERAL(CharT, "\u1100\x58"), 3}, + {TYPED_LITERAL(CharT, "\u115f\x58"), 3}, + {TYPED_LITERAL(CharT, "\u1160\x58"), 2}, + {TYPED_LITERAL(CharT, "\u2328\x58"), 2}, + {TYPED_LITERAL(CharT, "\u2329\x58"), 3}, + {TYPED_LITERAL(CharT, "\u232a\x58"), 3}, + {TYPED_LITERAL(CharT, "\u232b\x58"), 2}, + {TYPED_LITERAL(CharT, "\u2e7f\x58"), 2}, + {TYPED_LITERAL(CharT, "\u2e80\x58"), 3}, + {TYPED_LITERAL(CharT, "\u303e\x58"), 3}, + {TYPED_LITERAL(CharT, "\u303f\x58"), 2}, + {TYPED_LITERAL(CharT, "\u3040\x58"), 3}, + {TYPED_LITERAL(CharT, "\ua4cf\x58"), 3}, + {TYPED_LITERAL(CharT, "\ua4d0\x58"), 2}, + {TYPED_LITERAL(CharT, "\uabff\x58"), 2}, + {TYPED_LITERAL(CharT, "\uac00\x58"), 3}, + {TYPED_LITERAL(CharT, "\ud7a3\x58"), 3}, + {TYPED_LITERAL(CharT, "\ud7a4\x58"), 2}, + {TYPED_LITERAL(CharT, "\ud7ff\x58"), 2}, + // skip over the surrogate pair range (\ud800-\udfff) + {TYPED_LITERAL(CharT, "\ue000\x58"), 2}, + {TYPED_LITERAL(CharT, "\uf8ff\x58"), 2}, + {TYPED_LITERAL(CharT, "\uf900\x58"), 3}, + {TYPED_LITERAL(CharT, "\ufaff\x58"), 3}, + {TYPED_LITERAL(CharT, "\ufb00\x58"), 2}, + {TYPED_LITERAL(CharT, "\ufe0f\x58"), 2}, + {TYPED_LITERAL(CharT, "\ufe10\x58"), 3}, + {TYPED_LITERAL(CharT, "\ufe19\x58"), 3}, + {TYPED_LITERAL(CharT, "\ufe1a\x58"), 2}, + {TYPED_LITERAL(CharT, "\ufe2f\x58"), 2}, + {TYPED_LITERAL(CharT, "\ufe30\x58"), 3}, + {TYPED_LITERAL(CharT, "\ufe6f\x58"), 3}, + {TYPED_LITERAL(CharT, "\ufe70\x58"), 2}, + {TYPED_LITERAL(CharT, "\ufeff\x58"), 2}, + {TYPED_LITERAL(CharT, "\uff00\x58"), 3}, + {TYPED_LITERAL(CharT, "\uff60\x58"), 3}, + {TYPED_LITERAL(CharT, "\uff61\x58"), 2}, + {TYPED_LITERAL(CharT, "\uffdf\x58"), 2}, + {TYPED_LITERAL(CharT, "\uffe0\x58"), 3}, + {TYPED_LITERAL(CharT, "\uffe6\x58"), 3}, + {TYPED_LITERAL(CharT, "\uffe7\x58"), 2}, + {TYPED_LITERAL(CharT, "\U0001f2ff\x58"), 2}, + {TYPED_LITERAL(CharT, "\U0001f300\x58"), 3}, + {TYPED_LITERAL(CharT, "\U0001f64f\x58"), 3}, + {TYPED_LITERAL(CharT, "\U0001f650\x58"), 2}, + {TYPED_LITERAL(CharT, "\U0001f8ff\x58"), 2}, + {TYPED_LITERAL(CharT, "\U0001f900\x58"), 3}, + {TYPED_LITERAL(CharT, "\U0001f9ff\x58"), 3}, + {TYPED_LITERAL(CharT, "\U0001fa00\x58"), 2}, + {TYPED_LITERAL(CharT, "\U0001ffff\x58"), 2}, + {TYPED_LITERAL(CharT, "\U00020000\x58"), 3}, + {TYPED_LITERAL(CharT, "\U0002fffd\x58"), 3}, + {TYPED_LITERAL(CharT, "\U0002fffe\x58"), 2}, + {TYPED_LITERAL(CharT, "\U0002ffff\x58"), 2}, + {TYPED_LITERAL(CharT, "\U00030000\x58"), 3}, + {TYPED_LITERAL(CharT, "\U0003fffd\x58"), 3}, + {TYPED_LITERAL(CharT, "\U0003fffe\x58"), 2}, + {TYPED_LITERAL(CharT, "\U0010ffff\x58"), 2}, + + // Pick a "short" and "long" codepoints (\u2000 and \ufe40), then form + // all permutations of 3-codepoint prefixes. This gives us coverage of + // all transitions (e.g. short-to-long, long-to-long). + {TYPED_LITERAL(CharT, "\u2000\u2000\u2000\x58"), 4}, + {TYPED_LITERAL(CharT, "\u2000\u2000\ufe40\x58"), 5}, + {TYPED_LITERAL(CharT, "\u2000\ufe40\u2000\x58"), 5}, + {TYPED_LITERAL(CharT, "\ufe40\u2000\u2000\x58"), 5}, + {TYPED_LITERAL(CharT, "\u2000\ufe40\ufe40\x58"), 6}, + {TYPED_LITERAL(CharT, "\ufe40\u2000\ufe40\x58"), 6}, + {TYPED_LITERAL(CharT, "\ufe40\ufe40\u2000\x58"), 6}, + {TYPED_LITERAL(CharT, "\ufe40\ufe40\ufe40\x58"), 7}, + }; + + for (const auto& test : test_cases) { + basic_string_view sv{test.str}; + sv = sv.substr(0, sv.size() - 1); + assert(format(TYPED_LITERAL(CharT, "{:.{}}"), test.str, test.width - 1) == sv); } } void run_tests() { test_multibyte_format_strings(); test_parse_align(); + test_width_estimation(); + test_width_estimation(); } int main() { From 16c599c712bb99400ade37f705a84e20965ad64f Mon Sep 17 00:00:00 2001 From: Casey Carter Date: Fri, 16 Apr 2021 13:14:38 -0700 Subject: [PATCH 6/7] *sigh* commit the fix for the second bug --- stl/inc/format | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/inc/format b/stl/inc/format index 4275f77a71..d09850c03f 100644 --- a/stl/inc/format +++ b/stl/inc/format @@ -667,7 +667,7 @@ public: } // surrogate pair - _Ch = (_Ch - 0xD8000u) << 10; + _Ch = (_Ch - 0xD800u) << 10; _Ch += static_cast(_Ptr[1]) - 0xDC00u; _Ch += 0x10000u; return _Unicode_width_estimate<_Width_estimate_high_intervals>(_Ch); From 3f2944f6860bb8ce9ea19c1db899a358a0bc8825 Mon Sep 17 00:00:00 2001 From: Casey Carter Date: Fri, 16 Apr 2021 15:15:50 -0700 Subject: [PATCH 7/7] More review comments --- .../test.cpp | 11 +++++++---- .../std/tests/P0645R10_text_formatting_utf8/test.cpp | 11 +++++++---- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp b/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp index bd81958f9c..081eecd7ce 100644 --- a/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp +++ b/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp @@ -58,9 +58,12 @@ void test_width_estimation() { int width; }; constexpr test_case test_cases[] = { - // Pick a "short" and "long" codepoints (\x20 and \x96\x7b), then form - // all permutations of 3-codepoint prefixes. This gives us coverage of - // all transitions (e.g. short-to-long, long-to-long). + {"\x58", 1}, + {"x\x58", 2}, + + // Pick "short" and "long" codepoints (\x20 and \x96\x7b), then form all permutations of + // 3-codepoint prefixes with the same fixed delimiter as above. This gives us coverage of + // all adjacent pairings (short/short, short/long, long/short, long/long). {"\x20\x20\x20\x58", 4}, {"\x20\x20\x96\x7b\x58", 5}, {"\x20\x96\x7b\x20\x58", 5}, @@ -72,7 +75,7 @@ void test_width_estimation() { }; for (const auto& test : test_cases) { - basic_string_view sv{test.str}; + string_view sv{test.str}; sv = sv.substr(0, sv.size() - 1); assert(format("{:.{}}", test.str, test.width - 1) == sv); } diff --git a/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp b/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp index 9f33b368de..08a70afaac 100644 --- a/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp +++ b/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp @@ -53,7 +53,8 @@ void test_width_estimation() { constexpr test_case test_cases[] = { {TYPED_LITERAL(CharT, "\x58"), 1}, {TYPED_LITERAL(CharT, "x\x58"), 2}, - // test the boundaries of the intervals defined in n4885 [format.str.std]/11 + + // test the boundaries of the intervals defined in n4885 [format.string.std]/11 {TYPED_LITERAL(CharT, "\u10ff\x58"), 2}, {TYPED_LITERAL(CharT, "\u1100\x58"), 3}, {TYPED_LITERAL(CharT, "\u115f\x58"), 3}, @@ -74,7 +75,9 @@ void test_width_estimation() { {TYPED_LITERAL(CharT, "\ud7a3\x58"), 3}, {TYPED_LITERAL(CharT, "\ud7a4\x58"), 2}, {TYPED_LITERAL(CharT, "\ud7ff\x58"), 2}, + // skip over the surrogate pair range (\ud800-\udfff) + {TYPED_LITERAL(CharT, "\ue000\x58"), 2}, {TYPED_LITERAL(CharT, "\uf8ff\x58"), 2}, {TYPED_LITERAL(CharT, "\uf900\x58"), 3}, @@ -114,9 +117,9 @@ void test_width_estimation() { {TYPED_LITERAL(CharT, "\U0003fffe\x58"), 2}, {TYPED_LITERAL(CharT, "\U0010ffff\x58"), 2}, - // Pick a "short" and "long" codepoints (\u2000 and \ufe40), then form - // all permutations of 3-codepoint prefixes. This gives us coverage of - // all transitions (e.g. short-to-long, long-to-long). + // Pick "short" and "long" codepoints (\u2000 and \ufe40), then form all permutations of + // 3-codepoint prefixes with the same fixed delimiter as above. This gives us coverage of + // all adjacent pairings (short/short, short/long, long/short, long/long). {TYPED_LITERAL(CharT, "\u2000\u2000\u2000\x58"), 4}, {TYPED_LITERAL(CharT, "\u2000\u2000\ufe40\x58"), 5}, {TYPED_LITERAL(CharT, "\u2000\ufe40\u2000\x58"), 5},