microsoft · StephanTLavavej · Apr 20, 2021 · Apr 13, 2021 · Apr 14, 2021 · Apr 14, 2021
@@ -462,7 +462,7 @@ _NODISCARD constexpr int _Unicode_width_estimate(const char32_t _Ch) noexcept {
         if (_Ch < _Bound) {
             return _Result;
         }
-        _Result ^= 1;
+        _Result ^= 0b11u; // Flip between 1 and 2 on each iteration
     }
 
     return 1;
@@ -564,7 +564,7 @@ private:
     }
 
 public:
-    _NODISCARD int _Units_in_next_character(const char* _First, const char* _Last) const noexcept {
+    _NODISCARD int _Units_in_next_character(const char* const _First, const char* const _Last) const noexcept {
         // Returns a count of the number of code units that compose the first encoded character in
         // [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or
         // *_First is not a valid lead byte.
@@ -589,7 +589,7 @@ public:
         }
     }
 
-    _NODISCARD const char* _Find_encoded(const char* _First, const char* _Last, const char _Val) const {
+    _NODISCARD const char* _Find_encoded(const char* _First, const char* const _Last, const char _Val) const {
         // Returns the first occurrence of _Val as an encoded character (and not, for example, as a
         // continuation byte) in [_First, _Last).
         if constexpr (_Statically_Utf8) {
@@ -632,7 +632,7 @@ public:
 template <bool _Statically_Utf8>
 class _Fmt_codec<wchar_t, _Statically_Utf8> {
 public:
-    _NODISCARD int _Units_in_next_character(const wchar_t* _First, const wchar_t* _Last) const noexcept {
+    _NODISCARD int _Units_in_next_character(const wchar_t* _First, const wchar_t* const _Last) const noexcept {
         // Returns a count of the number of code units that compose the first encoded character in
         // [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or
         // *_First is an unpaired surrogate.
@@ -653,7 +653,8 @@ public:
         return 2; // surrogate pair
     }
 
-    _NODISCARD const wchar_t* _Find_encoded(const wchar_t* _First, const wchar_t* _Last, const wchar_t _Val) const {
+    _NODISCARD const wchar_t* _Find_encoded(
+        const wchar_t* const _First, const wchar_t* const _Last, const wchar_t _Val) const {
         return _Find_unchecked(_First, _Last, _Val);
     }
 
@@ -666,7 +667,7 @@ public:
         }
 
         // surrogate pair
-        _Ch = (_Ch - 0xD8000u) << 10;
+        _Ch = (_Ch - 0xD800u) << 10;
         _Ch += static_cast<char32_t>(_Ptr[1]) - 0xDC00u;
         _Ch += 0x10000u;
         return _Unicode_width_estimate<_Width_estimate_high_intervals>(_Ch);

@@ -50,7 +50,36 @@ void test_parse_align() {
         {.expected_alignment = _Fmt_align::_Center, .expected_fill = "\x92\x6e"sv});
 }
 
+void test_width_estimation() {
+    // Format strings of known width with a trailing delimiter using a precision large enough to
+    // include all but the delimiter to validate the width estimation code.
+    struct test_case {
+        const char* str;
+        int width;
+    };
+    constexpr test_case test_cases[] = {
+        // Pick a "short" and "long" codepoints (\x20 and \x96\x7b), then form
+        // all permutations of 3-codepoint prefixes. This gives us coverage of
+        // all transitions (e.g. short-to-long, long-to-long).
+        {"\x20\x20\x20\x58", 4},
+        {"\x20\x20\x96\x7b\x58", 5},
+        {"\x20\x96\x7b\x20\x58", 5},
+        {"\x96\x7b\x20\x20\x58", 5},
+        {"\x20\x96\x7b\x96\x7b\x58", 6},
+        {"\x96\x7b\x20\x96\x7b\x58", 6},
+        {"\x96\x7b\x96\x7b\x20\x58", 6},
+        {"\x96\x7b\x96\x7b\x96\x7b\x58", 7},
+    };
+
+    for (const auto& test : test_cases) {
+        basic_string_view sv{test.str};
+        sv = sv.substr(0, sv.size() - 1);
+        assert(format("{:.{}}", test.str, test.width - 1) == sv);
+    }
+}
+
 int main() {
     test_multibyte_format_strings();
     test_parse_align();
+    test_width_estimation();
 }
@@ -1,8 +1,6 @@
 // Copyright (c) Microsoft Corporation.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#define _FORMAT_CODEPAGE __std_code_page::_Utf8
-
 #include <cassert>
 #include <clocale>
 #include <format>
@@ -14,12 +12,12 @@ using namespace std;
 
 void test_multibyte_format_strings() {
     {
-        // Filling with footballs ("\xf0\x9f\x8f\x88" is U+1F3C8 AMERICAN FOOTBALL)
-        assert(format("{:\xf0\x9f\x8f\x88>4}"sv, 42) == "\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\x34\x32");
+        // Filling with footballs ("\U0001f3c8" is U+1F3C8 AMERICAN FOOTBALL)
+        assert(format("{:\U0001f3c8>4}"sv, 42) == "\U0001f3c8\U0001f3c8\x34\x32");
 
-        assert(format("{:\xf0\x9f\x8f\x88<4.2}", "1") == "\x31\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88"sv);
-        assert(format("{:\xf0\x9f\x8f\x88^4.2}", "1") == "\xf0\x9f\x8f\x88\x31\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88"sv);
-        assert(format("{:\xf0\x9f\x8f\x88>4.2}", "1") == "\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\x31"sv);
+        assert(format("{:\U0001f3c8<4.2}", "1") == "\x31\U0001f3c8\U0001f3c8\U0001f3c8"sv);
+        assert(format("{:\U0001f3c8^4.2}", "1") == "\U0001f3c8\x31\U0001f3c8\U0001f3c8"sv);
+        assert(format("{:\U0001f3c8>4.2}", "1") == "\U0001f3c8\U0001f3c8\U0001f3c8\x31"sv);
     }
 
     {
@@ -35,19 +33,112 @@ void test_parse_align() {
     auto parse_align_fn = _Parse_align<char, testing_callbacks<char>>;
 
     {
-        // "\xf0\x9f\x8f\x88" is U+1F3C8 AMERICAN FOOTBALL
-        test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88<X"sv, false, 5,
-            {.expected_alignment = _Fmt_align::_Left, .expected_fill = "\xf0\x9f\x8f\x88"sv});
-        test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88>X"sv, false, 5,
-            {.expected_alignment = _Fmt_align::_Right, .expected_fill = "\xf0\x9f\x8f\x88"sv});
-        test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88^X"sv, false, 5,
-            {.expected_alignment = _Fmt_align::_Center, .expected_fill = "\xf0\x9f\x8f\x88"sv});
+        test_parse_helper(parse_align_fn, "\U0001f3c8<X"sv, false, 5,
+            {.expected_alignment = _Fmt_align::_Left, .expected_fill = "\U0001f3c8"sv});
+        test_parse_helper(parse_align_fn, "\U0001f3c8>X"sv, false, 5,
+            {.expected_alignment = _Fmt_align::_Right, .expected_fill = "\U0001f3c8"sv});
+        test_parse_helper(parse_align_fn, "\U0001f3c8^X"sv, false, 5,
+            {.expected_alignment = _Fmt_align::_Center, .expected_fill = "\U0001f3c8"sv});
+    }
+}
+
+template <class CharT>
+void test_width_estimation() {
+    // Format strings of known width with a trailing delimiter using a precision large enough to
+    // include all but the delimiter to validate the width estimation code.
+    struct test_case {
+        const CharT* str;
+        int width;
+    };
+    constexpr test_case test_cases[] = {
+        {TYPED_LITERAL(CharT, "\x58"), 1},
+        {TYPED_LITERAL(CharT, "x\x58"), 2},
+        // test the boundaries of the intervals defined in n4885 [format.str.std]/11
+        {TYPED_LITERAL(CharT, "\u10ff\x58"), 2},
+        {TYPED_LITERAL(CharT, "\u1100\x58"), 3},
+        {TYPED_LITERAL(CharT, "\u115f\x58"), 3},
+        {TYPED_LITERAL(CharT, "\u1160\x58"), 2},
+        {TYPED_LITERAL(CharT, "\u2328\x58"), 2},
+        {TYPED_LITERAL(CharT, "\u2329\x58"), 3},
+        {TYPED_LITERAL(CharT, "\u232a\x58"), 3},
+        {TYPED_LITERAL(CharT, "\u232b\x58"), 2},
+        {TYPED_LITERAL(CharT, "\u2e7f\x58"), 2},
+        {TYPED_LITERAL(CharT, "\u2e80\x58"), 3},
+        {TYPED_LITERAL(CharT, "\u303e\x58"), 3},
+        {TYPED_LITERAL(CharT, "\u303f\x58"), 2},
+        {TYPED_LITERAL(CharT, "\u3040\x58"), 3},
+        {TYPED_LITERAL(CharT, "\ua4cf\x58"), 3},
+        {TYPED_LITERAL(CharT, "\ua4d0\x58"), 2},
+        {TYPED_LITERAL(CharT, "\uabff\x58"), 2},
+        {TYPED_LITERAL(CharT, "\uac00\x58"), 3},
+        {TYPED_LITERAL(CharT, "\ud7a3\x58"), 3},
+        {TYPED_LITERAL(CharT, "\ud7a4\x58"), 2},
+        {TYPED_LITERAL(CharT, "\ud7ff\x58"), 2},
+        // skip over the surrogate pair range (\ud800-\udfff)
+        {TYPED_LITERAL(CharT, "\ue000\x58"), 2},
+        {TYPED_LITERAL(CharT, "\uf8ff\x58"), 2},
+        {TYPED_LITERAL(CharT, "\uf900\x58"), 3},
+        {TYPED_LITERAL(CharT, "\ufaff\x58"), 3},
+        {TYPED_LITERAL(CharT, "\ufb00\x58"), 2},
+        {TYPED_LITERAL(CharT, "\ufe0f\x58"), 2},
+        {TYPED_LITERAL(CharT, "\ufe10\x58"), 3},
+        {TYPED_LITERAL(CharT, "\ufe19\x58"), 3},
+        {TYPED_LITERAL(CharT, "\ufe1a\x58"), 2},
+        {TYPED_LITERAL(CharT, "\ufe2f\x58"), 2},
+        {TYPED_LITERAL(CharT, "\ufe30\x58"), 3},
+        {TYPED_LITERAL(CharT, "\ufe6f\x58"), 3},
+        {TYPED_LITERAL(CharT, "\ufe70\x58"), 2},
+        {TYPED_LITERAL(CharT, "\ufeff\x58"), 2},
+        {TYPED_LITERAL(CharT, "\uff00\x58"), 3},
+        {TYPED_LITERAL(CharT, "\uff60\x58"), 3},
+        {TYPED_LITERAL(CharT, "\uff61\x58"), 2},
+        {TYPED_LITERAL(CharT, "\uffdf\x58"), 2},
+        {TYPED_LITERAL(CharT, "\uffe0\x58"), 3},
+        {TYPED_LITERAL(CharT, "\uffe6\x58"), 3},
+        {TYPED_LITERAL(CharT, "\uffe7\x58"), 2},
+        {TYPED_LITERAL(CharT, "\U0001f2ff\x58"), 2},
+        {TYPED_LITERAL(CharT, "\U0001f300\x58"), 3},
+        {TYPED_LITERAL(CharT, "\U0001f64f\x58"), 3},
+        {TYPED_LITERAL(CharT, "\U0001f650\x58"), 2},
+        {TYPED_LITERAL(CharT, "\U0001f8ff\x58"), 2},
+        {TYPED_LITERAL(CharT, "\U0001f900\x58"), 3},
+        {TYPED_LITERAL(CharT, "\U0001f9ff\x58"), 3},
+        {TYPED_LITERAL(CharT, "\U0001fa00\x58"), 2},
+        {TYPED_LITERAL(CharT, "\U0001ffff\x58"), 2},
+        {TYPED_LITERAL(CharT, "\U00020000\x58"), 3},
+        {TYPED_LITERAL(CharT, "\U0002fffd\x58"), 3},
+        {TYPED_LITERAL(CharT, "\U0002fffe\x58"), 2},
+        {TYPED_LITERAL(CharT, "\U0002ffff\x58"), 2},
+        {TYPED_LITERAL(CharT, "\U00030000\x58"), 3},
+        {TYPED_LITERAL(CharT, "\U0003fffd\x58"), 3},
+        {TYPED_LITERAL(CharT, "\U0003fffe\x58"), 2},
+        {TYPED_LITERAL(CharT, "\U0010ffff\x58"), 2},
+
+        // Pick a "short" and "long" codepoints (\u2000 and \ufe40), then form
+        // all permutations of 3-codepoint prefixes. This gives us coverage of
+        // all transitions (e.g. short-to-long, long-to-long).
+        {TYPED_LITERAL(CharT, "\u2000\u2000\u2000\x58"), 4},
+        {TYPED_LITERAL(CharT, "\u2000\u2000\ufe40\x58"), 5},
+        {TYPED_LITERAL(CharT, "\u2000\ufe40\u2000\x58"), 5},
+        {TYPED_LITERAL(CharT, "\ufe40\u2000\u2000\x58"), 5},
+        {TYPED_LITERAL(CharT, "\u2000\ufe40\ufe40\x58"), 6},
+        {TYPED_LITERAL(CharT, "\ufe40\u2000\ufe40\x58"), 6},
+        {TYPED_LITERAL(CharT, "\ufe40\ufe40\u2000\x58"), 6},
+        {TYPED_LITERAL(CharT, "\ufe40\ufe40\ufe40\x58"), 7},
+    };
+
+    for (const auto& test : test_cases) {
+        basic_string_view sv{test.str};
+        sv = sv.substr(0, sv.size() - 1);
+        assert(format(TYPED_LITERAL(CharT, "{:.{}}"), test.str, test.width - 1) == sv);
     }
 }
 
 void run_tests() {
     test_multibyte_format_strings();
     test_parse_align();
+    test_width_estimation<char>();
+    test_width_estimation<wchar_t>();
 }
 
 int main() {