Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

<format>: Compile-time string size estimation #2437

Closed
256 changes: 228 additions & 28 deletions stl/inc/format
Original file line number Diff line number Diff line change
Expand Up @@ -2763,12 +2763,39 @@ struct _Arg_formatter {
}
};

enum class _String_type : uint8_t { _Not_a_string, _Known_size_string, _Nullterminated_string };

template <class _Ty>
inline constexpr _String_type _String_type_of = _String_type::_Not_a_string;

template <_Format_supported_charT _CharT, class _Traits>
inline constexpr _String_type _String_type_of<basic_string_view<_CharT, _Traits>> = _String_type::_Known_size_string;

template <_Format_supported_charT _CharT, class _Traits, class _Allocator>
inline constexpr _String_type _String_type_of<basic_string<_CharT, _Traits, _Allocator>> =
_String_type::_Known_size_string;

template <_Format_supported_charT _CharT>
inline constexpr _String_type _String_type_of<_CharT*> = _String_type::_Nullterminated_string;

template <_Format_supported_charT _CharT>
inline constexpr _String_type _String_type_of<const _CharT*> = _String_type::_Nullterminated_string;

template <_Format_supported_charT _CharT, size_t _Nx>
inline constexpr _String_type _String_type_of<const _CharT[_Nx]> = _String_type::_Nullterminated_string;

template <class _Ty>
concept _Derived_from_formatter_base = requires {
typename _Ty::_Is_formatter_base_specialization;
};

// Special compile time version of _Parse_format_specs. This version is parameterized on
// the type of the argument associated with the format specifier, since we don't really
// care about avoiding code bloat for code that never runs at runtime, and we can't form
// the erased basic_format_args structure at compile time.
template <class _Ty, class _ParseContext>
consteval typename _ParseContext::iterator _Compile_time_parse_format_specs(_ParseContext& _Pc) {
consteval pair<typename _ParseContext::iterator, _Dynamic_format_specs<typename _ParseContext::char_type>>
_Compile_time_parse_format_specs(_ParseContext& _Pc) {
using _CharT = typename _ParseContext::char_type;
using _Context = basic_format_context<back_insert_iterator<_Fmt_buffer<_CharT>>, _CharT>;
using _ArgTraits = _Format_arg_traits<_Context>;
Expand All @@ -2779,31 +2806,91 @@ consteval typename _ParseContext::iterator _Compile_time_parse_format_specs(_Par
using _FormattedType = conditional_t<is_same_v<_FormattedTypeMapping, typename basic_format_arg<_Context>::handle>,
_Ty, _FormattedTypeMapping>;
formatter<_FormattedType, _CharT> _Formatter{};
return _Formatter.parse(_Pc);
auto _Iter = _Formatter.parse(_Pc);
if constexpr (_Derived_from_formatter_base<formatter<_FormattedType, _CharT>>) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

User defined formatters that derive from built-in ones don't nessassarly actually populate their format specs, they are not obligated to call their base parse method (though, this is somewhat of an odd formatter)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If user defined formatter never calls parse then _Specs will be in default constructed state, which is what this function returns for formatters that do not derive from _Formatter_base anyway. If the user derives from standard formatter and calls parse then _Specs can help in estimation. This is useful for users' enum formatters which often derive from basic_string_view formatter to have width and precision handled automatically.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if they call parse but don't actually output the implied number of characters in format?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After realizing my mistake with the flow through _On_format_specs I think things are OK even if they lie

return {_Iter, _Formatter._Specs};
AdamBucior marked this conversation as resolved.
Show resolved Hide resolved
} else {
return {_Iter, {}};
}
}

// set of format parsing actions that only checks for validity
template <class _CharT, class... _Args>
struct _Format_checker {
using _ParseContext = basic_format_parse_context<_CharT>;
using _ParseFunc = typename _ParseContext::iterator (*)(_ParseContext&);
using _ParseFunc = pair<typename _ParseContext::iterator, _Dynamic_format_specs<_CharT>> (*)(_ParseContext&);

static constexpr size_t _Num_args = sizeof...(_Args);
static constexpr size_t _Num_args = sizeof...(_Args);
static constexpr size_t _Num_args_non_zero = _Num_args > 0 ? _Num_args : 1;
_ParseContext _Parse_context;
_ParseFunc _Parse_funcs[_Num_args > 0 ? _Num_args : 1];
_ParseFunc _Parse_funcs[_Num_args_non_zero];
size_t _Arg_use_count[_Num_args_non_zero]{}; // only used for strings
_String_type _Arg_type[_Num_args_non_zero];
size_t _Estimated_size = 0;
bool _Is_estimation_exact = true;

consteval explicit _Format_checker(basic_string_view<_CharT> _Fmt) noexcept
: _Parse_context(_Fmt, _Num_args), _Parse_funcs{&_Compile_time_parse_format_specs<_Args, _ParseContext>...} {}
constexpr void _On_text(const _CharT*, const _CharT*) const noexcept {}
constexpr void _On_replacement_field(size_t, const _CharT*) const noexcept {}
: _Parse_context(_Fmt, _Num_args), _Parse_funcs{&_Compile_time_parse_format_specs<_Args, _ParseContext>...},
_Arg_type{_String_type_of<_Args>...} {}

constexpr void _On_text(const _CharT* _First, const _CharT* _Last) noexcept {
_Estimated_size += _Last - _First;
}

constexpr void _On_replacement_field(const size_t _Id, const _CharT*) noexcept {
if (_Arg_type[_Id] == _String_type::_Known_size_string
|| _Arg_type[_Id] == _String_type::_Nullterminated_string) {
// if type of the argument is a string we will add the length of it later and the size will remain exact
++_Arg_use_count[_Id];
} else {
_Estimated_size += 8; // estimate for length of all other arguments
_Is_estimation_exact = false;
}
}

constexpr const _CharT* _On_format_specs(const size_t _Id, const _CharT* _First, const _CharT*) {
_Parse_context.advance_to(_Parse_context.begin() + (_First - _Parse_context.begin()._Unwrapped()));
if (_Id < _Num_args) {
auto _Iter = _Parse_funcs[_Id](_Parse_context); // TRANSITION, VSO-1451773 (workaround: named variable)
return _Iter._Unwrapped();
} else {
if (_Id >= _Num_args) {
return _First;
}

auto [_Iter, _Specs] = _Parse_funcs[_Id](_Parse_context);
if (_Arg_type[_Id] == _String_type::_Nullterminated_string) {
if (_Specs._Precision >= 0) {
// if precision (the maximum length) is specified as a constant, add it
_Estimated_size += _Specs._Precision;
_Is_estimation_exact = false;
} else if (_Specs._Width > 0) {
// otherwise, if width (the minimum length) is specified as a constant, add it
_Estimated_size += _Specs._Width;
_Is_estimation_exact = false;
} else if (_Specs._Dynamic_precision_index >= 0) {
// if precision is dynamic we can't really predict so let's estimate it to 32
_Estimated_size += 32;
_Is_estimation_exact = false;
} else if (_Specs._Dynamic_width_index >= 0) {
// if precision is not specified and width is dynamic we will calculate the length of the
// argument and add it to estimation
++_Arg_use_count[_Id];
_Is_estimation_exact = false;
} else {
// if precision and width are not specified we will calculate the length of the argument and add
// it to estimation. The estimation will remain exact
++_Arg_use_count[_Id];
}
} else if (_Arg_type[_Id] == _String_type::_Known_size_string) {
// if the length of the string is known we will add it to estimation
++_Arg_use_count[_Id];
if (_Specs._Precision >= 0 || _Specs._Width > 0 || _Specs._Dynamic_precision_index >= 0
|| _Specs._Dynamic_width_index >= 0) {
_Is_estimation_exact = false;
}
} else {
// for all other arguments use the largest of precision, width, and 8
_Estimated_size += (_STD max)((_STD max)(_Specs._Precision, _Specs._Width), 8);
_Is_estimation_exact = false;
}
return _Iter._Unwrapped();
}
};

Expand Down Expand Up @@ -2869,6 +2956,8 @@ template <class _Ty, class _CharT, _Basic_format_arg_type _ArgType>
struct _Formatter_base {
using _Pc = basic_format_parse_context<_CharT>;

using _Is_formatter_base_specialization = void;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should really check for formatters that we control not just any derived from _Formatter_base

Also if we want to check for children of _formatter_base then why can't we just use is_base_of?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's useful to account for width and precision from user defined formatters that derive from standard formatters. See #2437 (comment). Also can't use is_base_of because _Formatter_base is a template.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, the way to go is to do the _From_primary thing that allocator does.

I guess I agree that it's useful, but it adds a footgun if we keep the exact estimation paths

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess I agree that it's useful, but it adds a footgun if we keep the exact estimation paths

I don't think users can specialize formatter for basic_string_view, basic_string or (const) char* so there isn't really any footgun.


constexpr typename _Pc::iterator parse(_Pc& _ParseCtx) {
_Specs_checker<_Dynamic_specs_handler<_Pc>> _Handler(_Dynamic_specs_handler<_Pc>{_Specs, _ParseCtx}, _ArgType);
const auto _It = _Parse_format_specs(_ParseCtx._Unchecked_begin(), _ParseCtx._Unchecked_end(), _Handler);
Expand Down Expand Up @@ -2898,6 +2987,10 @@ struct _Formatter_base {

private:
AdamBucior marked this conversation as resolved.
Show resolved Hide resolved
_Dynamic_format_specs<_CharT> _Specs;

template <class _Ty2, class _ParseContext>
friend consteval pair<typename _ParseContext::iterator, _Dynamic_format_specs<typename _ParseContext::char_type>>
_Compile_time_parse_format_specs(_ParseContext& _Pc);
};

#define _FORMAT_SPECIALIZE_FOR(_Type, _ArgType) \
Expand Down Expand Up @@ -2952,15 +3045,54 @@ struct formatter<basic_string_view<_CharT, _Traits>, _CharT>

template <class _CharT, class... _Args>
struct _Basic_format_string {
static constexpr size_t _Num_args = sizeof...(_Args);

basic_string_view<_CharT> _Str;
size_t _Arg_use_count[_Num_args > 0 ? _Num_args : 1]{};
size_t _Estimated_size = 0;
bool _Is_estimation_exact = false;

template <class _Ty>
requires convertible_to<const _Ty&, basic_string_view<_CharT>>
consteval _Basic_format_string(const _Ty& _Str_val) : _Str(_Str_val) {
if (_Is_execution_charset_self_synchronizing()) {
#ifndef __clang__ // TRANSITION, clang consteval bug (likely LLVM-52648)
_Format_checker<_CharT, remove_cvref_t<_Args>...> _Handler{_Str};
_Parse_format_string(_Str, _Handler);
_RANGES copy(_Handler._Arg_use_count, _Arg_use_count);
_Estimated_size = _Handler._Estimated_size;
_Is_estimation_exact = _Handler._Is_estimation_exact;
#else // ^^^ no workaround ^^^ / vvv workaround vvv
_Parse_format_string(_Str, _Format_checker<_CharT, remove_cvref_t<_Args>...>{_Str});
_Estimated_size = _Str.size() + _Num_args * 8;
AdamBucior marked this conversation as resolved.
Show resolved Hide resolved
_RANGES fill(_Arg_use_count, static_cast<size_t>(1));
#endif // ^^^ workaround ^^^
} else {
// fallback to assumption that all arguments are used once
_Estimated_size = _Str.size() + _Num_args * 8;
_RANGES fill(_Arg_use_count, static_cast<size_t>(1));
}
}

template <size_t... _Ids>
_NODISCARD constexpr size_t _Estimate_required_capacity_helper(
const _Args&... _Arg_values, integer_sequence<size_t, _Ids...>) const noexcept {
const auto _Visitor = [this]<class _ArgTy>(const _ArgTy& _Arg, size_t _Id) noexcept {
if constexpr (_String_type_of<_ArgTy> == _String_type::_Known_size_string) {
return _Arg_use_count[_Id] * _Arg.size();
} else if constexpr (_String_type_of<_ArgTy> == _String_type::_Nullterminated_string) {
// don't bother with calculating the length if we don't need to
return _Arg_use_count[_Id] > 0 ? _Arg_use_count[_Id] * char_traits<_CharT>::length(_Arg) : 0;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this might be slower than just doing the formatting and counting. But maybe it's still worth it.

} else {
return 0;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we estimating zero for non-strings?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For non-strings estimation has been done before:

STL/stl/inc/format

Lines 2876 to 2877 in 3d74470

// for all other arguments use the largest of precision, width and 8
_Estimated_size += (_STD max)((_STD max)(_Specs._Precision, _Specs._Width), 8);

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, it's still somewhat confusing to split the estimation up like this, but I can see why it's done.

}
};
return (_Estimated_size + ... + _Visitor(_Arg_values, _Ids));
}

_NODISCARD constexpr size_t _Estimate_required_capacity(const _Args&... _Arg_values) const noexcept {
return _Estimate_required_capacity_helper(_Arg_values..., index_sequence_for<_Args...>{});
}
};

template <class... _Args>
Expand Down Expand Up @@ -3092,22 +3224,74 @@ _NODISCARD wstring vformat(const locale& _Loc, const wstring_view _Fmt, const wf

template <class... _Types>
_NODISCARD string format(const _Fmt_string<_Types...> _Fmt, _Types&&... _Args) {
return _STD vformat(_Fmt._Str, _STD make_format_args(_Args...));
const size_t _Estimated_size = _Fmt._Estimate_required_capacity(_Args...);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would be better with a helper _Vformat that takes the estimated capacity.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Once that's done maybe this stuff should be done through _Fmt_iterator_buffer

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't understand what you mean. How exactly should this be restructured?

string _Str;
if (_Fmt._Is_estimation_exact) {
// The Standard does not allow op passed to resize_and_overwrite to throw an exception, but it's not a concern
// for our implementation
_Str._Resize_and_overwrite(_Estimated_size, [&](char* _Out, size_t _Size) {
_STD format_to(_Out, _Fmt, _STD forward<_Types>(_Args)...);
return _Size;
});
} else {
_Str.reserve(_Estimated_size);
_STD format_to(back_insert_iterator{_Str}, _Fmt, _STD forward<_Types>(_Args)...);
}
return _Str;
}

template <class... _Types>
_NODISCARD wstring format(const _Fmt_wstring<_Types...> _Fmt, _Types&&... _Args) {
return _STD vformat(_Fmt._Str, _STD make_wformat_args(_Args...));
const size_t _Estimated_size = _Fmt._Estimate_required_capacity(_Args...);
wstring _Str;
if (_Fmt._Is_estimation_exact) {
// The Standard does not allow op passed to resize_and_overwrite to throw an exception, but it's not a concern
// for our implementation
_Str._Resize_and_overwrite(_Estimated_size, [&](wchar_t* _Out, size_t _Size) {
_STD format_to(_Out, _Fmt, _STD forward<_Types>(_Args)...);
return _Size;
});
} else {
_Str.reserve(_Estimated_size);
_STD format_to(back_insert_iterator{_Str}, _Fmt, _STD forward<_Types>(_Args)...);
}
return _Str;
}

template <class... _Types>
_NODISCARD string format(const locale& _Loc, const _Fmt_string<_Types...> _Fmt, _Types&&... _Args) {
return _STD vformat(_Loc, _Fmt._Str, _STD make_format_args(_Args...));
const size_t _Estimated_size = _Fmt._Estimate_required_capacity(_Args...);
string _Str;
if (_Fmt._Is_estimation_exact) {
// The Standard does not allow op passed to resize_and_overwrite to throw an exception, but it's not a concern
// for our implementation
_Str._Resize_and_overwrite(_Estimated_size, [&](char* _Out, size_t _Size) {
_STD format_to(_Out, _Loc, _Fmt, _STD forward<_Types>(_Args)...);
return _Size;
});
} else {
_Str.reserve(_Estimated_size);
_STD format_to(back_insert_iterator{_Str}, _Loc, _Fmt, _STD forward<_Types>(_Args)...);
}
return _Str;
}

template <class... _Types>
_NODISCARD wstring format(const locale& _Loc, const _Fmt_wstring<_Types...> _Fmt, _Types&&... _Args) {
return _STD vformat(_Loc, _Fmt._Str, _STD make_wformat_args(_Args...));
const size_t _Estimated_size = _Fmt._Estimate_required_capacity(_Args...);
wstring _Str;
if (_Fmt._Is_estimation_exact) {
// The Standard does not allow op passed to resize_and_overwrite to throw an exception, but it's not a concern
// for our implementation
_Str._Resize_and_overwrite(_Estimated_size, [&](wchar_t* _Out, size_t _Size) {
_STD format_to(_Out, _Loc, _Fmt, _STD forward<_Types>(_Args)...);
return _Size;
});
} else {
_Str.reserve(_Estimated_size);
_STD format_to(back_insert_iterator{_Str}, _Loc, _Fmt, _STD forward<_Types>(_Args)...);
}
return _Str;
}

template <class _OutputIt>
Expand Down Expand Up @@ -3150,30 +3334,46 @@ format_to_n_result<_OutputIt> format_to_n(_OutputIt _Out, const iter_difference_

template <class... _Types>
_NODISCARD size_t formatted_size(const _Fmt_string<_Types...> _Fmt, _Types&&... _Args) {
_Fmt_counting_buffer<char> _Buf;
_STD vformat_to(_Fmt_it{_Buf}, _Fmt._Str, _STD make_format_args(_Args...));
return _Buf._Count();
if (_Fmt._Is_estimation_exact) {
return _Fmt._Estimate_required_capacity(_Args...);
} else {
_Fmt_counting_buffer<char> _Buf;
_STD vformat_to(_Fmt_it{_Buf}, _Fmt._Str, _STD make_format_args(_Args...));
return _Buf._Count();
}
}

template <class... _Types>
_NODISCARD size_t formatted_size(const _Fmt_wstring<_Types...> _Fmt, _Types&&... _Args) {
_Fmt_counting_buffer<wchar_t> _Buf;
_STD vformat_to(_Fmt_wit{_Buf}, _Fmt._Str, _STD make_wformat_args(_Args...));
return _Buf._Count();
if (_Fmt._Is_estimation_exact) {
return _Fmt._Estimate_required_capacity(_Args...);
} else {
_Fmt_counting_buffer<wchar_t> _Buf;
_STD vformat_to(_Fmt_wit{_Buf}, _Fmt._Str, _STD make_wformat_args(_Args...));
return _Buf._Count();
}
}

template <class... _Types>
_NODISCARD size_t formatted_size(const locale& _Loc, const _Fmt_string<_Types...> _Fmt, _Types&&... _Args) {
_Fmt_counting_buffer<char> _Buf;
_STD vformat_to(_Fmt_it{_Buf}, _Loc, _Fmt._Str, _STD make_format_args(_Args...));
return _Buf._Count();
if (_Fmt._Is_estimation_exact) {
return _Fmt._Estimate_required_capacity(_Args...);
} else {
_Fmt_counting_buffer<char> _Buf;
_STD vformat_to(_Fmt_it{_Buf}, _Loc, _Fmt._Str, _STD make_format_args(_Args...));
return _Buf._Count();
}
}

template <class... _Types>
_NODISCARD size_t formatted_size(const locale& _Loc, const _Fmt_wstring<_Types...> _Fmt, _Types&&... _Args) {
_Fmt_counting_buffer<wchar_t> _Buf;
_STD vformat_to(_Fmt_wit{_Buf}, _Loc, _Fmt._Str, _STD make_wformat_args(_Args...));
return _Buf._Count();
if (_Fmt._Is_estimation_exact) {
return _Fmt._Estimate_required_capacity(_Args...);
} else {
_Fmt_counting_buffer<wchar_t> _Buf;
_STD vformat_to(_Fmt_wit{_Buf}, _Loc, _Fmt._Str, _STD make_wformat_args(_Args...));
return _Buf._Count();
}
}

_STD_END
Expand Down
9 changes: 7 additions & 2 deletions stl/inc/xstring
Original file line number Diff line number Diff line change
Expand Up @@ -3959,9 +3959,8 @@ public:
}
}

#if _HAS_CXX23
template <class _Operation>
constexpr void resize_and_overwrite(_CRT_GUARDOVERFLOW const size_type _New_size, _Operation _Op) {
_CONSTEXPR20 void _Resize_and_overwrite(_CRT_GUARDOVERFLOW const size_type _New_size, _Operation _Op) {
if (_Mypair._Myval2._Myres < _New_size) {
_Reallocate_grow_by(_New_size - _Mypair._Myval2._Mysize,
[](_Elem* const _New_ptr, const _Elem* const _Old_ptr, const size_type _Old_size) {
Expand All @@ -3976,6 +3975,12 @@ public:
#endif // _CONTAINER_DEBUG_LEVEL > 0
_Eos(_Result_size);
}

#if _HAS_CXX23
template <class _Operation>
constexpr void resize_and_overwrite(_CRT_GUARDOVERFLOW const size_type _New_size, _Operation _Op) {
_Resize_and_overwrite(_New_size, _STD move(_Op));
}
#endif // _HAS_CXX23

_NODISCARD _CONSTEXPR20 size_type capacity() const noexcept {
Expand Down
3 changes: 3 additions & 0 deletions tests/std/tests/P0645R10_text_formatting_formatting/env.lst
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

RUNALL_INCLUDE ..\concepts_20_matrix.lst
RUNALL_CROSSLIST
PM_CL=""
PM_CL="/utf-8"
Loading