diff --git a/pxr/base/tf/testenv/unicodeUtils.cpp b/pxr/base/tf/testenv/unicodeUtils.cpp index 1e821a45a7..1c8f77d71c 100644 --- a/pxr/base/tf/testenv/unicodeUtils.cpp +++ b/pxr/base/tf/testenv/unicodeUtils.cpp @@ -24,14 +24,60 @@ #include "pxr/pxr.h" #include "pxr/base/tf/diagnosticLite.h" #include "pxr/base/tf/regTest.h" +#include "pxr/base/tf/stringUtils.h" #include "pxr/base/tf/unicodeUtils.h" #include #include +#include #include PXR_NAMESPACE_USING_DIRECTIVE +static bool +TestUtf8CodePoint() +{ + { + // Test default behavior + TF_AXIOM(TfUtf8CodePoint{} == TfUtf8InvalidCodePoint); + } + { + // Test boundary conditions + TF_AXIOM(TfUtf8CodePoint{0}.AsUInt32() == 0); + TF_AXIOM(TfUtf8CodePoint{TfUtf8CodePoint::MaximumValue}.AsUInt32() == + TfUtf8CodePoint::MaximumValue); + TF_AXIOM(TfUtf8CodePoint{TfUtf8CodePoint::MaximumValue + 1} == + TfUtf8InvalidCodePoint); + TF_AXIOM( + TfUtf8CodePoint{std::numeric_limits::max()} == + TfUtf8InvalidCodePoint); + TF_AXIOM(TfUtf8CodePoint{ + TfUtf8CodePoint::SurrogateRange.first - 1}.AsUInt32() == + TfUtf8CodePoint::SurrogateRange.first - 1); + TF_AXIOM(TfUtf8CodePoint{ + TfUtf8CodePoint::SurrogateRange.second + 1}.AsUInt32() == + TfUtf8CodePoint::SurrogateRange.second + 1); + TF_AXIOM(TfUtf8CodePoint{TfUtf8CodePoint::SurrogateRange.first} == + TfUtf8InvalidCodePoint); + TF_AXIOM(TfUtf8CodePoint{TfUtf8CodePoint::SurrogateRange.second} == + TfUtf8InvalidCodePoint); + TF_AXIOM(TfUtf8CodePoint{ + (TfUtf8CodePoint::SurrogateRange.second + + TfUtf8CodePoint::SurrogateRange.first) / 2} == + TfUtf8InvalidCodePoint); + } + { + // Test TfStringify + TF_AXIOM(TfStringify(TfUtf8CodePoint(97)) == "a"); + TF_AXIOM(TfStringify(TfUtf8CodePoint(8747)) == "∫"); + TF_AXIOM(TfStringify(TfUtf8InvalidCodePoint) == "�"); + TF_AXIOM(TfStringify(TfUtf8CodePoint{}) == + TfStringify(TfUtf8InvalidCodePoint)); + + } + return true; +} + static bool TestUtf8CodePointView() { @@ -53,7 +99,7 @@ TestUtf8CodePointView() TF_AXIOM(i1 == std::cend(u1)); for (const uint32_t codePoint : u1) { - TF_AXIOM(codePoint != TfUtf8CodePointIterator::INVALID_CODE_POINT); + TF_AXIOM(codePoint != TfUtf8InvalidCodePoint.AsUInt32()); } } @@ -67,7 +113,7 @@ TestUtf8CodePointView() TF_AXIOM(i2 == std::cend(u2)); for (const uint32_t codePoint : u2) { - TF_AXIOM(codePoint != TfUtf8CodePointIterator::INVALID_CODE_POINT); + TF_AXIOM(codePoint != TfUtf8InvalidCodePoint.AsUInt32()); } } @@ -98,7 +144,7 @@ TestUtf8CodePointView() TF_AXIOM(i3b == std::cend(u3)); for (const uint32_t codePoint : u3) { - TF_AXIOM(codePoint != TfUtf8CodePointIterator::INVALID_CODE_POINT); + TF_AXIOM(codePoint != TfUtf8InvalidCodePoint.AsUInt32()); } } @@ -112,8 +158,8 @@ TestUtf8CodePointView() std::array codePoints{0}; const std::array expectedCodePoints{{ - TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x61, 0x62, - TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x63}}; + TfUtf8InvalidCodePoint.AsUInt32(), 0x61, 0x62, + TfUtf8InvalidCodePoint.AsUInt32(), 0x63}}; std::copy(std::cbegin(uv), uv.EndAsIterator(), std::begin(codePoints)); TF_AXIOM(codePoints == expectedCodePoints); } @@ -127,16 +173,50 @@ TestUtf8CodePointView() std::array codePoints{0}; const std::array expectedCodePoints{{ - TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x61, - TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x62, - TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x63, - TfUtf8CodePointIterator::INVALID_CODE_POINT}}; + TfUtf8InvalidCodePoint.AsUInt32(), 0x61, + TfUtf8InvalidCodePoint.AsUInt32(), 0x62, + TfUtf8InvalidCodePoint.AsUInt32(), 0x63, + TfUtf8InvalidCodePoint.AsUInt32()}}; std::copy(std::cbegin(uv), uv.EndAsIterator(), std::begin(codePoints)); TF_AXIOM(codePoints == expectedCodePoints); } return true; } +/// Ensure that every code point can be serialized into a string and converted +/// back to a code point. +static bool +TestUtf8CodePointReflection() +{ + for (uint32_t value = 0; value <= TfUtf8CodePoint::MaximumValue; value++) { + if ((value < TfUtf8CodePoint::SurrogateRange.first) || + (value > TfUtf8CodePoint::SurrogateRange.second)) { + const TfUtf8CodePoint codePoint{value}; + TF_AXIOM(codePoint.AsUInt32() == value); + const std::string text{TfStringify(codePoint)}; + const auto view = TfUtf8CodePointView{text}; + TF_AXIOM(std::cbegin(view) != std::cend(view)); + TF_AXIOM(*std::cbegin(view) == codePoint.AsUInt32()); + TF_AXIOM(++std::cbegin(view) == std::cend(view)); + } + } + return true; +} + +/// Ensure that the surrogate range is replaced with the invalid character +static bool +TestUtf8CodePointSurrogateRange() +{ + for (uint32_t value = TfUtf8CodePoint::SurrogateRange.first; + value <= TfUtf8CodePoint::SurrogateRange.second; value++) { + const TfUtf8CodePoint surrogateCodePoint{value}; + TF_AXIOM(surrogateCodePoint == TfUtf8InvalidCodePoint); + TF_AXIOM(TfStringify(surrogateCodePoint) == + TfStringify(TfUtf8InvalidCodePoint)); + } + return true; +} + static bool TestCharacterClasses() { @@ -301,7 +381,11 @@ TestCharacterClasses() static bool Test_TfUnicodeUtils() { - return TestUtf8CodePointView() && TestCharacterClasses(); + return TestUtf8CodePoint() && + TestUtf8CodePointView() && + TestCharacterClasses() && + TestUtf8CodePointReflection() && + TestUtf8CodePointSurrogateRange(); } TF_ADD_REGTEST(TfUnicodeUtils); diff --git a/pxr/base/tf/unicodeUtils.cpp b/pxr/base/tf/unicodeUtils.cpp index 79339f5430..05dc282c6e 100644 --- a/pxr/base/tf/unicodeUtils.cpp +++ b/pxr/base/tf/unicodeUtils.cpp @@ -27,13 +27,50 @@ PXR_NAMESPACE_OPEN_SCOPE +std::ostream& +operator<<(std::ostream& stream, const TfUtf8CodePoint codePoint) +{ + const auto value = codePoint.AsUInt32(); + if (value < 0x80) + { + // 1-byte UTF-8 encoding + stream << static_cast(value); + } + else if (value < 0x800) + { + // 2-byte UTF-8 encoding + stream << (static_cast(static_cast((value >> 6) | 0xc0))); + stream << (static_cast(static_cast((value & 0x3f) | 0x80))); + } + else if (value < 0x10000) + { + // 3-byte UTF-8 encoding + stream << (static_cast(static_cast((value >> 12) | 0xe0))); + stream << (static_cast(static_cast(((value >> 6) & 0x3f) | 0x80))); + stream << (static_cast(static_cast((value & 0x3f) | 0x80))); + } + else if (value < 0x110000) + { + // 4-byte UTF-8 encoding + stream << (static_cast(static_cast((value >> 18) | 0xf0))); + stream << (static_cast(static_cast(((value >> 12) & 0x3f) | 0x80))); + stream << (static_cast(static_cast(((value >> 6) & 0x3f) | 0x80))); + stream << (static_cast(static_cast((value & 0x3f) | 0x80))); + } + else + { + stream << TfUtf8InvalidCodePoint; + } + return stream; +} + uint32_t TfUtf8CodePointIterator::_GetCodePoint() const { // determine what encoding length the character is _EncodingLength encodingLength = this->_GetEncodingLength(); if (encodingLength > std::distance(_it, _end)) { // error condition, would read bytes past the end of the range - return INVALID_CODE_POINT; + return TfUtf8InvalidCodePoint.AsUInt32(); } if (encodingLength == 1) { @@ -49,12 +86,12 @@ uint32_t TfUtf8CodePointIterator::_GetCodePoint() const if (byte1 < static_cast('\xc2') || byte1 > static_cast('\xdf')) { - return INVALID_CODE_POINT; + return TfUtf8InvalidCodePoint.AsUInt32(); } if (byte2 < static_cast('\x80') || byte2 > static_cast('\xbf')) { - return INVALID_CODE_POINT; + return TfUtf8InvalidCodePoint.AsUInt32(); } // the code point is constructed from the last 5 bits of byte1 @@ -77,7 +114,7 @@ uint32_t TfUtf8CodePointIterator::_GetCodePoint() const byte3 < static_cast('\x80') || byte3 > static_cast('\xbf')) { - return INVALID_CODE_POINT; + return TfUtf8InvalidCodePoint.AsUInt32(); } } else if ((byte1 >= static_cast('\xe1') && @@ -92,7 +129,7 @@ uint32_t TfUtf8CodePointIterator::_GetCodePoint() const byte3 < static_cast('\x80') || byte3 > static_cast('\xbf')) { - return INVALID_CODE_POINT; + return TfUtf8InvalidCodePoint.AsUInt32(); } } else if (byte1 == static_cast('\xed')) @@ -104,13 +141,13 @@ uint32_t TfUtf8CodePointIterator::_GetCodePoint() const byte3 < static_cast('\x80') || byte3 > static_cast('\xbf')) { - return INVALID_CODE_POINT; + return TfUtf8InvalidCodePoint.AsUInt32(); } } else { // byte 1 invalid - return INVALID_CODE_POINT; + return TfUtf8InvalidCodePoint.AsUInt32(); } // code point is constructed from the last 4 bits of byte1 @@ -137,7 +174,7 @@ uint32_t TfUtf8CodePointIterator::_GetCodePoint() const byte4 < static_cast('\x80') || byte4 > static_cast('\xbf')) { - return INVALID_CODE_POINT; + return TfUtf8InvalidCodePoint.AsUInt32(); } } else if (byte1 >= static_cast('\xf1') && @@ -153,7 +190,7 @@ uint32_t TfUtf8CodePointIterator::_GetCodePoint() const byte4 < static_cast('\x80') || byte4 > static_cast('\xbf')) { - return INVALID_CODE_POINT; + return TfUtf8InvalidCodePoint.AsUInt32(); } } else if (byte1 == static_cast('\xf4')) @@ -168,13 +205,13 @@ uint32_t TfUtf8CodePointIterator::_GetCodePoint() const byte4 < static_cast('\x80') || byte4 > static_cast('\xbf')) { - return INVALID_CODE_POINT; + return TfUtf8InvalidCodePoint.AsUInt32(); } } else { // byte 1 is invalid - return INVALID_CODE_POINT; + return TfUtf8InvalidCodePoint.AsUInt32(); } // code point is constructed from the last 3 bits of byte 1 @@ -182,7 +219,7 @@ uint32_t TfUtf8CodePointIterator::_GetCodePoint() const return ((byte1 & 0x7) << 18) + ((byte2 & 0x3f) << 12) + ((byte3 & 0x3f) << 6) + (byte4 & 0x3f); } - return INVALID_CODE_POINT; + return TfUtf8InvalidCodePoint.AsUInt32(); } PXR_NAMESPACE_CLOSE_SCOPE \ No newline at end of file diff --git a/pxr/base/tf/unicodeUtils.h b/pxr/base/tf/unicodeUtils.h index 8bce30c349..5dd1c7037f 100644 --- a/pxr/base/tf/unicodeUtils.h +++ b/pxr/base/tf/unicodeUtils.h @@ -32,11 +32,60 @@ #include "pxr/base/tf/diagnostic.h" #include "pxr/base/tf/unicodeCharacterClasses.h" +#include #include #include PXR_NAMESPACE_OPEN_SCOPE +/// Wrapper for a code point value that can be encoded as UTF-8 +class TfUtf8CodePoint { +public: + /// Code points that cannot be decoded or outside of the valid range are + /// may be replaced with this value. + static constexpr uint32_t ReplacementValue = 0xFFFD; + + /// Values higher than this will be replaced with the replacement + /// code point. + static constexpr uint32_t MaximumValue = 0x10FFFF; + + /// Values in this range (inclusive) cannot be constructed and will be + /// replaced by the replacement code point. + static constexpr std::pair + SurrogateRange = {0xD800, 0xDFFF}; + + /// Construct a code point initialized to the replacement value + constexpr TfUtf8CodePoint() = default; + + /// Construct a UTF-8 valued code point, constrained by the maximum value + /// and surrogate range. + constexpr explicit TfUtf8CodePoint(uint32_t value) : + _value(((value <= MaximumValue) && + ((value < SurrogateRange.first) || + (value > SurrogateRange.second))) ? + value : ReplacementValue) {} + + constexpr uint32_t AsUInt32() const { return _value; } + + friend constexpr bool operator==(const TfUtf8CodePoint left, + const TfUtf8CodePoint right) { + return left._value == right._value; + } + friend constexpr bool operator!=(const TfUtf8CodePoint left, + const TfUtf8CodePoint right) { + return left._value != right._value; + } + +private: + uint32_t _value{ReplacementValue}; +}; + +TF_API std::ostream& operator<<(std::ostream&, const TfUtf8CodePoint); + +/// The replacement code point can be used to signal that a code point could +/// not be decoded and needed to be replaced. +constexpr TfUtf8CodePoint TfUtf8InvalidCodePoint{}; + class TfUtf8CodePointIterator; /// Wrapper for a UTF-8 encoded `std::string_view` that can be iterated over @@ -49,7 +98,7 @@ class TfUtf8CodePointIterator; /// std::string value{"∫dx"}; /// TfUtf8CodePointView view{value}; /// for (const uint32_t codePoint : view) { -/// if (codePoint == TfTfUtf8CodePointIterator::INVALID_CODE_POINT) { +/// if (codePoint == TfUtf8InvalidCodePoint.AsUInt32()) { /// TF_WARN("String cannot be decoded."); /// } /// } @@ -125,11 +174,9 @@ class TfUtf8CodePointIterator final { using pointer = void; using reference = uint32_t; - static constexpr uint32_t INVALID_CODE_POINT = 0xFFFD; - /// Retrieves the next UTF-8 character in the sequence as its Unicode - /// code point value. Returns INVALID_CODE_POINT when the byte sequence - /// pointed to by the iterator cannot be decoded. + /// code point value. Returns TfUtf8InvalidCodePoint.AsUInt32() when the + /// byte sequence pointed to by the iterator cannot be decoded. /// /// If during read of the UTF-8 character sequence the underlying /// string iterator would go beyond \a end defined at construction