Skip to content

Commit

Permalink
Add class to facilitate serialization and validation of code points
Browse files Browse the repository at this point in the history
  • Loading branch information
nvmkuruc committed Dec 18, 2023
1 parent 7673eda commit 9a337ca
Show file tree
Hide file tree
Showing 3 changed files with 195 additions and 27 deletions.
104 changes: 94 additions & 10 deletions pxr/base/tf/testenv/unicodeUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,60 @@
#include "pxr/pxr.h"
#include "pxr/base/tf/diagnosticLite.h"
#include "pxr/base/tf/regTest.h"
#include "pxr/base/tf/stringUtils.h"
#include "pxr/base/tf/unicodeUtils.h"

#include <algorithm>
#include <array>
#include <limits>
#include <string_view>

PXR_NAMESPACE_USING_DIRECTIVE

static bool
TestUtf8CodePoint()
{
{
// Test default behavior
TF_AXIOM(TfUtf8CodePoint{} == TfUtf8InvalidCodePoint);
}
{
// Test boundary conditions
TF_AXIOM(TfUtf8CodePoint{0}.AsUInt32() == 0);
TF_AXIOM(TfUtf8CodePoint{TfUtf8CodePoint::MaximumValue}.AsUInt32() ==
TfUtf8CodePoint::MaximumValue);
TF_AXIOM(TfUtf8CodePoint{TfUtf8CodePoint::MaximumValue + 1} ==
TfUtf8InvalidCodePoint);
TF_AXIOM(
TfUtf8CodePoint{std::numeric_limits<uint32_t>::max()} ==
TfUtf8InvalidCodePoint);
TF_AXIOM(TfUtf8CodePoint{
TfUtf8CodePoint::SurrogateRange.first - 1}.AsUInt32() ==
TfUtf8CodePoint::SurrogateRange.first - 1);
TF_AXIOM(TfUtf8CodePoint{
TfUtf8CodePoint::SurrogateRange.second + 1}.AsUInt32() ==
TfUtf8CodePoint::SurrogateRange.second + 1);
TF_AXIOM(TfUtf8CodePoint{TfUtf8CodePoint::SurrogateRange.first} ==
TfUtf8InvalidCodePoint);
TF_AXIOM(TfUtf8CodePoint{TfUtf8CodePoint::SurrogateRange.second} ==
TfUtf8InvalidCodePoint);
TF_AXIOM(TfUtf8CodePoint{
(TfUtf8CodePoint::SurrogateRange.second +
TfUtf8CodePoint::SurrogateRange.first) / 2} ==
TfUtf8InvalidCodePoint);
}
{
// Test TfStringify
TF_AXIOM(TfStringify(TfUtf8CodePoint(97)) == "a");
TF_AXIOM(TfStringify(TfUtf8CodePoint(8747)) == "");
TF_AXIOM(TfStringify(TfUtf8InvalidCodePoint) == "");
TF_AXIOM(TfStringify(TfUtf8CodePoint{}) ==
TfStringify(TfUtf8InvalidCodePoint));

}
return true;
}

static bool
TestUtf8CodePointView()
{
Expand All @@ -53,7 +99,7 @@ TestUtf8CodePointView()
TF_AXIOM(i1 == std::cend(u1));

for (const uint32_t codePoint : u1) {
TF_AXIOM(codePoint != TfUtf8CodePointIterator::INVALID_CODE_POINT);
TF_AXIOM(codePoint != TfUtf8InvalidCodePoint.AsUInt32());
}
}

Expand All @@ -67,7 +113,7 @@ TestUtf8CodePointView()
TF_AXIOM(i2 == std::cend(u2));

for (const uint32_t codePoint : u2) {
TF_AXIOM(codePoint != TfUtf8CodePointIterator::INVALID_CODE_POINT);
TF_AXIOM(codePoint != TfUtf8InvalidCodePoint.AsUInt32());
}
}

Expand Down Expand Up @@ -98,7 +144,7 @@ TestUtf8CodePointView()
TF_AXIOM(i3b == std::cend(u3));

for (const uint32_t codePoint : u3) {
TF_AXIOM(codePoint != TfUtf8CodePointIterator::INVALID_CODE_POINT);
TF_AXIOM(codePoint != TfUtf8InvalidCodePoint.AsUInt32());
}

}
Expand All @@ -112,8 +158,8 @@ TestUtf8CodePointView()

std::array<uint32_t, 5> codePoints{0};
const std::array<uint32_t, 5> expectedCodePoints{{
TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x61, 0x62,
TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x63}};
TfUtf8InvalidCodePoint.AsUInt32(), 0x61, 0x62,
TfUtf8InvalidCodePoint.AsUInt32(), 0x63}};
std::copy(std::cbegin(uv), uv.EndAsIterator(), std::begin(codePoints));
TF_AXIOM(codePoints == expectedCodePoints);
}
Expand All @@ -127,16 +173,50 @@ TestUtf8CodePointView()

std::array<uint32_t, 7> codePoints{0};
const std::array<uint32_t, 7> expectedCodePoints{{
TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x61,
TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x62,
TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x63,
TfUtf8CodePointIterator::INVALID_CODE_POINT}};
TfUtf8InvalidCodePoint.AsUInt32(), 0x61,
TfUtf8InvalidCodePoint.AsUInt32(), 0x62,
TfUtf8InvalidCodePoint.AsUInt32(), 0x63,
TfUtf8InvalidCodePoint.AsUInt32()}};
std::copy(std::cbegin(uv), uv.EndAsIterator(), std::begin(codePoints));
TF_AXIOM(codePoints == expectedCodePoints);
}
return true;
}

/// Ensure that every code point can be serialized into a string and converted
/// back to a code point.
static bool
TestUtf8CodePointReflection()
{
for (uint32_t value = 0; value <= TfUtf8CodePoint::MaximumValue; value++) {
if ((value < TfUtf8CodePoint::SurrogateRange.first) ||
(value > TfUtf8CodePoint::SurrogateRange.second)) {
const TfUtf8CodePoint codePoint{value};
TF_AXIOM(codePoint.AsUInt32() == value);
const std::string text{TfStringify(codePoint)};
const auto view = TfUtf8CodePointView{text};
TF_AXIOM(std::cbegin(view) != std::cend(view));
TF_AXIOM(*std::cbegin(view) == codePoint.AsUInt32());
TF_AXIOM(++std::cbegin(view) == std::cend(view));
}
}
return true;
}

/// Ensure that the surrogate range is replaced with the invalid character
static bool
TestUtf8CodePointSurrogateRange()
{
for (uint32_t value = TfUtf8CodePoint::SurrogateRange.first;
value <= TfUtf8CodePoint::SurrogateRange.second; value++) {
const TfUtf8CodePoint surrogateCodePoint{value};
TF_AXIOM(surrogateCodePoint == TfUtf8InvalidCodePoint);
TF_AXIOM(TfStringify(surrogateCodePoint) ==
TfStringify(TfUtf8InvalidCodePoint));
}
return true;
}

static bool
TestCharacterClasses()
{
Expand Down Expand Up @@ -301,7 +381,11 @@ TestCharacterClasses()
static bool
Test_TfUnicodeUtils()
{
return TestUtf8CodePointView() && TestCharacterClasses();
return TestUtf8CodePoint() &&
TestUtf8CodePointView() &&
TestCharacterClasses() &&
TestUtf8CodePointReflection() &&
TestUtf8CodePointSurrogateRange();
}

TF_ADD_REGTEST(TfUnicodeUtils);
61 changes: 49 additions & 12 deletions pxr/base/tf/unicodeUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,50 @@

PXR_NAMESPACE_OPEN_SCOPE

std::ostream&
operator<<(std::ostream& stream, const TfUtf8CodePoint codePoint)
{
const auto value = codePoint.AsUInt32();
if (value < 0x80)
{
// 1-byte UTF-8 encoding
stream << static_cast<char>(value);
}
else if (value < 0x800)
{
// 2-byte UTF-8 encoding
stream << (static_cast<char>(static_cast<unsigned char>((value >> 6) | 0xc0)));
stream << (static_cast<char>(static_cast<unsigned char>((value & 0x3f) | 0x80)));
}
else if (value < 0x10000)
{
// 3-byte UTF-8 encoding
stream << (static_cast<char>(static_cast<unsigned char>((value >> 12) | 0xe0)));
stream << (static_cast<char>(static_cast<unsigned char>(((value >> 6) & 0x3f) | 0x80)));
stream << (static_cast<char>(static_cast<unsigned char>((value & 0x3f) | 0x80)));
}
else if (value < 0x110000)
{
// 4-byte UTF-8 encoding
stream << (static_cast<char>(static_cast<unsigned char>((value >> 18) | 0xf0)));
stream << (static_cast<char>(static_cast<unsigned char>(((value >> 12) & 0x3f) | 0x80)));
stream << (static_cast<char>(static_cast<unsigned char>(((value >> 6) & 0x3f) | 0x80)));
stream << (static_cast<char>(static_cast<unsigned char>((value & 0x3f) | 0x80)));
}
else
{
stream << TfUtf8InvalidCodePoint;
}
return stream;
}

uint32_t TfUtf8CodePointIterator::_GetCodePoint() const
{
// determine what encoding length the character is
_EncodingLength encodingLength = this->_GetEncodingLength();
if (encodingLength > std::distance(_it, _end)) {
// error condition, would read bytes past the end of the range
return INVALID_CODE_POINT;
return TfUtf8InvalidCodePoint.AsUInt32();
}
if (encodingLength == 1)
{
Expand All @@ -49,12 +86,12 @@ uint32_t TfUtf8CodePointIterator::_GetCodePoint() const
if (byte1 < static_cast<unsigned char>('\xc2') ||
byte1 > static_cast<unsigned char>('\xdf'))
{
return INVALID_CODE_POINT;
return TfUtf8InvalidCodePoint.AsUInt32();
}
if (byte2 < static_cast<unsigned char>('\x80') ||
byte2 > static_cast<unsigned char>('\xbf'))
{
return INVALID_CODE_POINT;
return TfUtf8InvalidCodePoint.AsUInt32();
}

// the code point is constructed from the last 5 bits of byte1
Expand All @@ -77,7 +114,7 @@ uint32_t TfUtf8CodePointIterator::_GetCodePoint() const
byte3 < static_cast<unsigned char>('\x80') ||
byte3 > static_cast<unsigned char>('\xbf'))
{
return INVALID_CODE_POINT;
return TfUtf8InvalidCodePoint.AsUInt32();
}
}
else if ((byte1 >= static_cast<unsigned char>('\xe1') &&
Expand All @@ -92,7 +129,7 @@ uint32_t TfUtf8CodePointIterator::_GetCodePoint() const
byte3 < static_cast<unsigned char>('\x80') ||
byte3 > static_cast<unsigned char>('\xbf'))
{
return INVALID_CODE_POINT;
return TfUtf8InvalidCodePoint.AsUInt32();
}
}
else if (byte1 == static_cast<unsigned char>('\xed'))
Expand All @@ -104,13 +141,13 @@ uint32_t TfUtf8CodePointIterator::_GetCodePoint() const
byte3 < static_cast<unsigned char>('\x80') ||
byte3 > static_cast<unsigned char>('\xbf'))
{
return INVALID_CODE_POINT;
return TfUtf8InvalidCodePoint.AsUInt32();
}
}
else
{
// byte 1 invalid
return INVALID_CODE_POINT;
return TfUtf8InvalidCodePoint.AsUInt32();
}

// code point is constructed from the last 4 bits of byte1
Expand All @@ -137,7 +174,7 @@ uint32_t TfUtf8CodePointIterator::_GetCodePoint() const
byte4 < static_cast<unsigned char>('\x80') ||
byte4 > static_cast<unsigned char>('\xbf'))
{
return INVALID_CODE_POINT;
return TfUtf8InvalidCodePoint.AsUInt32();
}
}
else if (byte1 >= static_cast<unsigned char>('\xf1') &&
Expand All @@ -153,7 +190,7 @@ uint32_t TfUtf8CodePointIterator::_GetCodePoint() const
byte4 < static_cast<unsigned char>('\x80') ||
byte4 > static_cast<unsigned char>('\xbf'))
{
return INVALID_CODE_POINT;
return TfUtf8InvalidCodePoint.AsUInt32();
}
}
else if (byte1 == static_cast<unsigned char>('\xf4'))
Expand All @@ -168,21 +205,21 @@ uint32_t TfUtf8CodePointIterator::_GetCodePoint() const
byte4 < static_cast<unsigned char>('\x80') ||
byte4 > static_cast<unsigned char>('\xbf'))
{
return INVALID_CODE_POINT;
return TfUtf8InvalidCodePoint.AsUInt32();
}
}
else
{
// byte 1 is invalid
return INVALID_CODE_POINT;
return TfUtf8InvalidCodePoint.AsUInt32();
}

// code point is constructed from the last 3 bits of byte 1
// and the last 6 bits of bytes 2, 3, and 4
return ((byte1 & 0x7) << 18) + ((byte2 & 0x3f) << 12) +
((byte3 & 0x3f) << 6) + (byte4 & 0x3f);
}
return INVALID_CODE_POINT;
return TfUtf8InvalidCodePoint.AsUInt32();
}

PXR_NAMESPACE_CLOSE_SCOPE
57 changes: 52 additions & 5 deletions pxr/base/tf/unicodeUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,60 @@
#include "pxr/base/tf/diagnostic.h"
#include "pxr/base/tf/unicodeCharacterClasses.h"

#include <ostream>
#include <string>
#include <string_view>

PXR_NAMESPACE_OPEN_SCOPE

/// Wrapper for a code point value that can be encoded as UTF-8
class TfUtf8CodePoint {
public:
/// Code points that cannot be decoded or outside of the valid range are
/// may be replaced with this value.
static constexpr uint32_t ReplacementValue = 0xFFFD;

/// Values higher than this will be replaced with the replacement
/// code point.
static constexpr uint32_t MaximumValue = 0x10FFFF;

/// Values in this range (inclusive) cannot be constructed and will be
/// replaced by the replacement code point.
static constexpr std::pair<uint32_t, uint32_t>
SurrogateRange = {0xD800, 0xDFFF};

/// Construct a code point initialized to the replacement value
constexpr TfUtf8CodePoint() = default;

/// Construct a UTF-8 valued code point, constrained by the maximum value
/// and surrogate range.
constexpr explicit TfUtf8CodePoint(uint32_t value) :
_value(((value <= MaximumValue) &&
((value < SurrogateRange.first) ||
(value > SurrogateRange.second))) ?
value : ReplacementValue) {}

constexpr uint32_t AsUInt32() const { return _value; }

friend constexpr bool operator==(const TfUtf8CodePoint left,
const TfUtf8CodePoint right) {
return left._value == right._value;
}
friend constexpr bool operator!=(const TfUtf8CodePoint left,
const TfUtf8CodePoint right) {
return left._value != right._value;
}

private:
uint32_t _value{ReplacementValue};
};

TF_API std::ostream& operator<<(std::ostream&, const TfUtf8CodePoint);

/// The replacement code point can be used to signal that a code point could
/// not be decoded and needed to be replaced.
constexpr TfUtf8CodePoint TfUtf8InvalidCodePoint{};

class TfUtf8CodePointIterator;

/// Wrapper for a UTF-8 encoded `std::string_view` that can be iterated over
Expand All @@ -49,7 +98,7 @@ class TfUtf8CodePointIterator;
/// std::string value{"∫dx"};
/// TfUtf8CodePointView view{value};
/// for (const uint32_t codePoint : view) {
/// if (codePoint == TfTfUtf8CodePointIterator::INVALID_CODE_POINT) {
/// if (codePoint == TfUtf8InvalidCodePoint.AsUInt32()) {
/// TF_WARN("String cannot be decoded.");
/// }
/// }
Expand Down Expand Up @@ -125,11 +174,9 @@ class TfUtf8CodePointIterator final {
using pointer = void;
using reference = uint32_t;

static constexpr uint32_t INVALID_CODE_POINT = 0xFFFD;

/// Retrieves the next UTF-8 character in the sequence as its Unicode
/// code point value. Returns INVALID_CODE_POINT when the byte sequence
/// pointed to by the iterator cannot be decoded.
/// code point value. Returns TfUtf8InvalidCodePoint.AsUInt32() when the
/// byte sequence pointed to by the iterator cannot be decoded.
///
/// If during read of the UTF-8 character sequence the underlying
/// string iterator would go beyond \a end defined at construction
Expand Down

0 comments on commit 9a337ca

Please sign in to comment.