Skip to content

Commit

Permalink
refactor: move zero width replacement to a function
Browse files Browse the repository at this point in the history
  • Loading branch information
Nerixyz committed Sep 14, 2024
1 parent 3d06f86 commit d8d62ac
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 46 deletions.
17 changes: 2 additions & 15 deletions src/providers/recentmessages/Impl.cpp
Original file line number Diff line number Diff line change
@@ -1,22 +1,13 @@
#include "providers/recentmessages/Impl.hpp"

#include "common/Env.hpp"
#include "common/QLogging.hpp"
#include "messages/MessageBuilder.hpp"
#include "providers/twitch/IrcMessageHandler.hpp"
#include "providers/twitch/TwitchChannel.hpp"
#include "util/FormatTime.hpp"
#include "util/Helpers.hpp"

#include <QJsonArray>
#include <QUrlQuery>

namespace {

// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
const auto &LOG = chatterinoRecentMessages;

} // namespace

namespace chatterino::recentmessages::detail {

// Parse the IRC messages returned in JSON form into Communi messages
Expand All @@ -33,11 +24,7 @@ std::vector<Communi::IrcMessage *> parseRecentMessages(

for (const auto &jsonMessage : jsonMessages)
{
auto content = jsonMessage.toString();

// For explanation of why this exists, see src/providers/twitch/TwitchChannel.hpp,
// where these constants are defined
content.replace(COMBINED_FIXER, ZERO_WIDTH_JOINER);
auto content = unescapeZeroWidthJoiner(jsonMessage.toString());

auto *message =
Communi::IrcMessage::fromData(content.toUtf8(), nullptr);
Expand Down
18 changes: 5 additions & 13 deletions src/providers/twitch/IrcMessageHandler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -702,15 +702,8 @@ void IrcMessageHandler::handlePrivMessage(Communi::IrcPrivateMessage *message,
}
}

// This is for compatibility with older Chatterino versions. Twitch didn't use
// to allow ZERO WIDTH JOINER unicode character, so Chatterino used ESCAPE_TAG
// instead.
// See https://github.com/Chatterino/chatterino2/issues/3384 and
// https://mm2pl.github.io/emoji_rfc.pdf for more details
this->addMessage(
message, chan,
message->content().replace(COMBINED_FIXER, ZERO_WIDTH_JOINER),
twitchServer, false, message->isAction());
this->addMessage(message, chan, unescapeZeroWidthJoiner(message->content()),
twitchServer, false, message->isAction());

if (message->tags().contains(u"pinned-chat-paid-amount"_s))
{
Expand Down Expand Up @@ -915,10 +908,9 @@ void IrcMessageHandler::handleWhisperMessage(Communi::IrcMessage *ircMessage)

auto *c = getApp()->getTwitch()->getWhispersChannel().get();

MessageBuilder builder(
c, ircMessage, args,
ircMessage->parameter(1).replace(COMBINED_FIXER, ZERO_WIDTH_JOINER),
false);
MessageBuilder builder(c, ircMessage, args,
unescapeZeroWidthJoiner(ircMessage->parameter(1)),
false);

if (builder.isIgnored())
{
Expand Down
18 changes: 0 additions & 18 deletions src/providers/twitch/TwitchChannel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,24 +27,6 @@

namespace chatterino {

// This is for compatibility with older Chatterino versions. Twitch didn't use
// to allow ZERO WIDTH JOINER unicode character, so Chatterino used ESCAPE_TAG
// instead.
// See https://github.com/Chatterino/chatterino2/issues/3384 and
// https://mm2pl.github.io/emoji_rfc.pdf for more details
const QString ZERO_WIDTH_JOINER = QString(QChar(0x200D));

// Here be MSVC: Do NOT replace with "\U" literal, it will fail silently.
namespace {
const QChar ESCAPE_TAG_CHARS[2] = {QChar::highSurrogate(0xE0002),
QChar::lowSurrogate(0xE0002)};
}
const QString ESCAPE_TAG = QString(ESCAPE_TAG_CHARS, 2);

const static QRegularExpression COMBINED_FIXER(
QString("(?<!%1)%1").arg(ESCAPE_TAG),
QRegularExpression::UseUnicodePropertiesOption);

enum class HighlightState;

struct Emote;
Expand Down
18 changes: 18 additions & 0 deletions src/util/Helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,18 @@
#include <QRegularExpression>
#include <QUuid>

namespace {

const QString ZERO_WIDTH_JOINER = QStringLiteral("\u200D");

// Note: \U requires /utf-8 for MSVC
// See https://mm2pl.github.io/emoji_rfc.pdf
const QRegularExpression ESCAPE_TAG_REGEX(
QStringLiteral("(?<!\U000E0002)\U000E0002"),
QRegularExpression::UseUnicodePropertiesOption);

} // namespace

namespace chatterino {

namespace helpers::detail {
Expand Down Expand Up @@ -283,4 +295,10 @@ bool compareEmoteStrings(const QString &a, const QString &b)
return k < 0;
}

QString unescapeZeroWidthJoiner(QString escaped)
{
escaped.replace(ESCAPE_TAG_REGEX, ZERO_WIDTH_JOINER);
return escaped;
}

} // namespace chatterino
7 changes: 7 additions & 0 deletions src/util/Helpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,4 +182,11 @@ constexpr std::optional<std::decay_t<T>> makeConditionedOptional(bool condition,
return std::nullopt;
}

/// @brief Unescapes zero width joiners (ZWJ; U+200D) from Twitch messages
///
/// Older Chatterino versions escape ZWJ with an ESCAPE TAG (U+E0002), following
/// https://mm2pl.github.io/emoji_rfc.pdf. This function unescapes all tags with
/// a ZWJ. See also: https://github.com/Chatterino/chatterino2/issues/3384.
QString unescapeZeroWidthJoiner(QString escaped);

} // namespace chatterino
56 changes: 56 additions & 0 deletions tests/src/Helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

#include "Test.hpp"

#include <span>

using namespace chatterino;
using namespace helpers::detail;

Expand Down Expand Up @@ -500,3 +502,57 @@ TEST(Helpers, parseDurationToSeconds)
<< c.output;
}
}

TEST(Helpers, unescapeZeroWidthJoiner)
{
struct TestCase {
QStringView input;
QStringView output;
};

std::vector<TestCase> tests{
{u"foo bar", u"foo bar"},
{u"", u""},
{u"a", u"a"},
{u"\U000E0002", u"\u200D"},
{u"foo\U000E0002bar", u"foo\u200Dbar"},
{u"foo \U000E0002 bar", u"foo \u200D bar"},
{u"\U0001F468\U000E0002\U0001F33E", u"\U0001F468\u200D\U0001F33E"},
// don't replace ZWJ
{u"\U0001F468\u200D\U0001F33E", u"\U0001F468\u200D\U0001F33E"},
// only replace the first escape tag in sequences
{
u"\U0001F468\U000E0002\U000E0002\U0001F33E",
u"\U0001F468\u200D\U000E0002\U0001F33E",
},
{
u"\U0001F468\U000E0002\U000E0002\U000E0002\U0001F33E",
u"\U0001F468\u200D\U000E0002\U000E0002\U0001F33E",
},
};

// sanity check that the compiler supports unicode string literals
static_assert(
[] {
constexpr std::span zwj = u"\u200D";
static_assert(zwj.size() == 2);
static_assert(zwj[0] == u'\x200D');
static_assert(zwj[1] == u'\0');

constexpr std::span escapeTag = u"\U000E0002";
static_assert(escapeTag.size() == 3);
static_assert(escapeTag[0] == u'\xDB40');
static_assert(escapeTag[1] == u'\xDC02');
static_assert(escapeTag[2] == u'\0');

return true;
}(),
"The compiler must support Unicode string literals");

for (const auto &c : tests)
{
const auto actual = unescapeZeroWidthJoiner(c.input.toString());

EXPECT_EQ(actual, c.output);
}
}

0 comments on commit d8d62ac

Please sign in to comment.