Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

common: add Jamtis base32 encoding #6

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
include_directories(SYSTEM ${OPENSSL_INCLUDE_DIR})

set(common_sources
base32.cpp
base58.cpp
command_line.cpp
dns_utils.cpp
Expand Down
249 changes: 249 additions & 0 deletions src/common/base32.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
// Copyright (c) 2023, The Monero Project
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without modification, are
// permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice, this list of
// conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright notice, this list
// of conditions and the following disclaimer in the documentation and/or other
// materials provided with the distribution.
//
// 3. Neither the name of the copyright holder nor the names of its contributors may be
// used to endorse or promote products derived from this software without specific
// prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
// THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "base32.h"

#include <cstring>
#include <limits>
#include <stdexcept>

// you might see a lot of the syntax a / y * x + a % y * x / y used in the code below.
// this is an equivalent way to write the expression a * x / y, but without overflowing.

namespace
{
static constexpr const size_t ENCODED_MAX = static_cast<size_t>(std::numeric_limits<ssize_t>::max());
static constexpr const size_t RAW_MAX = ENCODED_MAX / 8 * 5 + ENCODED_MAX % 8 * 5 / 8;

//--------------------------------------------------------------------------------------------------
template <bool ALLOW_PARTIAL> // ALLOW_PARTIAL=false is faster b/c branches are trimmed
static void encode_block(const char *binary, const size_t binary_len, char *encoded, const base32::Mode mode)
{
// this function looks complicated, but it's just the handwritten bit smashing operations for
// a block of 5 binary bytes / 8 base32 symbols with `if` branches inserted to exit when
// applicable. we encode bytes from left to right, from the MSB in each byte to the LSB. notice
// that when mode == binary_lossy, we don't encode parts of bytes at the tail, we return early.
// otherwise, when mode == encoded_lossy, we take the bits we can from the tail byte and use it
// as the MSB of the alphabet index to the last symbol.
using namespace base32;
if (ALLOW_PARTIAL && 0 == binary_len) return;
encoded[0] = JAMTIS_ALPHABET[(binary[0] & 0b11111000) >> 3];
if (ALLOW_PARTIAL && 1 == binary_len)
{
if (mode == base32::Mode::binary_lossy) { return; }
else { encoded[1] = JAMTIS_ALPHABET[(binary[0] & 0b00000111) << 2]; return; }
}
encoded[1] = JAMTIS_ALPHABET[((binary[0] & 0b00000111) << 2) | ((binary[1] & 0b11000000) >> 6)];
encoded[2] = JAMTIS_ALPHABET[(binary[1] & 0b00111110) >> 1];
if (ALLOW_PARTIAL && 2 == binary_len)
{
if (mode == base32::Mode::binary_lossy) { return; }
else { encoded[3] = JAMTIS_ALPHABET[(binary[1] & 0b00000001) << 4]; return; }
}
encoded[3] = JAMTIS_ALPHABET[((binary[1] & 0b00000001) << 4) | ((binary[2] & 0b11110000) >> 4)];
if (ALLOW_PARTIAL && 3 == binary_len)
{
if (mode == base32::Mode::binary_lossy) { return; }
else { encoded[4] = JAMTIS_ALPHABET[(binary[2] & 0b00001111) << 1]; return; }
}
encoded[4] = JAMTIS_ALPHABET[((binary[2] & 0b00001111) << 1) | ((binary[3] & 0b10000000) >> 7)];
encoded[5] = JAMTIS_ALPHABET[(binary[3] & 0b01111100) >> 2];
if (ALLOW_PARTIAL && 4 == binary_len)
{
if (mode == base32::Mode::binary_lossy) { return; }
else { encoded[6] = JAMTIS_ALPHABET[(binary[3] & 0b00000011) << 3]; return; }
}
encoded[6] = JAMTIS_ALPHABET[((binary[3] & 0b00000011) << 3) | ((binary[4] & 0b11100000) >> 5)];
encoded[7] = JAMTIS_ALPHABET[(binary[4] & 0b00011111)];
}
//--------------------------------------------------------------------------------------------------
[[noreturn]] void throw_by_err_code(const base32::Error err)
{
switch (err)
{
case base32::Error::invalid_char:
throw std::runtime_error("invalid base32 character encountered in encoded string");
case base32::Error::not_enough_space:
throw std::runtime_error("not enough buffer space provided for base32 operation");
default:
throw std::logic_error("unexpected base32 error code");
}
}
} // anonymous namespace
//--------------------------------------------------------------------------------------------------
//--------------------------------------------------------------------------------------------------
namespace base32
{
//--------------------------------------------------------------------------------------------------
const char JAMTIS_ALPHABET[32] =
{
'x', 'm', 'r', 'b', 'a', 's', 'e', '3', '2', 'c', 'd', 'f', 'g', 'h', 'i', 'j',
'k', 'n', 'p', 'q', 't', 'u', 'w', 'y', '0', '1', '4', '5', '6', '7', '8', '9'
};
//--------------------------------------------------------------------------------------------------
const unsigned char JAMTIS_INVERTED_ALPHABET[256] =
{
BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC,
BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC,
BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, IGNC, BADC, BADC,
24, 25, 8, 7, 26, 27, 28, 29, 30, 31, BADC, BADC, BADC, BADC, BADC, BADC,
BADC, 4, 3, 9, 10, 6, 11, 12, 13, 14, 15, 16, 25, 1, 17, 24,
18, 19, 2, 5, 20, 21, 21, 22, 0, 23, 8, BADC, BADC, BADC, BADC, BADC,
BADC, 4, 3, 9, 10, 6, 11, 12, 13, 14, 15, 16, 25, 1, 17, 24,
18, 19, 2, 5, 20, 21, 21, 22, 0, 23, 8, BADC, BADC, BADC, BADC, BADC,
BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC,
BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC,
BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC,
BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC,
BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC,
BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC,
BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC,
BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC
};
//--------------------------------------------------------------------------------------------------
ssize_t encoded_size(const size_t binary_len, const Mode mode) noexcept
{
if (binary_len > RAW_MAX)
return static_cast<ssize_t>(Error::not_enough_space);

const ssize_t rem5 = binary_len % 5;
const bool extra_tail = (mode == Mode::encoded_lossy) && rem5;
return binary_len / 5 * 8 + rem5 * 8 / 5 + extra_tail;
}
//--------------------------------------------------------------------------------------------------
ssize_t decoded_size_max(const size_t encoded_len, const Mode mode) noexcept
{
if (encoded_len > ENCODED_MAX)
return static_cast<ssize_t>(Error::not_enough_space);

const ssize_t rem8 = encoded_len % 8;
const bool extra_tail = (mode == Mode::binary_lossy) && rem8;
return encoded_len / 8 * 5 + rem8 * 5 / 8 + extra_tail;
}
//--------------------------------------------------------------------------------------------------
ssize_t encode(epee::span<const char> binary_buf,
epee::span<char> encoded_str_out,
const Mode mode)
{
const ssize_t actual_encoded_len = encoded_size(binary_buf.size(), mode);
if (actual_encoded_len < 0 || static_cast<size_t>(actual_encoded_len) > encoded_str_out.size())
return static_cast<ssize_t>(Error::not_enough_space);

while (binary_buf.size() >= 5)
{
// use encode_block<false> when we are encoding exactly 5 bytes
encode_block<false>(binary_buf.data(), binary_buf.size(), encoded_str_out.data(), mode);
binary_buf.remove_prefix(5);
encoded_str_out.remove_prefix(8);
}

// use encode_block<true> when encoding a partial block on the tail
encode_block<true>(binary_buf.data(), binary_buf.size(), encoded_str_out.data(), mode);

return actual_encoded_len;
}
//--------------------------------------------------------------------------------------------------
std::string encode(const std::string &binary_buf, const Mode mode)
{
ssize_t r = encoded_size(binary_buf.size(), mode);
if (r < 0)
throw_by_err_code(static_cast<Error>(r));
std::string enc(r, '\0');
if (0 > (r = encode(epee::to_span(binary_buf), {&enc[0], enc.size()}, mode)))
throw_by_err_code(static_cast<Error>(r));
if (r > (ssize_t) enc.size())
throw std::logic_error("base32::encode buffer overflow occurred. this should never happen");
enc.resize(r);
return enc;
}
//--------------------------------------------------------------------------------------------------
ssize_t decode(const epee::span<const char> encoded_str,
epee::span<char> decoded_buf_out,
const Mode mode)
{
size_t byte_offset = 0;
unsigned char bit_offset = 0;

if (encoded_str.size() > ENCODED_MAX)
return static_cast<ssize_t>(Error::not_enough_space);

// zero out resulting buffer since we only |= the buffer from here on out
memset(decoded_buf_out.data(), 0, decoded_buf_out.size());

for (size_t enc_i = 0; enc_i < encoded_str.size(); ++enc_i)
{
if (byte_offset >= decoded_buf_out.size())
return static_cast<ssize_t>(Error::not_enough_space);

// grab next alphabet index
const unsigned char v = JAMTIS_INVERTED_ALPHABET[static_cast<size_t>(encoded_str[enc_i])];
if (IGNC == v)
continue;
else if (v >= 32)
return static_cast<ssize_t>(Error::invalid_char);

// write symbol bits to current pointed-to byte
decoded_buf_out[byte_offset] |= v << 3 >> bit_offset;
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand why the << 3 here. This should shift the first value by 3 (left), and I don't see how that could be accurate.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since MSBs are encoded "before" LSBs, we shift the 5-bit alphabet index up 3 to align it with the first bit in the byte, the MSB, then we shift it according to the bit_offset.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a design choice where we could've encoded the LSBs before the MSBs, and not needed the << 3 but I like MSB->LSB type of encoding because it makes more sense for humans when you convert the raw data into a binary string.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's also what most base32 libraries do anyways.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup, looked at the encoding algorithm to see what you did. I suppose it doesn't matter, as long as its consistent behavior.


// if we are in encoded lossy mode (default), then don't extend the binary buffer to write
// only part of a symbol, we can just end here
if (enc_i == encoded_str.size() - 1 && mode == Mode::encoded_lossy)
return byte_offset + 1;

// step byte & bit pointers, and determine if any symbol bits wrap to the next byte
byte_offset += bit_offset >= 3;
const bool write_next_byte = bit_offset > 3;
bit_offset = (bit_offset + 5) & 7;

if (!write_next_byte)
continue;
else if (byte_offset >= decoded_buf_out.size())
return static_cast<ssize_t>(Error::not_enough_space);

// write wrapped symbol bits to next byte
decoded_buf_out[byte_offset] |= v << (8 - bit_offset);
}

return byte_offset + (bit_offset != 0);
}
//--------------------------------------------------------------------------------------------------
std::string decode(const std::string &encoded_buf, const Mode mode)
{
ssize_t r = decoded_size_max(encoded_buf.size(), mode);
if (r < 0)
throw_by_err_code(static_cast<Error>(r));
std::string dec(r, '\0');
if (0 > (r = decode(epee::to_span(encoded_buf), {&dec[0], dec.size()}, mode)))
throw_by_err_code(static_cast<Error>(r));
if(r > (ssize_t) dec.size())
throw std::logic_error("base32::encode buffer overflow occurred. this should never happen");
dec.resize(r);
return dec;
}
//--------------------------------------------------------------------------------------------------
} // namespace base32
123 changes: 123 additions & 0 deletions src/common/base32.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
// Copyright (c) 2023, The Monero Project
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without modification, are
// permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice, this list of
// conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright notice, this list
// of conditions and the following disclaimer in the documentation and/or other
// materials provided with the distribution.
//
// 3. Neither the name of the copyright holder nor the names of its contributors may be
// used to endorse or promote products derived from this software without specific
// prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
// THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

/**
* @file Encode/Decode using Jamtis base32 encoding
*
* We use the alphabet "xmrbase32cdfghijknpqtuwy01456789"
*
* This alphabet was selected for the following reasons:
* 1. To have a unique prefix that distinguishes the encoding from other variants of "base32"
* 2. To contain all digits 0-9, allowing numeric values to be encoded in a human readable form
* 3. To normalize the symbols o->0, l->1, v->u and z->2 for human transcription correction
*
* Hypens can be used to space base32 encoded strings, and are ignored during the decoding process.
*/

#pragma once

#include <cstddef>
#include <cstdint>
#include <string>

#include "span.h"

namespace base32
{
enum class Error: ssize_t
{
invalid_char = -1, // encountered invalid character when decoding
not_enough_space = -2 // not enough space in pre-allocated buffers
};

enum class Mode
{
encoded_lossy, // when decoding, discard odd encoded LSB bits left at end of tail (default).
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When is lossy useful? And how to select not lossy?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mode::binary_lossy in useful for encoding exact blocks of 5 bits so that the encoded base32 string isn't as long. For example, Jamtis address body sizes will be an odd number of bits long, not divisible by 8. Thus, we can make the encoded string one byte shorter since there's leftover bits in the binary that we aren't using.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You select binary_lossy by passing it as the mode parameter in each function.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought it was expected when divisible by 5-bits that no additional values would be appended. I guess not. But then how does someone select lossless mode? There is no enum for it.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Although it isn't explicit, almost all base32 libraries take the "encoded lossy" approach which preserves every bit in the raw data and discards extraneous encoded string bits, since that's the expected behavior 90% of the time. That's the default behavior for this code too, but now you have the option.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But then how does someone select lossless mode?

You could make a lossless mode if and only if you forced the user to only encode raw data for which the byte length is divisible by 5, and decode encoded strings of which the length is divisible by 8.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It could be added, although I don't know when that would be useful.

binary_lossy // when encoding, discard odd binary LSB bits left at end of tail.
};

// table of the base32 symbols, in Jamtis order
extern const char JAMTIS_ALPHABET[32];
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do these tables really need to be exported? Can they be local to the cpp?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They are exported so they can be used by the base32 checksum PR #7 as default tables.


// table that converts ascii character codes into base32 symbol indexes
extern const unsigned char JAMTIS_INVERTED_ALPHABET[256];

// constants in the inverted table that signal an ascii code is invalid or ignoreable, respectively
static constexpr const unsigned char BADC = 255;
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same with these constants, why export them?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They are exported so they can be used by the base32 checksum PR #7 as default tables.

static constexpr const unsigned char IGNC = 254;

/**
* @brief calculate size of encoded string, returns not_enough_space if binary_len too big
*/
ssize_t encoded_size(const size_t binary_len, const Mode mode = Mode::encoded_lossy) noexcept;

/**
* @brief calculate maximum size of decoded binary, returns not_enough_space if encoded_len too big
* ("maximum" size because hypens are skipped over)
*/
ssize_t decoded_size_max(const size_t encoded_len, const Mode mode = Mode::encoded_lossy) noexcept;

/**
* @brief encode a binary buffer into a base32 string
* @param binary_buf
* @param[out] encoded_str_out null terminator is not included
* @param mode
* @return the size of the encoded string, if successful, otherwise a negative Error enum value
*/
ssize_t encode(epee::span<const char> binary_buf,
epee::span<char> encoded_str_out,
const Mode mode = Mode::encoded_lossy);

/**
* @brief encode a binary buffer into a base32 string
* @param binary_buf
* @param mode
* @return the encoded string
*/
std::string encode(const std::string &binary_buf, const Mode mode = Mode::encoded_lossy);

/**
* @brief decode a base32 string into a binary buffer
* @param encoded_str
* @param[out] decoded_buf_out
* @param mode
* @return the size of the decoded buffer, if successful, otherwise a negative Error enum value
*/
ssize_t decode(epee::span<const char> encoded_str,
epee::span<char> decoded_buf_out,
const Mode mode = Mode::encoded_lossy);

/**
* @brief decode a base32 string into a binary buffer
* @param encoded_buf
* @param mode
* @return the decoded buffer
* @throw if an invalid character is encountered
*/
std::string decode(const std::string &encoded_buf, const Mode mode = Mode::encoded_lossy);
} // namespace base32
Loading