Merge pull request #6 from jeffro256/jamtis_base32_sm

common: add Jamtis base32 encoding
seraphis-migration · Sep 26, 2023 · d7e89f7 · d7e89f7
2 parents 0a14382 + ad1cb23
commit d7e89f7
Show file tree

Hide file tree

Showing 6 changed files with 997 additions and 0 deletions.
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
@@ -29,6 +29,7 @@
 include_directories(SYSTEM ${OPENSSL_INCLUDE_DIR})
 
 set(common_sources
+  base32.cpp
   base58.cpp
   command_line.cpp
   dns_utils.cpp

diff --git a/src/common/base32.cpp b/src/common/base32.cpp
@@ -0,0 +1,249 @@
+// Copyright (c) 2023, The Monero Project
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification, are
+// permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this list of
+//    conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice, this list
+//    of conditions and the following disclaimer in the documentation and/or other
+//    materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors may be
+//    used to endorse or promote products derived from this software without specific
+//    prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+// THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "base32.h"
+
+#include <cstring>
+#include <limits>
+#include <stdexcept>
+
+// you might see a lot of the syntax a / y * x + a % y * x / y used in the code below.
+// this is an equivalent way to write the expression a * x / y, but without overflowing.
+
+namespace
+{
+static constexpr const size_t ENCODED_MAX = static_cast<size_t>(std::numeric_limits<ssize_t>::max());
+static constexpr const size_t RAW_MAX = ENCODED_MAX / 8 * 5 + ENCODED_MAX % 8 * 5 / 8;
+
+//--------------------------------------------------------------------------------------------------
+template <bool ALLOW_PARTIAL> // ALLOW_PARTIAL=false is faster b/c branches are trimmed
+static void encode_block(const char *binary, const size_t binary_len, char *encoded, const base32::Mode mode)
+{
+    // this function looks complicated, but it's just the handwritten bit smashing operations for
+    // a block of 5 binary bytes / 8 base32 symbols with `if` branches inserted to exit when
+    // applicable. we encode bytes from left to right, from the MSB in each byte to the LSB. notice
+    // that when mode == binary_lossy, we don't encode parts of bytes at the tail, we return early.
+    // otherwise, when mode == encoded_lossy, we take the bits we can from the tail byte and use it
+    // as the MSB of the alphabet index to the last symbol.
+    using namespace base32;
+    if (ALLOW_PARTIAL && 0 == binary_len) return;
+    encoded[0] = JAMTIS_ALPHABET[(binary[0] & 0b11111000) >> 3];
+    if (ALLOW_PARTIAL && 1 == binary_len)
+    {
+        if (mode == base32::Mode::binary_lossy) { return; }
+        else { encoded[1] = JAMTIS_ALPHABET[(binary[0] & 0b00000111) << 2]; return; }
+    }
+    encoded[1] = JAMTIS_ALPHABET[((binary[0] & 0b00000111) << 2) | ((binary[1] & 0b11000000) >> 6)];
+    encoded[2] = JAMTIS_ALPHABET[(binary[1] & 0b00111110) >> 1];
+    if (ALLOW_PARTIAL && 2 == binary_len)
+    {
+        if (mode == base32::Mode::binary_lossy) { return; }
+        else { encoded[3] = JAMTIS_ALPHABET[(binary[1] & 0b00000001) << 4]; return; }
+    }
+    encoded[3] = JAMTIS_ALPHABET[((binary[1] & 0b00000001) << 4) | ((binary[2] & 0b11110000) >> 4)];
+    if (ALLOW_PARTIAL && 3 == binary_len)
+    {
+        if (mode == base32::Mode::binary_lossy) { return; }
+        else { encoded[4] = JAMTIS_ALPHABET[(binary[2] & 0b00001111) << 1]; return; }
+    }
+    encoded[4] = JAMTIS_ALPHABET[((binary[2] & 0b00001111) << 1) | ((binary[3] & 0b10000000) >> 7)];
+    encoded[5] = JAMTIS_ALPHABET[(binary[3] & 0b01111100) >> 2];
+    if (ALLOW_PARTIAL && 4 == binary_len)
+    {
+        if (mode == base32::Mode::binary_lossy) { return; }
+        else { encoded[6] = JAMTIS_ALPHABET[(binary[3] & 0b00000011) << 3]; return; }
+    }
+    encoded[6] = JAMTIS_ALPHABET[((binary[3] & 0b00000011) << 3) | ((binary[4] & 0b11100000) >> 5)];
+    encoded[7] = JAMTIS_ALPHABET[(binary[4] & 0b00011111)];
+}
+//--------------------------------------------------------------------------------------------------
+[[noreturn]] void throw_by_err_code(const base32::Error err)
+{
+    switch (err)
+    {
+    case base32::Error::invalid_char:
+        throw std::runtime_error("invalid base32 character encountered in encoded string");
+    case base32::Error::not_enough_space:
+        throw std::runtime_error("not enough buffer space provided for base32 operation");
+    default:
+        throw std::logic_error("unexpected base32 error code");
+    }
+}
+} // anonymous namespace
+//--------------------------------------------------------------------------------------------------
+//--------------------------------------------------------------------------------------------------
+namespace base32
+{
+//--------------------------------------------------------------------------------------------------
+const char JAMTIS_ALPHABET[32] =
+{
+    'x', 'm', 'r', 'b', 'a', 's', 'e', '3', '2', 'c', 'd', 'f', 'g', 'h', 'i', 'j',
+    'k', 'n', 'p', 'q', 't', 'u', 'w', 'y', '0', '1', '4', '5', '6', '7', '8', '9'
+};
+//--------------------------------------------------------------------------------------------------
+const unsigned char JAMTIS_INVERTED_ALPHABET[256] =
+{
+    BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC,
+    BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC,
+    BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, IGNC, BADC, BADC,
+      24,   25,    8,    7,   26,   27,   28,   29,   30,   31, BADC, BADC, BADC, BADC, BADC, BADC,
+    BADC,    4,    3,    9,   10,    6,   11,   12,   13,   14,   15,   16,   25,    1,   17,   24,
+      18,   19,    2,    5,   20,   21,   21,   22,    0,   23,    8, BADC, BADC, BADC, BADC, BADC,
+    BADC,    4,    3,    9,   10,    6,   11,   12,   13,   14,   15,   16,   25,    1,   17,   24,
+      18,   19,    2,    5,   20,   21,   21,   22,    0,   23,    8, BADC, BADC, BADC, BADC, BADC,
+    BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC,
+    BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC,
+    BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC,
+    BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC,
+    BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC,
+    BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC,
+    BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC,
+    BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC, BADC
+};
+//--------------------------------------------------------------------------------------------------
+ssize_t encoded_size(const size_t binary_len, const Mode mode) noexcept
+{
+    if (binary_len > RAW_MAX)
+        return static_cast<ssize_t>(Error::not_enough_space);
+
+    const ssize_t rem5 = binary_len % 5;
+    const bool extra_tail = (mode == Mode::encoded_lossy) && rem5;
+    return binary_len / 5 * 8 + rem5 * 8 / 5 + extra_tail;
+}
+//--------------------------------------------------------------------------------------------------
+ssize_t decoded_size_max(const size_t encoded_len, const Mode mode) noexcept
+{
+    if (encoded_len > ENCODED_MAX)
+        return static_cast<ssize_t>(Error::not_enough_space);
+
+    const ssize_t rem8 = encoded_len % 8;
+    const bool extra_tail = (mode == Mode::binary_lossy) && rem8;
+    return encoded_len / 8 * 5 + rem8 * 5 / 8 + extra_tail;
+}
+//--------------------------------------------------------------------------------------------------
+ssize_t encode(epee::span<const char> binary_buf,
+    epee::span<char> encoded_str_out,
+    const Mode mode)
+{
+    const ssize_t actual_encoded_len = encoded_size(binary_buf.size(), mode);
+    if (actual_encoded_len < 0 || static_cast<size_t>(actual_encoded_len) > encoded_str_out.size())
+        return static_cast<ssize_t>(Error::not_enough_space);
+
+    while (binary_buf.size() >= 5)
+    {
+        // use encode_block<false> when we are encoding exactly 5 bytes
+        encode_block<false>(binary_buf.data(), binary_buf.size(), encoded_str_out.data(), mode);
+        binary_buf.remove_prefix(5);
+        encoded_str_out.remove_prefix(8);
+    }
+
+    // use encode_block<true> when encoding a partial block on the tail
+    encode_block<true>(binary_buf.data(), binary_buf.size(), encoded_str_out.data(), mode);
+
+    return actual_encoded_len;
+}
+//--------------------------------------------------------------------------------------------------
+std::string encode(const std::string &binary_buf, const Mode mode)
+{
+    ssize_t r = encoded_size(binary_buf.size(), mode);
+    if (r < 0)
+        throw_by_err_code(static_cast<Error>(r));
+    std::string enc(r, '\0');
+    if (0 > (r = encode(epee::to_span(binary_buf), {&enc[0], enc.size()}, mode)))
+        throw_by_err_code(static_cast<Error>(r));
+    if (r > (ssize_t) enc.size())
+        throw std::logic_error("base32::encode buffer overflow occurred. this should never happen");
+    enc.resize(r);
+    return enc;
+}
+//--------------------------------------------------------------------------------------------------
+ssize_t decode(const epee::span<const char> encoded_str,
+    epee::span<char> decoded_buf_out,
+    const Mode mode)
+{
+    size_t byte_offset = 0;
+    unsigned char bit_offset = 0;
+
+    if (encoded_str.size() > ENCODED_MAX)
+        return static_cast<ssize_t>(Error::not_enough_space);
+
+    // zero out resulting buffer since we only |= the buffer from here on out
+    memset(decoded_buf_out.data(), 0, decoded_buf_out.size());
+
+    for (size_t enc_i = 0; enc_i < encoded_str.size(); ++enc_i)
+    {
+        if (byte_offset >= decoded_buf_out.size())
+            return static_cast<ssize_t>(Error::not_enough_space);
+
+        // grab next alphabet index
+        const unsigned char v = JAMTIS_INVERTED_ALPHABET[static_cast<size_t>(encoded_str[enc_i])];
+        if (IGNC == v)
+            continue;
+        else if (v >= 32)
+            return static_cast<ssize_t>(Error::invalid_char);
+
+        // write symbol bits to current pointed-to byte
+        decoded_buf_out[byte_offset] |= v << 3 >> bit_offset;
+
+        // if we are in encoded lossy mode (default), then don't extend the binary buffer to write
+        // only part of a symbol, we can just end here
+        if (enc_i == encoded_str.size() - 1 && mode == Mode::encoded_lossy)
+            return byte_offset + 1;
+
+        // step byte & bit pointers, and determine if any symbol bits wrap to the next byte
+        byte_offset += bit_offset >= 3;
+        const bool write_next_byte = bit_offset > 3;
+        bit_offset = (bit_offset + 5) & 7;
+
+        if (!write_next_byte)
+            continue;
+        else if (byte_offset >= decoded_buf_out.size())
+            return static_cast<ssize_t>(Error::not_enough_space);
+
+        // write wrapped symbol bits to next byte
+        decoded_buf_out[byte_offset] |= v << (8 - bit_offset);
+    }
+
+    return byte_offset + (bit_offset != 0);
+}
+//--------------------------------------------------------------------------------------------------
+std::string decode(const std::string &encoded_buf, const Mode mode)
+{
+    ssize_t r = decoded_size_max(encoded_buf.size(), mode); 
+    if (r < 0)
+        throw_by_err_code(static_cast<Error>(r));
+    std::string dec(r, '\0');
+    if (0 > (r = decode(epee::to_span(encoded_buf), {&dec[0], dec.size()}, mode)))
+        throw_by_err_code(static_cast<Error>(r));
+    if(r > (ssize_t) dec.size())
+        throw std::logic_error("base32::encode buffer overflow occurred. this should never happen");
+    dec.resize(r);
+    return dec;
+}
+//--------------------------------------------------------------------------------------------------
+} // namespace base32
diff --git a/src/common/base32.h b/src/common/base32.h
@@ -0,0 +1,123 @@
+// Copyright (c) 2023, The Monero Project
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification, are
+// permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this list of
+//    conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice, this list
+//    of conditions and the following disclaimer in the documentation and/or other
+//    materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors may be
+//    used to endorse or promote products derived from this software without specific
+//    prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+// THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file Encode/Decode using Jamtis base32 encoding
+ * 
+ * We use the alphabet "xmrbase32cdfghijknpqtuwy01456789"
+ * 
+ * This alphabet was selected for the following reasons:
+ *     1. To have a unique prefix that distinguishes the encoding from other variants of "base32"
+ *     2. To contain all digits 0-9, allowing numeric values to be encoded in a human readable form
+ *     3. To normalize the symbols o->0, l->1, v->u and z->2 for human transcription correction
+ *
+ * Hypens can be used to space base32 encoded strings, and are ignored during the decoding process.
+*/
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#include "span.h"
+
+namespace base32
+{
+enum class Error: ssize_t
+{
+    invalid_char = -1,    // encountered invalid character when decoding
+    not_enough_space = -2 // not enough space in pre-allocated buffers
+};
+
+enum class Mode
+{
+    encoded_lossy, // when decoding, discard odd encoded LSB bits left at end of tail (default).
+    binary_lossy   // when encoding, discard odd binary LSB bits left at end of tail.
+};
+
+// table of the base32 symbols, in Jamtis order
+extern const char JAMTIS_ALPHABET[32];
+
+// table that converts ascii character codes into base32 symbol indexes
+extern const unsigned char JAMTIS_INVERTED_ALPHABET[256];
+
+// constants in the inverted table that signal an ascii code is invalid or ignoreable, respectively
+static constexpr const unsigned char BADC = 255;
+static constexpr const unsigned char IGNC = 254;
+
+/**
+ * @brief calculate size of encoded string, returns not_enough_space if binary_len too big
+*/
+ssize_t encoded_size(const size_t binary_len, const Mode mode = Mode::encoded_lossy) noexcept;
+
+/**
+ * @brief calculate maximum size of decoded binary, returns not_enough_space if encoded_len too big
+ *        ("maximum" size because hypens are skipped over)
+*/
+ssize_t decoded_size_max(const size_t encoded_len, const Mode mode = Mode::encoded_lossy) noexcept;
+
+/**
+ * @brief encode a binary buffer into a base32 string
+ * @param binary_buf
+ * @param[out] encoded_str_out null terminator is not included
+ * @param mode
+ * @return the size of the encoded string, if successful, otherwise a negative Error enum value
+*/
+ssize_t encode(epee::span<const char> binary_buf,
+    epee::span<char> encoded_str_out,
+    const Mode mode = Mode::encoded_lossy);
+
+/**
+ * @brief encode a binary buffer into a base32 string
+ * @param binary_buf
+ * @param mode
+ * @return the encoded string
+*/
+std::string encode(const std::string &binary_buf, const Mode mode = Mode::encoded_lossy);
+
+/**
+ * @brief decode a base32 string into a binary buffer
+ * @param encoded_str
+ * @param[out] decoded_buf_out
+ * @param mode
+ * @return the size of the decoded buffer, if successful, otherwise a negative Error enum value
+*/
+ssize_t decode(epee::span<const char> encoded_str,
+    epee::span<char> decoded_buf_out,
+    const Mode mode = Mode::encoded_lossy);
+
+/**
+ * @brief decode a base32 string into a binary buffer
+ * @param encoded_buf
+ * @param mode
+ * @return the decoded buffer
+ * @throw if an invalid character is encountered
+*/
+std::string decode(const std::string &encoded_buf, const Mode mode = Mode::encoded_lossy);
+} // namespace base32