Skip to content

Commit

Permalink
[DAPHNE-#629] Efficient processing of string data in DenseMatrix (#797)
Browse files Browse the repository at this point in the history
This commit addresses issue #629 by enhancing the string support in the DAPHNE runtime making it practical to process string data sets. The main addition is generalizing or specializing current template structures for FixedStr16 class and std::string class. Generalizations include, e.g., using std::fill()/std::copy() instead of memset()/memcpy() and using the newly introduced ValueTypeUtils::defaultValue<VT> instead of 0.

Key Features Implemented
- Focus on the runtime part, i.e., data structures and kernels; no DaphneDSL/compiler integration yet.
- FixedStr16 Class: A fixed-size string class with a 16-character buffer.
- DenseMatrix for string value type: Generalize the current DenseMatrix class to support string data.
- I/O Operations: Reading CSV files containing string columns.
- Kernels on string-valued matrices.
  - Convert string matrices to numeric:
    - oneHot: Applies one-hot-encoding to the given (n x m) matrix of strings.
    - recode: Applies dictionary encoding to the given (n x 1) matrix.
    - cast: String value and matrix objects can be cast to a particular numeric type.
  - Comparison operations:
    - Element-wise binary operators for comparing DenseMatrix<std::string> and/or DenseMatrix<FixedStr16>.
  - Elementwise unary/binary string operations:
    - Element-wise unary operators for converting all strings in a matrix to lower/upper case.
    - Element-wise binary operators for concatenating all corresponding strings in two matrices.
  - Source operations:
    - fill: Creates a matrix and sets all elements to a particular value.
  - Reorganization:
    - transpose: Transposes a given matrix.

Testing
- Initial unit tests for the DenseMatrix<std::string> and DenseMatrix<FixedStr16> have been implemented, verifying functionality for newly added features and data types.
  • Loading branch information
saminbassiri authored Oct 17, 2024
1 parent d9c75ef commit 6720539
Show file tree
Hide file tree
Showing 30 changed files with 1,184 additions and 33 deletions.
2 changes: 1 addition & 1 deletion src/runtime/local/datagen/GenGivenVals.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ template <typename VT> struct GenGivenVals<DenseMatrix<VT>> {
"divisible by given number of rows");
const size_t numCols = numCells / numRows;
auto res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, false);
memcpy(res->getValues(), elements.data(), numCells * sizeof(VT));
std::copy(elements.begin(), elements.end(), res->getValues());
return res;
}
};
Expand Down
5 changes: 4 additions & 1 deletion src/runtime/local/datastructures/DenseMatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,9 @@ DenseMatrix<ValueType>::DenseMatrix(size_t maxNumRows, size_t numCols, bool zero
} else {
AllocationDescriptorHost myHostAllocInfo;
alloc_shared_values();

if (zero)
memset(values.get(), 0, maxNumRows * numCols * sizeof(ValueType));
std::fill(values.get(), values.get() + maxNumRows * numCols, ValueTypeUtils::defaultValue<ValueType>);
new_data_placement = this->mdo->addDataPlacement(&myHostAllocInfo);
}
this->mdo->addLatest(new_data_placement->dp_id);
Expand Down Expand Up @@ -341,3 +342,5 @@ template class DenseMatrix<unsigned char>;
template class DenseMatrix<unsigned int>;
template class DenseMatrix<unsigned long>;
template class DenseMatrix<bool>;
template class DenseMatrix<std::string>;
template class DenseMatrix<FixedStr16>;
30 changes: 16 additions & 14 deletions src/runtime/local/datastructures/DenseMatrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,18 +124,22 @@ template <typename ValueType> class DenseMatrix : public Matrix<ValueType> {
if (rowSkip == numCols || lastAppendedRowIdx == rowIdx) {
const size_t startPosIncl = pos(lastAppendedRowIdx, lastAppendedColIdx) + 1;
const size_t endPosExcl = pos(rowIdx, colIdx);

if (startPosIncl < endPosExcl)
memset(values.get() + startPosIncl, 0, (endPosExcl - startPosIncl) * sizeof(ValueType));
std::fill(values.get() + startPosIncl, values.get() + endPosExcl,
ValueTypeUtils::defaultValue<ValueType>);
} else {
auto v = values.get() + lastAppendedRowIdx * rowSkip;
memset(v + lastAppendedColIdx + 1, 0, (numCols - lastAppendedColIdx - 1) * sizeof(ValueType));
std::fill(v + lastAppendedColIdx + 1, v + numCols, ValueTypeUtils::defaultValue<ValueType>);

v += rowSkip;

for (size_t r = lastAppendedRowIdx + 1; r < rowIdx; r++) {
memset(v, 0, numCols * sizeof(ValueType));
std::fill(v, v + numCols, ValueTypeUtils::defaultValue<ValueType>);
v += rowSkip;
}
if (colIdx)
memset(v, 0, (colIdx - 1) * sizeof(ValueType));
std::fill(v, v + colIdx - 1, ValueTypeUtils::defaultValue<ValueType>);
}
}

Expand Down Expand Up @@ -258,7 +262,7 @@ template <typename ValueType> class DenseMatrix : public Matrix<ValueType> {
void prepareAppend() override {
// The matrix might be empty.
if (numRows != 0 && numCols != 0)
values.get()[0] = ValueType(0);
values.get()[0] = ValueType(ValueTypeUtils::defaultValue<ValueType>);
lastAppendedRowIdx = 0;
lastAppendedColIdx = 0;
}
Expand All @@ -277,7 +281,7 @@ template <typename ValueType> class DenseMatrix : public Matrix<ValueType> {
// The matrix might be empty.
if ((numRows != 0 && numCols != 0) &&
((lastAppendedRowIdx + 1 < numRows) || (lastAppendedColIdx + 1 < numCols)))
append(numRows - 1, numCols - 1, ValueType(0));
append(numRows - 1, numCols - 1, ValueType(ValueTypeUtils::defaultValue<ValueType>));
}

void print(std::ostream &os) const override {
Expand Down Expand Up @@ -327,17 +331,15 @@ template <typename ValueType> class DenseMatrix : public Matrix<ValueType> {
if (valuesLhs == valuesRhs && rowSkipLhs == rowSkipRhs)
return true;

if (rowSkipLhs == numCols && rowSkipRhs == numCols)
return !memcmp(valuesLhs, valuesRhs, numRows * numCols * sizeof(ValueType));
else {
for (size_t r = 0; r < numRows; r++) {
if (memcmp(valuesLhs, valuesRhs, numCols * sizeof(ValueType)))
for (size_t r = 0; r < numRows; ++r) {
for (size_t c = 0; c < numCols; ++c) {
if (*(valuesLhs + c) != *(valuesRhs + c))
return false;
valuesLhs += rowSkipLhs;
valuesRhs += rowSkipRhs;
}
return true;
valuesLhs += rowSkipLhs;
valuesRhs += rowSkipRhs;
}
return true;
}

size_t serialize(std::vector<char> &buf) const override;
Expand Down
146 changes: 146 additions & 0 deletions src/runtime/local/datastructures/FixedSizeStringValueType.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
/*
* Copyright 2024 The DAPHNE Consortium
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <algorithm>
#include <iostream>
#include <stdexcept>
#include <string>
#include <vector>

#include <cstddef>
#include <cstring>

/**
* @brief A string value type with a maximum length of 15 characters.
*
* Each instance is backed by a 16-character buffer, whereby at least the last character must always be a null
* character. The null-termination is required for some operations to work correctly (e.g., casting to a number).
*/
struct FixedStr16 {
static const std::size_t N = 16;
char buffer[N];

// Default constructor
FixedStr16() { std::fill(buffer, buffer + N, '\0'); }

// Constructor from a C-style string
FixedStr16(const char *str) {
size_t len = std::strlen(str);
if (len >= N) {
throw std::length_error("string exceeds fixed buffer size");
}
std::copy(str, str + len, buffer);
std::fill(buffer + len, buffer + N, '\0');
}

// Copy constructor
FixedStr16(const FixedStr16 &other) { std::copy(other.buffer, other.buffer + N, buffer); }

// Constructor from a std::string
FixedStr16(const std::string &other) {
size_t len = other.size();
if (len >= N) {
throw std::length_error("string exceeds fixed buffer size");
}
std::copy(other.begin(), other.end(), buffer);
std::fill(buffer + len, buffer + N, '\0');
}

// Assignment operator
FixedStr16 &operator=(const FixedStr16 &other) {
if (this != &other) {
std::copy(other.buffer, other.buffer + N, buffer);
}
return *this;
}

// Overriding the equality operator
bool operator==(const FixedStr16 &other) const { return std::equal(buffer, buffer + N, other.buffer); }

bool operator==(const char *str) const { return std::strncmp(buffer, str, sizeof(buffer)) == 0; }

// Overriding the inequality operator
bool operator!=(const FixedStr16 &other) const { return !(std::equal(buffer, buffer + N, other.buffer)); }

bool operator!=(const char *str) const { return !(std::strncmp(buffer, str, sizeof(buffer)) == 0); }

// Overriding the Less than operator
bool operator<(const FixedStr16 &other) const { return std::strncmp(buffer, other.buffer, N) < 0; }

// Overriding the Greater than operator
bool operator>(const FixedStr16 &other) const { return std::strncmp(buffer, other.buffer, N) > 0; }

// Concatenation operator
friend std::string operator+(const FixedStr16 &lhs, const FixedStr16 &rhs) {
std::string result(lhs.buffer);
result.append(rhs.buffer);
return result;
}

// Serialization function
void serialize(std::vector<char> &outBuffer) const { outBuffer.insert(outBuffer.end(), buffer, buffer + N); }

// Overload the output stream operator
friend std::ostream &operator<<(std::ostream &os, const FixedStr16 &fs) {
os.write(fs.buffer, N);
return os;
}

// Size method
size_t size() const { return std::strlen(buffer); }

// Method to set the string
void set(const char *str) {
size_t len = std::strlen(str);
if (len >= N) {
throw std::length_error("string exceeds fixed buffer size");
}
std::transform(str, str + len, buffer, [](char c) { return c; });
std::fill(buffer + len, buffer + N, '\0');
}

// C-string method for compatibility
std::string to_string() const { return std::string(buffer, size()); }

// Compare method similar to std::string::compare
int compare(const FixedStr16 &other) const { return std::strncmp(buffer, other.buffer, N); }

// Convert to lowercase
FixedStr16 lower() const {
FixedStr16 result;
std::transform(buffer, buffer + N, result.buffer, [](unsigned char c) { return std::tolower(c); });
return result;
}

// Convert to uppercase
FixedStr16 upper() const {
FixedStr16 result;
std::transform(buffer, buffer + N, result.buffer, [](unsigned char c) { return std::toupper(c); });
return result;
}
};

// Specialize std::hash for FixedStr16 this is nessary to use FixedStr16 as a key in std::unordered_map
namespace std {
template <> struct hash<FixedStr16> {
std::size_t operator()(const FixedStr16 &key) const {
// Compute the hash of the fixed-size buffer
return std::hash<std::string>()(std::string(key.buffer, key.N));
}
};
} // namespace std
6 changes: 4 additions & 2 deletions src/runtime/local/datastructures/ValueTypeCode.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,10 @@ enum class ValueTypeCode : uint8_t {
UI32,
UI64, // unsigned integers (uintx_t)
F32,
F64, // floating point (float, double)
INVALID, // only for JSON enum conversion
F64, // floating point (float, double)
STR, // std::string
FIXEDSTR16, // fixed-size string (length 16)
INVALID, // only for JSON enum conversion
// TODO Support bool as well, but poses some challenges (e.g. sizeof).
// UI1 // boolean (bool)
};
Expand Down
16 changes: 16 additions & 0 deletions src/runtime/local/datastructures/ValueTypeUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ template <> const ValueTypeCode ValueTypeUtils::codeFor<uint32_t> = ValueTypeCod
template <> const ValueTypeCode ValueTypeUtils::codeFor<uint64_t> = ValueTypeCode::UI64;
template <> const ValueTypeCode ValueTypeUtils::codeFor<float> = ValueTypeCode::F32;
template <> const ValueTypeCode ValueTypeUtils::codeFor<double> = ValueTypeCode::F64;
template <> const ValueTypeCode ValueTypeUtils::codeFor<std::string> = ValueTypeCode::STR;
template <> const ValueTypeCode ValueTypeUtils::codeFor<FixedStr16> = ValueTypeCode::FIXEDSTR16;

template <> const std::string ValueTypeUtils::cppNameFor<int8_t> = "int8_t";
template <> const std::string ValueTypeUtils::cppNameFor<int32_t> = "int32_t";
Expand All @@ -99,6 +101,8 @@ template <> const std::string ValueTypeUtils::cppNameFor<float> = "float";
template <> const std::string ValueTypeUtils::cppNameFor<double> = "double";
template <> const std::string ValueTypeUtils::cppNameFor<bool> = "bool";
template <> const std::string ValueTypeUtils::cppNameFor<const char *> = "const char*";
template <> const std::string ValueTypeUtils::cppNameFor<std::string> = "std::string";
template <> const std::string ValueTypeUtils::cppNameFor<FixedStr16> = "FixedStr";

template <> const std::string ValueTypeUtils::irNameFor<int8_t> = "si8";
template <> const std::string ValueTypeUtils::irNameFor<int32_t> = "si32";
Expand All @@ -109,6 +113,18 @@ template <> const std::string ValueTypeUtils::irNameFor<uint64_t> = "ui64";
template <> const std::string ValueTypeUtils::irNameFor<float> = "f32";
template <> const std::string ValueTypeUtils::irNameFor<double> = "f64";

template <> const int8_t ValueTypeUtils::defaultValue<int8_t> = 0;
template <> const int32_t ValueTypeUtils::defaultValue<int32_t> = 0;
template <> const int64_t ValueTypeUtils::defaultValue<int64_t> = 0;
template <> const uint8_t ValueTypeUtils::defaultValue<uint8_t> = 0;
template <> const uint32_t ValueTypeUtils::defaultValue<uint32_t> = 0;
template <> const uint64_t ValueTypeUtils::defaultValue<uint64_t> = 0;
template <> const float ValueTypeUtils::defaultValue<float> = 0;
template <> const double ValueTypeUtils::defaultValue<double> = 0;
template <> const bool ValueTypeUtils::defaultValue<bool> = false;
template <> const std::string ValueTypeUtils::defaultValue<std::string> = std::string("");
template <> const FixedStr16 ValueTypeUtils::defaultValue<FixedStr16> = FixedStr16();

const std::string ValueTypeUtils::cppNameForCode(ValueTypeCode type) {
switch (type) {
case ValueTypeCode::SI8:
Expand Down
19 changes: 19 additions & 0 deletions src/runtime/local/datastructures/ValueTypeUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#pragma once

#include <runtime/local/datastructures/FixedSizeStringValueType.h>
#include <runtime/local/datastructures/ValueTypeCode.h>

#include <iostream>
Expand All @@ -29,6 +30,8 @@
// changes to the list of supported data types local.
#define ALL_VALUE_TYPES int8_t, int32_t, int64_t, uint8_t, uint32_t, uint64_t, float, double

#define ALL_STRING_VALUE_TYPES std::string, FixedStr16

struct ValueTypeUtils {

static size_t sizeOf(ValueTypeCode type);
Expand All @@ -37,6 +40,8 @@ struct ValueTypeUtils {

template <typename ValueType> static const ValueTypeCode codeFor;

template <typename ValueType> static const ValueType defaultValue;

template <typename ValueType> static const std::string cppNameFor;

template <typename ValueType> static const std::string irNameFor;
Expand All @@ -54,6 +59,8 @@ template <> const ValueTypeCode ValueTypeUtils::codeFor<uint32_t>;
template <> const ValueTypeCode ValueTypeUtils::codeFor<uint64_t>;
template <> const ValueTypeCode ValueTypeUtils::codeFor<float>;
template <> const ValueTypeCode ValueTypeUtils::codeFor<double>;
template <> const ValueTypeCode ValueTypeUtils::codeFor<std::string>;
template <> const ValueTypeCode ValueTypeUtils::codeFor<FixedStr16>;

template <> const std::string ValueTypeUtils::cppNameFor<int8_t>;
template <> const std::string ValueTypeUtils::cppNameFor<int32_t>;
Expand All @@ -74,3 +81,15 @@ template <> const std::string ValueTypeUtils::irNameFor<uint32_t>;
template <> const std::string ValueTypeUtils::irNameFor<uint64_t>;
template <> const std::string ValueTypeUtils::irNameFor<float>;
template <> const std::string ValueTypeUtils::irNameFor<double>;

template <> const int8_t ValueTypeUtils::defaultValue<int8_t>;
template <> const int32_t ValueTypeUtils::defaultValue<int32_t>;
template <> const int64_t ValueTypeUtils::defaultValue<int64_t>;
template <> const uint8_t ValueTypeUtils::defaultValue<uint8_t>;
template <> const uint32_t ValueTypeUtils::defaultValue<uint32_t>;
template <> const uint64_t ValueTypeUtils::defaultValue<uint64_t>;
template <> const float ValueTypeUtils::defaultValue<float>;
template <> const double ValueTypeUtils::defaultValue<double>;
template <> const std::string ValueTypeUtils::defaultValue<std::string>;
template <> const FixedStr16 ValueTypeUtils::defaultValue<FixedStr16>;
template <> const char *ValueTypeUtils::defaultValue<const char *>;
Loading

0 comments on commit 6720539

Please sign in to comment.