[DAPHNE-#629] Efficient processing of string data in DenseMatrix (#797)

This commit addresses issue #629 by enhancing the string support in the DAPHNE runtime making it practical to process string data sets. The main addition is generalizing or specializing current template structures for FixedStr16 class and std::string class. Generalizations include, e.g., using std::fill()/std::copy() instead of memset()/memcpy() and using the newly introduced ValueTypeUtils::defaultValue<VT> instead of 0. Key Features Implemented - Focus on the runtime part, i.e., data structures and kernels; no DaphneDSL/compiler integration yet. - FixedStr16 Class: A fixed-size string class with a 16-character buffer. - DenseMatrix for string value type: Generalize the current DenseMatrix class to support string data. - I/O Operations: Reading CSV files containing string columns. - Kernels on string-valued matrices. - Convert string matrices to numeric: - oneHot: Applies one-hot-encoding to the given (n x m) matrix of strings. - recode: Applies dictionary encoding to the given (n x 1) matrix. - cast: String value and matrix objects can be cast to a particular numeric type. - Comparison operations: - Element-wise binary operators for comparing DenseMatrix<std::string> and/or DenseMatrix<FixedStr16>. - Elementwise unary/binary string operations: - Element-wise unary operators for converting all strings in a matrix to lower/upper case. - Element-wise binary operators for concatenating all corresponding strings in two matrices. - Source operations: - fill: Creates a matrix and sets all elements to a particular value. - Reorganization: - transpose: Transposes a given matrix. Testing - Initial unit tests for the DenseMatrix<std::string> and DenseMatrix<FixedStr16> have been implemented, verifying functionality for newly added features and data types.
daphne-eu · Oct 17, 2024 · 6720539 · 6720539
1 parent d9c75ef
commit 6720539
Show file tree

Hide file tree

Showing 30 changed files with 1,184 additions and 33 deletions.
diff --git a/src/runtime/local/datagen/GenGivenVals.h b/src/runtime/local/datagen/GenGivenVals.h
@@ -97,7 +97,7 @@ template <typename VT> struct GenGivenVals<DenseMatrix<VT>> {
                                      "divisible by given number of rows");
         const size_t numCols = numCells / numRows;
         auto res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, false);
-        memcpy(res->getValues(), elements.data(), numCells * sizeof(VT));
+        std::copy(elements.begin(), elements.end(), res->getValues());
         return res;
     }
 };

diff --git a/src/runtime/local/datastructures/DenseMatrix.cpp b/src/runtime/local/datastructures/DenseMatrix.cpp
@@ -78,8 +78,9 @@ DenseMatrix<ValueType>::DenseMatrix(size_t maxNumRows, size_t numCols, bool zero
     } else {
         AllocationDescriptorHost myHostAllocInfo;
         alloc_shared_values();
+
         if (zero)
-            memset(values.get(), 0, maxNumRows * numCols * sizeof(ValueType));
+            std::fill(values.get(), values.get() + maxNumRows * numCols, ValueTypeUtils::defaultValue<ValueType>);
         new_data_placement = this->mdo->addDataPlacement(&myHostAllocInfo);
     }
     this->mdo->addLatest(new_data_placement->dp_id);
@@ -341,3 +342,5 @@ template class DenseMatrix<unsigned char>;
 template class DenseMatrix<unsigned int>;
 template class DenseMatrix<unsigned long>;
 template class DenseMatrix<bool>;
+template class DenseMatrix<std::string>;
+template class DenseMatrix<FixedStr16>;
diff --git a/src/runtime/local/datastructures/DenseMatrix.h b/src/runtime/local/datastructures/DenseMatrix.h
@@ -124,18 +124,22 @@ template <typename ValueType> class DenseMatrix : public Matrix<ValueType> {
         if (rowSkip == numCols || lastAppendedRowIdx == rowIdx) {
             const size_t startPosIncl = pos(lastAppendedRowIdx, lastAppendedColIdx) + 1;
             const size_t endPosExcl = pos(rowIdx, colIdx);
+
             if (startPosIncl < endPosExcl)
-                memset(values.get() + startPosIncl, 0, (endPosExcl - startPosIncl) * sizeof(ValueType));
+                std::fill(values.get() + startPosIncl, values.get() + endPosExcl,
+                          ValueTypeUtils::defaultValue<ValueType>);
         } else {
             auto v = values.get() + lastAppendedRowIdx * rowSkip;
-            memset(v + lastAppendedColIdx + 1, 0, (numCols - lastAppendedColIdx - 1) * sizeof(ValueType));
+            std::fill(v + lastAppendedColIdx + 1, v + numCols, ValueTypeUtils::defaultValue<ValueType>);
+
             v += rowSkip;
+
             for (size_t r = lastAppendedRowIdx + 1; r < rowIdx; r++) {
-                memset(v, 0, numCols * sizeof(ValueType));
+                std::fill(v, v + numCols, ValueTypeUtils::defaultValue<ValueType>);
                 v += rowSkip;
             }
             if (colIdx)
-                memset(v, 0, (colIdx - 1) * sizeof(ValueType));
+                std::fill(v, v + colIdx - 1, ValueTypeUtils::defaultValue<ValueType>);
         }
     }
 
@@ -258,7 +262,7 @@ template <typename ValueType> class DenseMatrix : public Matrix<ValueType> {
     void prepareAppend() override {
         // The matrix might be empty.
         if (numRows != 0 && numCols != 0)
-            values.get()[0] = ValueType(0);
+            values.get()[0] = ValueType(ValueTypeUtils::defaultValue<ValueType>);
         lastAppendedRowIdx = 0;
         lastAppendedColIdx = 0;
     }
@@ -277,7 +281,7 @@ template <typename ValueType> class DenseMatrix : public Matrix<ValueType> {
         // The matrix might be empty.
         if ((numRows != 0 && numCols != 0) &&
             ((lastAppendedRowIdx + 1 < numRows) || (lastAppendedColIdx + 1 < numCols)))
-            append(numRows - 1, numCols - 1, ValueType(0));
+            append(numRows - 1, numCols - 1, ValueType(ValueTypeUtils::defaultValue<ValueType>));
     }
 
     void print(std::ostream &os) const override {
@@ -327,17 +331,15 @@ template <typename ValueType> class DenseMatrix : public Matrix<ValueType> {
         if (valuesLhs == valuesRhs && rowSkipLhs == rowSkipRhs)
             return true;
 
-        if (rowSkipLhs == numCols && rowSkipRhs == numCols)
-            return !memcmp(valuesLhs, valuesRhs, numRows * numCols * sizeof(ValueType));
-        else {
-            for (size_t r = 0; r < numRows; r++) {
-                if (memcmp(valuesLhs, valuesRhs, numCols * sizeof(ValueType)))
+        for (size_t r = 0; r < numRows; ++r) {
+            for (size_t c = 0; c < numCols; ++c) {
+                if (*(valuesLhs + c) != *(valuesRhs + c))
                     return false;
-                valuesLhs += rowSkipLhs;
-                valuesRhs += rowSkipRhs;
             }
-            return true;
+            valuesLhs += rowSkipLhs;
+            valuesRhs += rowSkipRhs;
         }
+        return true;
     }
 
     size_t serialize(std::vector<char> &buf) const override;

diff --git a/src/runtime/local/datastructures/FixedSizeStringValueType.h b/src/runtime/local/datastructures/FixedSizeStringValueType.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright 2024 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <cstddef>
+#include <cstring>
+
+/**
+ * @brief A string value type with a maximum length of 15 characters.
+ *
+ * Each instance is backed by a 16-character buffer, whereby at least the last character must always be a null
+ * character. The null-termination is required for some operations to work correctly (e.g., casting to a number).
+ */
+struct FixedStr16 {
+    static const std::size_t N = 16;
+    char buffer[N];
+
+    // Default constructor
+    FixedStr16() { std::fill(buffer, buffer + N, '\0'); }
+
+    // Constructor from a C-style string
+    FixedStr16(const char *str) {
+        size_t len = std::strlen(str);
+        if (len >= N) {
+            throw std::length_error("string exceeds fixed buffer size");
+        }
+        std::copy(str, str + len, buffer);
+        std::fill(buffer + len, buffer + N, '\0');
+    }
+
+    // Copy constructor
+    FixedStr16(const FixedStr16 &other) { std::copy(other.buffer, other.buffer + N, buffer); }
+
+    // Constructor from a std::string
+    FixedStr16(const std::string &other) {
+        size_t len = other.size();
+        if (len >= N) {
+            throw std::length_error("string exceeds fixed buffer size");
+        }
+        std::copy(other.begin(), other.end(), buffer);
+        std::fill(buffer + len, buffer + N, '\0');
+    }
+
+    // Assignment operator
+    FixedStr16 &operator=(const FixedStr16 &other) {
+        if (this != &other) {
+            std::copy(other.buffer, other.buffer + N, buffer);
+        }
+        return *this;
+    }
+
+    // Overriding the equality operator
+    bool operator==(const FixedStr16 &other) const { return std::equal(buffer, buffer + N, other.buffer); }
+
+    bool operator==(const char *str) const { return std::strncmp(buffer, str, sizeof(buffer)) == 0; }
+
+    // Overriding the inequality operator
+    bool operator!=(const FixedStr16 &other) const { return !(std::equal(buffer, buffer + N, other.buffer)); }
+
+    bool operator!=(const char *str) const { return !(std::strncmp(buffer, str, sizeof(buffer)) == 0); }
+
+    // Overriding the Less than operator
+    bool operator<(const FixedStr16 &other) const { return std::strncmp(buffer, other.buffer, N) < 0; }
+
+    // Overriding the Greater than operator
+    bool operator>(const FixedStr16 &other) const { return std::strncmp(buffer, other.buffer, N) > 0; }
+
+    // Concatenation operator
+    friend std::string operator+(const FixedStr16 &lhs, const FixedStr16 &rhs) {
+        std::string result(lhs.buffer);
+        result.append(rhs.buffer);
+        return result;
+    }
+
+    // Serialization function
+    void serialize(std::vector<char> &outBuffer) const { outBuffer.insert(outBuffer.end(), buffer, buffer + N); }
+
+    // Overload the output stream operator
+    friend std::ostream &operator<<(std::ostream &os, const FixedStr16 &fs) {
+        os.write(fs.buffer, N);
+        return os;
+    }
+
+    // Size method
+    size_t size() const { return std::strlen(buffer); }
+
+    // Method to set the string
+    void set(const char *str) {
+        size_t len = std::strlen(str);
+        if (len >= N) {
+            throw std::length_error("string exceeds fixed buffer size");
+        }
+        std::transform(str, str + len, buffer, [](char c) { return c; });
+        std::fill(buffer + len, buffer + N, '\0');
+    }
+
+    // C-string method for compatibility
+    std::string to_string() const { return std::string(buffer, size()); }
+
+    // Compare method similar to std::string::compare
+    int compare(const FixedStr16 &other) const { return std::strncmp(buffer, other.buffer, N); }
+
+    // Convert to lowercase
+    FixedStr16 lower() const {
+        FixedStr16 result;
+        std::transform(buffer, buffer + N, result.buffer, [](unsigned char c) { return std::tolower(c); });
+        return result;
+    }
+
+    // Convert to uppercase
+    FixedStr16 upper() const {
+        FixedStr16 result;
+        std::transform(buffer, buffer + N, result.buffer, [](unsigned char c) { return std::toupper(c); });
+        return result;
+    }
+};
+
+// Specialize std::hash for FixedStr16 this is nessary to use FixedStr16 as a key in std::unordered_map
+namespace std {
+template <> struct hash<FixedStr16> {
+    std::size_t operator()(const FixedStr16 &key) const {
+        // Compute the hash of the fixed-size buffer
+        return std::hash<std::string>()(std::string(key.buffer, key.N));
+    }
+};
+} // namespace std
diff --git a/src/runtime/local/datastructures/ValueTypeCode.h b/src/runtime/local/datastructures/ValueTypeCode.h
@@ -34,8 +34,10 @@ enum class ValueTypeCode : uint8_t {
     UI32,
     UI64, // unsigned integers (uintx_t)
     F32,
-    F64,     // floating point (float, double)
-    INVALID, // only for JSON enum conversion
+    F64,        // floating point (float, double)
+    STR,        // std::string
+    FIXEDSTR16, // fixed-size string (length 16)
+    INVALID,    // only for JSON enum conversion
     // TODO Support bool as well, but poses some challenges (e.g. sizeof).
     //    UI1 // boolean (bool)
 };

diff --git a/src/runtime/local/datastructures/ValueTypeUtils.cpp b/src/runtime/local/datastructures/ValueTypeUtils.cpp
@@ -88,6 +88,8 @@ template <> const ValueTypeCode ValueTypeUtils::codeFor<uint32_t> = ValueTypeCod
 template <> const ValueTypeCode ValueTypeUtils::codeFor<uint64_t> = ValueTypeCode::UI64;
 template <> const ValueTypeCode ValueTypeUtils::codeFor<float> = ValueTypeCode::F32;
 template <> const ValueTypeCode ValueTypeUtils::codeFor<double> = ValueTypeCode::F64;
+template <> const ValueTypeCode ValueTypeUtils::codeFor<std::string> = ValueTypeCode::STR;
+template <> const ValueTypeCode ValueTypeUtils::codeFor<FixedStr16> = ValueTypeCode::FIXEDSTR16;
 
 template <> const std::string ValueTypeUtils::cppNameFor<int8_t> = "int8_t";
 template <> const std::string ValueTypeUtils::cppNameFor<int32_t> = "int32_t";
@@ -99,6 +101,8 @@ template <> const std::string ValueTypeUtils::cppNameFor<float> = "float";
 template <> const std::string ValueTypeUtils::cppNameFor<double> = "double";
 template <> const std::string ValueTypeUtils::cppNameFor<bool> = "bool";
 template <> const std::string ValueTypeUtils::cppNameFor<const char *> = "const char*";
+template <> const std::string ValueTypeUtils::cppNameFor<std::string> = "std::string";
+template <> const std::string ValueTypeUtils::cppNameFor<FixedStr16> = "FixedStr";
 
 template <> const std::string ValueTypeUtils::irNameFor<int8_t> = "si8";
 template <> const std::string ValueTypeUtils::irNameFor<int32_t> = "si32";
@@ -109,6 +113,18 @@ template <> const std::string ValueTypeUtils::irNameFor<uint64_t> = "ui64";
 template <> const std::string ValueTypeUtils::irNameFor<float> = "f32";
 template <> const std::string ValueTypeUtils::irNameFor<double> = "f64";
 
+template <> const int8_t ValueTypeUtils::defaultValue<int8_t> = 0;
+template <> const int32_t ValueTypeUtils::defaultValue<int32_t> = 0;
+template <> const int64_t ValueTypeUtils::defaultValue<int64_t> = 0;
+template <> const uint8_t ValueTypeUtils::defaultValue<uint8_t> = 0;
+template <> const uint32_t ValueTypeUtils::defaultValue<uint32_t> = 0;
+template <> const uint64_t ValueTypeUtils::defaultValue<uint64_t> = 0;
+template <> const float ValueTypeUtils::defaultValue<float> = 0;
+template <> const double ValueTypeUtils::defaultValue<double> = 0;
+template <> const bool ValueTypeUtils::defaultValue<bool> = false;
+template <> const std::string ValueTypeUtils::defaultValue<std::string> = std::string("");
+template <> const FixedStr16 ValueTypeUtils::defaultValue<FixedStr16> = FixedStr16();
+
 const std::string ValueTypeUtils::cppNameForCode(ValueTypeCode type) {
     switch (type) {
     case ValueTypeCode::SI8:

diff --git a/src/runtime/local/datastructures/ValueTypeUtils.h b/src/runtime/local/datastructures/ValueTypeUtils.h
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <runtime/local/datastructures/FixedSizeStringValueType.h>
 #include <runtime/local/datastructures/ValueTypeCode.h>
 
 #include <iostream>
@@ -29,6 +30,8 @@
 // changes to the list of supported data types local.
 #define ALL_VALUE_TYPES int8_t, int32_t, int64_t, uint8_t, uint32_t, uint64_t, float, double
 
+#define ALL_STRING_VALUE_TYPES std::string, FixedStr16
+
 struct ValueTypeUtils {
 
     static size_t sizeOf(ValueTypeCode type);
@@ -37,6 +40,8 @@ struct ValueTypeUtils {
 
     template <typename ValueType> static const ValueTypeCode codeFor;
 
+    template <typename ValueType> static const ValueType defaultValue;
+
     template <typename ValueType> static const std::string cppNameFor;
 
     template <typename ValueType> static const std::string irNameFor;
@@ -54,6 +59,8 @@ template <> const ValueTypeCode ValueTypeUtils::codeFor<uint32_t>;
 template <> const ValueTypeCode ValueTypeUtils::codeFor<uint64_t>;
 template <> const ValueTypeCode ValueTypeUtils::codeFor<float>;
 template <> const ValueTypeCode ValueTypeUtils::codeFor<double>;
+template <> const ValueTypeCode ValueTypeUtils::codeFor<std::string>;
+template <> const ValueTypeCode ValueTypeUtils::codeFor<FixedStr16>;
 
 template <> const std::string ValueTypeUtils::cppNameFor<int8_t>;
 template <> const std::string ValueTypeUtils::cppNameFor<int32_t>;
@@ -74,3 +81,15 @@ template <> const std::string ValueTypeUtils::irNameFor<uint32_t>;
 template <> const std::string ValueTypeUtils::irNameFor<uint64_t>;
 template <> const std::string ValueTypeUtils::irNameFor<float>;
 template <> const std::string ValueTypeUtils::irNameFor<double>;
+
+template <> const int8_t ValueTypeUtils::defaultValue<int8_t>;
+template <> const int32_t ValueTypeUtils::defaultValue<int32_t>;
+template <> const int64_t ValueTypeUtils::defaultValue<int64_t>;
+template <> const uint8_t ValueTypeUtils::defaultValue<uint8_t>;
+template <> const uint32_t ValueTypeUtils::defaultValue<uint32_t>;
+template <> const uint64_t ValueTypeUtils::defaultValue<uint64_t>;
+template <> const float ValueTypeUtils::defaultValue<float>;
+template <> const double ValueTypeUtils::defaultValue<double>;
+template <> const std::string ValueTypeUtils::defaultValue<std::string>;
+template <> const FixedStr16 ValueTypeUtils::defaultValue<FixedStr16>;
+template <> const char *ValueTypeUtils::defaultValue<const char *>;