apache · mapleFU · Jul 31, 2024 · Jul 31, 2024 · Jul 31, 2024 · Aug 1, 2024
diff --git a/src/common/port.h b/src/common/port.h
@@ -20,10 +20,6 @@
 
 #pragma once
 
-#if defined(__sparc__) || defined(__arm__)
-#define USE_ALIGNED_ACCESS
-#endif
-
 #if defined(__s390__)
 #if defined(__GNUC__) && __GNUC__ < 7
 constexpr size_t CACHE_LINE_SIZE = 64U;

diff --git a/src/storage/storage.h b/src/storage/storage.h
@@ -42,10 +42,6 @@
 #include "observer_or_unique.h"
 #include "status.h"
 
-#if defined(__sparc__) || defined(__arm__)
-#define USE_ALIGNED_ACCESS
-#endif
-
 enum class StorageEngineType : uint16_t {
   RocksDB,
   Speedb,

diff --git a/src/types/redis_bitmap.cc b/src/types/redis_bitmap.cc
@@ -549,33 +549,40 @@ rocksdb::Status Bitmap::BitOp(BitOpFlags op_flag, const std::string &op_name, co
         } else {
           memset(frag_res.get(), 0, frag_maxlen);
         }
-
         /* Fast path: as far as we have data for all the input bitmaps we
          * can take a fast path that performs much better than the
-         * vanilla algorithm. On ARM we skip the fast path since it will
-         * result in GCC compiling the code using multiple-words load/store
-         * operations that are not supported even in ARM >= v6. */
-#ifndef USE_ALIGNED_ACCESS
+         * vanilla algorithm.
+         * We hope the compiler will generate a better code for memcpy
+         * rather than keep this fast path only in ARM machine.
+         */
         if (frag_minlen >= sizeof(uint64_t) * 4 && frag_numkeys <= 16) {
-          auto *lres = reinterpret_cast<uint64_t *>(frag_res.get());
-          const uint64_t *lp[16];
+          uint8_t *lres = frag_res.get();
+          // lp points to the start of each fragment and would be advanced
+          // to frag_minlen.
+          const uint8_t *lp[16];
           for (uint64_t i = 0; i < frag_numkeys; i++) {
-            lp[i] = reinterpret_cast<const uint64_t *>(fragments[i].data());
+            lp[i] = reinterpret_cast<const uint8_t *>(fragments[i].data());
           }
           memcpy(frag_res.get(), fragments[0].data(), frag_minlen);
           auto apply_fast_path_op = [&](auto op) {
             // Note: kBitOpNot cannot use this op, it only applying
             // to kBitOpAnd, kBitOpOr, kBitOpXor.
             DCHECK(op_flag != kBitOpNot);
             while (frag_minlen >= sizeof(uint64_t) * 4) {
+              uint64_t lres_u64[4];
+              memcpy(lres_u64, lres, sizeof(lres_u64));
               for (uint64_t i = 1; i < frag_numkeys; i++) {
-                op(lres[0], lp[i][0]);
-                op(lres[1], lp[i][1]);
-                op(lres[2], lp[i][2]);
-                op(lres[3], lp[i][3]);
-                lp[i] += 4;
+                uint64_t lp_data[4];
+                memcpy(lp_data, lp[i], sizeof(lp_data));
+                op(lres_u64[0], lp_data[0]);
+                op(lres_u64[1], lp_data[1]);
+                op(lres_u64[2], lp_data[2]);
+                op(lres_u64[3], lp_data[3]);
+                lp[i] += 4 * sizeof(uint64_t);
               }
-              lres += 4;
+              // memcpy back to lres
+              memcpy(lres, &lres_u64, sizeof(lres_u64));
+              lres += 4 * sizeof(uint64_t);
               j += sizeof(uint64_t) * 4;
               frag_minlen -= sizeof(uint64_t) * 4;
             }
@@ -589,18 +596,19 @@ rocksdb::Status Bitmap::BitOp(BitOpFlags op_flag, const std::string &op_name, co
             apply_fast_path_op([](uint64_t &a, uint64_t b) { a ^= b; });
           } else if (op_flag == kBitOpNot) {
             while (frag_minlen >= sizeof(uint64_t) * 4) {
-              lres[0] = ~lres[0];
-              lres[1] = ~lres[1];
-              lres[2] = ~lres[2];
-              lres[3] = ~lres[3];
-              lres += 4;
+              uint64_t lres_u64[4];
+              memcpy(lres_u64, lres, sizeof(lres_u64));
+              lres_u64[0] = ~lres_u64[0];
+              lres_u64[1] = ~lres_u64[1];
+              lres_u64[2] = ~lres_u64[2];
+              lres_u64[3] = ~lres_u64[3];
+              memcpy(lres, &lres_u64, sizeof(lres_u64));
+              lres += 4 * sizeof(uint64_t);
               j += sizeof(uint64_t) * 4;
               frag_minlen -= sizeof(uint64_t) * 4;
             }
           }
         }
-#endif
-
         uint8_t output = 0, byte = 0;
         for (; j < frag_maxlen; j++) {
           output = (fragments[0].size() <= j) ? 0 : fragments[0][j];

diff --git a/src/vendor/endianconv.h b/src/vendor/endianconv.h
@@ -43,7 +43,7 @@ uint64_t intrev64(uint64_t v);
 
 /* variants of the function doing the actual conversion only if the target
  * host is big endian */
-#if (BYTE_ORDER == LITTLE_ENDIAN)
+#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
 #define memrev16ifbe(p) ((void)(0))
 #define memrev32ifbe(p) ((void)(0))
 #define memrev64ifbe(p) ((void)(0))
@@ -61,7 +61,7 @@ uint64_t intrev64(uint64_t v);
 
 /* The functions htonu64() and ntohu64() convert the specified value to
  * network byte ordering and back. In big endian systems they are no-ops. */
-#if (BYTE_ORDER == BIG_ENDIAN)
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 #define htonu64(v) (v)
 #define ntohu64(v) (v)
 #else

diff --git a/src/vendor/murmurhash2.h b/src/vendor/murmurhash2.h
@@ -33,12 +33,6 @@
 
 #include <stdint.h>
 
-#ifndef USE_ALIGNED_ACCESS
-#if defined(__sparc__) || defined(__arm__)
-#define USE_ALIGNED_ACCESS
-#endif
-#endif
-
 // NOLINTBEGIN
 
 /* MurmurHash2, 64 bit version.
@@ -55,11 +49,7 @@ inline uint64_t HllMurMurHash64A(const void *key, int len, uint32_t seed) {
     uint64_t k = 0;
 
 #if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-#ifdef USE_ALIGNED_ACCESS
     memcpy(&k, data, sizeof(uint64_t));
-#else
-    k = *((uint64_t *)data);
-#endif
 #else
     k = (uint64_t)data[0];
     k |= (uint64_t)data[1] << 8;

diff --git a/src/vendor/sha1.cc b/src/vendor/sha1.cc
@@ -26,10 +26,10 @@ A million repetitions of "a"
 
 /* blk0() and blk() perform the initial expand. */
 /* I got the idea of expanding during the round function from SSLeay */
-#if BYTE_ORDER == LITTLE_ENDIAN
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 // NOLINTNEXTLINE
 #define blk0(i) (block->l[i] = (rol(block->l[i], 24) & 0xFF00FF00) | (rol(block->l[i], 8) & 0x00FF00FF))
-#elif BYTE_ORDER == BIG_ENDIAN
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 // NOLINTNEXTLINE
 #define blk0(i) block->l[i]
 #else