Skip to content

Commit

Permalink
opt code
Browse files Browse the repository at this point in the history
  • Loading branch information
skadilover committed Aug 14, 2024
1 parent ee87495 commit 1ac4f0f
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 189 deletions.
102 changes: 0 additions & 102 deletions velox/common/base/FixedMemCompare.h

This file was deleted.

136 changes: 50 additions & 86 deletions velox/common/base/SimdUtil-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1437,109 +1437,80 @@ inline bool memEqualUnsafe(const void* x, const void* y, int32_t size) {
}

namespace detail {
template <typename T>
T clearLeftmostSet(const T value) {
assert(value != 0);

return value & (value - 1);
}

template <typename T>
unsigned FOLLY_ALWAYS_INLINE getFirstBitSet(const T value) {
assert(value != 0);

return __builtin_ctz(value);
}

template <>
unsigned FOLLY_ALWAYS_INLINE getFirstBitSet<uint64_t>(const uint64_t value) {
assert(value != 0);

return __builtin_ctzl(value);
}

#if XSIMD_WITH_AVX2
// AVX2 is faster than sse2
#define SIMD_STRSTR
using CharVector = xsimd::batch<uint8_t, xsimd::avx2>;
#if XSIMD_WITH_SSE4_2
using CharVector = xsimd::batch<uint8_t, xsimd::sse4_2>;
#elif XSIMD_WITH_NEON
#define SIMD_STRSTR
using CharVector = xsimd::batch<uint8_t, xsimd::neon>;
#endif

#ifdef SIMD_STRSTR
size_t FOLLY_ALWAYS_INLINE
smidStrstrAnysize(const char* s, size_t n, const char* needle, size_t k) {
const auto first = CharVector::broadcast(needle[0]);
const auto last = CharVector::broadcast(needle[k - 1]);

for (size_t i = 0; i < n; i += CharVector::size) {
const auto block_first = CharVector::load_unaligned(s + i);
const auto block_last = CharVector::load_unaligned(s + i + k - 1);

const auto eq_first = (first == block_first);
const auto eq_last = (last == block_last);

auto mask = toBitMask(eq_first && eq_last);
;

while (mask != 0) {
const auto bitpos = detail::getFirstBitSet(mask);

if (memcmp(s + i + bitpos + 1, needle + 1, k - 2) == 0) {
return i + bitpos;
}

mask = detail::clearLeftmostSet(mask);
}
}

return std::string::npos;
const int kPageSize = sysconf(_SC_PAGESIZE);
FOLLY_ALWAYS_INLINE bool pageSafe(const void* const ptr) {
return ((kPageSize - 1) & reinterpret_cast<std::uintptr_t>(ptr)) <=
kPageSize - CharVector::size;
}

template <size_t k, typename MEMCMP>
template <bool compiled, size_t compiledNeedleSize>
size_t FOLLY_ALWAYS_INLINE smidStrstrMemcmp(
const char* s,
size_t n,
const char* needle,
MEMCMP memcmp_fun) {
assert(k > 0);
assert(n > 0);
size_t needleSize) {
VELOX_DCHECK(k > 0);
VELOX_DCHECK(n > 0);

auto first = CharVector::broadcast(needle[0]);
auto last = CharVector::broadcast(needle[k - 1]);
for (size_t i = 0; i < n; i += CharVector::size) {
auto last = CharVector::broadcast(needle[needleSize - 1]);
size_t i = 0;
for (; i < n - needleSize && pageSafe(s + i + needleSize - 1) &&
pageSafe(s + i);
i += CharVector::size) {
auto block_first = CharVector::load_unaligned(s + i);
auto block_last = CharVector::load_unaligned(s + i + k - 1);
auto block_last = CharVector::load_unaligned(s + i + needleSize - 1);

const auto eq_first = (first == block_first);
const auto eq_last = (last == block_last);

auto mask = toBitMask(eq_first && eq_last);

while (mask != 0) {
const auto bitpos = detail::getFirstBitSet(mask);
const auto bitpos = __builtin_ctzl(mask);

if (memcmp_fun(s + i + bitpos + 1, needle + 1)) {
return i + bitpos;
if constexpr (compiled) {
if (memcmp(s + i + bitpos + 1, needle + 1, compiledNeedleSize - 2)) {
return i + bitpos;
}
} else {
if (memcmp(s + i + bitpos + 1, needle + 1, needleSize - 2)) {
return i + bitpos;
}
}
mask = mask & (mask - 1);
}
}
for (; i < n - needleSize; ++i) {
if constexpr (compiled) {
if (memcmp(s + i, needle, compiledNeedleSize) == 0) {
return i;
}
} else {
if (memcmp(s + i, needle, needleSize)) {
return i;
}

mask = detail::clearLeftmostSet(mask);
}
}

return std::string::npos;
};

} // namespace detail
#endif

/// A faster implementation for c_strstr(), about 2x faster than string_view`s
/// find(), proved by TpchLikeBenchmark. Use xsmid-batch to compare first&&last
/// char first, use fixed-memcmp to compare left chars. Inline in header file
/// will be a little faster.
size_t FOLLY_ALWAYS_INLINE
simdStrstr(const char* s, size_t n, const char* needle, size_t k) {
#ifdef SIMD_STRSTR
size_t result = std::string::npos;

if (n < k) {
Expand All @@ -1557,54 +1528,51 @@ simdStrstr(const char* s, size_t n, const char* needle, size_t k) {
}

case 2:
result = detail::smidStrstrMemcmp<2>(s, n, needle, alwaysTrue);
result = detail::smidStrstrMemcmp<true, 2>(s, n, needle, 2);
break;

case 3:
result = detail::smidStrstrMemcmp<3>(s, n, needle, memcmp1);
result = detail::smidStrstrMemcmp<true, 3>(s, n, needle, 3);
break;

case 4:
result = detail::smidStrstrMemcmp<4>(s, n, needle, memcmp2);
result = detail::smidStrstrMemcmp<true, 4>(s, n, needle, 4);
break;

case 5:
// Note: use memcmp4 rather memcmp3 for align, as the last character
// of needle is already proven to be equal
result = detail::smidStrstrMemcmp<5>(s, n, needle, memcmp4);
result = detail::smidStrstrMemcmp<true, 5>(s, n, needle, 5);
break;

case 6:
result = detail::smidStrstrMemcmp<6>(s, n, needle, memcmp4);
result = detail::smidStrstrMemcmp<true, 6>(s, n, needle, 6);
break;

case 7:
result = detail::smidStrstrMemcmp<7>(s, n, needle, memcmp5);
result = detail::smidStrstrMemcmp<true, 7>(s, n, needle, 7);
break;

case 8:
result = detail::smidStrstrMemcmp<8>(s, n, needle, memcmp6);
result = detail::smidStrstrMemcmp<true, 8>(s, n, needle, 8);
break;

case 9:
// Note: use memcmp8 rather memcmp7 for the same reason as above.
result = detail::smidStrstrMemcmp<9>(s, n, needle, memcmp8);
result = detail::smidStrstrMemcmp<true, 9>(s, n, needle, 9);
break;

case 10:
result = detail::smidStrstrMemcmp<10>(s, n, needle, memcmp8);
result = detail::smidStrstrMemcmp<true, 10>(s, n, needle, 10);
break;

case 11:
result = detail::smidStrstrMemcmp<11>(s, n, needle, memcmp9);
result = detail::smidStrstrMemcmp<true, 11>(s, n, needle, 11);
break;

case 12:
result = detail::smidStrstrMemcmp<12>(s, n, needle, memcmp10);
result = detail::smidStrstrMemcmp<true, 12>(s, n, needle, 12);
break;

default:
result = detail::smidStrstrAnysize(s, n, needle, k);
result = detail::smidStrstrMemcmp<false, 0>(s, n, needle, k);
break;
}

Expand All @@ -1615,10 +1583,6 @@ simdStrstr(const char* s, size_t n, const char* needle, size_t k) {
} else {
return std::string::npos;
}
#else
// Generic path for string search.
return std::string_view(s, n).find(std::string_view(needle, k));
#endif
}

} // namespace facebook::velox::simd
1 change: 0 additions & 1 deletion velox/common/base/SimdUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#include <cstdint>
#include "velox/common/base/BitUtil.h"
#include "velox/common/base/Exceptions.h"
#include "velox/common/base/FixedMemCompare.h"

#include <folly/Likely.h>
#include <xsimd/xsimd.hpp>
Expand Down

0 comments on commit 1ac4f0f

Please sign in to comment.