From 1ac4f0fc1eefaa5a099f4dcf38249a58e6f942c8 Mon Sep 17 00:00:00 2001 From: "hengjiang.ly" Date: Wed, 14 Aug 2024 20:50:04 +0800 Subject: [PATCH] opt code --- velox/common/base/FixedMemCompare.h | 102 --------------------- velox/common/base/SimdUtil-inl.h | 136 ++++++++++------------------ velox/common/base/SimdUtil.h | 1 - 3 files changed, 50 insertions(+), 189 deletions(-) delete mode 100644 velox/common/base/FixedMemCompare.h diff --git a/velox/common/base/FixedMemCompare.h b/velox/common/base/FixedMemCompare.h deleted file mode 100644 index 6fb5f05dc9c3..000000000000 --- a/velox/common/base/FixedMemCompare.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include -#include "folly/CPortability.h" - -namespace facebook::velox::simd { - -bool FOLLY_ALWAYS_INLINE alwaysTrue(const char*, const char*) { - return true; -} - -bool FOLLY_ALWAYS_INLINE memcmp1(const char* a, const char* b) { - return a[0] == b[0]; -} - -bool FOLLY_ALWAYS_INLINE memcmp2(const char* a, const char* b) { - const uint16_t A = *reinterpret_cast(a); - const uint16_t B = *reinterpret_cast(b); - return A == B; -} - -bool FOLLY_ALWAYS_INLINE memcmp3(const char* a, const char* b) { - const uint32_t A = *reinterpret_cast(a); - const uint32_t B = *reinterpret_cast(b); - return (A & 0x00ffffff) == (B & 0x00ffffff); -} - -bool FOLLY_ALWAYS_INLINE memcmp4(const char* a, const char* b) { - const uint32_t A = *reinterpret_cast(a); - const uint32_t B = *reinterpret_cast(b); - return A == B; -} - -bool FOLLY_ALWAYS_INLINE memcmp5(const char* a, const char* b) { - const uint64_t A = *reinterpret_cast(a); - const uint64_t B = *reinterpret_cast(b); - return ((A ^ B) & 0x000000fffffffffflu) == 0; -} - -bool FOLLY_ALWAYS_INLINE memcmp6(const char* a, const char* b) { - const uint64_t A = *reinterpret_cast(a); - const uint64_t B = *reinterpret_cast(b); - return ((A ^ B) & 0x0000fffffffffffflu) == 0; -} - -bool FOLLY_ALWAYS_INLINE memcmp7(const char* a, const char* b) { - const uint64_t A = *reinterpret_cast(a); - const uint64_t B = *reinterpret_cast(b); - return ((A ^ B) & 0x00fffffffffffffflu) == 0; -} - -bool FOLLY_ALWAYS_INLINE memcmp8(const char* a, const char* b) { - const uint64_t A = *reinterpret_cast(a); - const uint64_t B = *reinterpret_cast(b); - return A == B; -} - -bool FOLLY_ALWAYS_INLINE memcmp9(const char* a, const char* b) { - const uint64_t A = *reinterpret_cast(a); - const uint64_t B = *reinterpret_cast(b); - return (A == B) & (a[8] == b[8]); -} - -bool FOLLY_ALWAYS_INLINE memcmp10(const char* a, const char* b) { - const uint64_t Aq = *reinterpret_cast(a); - const uint64_t Bq = *reinterpret_cast(b); - const uint16_t Aw = *reinterpret_cast(a + 8); - const uint16_t Bw = *reinterpret_cast(b + 8); - return (Aq == Bq) & (Aw == Bw); -} - -bool FOLLY_ALWAYS_INLINE memcmp11(const char* a, const char* b) { - const uint64_t Aq = *reinterpret_cast(a); - const uint64_t Bq = *reinterpret_cast(b); - const uint32_t Ad = *reinterpret_cast(a + 8); - const uint32_t Bd = *reinterpret_cast(b + 8); - return (Aq == Bq) & ((Ad & 0x00ffffff) == (Bd & 0x00ffffff)); -} - -bool FOLLY_ALWAYS_INLINE memcmp12(const char* a, const char* b) { - const uint64_t Aq = *reinterpret_cast(a); - const uint64_t Bq = *reinterpret_cast(b); - const uint32_t Ad = *reinterpret_cast(a + 8); - const uint32_t Bd = *reinterpret_cast(b + 8); - return (Aq == Bq) & (Ad == Bd); -} - -} // namespace facebook::velox::simd diff --git a/velox/common/base/SimdUtil-inl.h b/velox/common/base/SimdUtil-inl.h index 8739d3e1005c..b16a2a57daf2 100644 --- a/velox/common/base/SimdUtil-inl.h +++ b/velox/common/base/SimdUtil-inl.h @@ -1437,80 +1437,36 @@ inline bool memEqualUnsafe(const void* x, const void* y, int32_t size) { } namespace detail { -template -T clearLeftmostSet(const T value) { - assert(value != 0); - - return value & (value - 1); -} - -template -unsigned FOLLY_ALWAYS_INLINE getFirstBitSet(const T value) { - assert(value != 0); - - return __builtin_ctz(value); -} - -template <> -unsigned FOLLY_ALWAYS_INLINE getFirstBitSet(const uint64_t value) { - assert(value != 0); - - return __builtin_ctzl(value); -} -#if XSIMD_WITH_AVX2 -// AVX2 is faster than sse2 -#define SIMD_STRSTR -using CharVector = xsimd::batch; +#if XSIMD_WITH_SSE4_2 +using CharVector = xsimd::batch; #elif XSIMD_WITH_NEON -#define SIMD_STRSTR using CharVector = xsimd::batch; #endif -#ifdef SIMD_STRSTR -size_t FOLLY_ALWAYS_INLINE -smidStrstrAnysize(const char* s, size_t n, const char* needle, size_t k) { - const auto first = CharVector::broadcast(needle[0]); - const auto last = CharVector::broadcast(needle[k - 1]); - - for (size_t i = 0; i < n; i += CharVector::size) { - const auto block_first = CharVector::load_unaligned(s + i); - const auto block_last = CharVector::load_unaligned(s + i + k - 1); - - const auto eq_first = (first == block_first); - const auto eq_last = (last == block_last); - - auto mask = toBitMask(eq_first && eq_last); - ; - - while (mask != 0) { - const auto bitpos = detail::getFirstBitSet(mask); - - if (memcmp(s + i + bitpos + 1, needle + 1, k - 2) == 0) { - return i + bitpos; - } - - mask = detail::clearLeftmostSet(mask); - } - } - - return std::string::npos; +const int kPageSize = sysconf(_SC_PAGESIZE); +FOLLY_ALWAYS_INLINE bool pageSafe(const void* const ptr) { + return ((kPageSize - 1) & reinterpret_cast(ptr)) <= + kPageSize - CharVector::size; } -template +template size_t FOLLY_ALWAYS_INLINE smidStrstrMemcmp( const char* s, size_t n, const char* needle, - MEMCMP memcmp_fun) { - assert(k > 0); - assert(n > 0); + size_t needleSize) { + VELOX_DCHECK(k > 0); + VELOX_DCHECK(n > 0); auto first = CharVector::broadcast(needle[0]); - auto last = CharVector::broadcast(needle[k - 1]); - for (size_t i = 0; i < n; i += CharVector::size) { + auto last = CharVector::broadcast(needle[needleSize - 1]); + size_t i = 0; + for (; i < n - needleSize && pageSafe(s + i + needleSize - 1) && + pageSafe(s + i); + i += CharVector::size) { auto block_first = CharVector::load_unaligned(s + i); - auto block_last = CharVector::load_unaligned(s + i + k - 1); + auto block_last = CharVector::load_unaligned(s + i + needleSize - 1); const auto eq_first = (first == block_first); const auto eq_last = (last == block_last); @@ -1518,20 +1474,36 @@ size_t FOLLY_ALWAYS_INLINE smidStrstrMemcmp( auto mask = toBitMask(eq_first && eq_last); while (mask != 0) { - const auto bitpos = detail::getFirstBitSet(mask); + const auto bitpos = __builtin_ctzl(mask); - if (memcmp_fun(s + i + bitpos + 1, needle + 1)) { - return i + bitpos; + if constexpr (compiled) { + if (memcmp(s + i + bitpos + 1, needle + 1, compiledNeedleSize - 2)) { + return i + bitpos; + } + } else { + if (memcmp(s + i + bitpos + 1, needle + 1, needleSize - 2)) { + return i + bitpos; + } + } + mask = mask & (mask - 1); + } + } + for (; i < n - needleSize; ++i) { + if constexpr (compiled) { + if (memcmp(s + i, needle, compiledNeedleSize) == 0) { + return i; + } + } else { + if (memcmp(s + i, needle, needleSize)) { + return i; } - - mask = detail::clearLeftmostSet(mask); } } return std::string::npos; }; + } // namespace detail -#endif /// A faster implementation for c_strstr(), about 2x faster than string_view`s /// find(), proved by TpchLikeBenchmark. Use xsmid-batch to compare first&&last @@ -1539,7 +1511,6 @@ size_t FOLLY_ALWAYS_INLINE smidStrstrMemcmp( /// will be a little faster. size_t FOLLY_ALWAYS_INLINE simdStrstr(const char* s, size_t n, const char* needle, size_t k) { -#ifdef SIMD_STRSTR size_t result = std::string::npos; if (n < k) { @@ -1557,54 +1528,51 @@ simdStrstr(const char* s, size_t n, const char* needle, size_t k) { } case 2: - result = detail::smidStrstrMemcmp<2>(s, n, needle, alwaysTrue); + result = detail::smidStrstrMemcmp(s, n, needle, 2); break; case 3: - result = detail::smidStrstrMemcmp<3>(s, n, needle, memcmp1); + result = detail::smidStrstrMemcmp(s, n, needle, 3); break; case 4: - result = detail::smidStrstrMemcmp<4>(s, n, needle, memcmp2); + result = detail::smidStrstrMemcmp(s, n, needle, 4); break; case 5: - // Note: use memcmp4 rather memcmp3 for align, as the last character - // of needle is already proven to be equal - result = detail::smidStrstrMemcmp<5>(s, n, needle, memcmp4); + result = detail::smidStrstrMemcmp(s, n, needle, 5); break; case 6: - result = detail::smidStrstrMemcmp<6>(s, n, needle, memcmp4); + result = detail::smidStrstrMemcmp(s, n, needle, 6); break; case 7: - result = detail::smidStrstrMemcmp<7>(s, n, needle, memcmp5); + result = detail::smidStrstrMemcmp(s, n, needle, 7); break; case 8: - result = detail::smidStrstrMemcmp<8>(s, n, needle, memcmp6); + result = detail::smidStrstrMemcmp(s, n, needle, 8); break; case 9: - // Note: use memcmp8 rather memcmp7 for the same reason as above. - result = detail::smidStrstrMemcmp<9>(s, n, needle, memcmp8); + result = detail::smidStrstrMemcmp(s, n, needle, 9); break; case 10: - result = detail::smidStrstrMemcmp<10>(s, n, needle, memcmp8); + result = detail::smidStrstrMemcmp(s, n, needle, 10); break; case 11: - result = detail::smidStrstrMemcmp<11>(s, n, needle, memcmp9); + result = detail::smidStrstrMemcmp(s, n, needle, 11); break; case 12: - result = detail::smidStrstrMemcmp<12>(s, n, needle, memcmp10); + result = detail::smidStrstrMemcmp(s, n, needle, 12); break; default: - result = detail::smidStrstrAnysize(s, n, needle, k); + result = detail::smidStrstrMemcmp(s, n, needle, k); break; } @@ -1615,10 +1583,6 @@ simdStrstr(const char* s, size_t n, const char* needle, size_t k) { } else { return std::string::npos; } -#else - // Generic path for string search. - return std::string_view(s, n).find(std::string_view(needle, k)); -#endif } } // namespace facebook::velox::simd diff --git a/velox/common/base/SimdUtil.h b/velox/common/base/SimdUtil.h index c26732dfc3a7..71a37ced6131 100644 --- a/velox/common/base/SimdUtil.h +++ b/velox/common/base/SimdUtil.h @@ -19,7 +19,6 @@ #include #include "velox/common/base/BitUtil.h" #include "velox/common/base/Exceptions.h" -#include "velox/common/base/FixedMemCompare.h" #include #include