From 609ae19565ff6a3499168b76a0be5652762e24f6 Mon Sep 17 00:00:00 2001 From: Wayne Franz Date: Thu, 4 Apr 2024 16:30:25 -0400 Subject: [PATCH] Update compile-time shared memory usage check for device_partition (#543) * Update compile-time shared memory usage check for device_partition The device partition algorithm uses a default_select_config struct to detect which architecture we're running on. The default_select_config struct eventually causes the creation of a struct of type limit_block_size. This struct is used to detect if the launch configuration that's being used (block size and amount of shared memory per thread) will cause on the selected device to use more than 32 KiB of shared memory. If so, then limit_block_size attempts to reduce the block size (divides it by 2) and checks the shared memory usage again. If the element type is large enough, it is possible to get into a situation where, even if we use the minimum block size (a single wavefront of threads) and give the threads the minimum possible number of elements to work on (1 each), we will still use more than 32 KiB of shared memory. The limit_block_size struct assumes that the amount of shared memory that will be used is equal to the block size multiplied by the amount of memory required per thread. However, the device partition algorithm actually requires slightly more shared memory than this, because it does an extra allocation to store the lookback scan's state. It's not really feasible to move this lookback scan state out of shared memory because all threads in the block need access to it. This change modifies the limit_block_size struct so that it accepts an "ExtraSharedMemory" template parameter, and updates the shared memory check it performs so that it takes this value into account. It also updates the device partition's config-creating code so that it passes in the size of the lookback scan state. * Add device partition unit test to check behaviour around shared memory limit Test the edge case where the data passed to the device partition algorithm will consume the maximum allowable amount of shared memory. Since the algorithm itself also requires some shared memory to store state, this should push us over the max limit. In this case, the block size should be reduced to compensate. --------- Co-authored-by: root --- .../include/rocprim/device/config_types.hpp | 13 +- .../rocprim/device/device_partition.hpp | 11 +- .../rocprim/device/device_select_config.hpp | 28 ++-- test/rocprim/test_device_partition.cpp | 157 ++++++++++++++++++ 4 files changed, 187 insertions(+), 22 deletions(-) diff --git a/rocprim/include/rocprim/device/config_types.hpp b/rocprim/include/rocprim/device/config_types.hpp index 32bdb5cd6..484db8348 100644 --- a/rocprim/include/rocprim/device/config_types.hpp +++ b/rocprim/include/rocprim/device/config_types.hpp @@ -100,10 +100,13 @@ template< unsigned int SharedMemoryPerThread, // Most kernels require block sizes not smaller than warp unsigned int MinBlockSize, + // If kernels require more than MaxBlockSize * SharedMemoryPerThread bytes + // (eg. to store some kind of block-wide state), that size can be specified here + unsigned int ExtraSharedMemory = 0, // Can fit in shared memory? // Although GPUs have 64KiB, 32KiB is used here as a "soft" limit, // because some additional memory may be required in kernels - bool = (MaxBlockSize * SharedMemoryPerThread <= (1u << 15)) + bool = (MaxBlockSize * SharedMemoryPerThread + ExtraSharedMemory <= (1u << 15)) > struct limit_block_size { @@ -112,16 +115,18 @@ struct limit_block_size limit_block_size< detail::next_power_of_two(MaxBlockSize) / 2, SharedMemoryPerThread, - MinBlockSize + MinBlockSize, + ExtraSharedMemory >::value; }; template< unsigned int MaxBlockSize, unsigned int SharedMemoryPerThread, - unsigned int MinBlockSize + unsigned int MinBlockSize, + unsigned int ExtraSharedMemory > -struct limit_block_size +struct limit_block_size { static_assert(MaxBlockSize >= MinBlockSize, "Data is too large, it cannot fit in shared memory"); diff --git a/rocprim/include/rocprim/device/device_partition.hpp b/rocprim/include/rocprim/device/device_partition.hpp index 3b89a3ed1..56394c631 100644 --- a/rocprim/include/rocprim/device/device_partition.hpp +++ b/rocprim/include/rocprim/device/device_partition.hpp @@ -149,15 +149,18 @@ hipError_t partition_impl(void * temporary_storage, using key_type = typename std::iterator_traits::value_type; using value_type = typename std::iterator_traits::value_type; + using offset_scan_state_type = detail::lookback_scan_state; + using offset_scan_state_with_sleep_type = detail::lookback_scan_state; + // Get default config if Config is default_config using config = default_or_custom_config< Config, - default_select_config + // Note: the partition algorithm requires some extra shared memory space for an instance of + // offset_scan_state_type. Pass it's size to default_select_config here so that it can select + // an appropriate block size (one that ensures that we don't run out of shared memory). + default_select_config >; - using offset_scan_state_type = detail::lookback_scan_state; - using offset_scan_state_with_sleep_type = detail::lookback_scan_state; - static constexpr unsigned int block_size = config::block_size; static constexpr unsigned int items_per_thread = config::items_per_thread; static constexpr auto items_per_block = block_size * items_per_thread; diff --git a/rocprim/include/rocprim/device/device_select_config.hpp b/rocprim/include/rocprim/device/device_select_config.hpp index b7c89eb72..7f90abc81 100644 --- a/rocprim/include/rocprim/device/device_select_config.hpp +++ b/rocprim/include/rocprim/device/device_select_config.hpp @@ -76,14 +76,14 @@ struct select_config namespace detail { -template +template struct select_config_803 { static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div(sizeof(Key), sizeof(int)); using type = select_config< - limit_block_size<256U, sizeof(Key), ROCPRIM_WARP_SIZE_64>::value, + limit_block_size<256U, sizeof(Key), ROCPRIM_WARP_SIZE_64, ExtraSharedMemory>::value, ::rocprim::max(1u, 13u / item_scale), ::rocprim::block_load_method::block_load_transpose, ::rocprim::block_load_method::block_load_transpose, @@ -92,14 +92,14 @@ struct select_config_803 >; }; -template +template struct select_config_900 { static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div(sizeof(Key), sizeof(int)); using type = select_config< - limit_block_size<256U, sizeof(Key), ROCPRIM_WARP_SIZE_64>::value, + limit_block_size<256U, sizeof(Key), ROCPRIM_WARP_SIZE_64, ExtraSharedMemory>::value, ::rocprim::max(1u, 15u / item_scale), ::rocprim::block_load_method::block_load_transpose, ::rocprim::block_load_method::block_load_transpose, @@ -108,14 +108,14 @@ struct select_config_900 >; }; -template +template struct select_config_90a { static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); using type = select_config< - limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value, + limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64, ExtraSharedMemory>::value, ::rocprim::max(1u, 15u / item_scale), ::rocprim::block_load_method::block_load_transpose, ::rocprim::block_load_method::block_load_transpose, @@ -124,14 +124,14 @@ struct select_config_90a >; }; -template +template struct select_config_1030 { static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); using type = select_config< - limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_32>::value, + limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_32, ExtraSharedMemory>::value, ::rocprim::max(1u, 15u / item_scale), ::rocprim::block_load_method::block_load_transpose, ::rocprim::block_load_method::block_load_transpose, @@ -141,15 +141,15 @@ struct select_config_1030 }; -template +template struct default_select_config : select_arch< TargetArch, - select_arch_case<803, select_config_803>, - select_arch_case<900, select_config_900>, - select_arch_case>, - select_arch_case<1030, select_config_1030>, - select_config_803 + select_arch_case<803, select_config_803>, + select_arch_case<900, select_config_900>, + select_arch_case>, + select_arch_case<1030, select_config_1030>, + select_config_803 > { }; } // end namespace detail diff --git a/test/rocprim/test_device_partition.cpp b/test/rocprim/test_device_partition.cpp index 77f2d5629..627d50e72 100644 --- a/test/rocprim/test_device_partition.cpp +++ b/test/rocprim/test_device_partition.cpp @@ -1464,3 +1464,160 @@ TEST_P(RocprimDevicePartitionLargeInputTests, LargeInputPartitionThreeWay) HIP_CHECK(hipStreamDestroy(stream)); } } + +// This test checks to make sure that the block size is reduced correctly +// when our data size and type are set in a way that we will exceed the shared +// memory limit. Since the block size calculation is done at compile time, +// if the block size is not correctly reduced, this test will fail to compile. +TEST(RocprimDevicePartitionBlockSizeTests, BlockSize) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + // Create a large struct to test with. It must be big enough that when + // we the use default block size (defined in rocprim::default_select_config + // struct as 256), giving one instance to each thread will cause us to hit + // 32 KiB of shared memory (the limit enforced by the rocprim::limit_block_size + // struct's boolean template parameter). Since the device_partition algorithm also + // uses some shared memory to store state, this will cause the total usage to exceed + // the 32 KiB limit. If everything's working correctly, this should be detected in + // the limit_block_size's template logic, and it should reduce the block size. + const size_t test_obj_size = 128; // Choose 128, since 256 * 128 = 2^15 bytes (32 KiB). + struct TestObject + { + unsigned char data[test_obj_size]; + + bool operator==(const TestObject& other) const + { + bool equal = true; + for (size_t i = 0; equal && i < test_obj_size; i++) + equal = data[i] == other.data[i]; + + return equal; + } + }; + + using T = TestObject; // input data type + using U = TestObject; // output data type + const bool debug_synchronous = false; + const hipStream_t stream = 0; // default stream + + auto select_op = [] __host__ __device__ (const T& value) -> bool + { + // The data values are in [0, 255]. Partition on the midpoint. + if(value.data[0] == 128) return true; + return false; + }; + + // Use some power of two and off-by-one-from-power-of-two data sizes. + const std::vector sizes = {256, 257, 511, 512, 1024, 1025}; + + for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + for(auto size : sizes) + { + SCOPED_TRACE(testing::Message() << "with size = " << size); + + // Generate data + std::vector input_data = test_utils::get_random_data(size * test_obj_size, 0, 255, seed_value); + std::vector input(size); + for (size_t i = 0; i < size; i++) + memcpy(input[i].data, input_data.data() + i * test_obj_size, test_obj_size); + + T * d_input; + U * d_output; + unsigned int * d_selected_count_output; + HIP_CHECK(test_common_utils::hipMallocHelper(&d_input, input.size() * sizeof(T))); + HIP_CHECK(test_common_utils::hipMallocHelper(&d_output, input.size() * sizeof(U))); + HIP_CHECK(test_common_utils::hipMallocHelper(&d_selected_count_output, sizeof(unsigned int))); + HIP_CHECK( + hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); + + // Calculate expected_selected and expected_rejected results on host + std::vector expected_selected; + std::vector expected_rejected; + expected_selected.reserve(input.size()/2); + expected_rejected.reserve(input.size()/2); + for(size_t i = 0; i < input.size(); i++) + { + if(select_op(input[i])) + { + expected_selected.push_back(input[i]); + } + else + { + expected_rejected.push_back(input[i]); + } + } + std::reverse(expected_rejected.begin(), expected_rejected.end()); + + // temp storage + size_t temp_storage_size_bytes; + // Get size of d_temp_storage + HIP_CHECK(rocprim::partition( + nullptr, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + select_op, + stream, + debug_synchronous)); + + // temp_storage_size_bytes must be >0 + ASSERT_GT(temp_storage_size_bytes, 0); + + // allocate temporary storage + void* d_temp_storage = nullptr; + HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); + + // Run + HIP_CHECK(rocprim::partition( + d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + select_op, + stream, + debug_synchronous)); + + HIP_CHECK(hipDeviceSynchronize()); + + // Check if number of selected value is as expected_selected + unsigned int selected_count_output = 0; + HIP_CHECK(hipMemcpy(&selected_count_output, + d_selected_count_output, + sizeof(unsigned int), + hipMemcpyDeviceToHost)); + ASSERT_EQ(selected_count_output, expected_selected.size()); + + // Check if output values are as expected_selected + std::vector output(input.size()); + HIP_CHECK(hipMemcpy(output.data(), + d_output, + output.size() * sizeof(U), + hipMemcpyDeviceToHost)); + + std::vector output_rejected; + for(size_t i = 0; i < expected_rejected.size(); i++) + { + auto j = i + expected_selected.size(); + output_rejected.push_back(output[j]); + } + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected_selected, expected_selected.size())); + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output_rejected, expected_rejected, expected_rejected.size())); + + hipFree(d_input); + hipFree(d_output); + hipFree(d_selected_count_output); + hipFree(d_temp_storage); + } + } +} \ No newline at end of file