diff --git a/include/cuco/bucket_storage.cuh b/include/cuco/bucket_storage.cuh index c735a3092..9d99e3d7b 100644 --- a/include/cuco/bucket_storage.cuh +++ b/include/cuco/bucket_storage.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2025, NVIDIA CORPORATION. + * Copyright (c) 2022-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,15 +21,16 @@ #include #include +#include #include #include #include #include -#include #include namespace cuco { + /** * @brief Non-owning array of slots storage reference type. * @@ -40,14 +41,18 @@ namespace cuco { template > class bucket_storage_ref { public: - static constexpr int32_t bucket_size = BucketSize; ///< Number of elements processed per bucket + static constexpr int32_t bucket_size = BucketSize; ///< Number of elements per bucket + static constexpr std::size_t max_vector_load_bytes = 16; ///< Maximum vector load width in bytes + + using bucket_type = cuda::std::array; ///< Slot bucket type + static constexpr std::size_t alignment = - cuda::std::min(sizeof(T) * bucket_size, std::size_t{16}); ///< Required alignment + cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)), + max_vector_load_bytes); ///< Required alignment in bytes using extent_type = Extent; ///< Storage extent type using size_type = typename extent_type::value_type; ///< Storage size type using value_type = T; ///< Slot type - using bucket_type = cuda::std::array; ///< Slot bucket type /** * @brief Constructor of slot storage ref. @@ -150,7 +155,7 @@ class bucket_storage { using value_type = T; ///< Slot type using bucket_type = cuda::std::array; ///< Slot bucket type - /// Type of the allocator to (de)allocate buckets + /// Type of the allocator to (de)allocate slots using allocator_type = typename std::allocator_traits::template rebind_alloc; using ref_type = bucket_storage_ref; ///< Storage ref type @@ -244,13 +249,19 @@ class bucket_storage { [[nodiscard]] __host__ __device__ constexpr extent_type extent() const noexcept; private: - using slot_deleter_type = - detail::custom_deleter; ///< Type of slot deleter + struct aligned_deleter { + value_type* raw_ptr_; + std::size_t size_; + allocator_type& allocator_; + cuda::stream_ref stream_; + + void operator()(value_type*) const { allocator_.deallocate(raw_ptr_, size_, stream_); } + }; extent_type extent_; ///< Storage extent allocator_type allocator_; ///< Allocator used to (de)allocate slots - /// Pointer to the slot storage - std::unique_ptr slots_; + /// Pointer to the aligned slot storage + std::unique_ptr slots_; }; } // namespace cuco diff --git a/include/cuco/detail/storage/bucket_storage.inl b/include/cuco/detail/storage/bucket_storage.inl index 40cb9f84b..9cd91d95d 100644 --- a/include/cuco/detail/storage/bucket_storage.inl +++ b/include/cuco/detail/storage/bucket_storage.inl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2025, NVIDIA CORPORATION. + * Copyright (c) 2022-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,9 +23,11 @@ #include #include #include +#include #include #include +#include namespace cuco { @@ -98,10 +100,16 @@ bucket_storage_ref::extent() const noexcept template constexpr bucket_storage::bucket_storage( Extent size, Allocator const& allocator, cuda::stream_ref stream) - : extent_{size}, - allocator_{allocator}, - slots_{allocator_.allocate(capacity(), stream), - slot_deleter_type{capacity(), allocator_, stream}} + : extent_{size}, allocator_{allocator}, slots_{[this, &stream]() { + constexpr std::size_t align = ref_type::alignment; + constexpr std::size_t extra = (align - 1) / sizeof(value_type) + 1; + std::size_t const alloc_size = static_cast(capacity()) + extra; + auto* const raw_ptr = allocator_.allocate(alloc_size, stream); + auto* const aligned_ptr = reinterpret_cast( + (reinterpret_cast(raw_ptr) + align - 1) & ~(align - 1)); + return std::unique_ptr{ + aligned_ptr, aligned_deleter{raw_ptr, alloc_size, allocator_, stream}}; + }()} { } diff --git a/tests/utility/storage_test.cu b/tests/utility/storage_test.cu index 87120c44a..cddfe9ba3 100644 --- a/tests/utility/storage_test.cu +++ b/tests/utility/storage_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2025, NVIDIA CORPORATION. + * Copyright (c) 2022-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,8 +21,12 @@ #include #include +#include + #include +#include + TEMPLATE_TEST_CASE_SIG("utility storage tests", "", ((typename Key, typename Value), Key, Value), @@ -96,4 +100,88 @@ TEMPLATE_TEST_CASE_SIG("utility storage tests", STATIC_REQUIRE(num_buckets == size / bucket_size); STATIC_REQUIRE(capacity == gold_capacity); } + + SECTION("Storage alignment constant is correct for pairs.") + { + using storage_ref_type = + cuco::bucket_storage_ref, bucket_size, cuco::extent>; + using bucket_type = typename storage_ref_type::bucket_type; + + constexpr auto alignment = storage_ref_type::alignment; + constexpr auto expected_align = cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)), + storage_ref_type::max_vector_load_bytes); + + STATIC_REQUIRE(alignment == expected_align); + STATIC_REQUIRE(cuda::std::has_single_bit(alignment)); + } + + SECTION("Storage alignment constant is correct for keys.") + { + using storage_ref_type = cuco::bucket_storage_ref>; + using bucket_type = typename storage_ref_type::bucket_type; + + constexpr auto alignment = storage_ref_type::alignment; + constexpr auto expected_align = cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)), + storage_ref_type::max_vector_load_bytes); + + STATIC_REQUIRE(alignment == expected_align); + STATIC_REQUIRE(cuda::std::has_single_bit(alignment)); + } + + SECTION("Storage data pointer is aligned to bucket boundary for pairs.") + { + auto s = cuco::bucket_storage, + bucket_size, + cuco::extent, + allocator_type>( + cuco::extent{size}, allocator, cuda::stream_ref{cudaStream_t{nullptr}}); + + auto const ptr = reinterpret_cast(s.data()); + auto const alignment = decltype(s)::ref_type::alignment; + + REQUIRE((ptr % alignment) == 0); + } + + SECTION("Storage data pointer is aligned to bucket boundary for keys.") + { + auto s = cuco::bucket_storage, allocator_type>( + cuco::extent{size}, allocator, cuda::stream_ref{cudaStream_t{nullptr}}); + + auto const ptr = reinterpret_cast(s.data()); + auto const alignment = decltype(s)::ref_type::alignment; + + REQUIRE((ptr % alignment) == 0); + } +} + +TEMPLATE_TEST_CASE_SIG("bucket storage alignment with different bucket sizes", + "", + ((typename T, int BucketSize), T, BucketSize), + (int32_t, 1), + (int32_t, 2), + (int32_t, 4), + (int64_t, 1), + (int64_t, 2), + (cuco::pair, 1), + (cuco::pair, 2), + (cuco::pair, 1)) +{ + constexpr std::size_t size{1'000}; + + using allocator_type = cuco::cuda_allocator; + using storage_type = + cuco::bucket_storage, allocator_type>; + using storage_ref_type = typename storage_type::ref_type; + + auto allocator = allocator_type{}; + + SECTION("Data pointer is aligned to bucket boundary.") + { + auto s = storage_type(cuco::extent{size}, allocator, cuda::stream_ref{cudaStream_t{nullptr}}); + + auto const ptr = reinterpret_cast(s.data()); + auto const alignment = storage_ref_type::alignment; + + REQUIRE((ptr % alignment) == 0); + } }