From c40de5a5e524d8a16c05495151107a2ada215656 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 3 Feb 2026 12:46:44 -0800 Subject: [PATCH 1/2] Fix storage alignment --- include/cuco/bucket_storage.cuh | 28 +++-- .../cuco/detail/storage/bucket_storage.inl | 18 ++- tests/utility/storage_test.cu | 109 +++++++++++++++++- 3 files changed, 138 insertions(+), 17 deletions(-) diff --git a/include/cuco/bucket_storage.cuh b/include/cuco/bucket_storage.cuh index c735a3092..46a507cdb 100644 --- a/include/cuco/bucket_storage.cuh +++ b/include/cuco/bucket_storage.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2025, NVIDIA CORPORATION. + * Copyright (c) 2022-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,12 +21,12 @@ #include #include +#include #include #include #include #include -#include #include namespace cuco { @@ -40,14 +40,14 @@ namespace cuco { template > class bucket_storage_ref { public: - static constexpr int32_t bucket_size = BucketSize; ///< Number of elements processed per bucket - static constexpr std::size_t alignment = - cuda::std::min(sizeof(T) * bucket_size, std::size_t{16}); ///< Required alignment + static constexpr int32_t bucket_size = BucketSize; ///< Number of elements processed per bucket + using bucket_type = cuda::std::array; ///< Slot bucket type + static constexpr std::size_t alignment = cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)), + std::size_t{16}); ///< Required alignment using extent_type = Extent; ///< Storage extent type using size_type = typename extent_type::value_type; ///< Storage size type using value_type = T; ///< Slot type - using bucket_type = cuda::std::array; ///< Slot bucket type /** * @brief Constructor of slot storage ref. @@ -150,7 +150,7 @@ class bucket_storage { using value_type = T; ///< Slot type using bucket_type = cuda::std::array; ///< Slot bucket type - /// Type of the allocator to (de)allocate buckets + /// Type of the allocator to (de)allocate slots using allocator_type = typename std::allocator_traits::template rebind_alloc; using ref_type = bucket_storage_ref; ///< Storage ref type @@ -244,13 +244,19 @@ class bucket_storage { [[nodiscard]] __host__ __device__ constexpr extent_type extent() const noexcept; private: - using slot_deleter_type = - detail::custom_deleter; ///< Type of slot deleter + struct aligned_deleter { + value_type* raw_ptr_; + std::size_t size_; + allocator_type& allocator_; + cuda::stream_ref stream_; + + void operator()(value_type*) const { allocator_.deallocate(raw_ptr_, size_, stream_); } + }; extent_type extent_; ///< Storage extent allocator_type allocator_; ///< Allocator used to (de)allocate slots - /// Pointer to the slot storage - std::unique_ptr slots_; + /// Pointer to the aligned slot storage + std::unique_ptr slots_; }; } // namespace cuco diff --git a/include/cuco/detail/storage/bucket_storage.inl b/include/cuco/detail/storage/bucket_storage.inl index 40cb9f84b..9cd91d95d 100644 --- a/include/cuco/detail/storage/bucket_storage.inl +++ b/include/cuco/detail/storage/bucket_storage.inl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2025, NVIDIA CORPORATION. + * Copyright (c) 2022-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,9 +23,11 @@ #include #include #include +#include #include #include +#include namespace cuco { @@ -98,10 +100,16 @@ bucket_storage_ref::extent() const noexcept template constexpr bucket_storage::bucket_storage( Extent size, Allocator const& allocator, cuda::stream_ref stream) - : extent_{size}, - allocator_{allocator}, - slots_{allocator_.allocate(capacity(), stream), - slot_deleter_type{capacity(), allocator_, stream}} + : extent_{size}, allocator_{allocator}, slots_{[this, &stream]() { + constexpr std::size_t align = ref_type::alignment; + constexpr std::size_t extra = (align - 1) / sizeof(value_type) + 1; + std::size_t const alloc_size = static_cast(capacity()) + extra; + auto* const raw_ptr = allocator_.allocate(alloc_size, stream); + auto* const aligned_ptr = reinterpret_cast( + (reinterpret_cast(raw_ptr) + align - 1) & ~(align - 1)); + return std::unique_ptr{ + aligned_ptr, aligned_deleter{raw_ptr, alloc_size, allocator_, stream}}; + }()} { } diff --git a/tests/utility/storage_test.cu b/tests/utility/storage_test.cu index 87120c44a..5b15d7e44 100644 --- a/tests/utility/storage_test.cu +++ b/tests/utility/storage_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2025, NVIDIA CORPORATION. + * Copyright (c) 2022-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,8 +21,12 @@ #include #include +#include + #include +#include + TEMPLATE_TEST_CASE_SIG("utility storage tests", "", ((typename Key, typename Value), Key, Value), @@ -96,4 +100,107 @@ TEMPLATE_TEST_CASE_SIG("utility storage tests", STATIC_REQUIRE(num_buckets == size / bucket_size); STATIC_REQUIRE(capacity == gold_capacity); } + + SECTION("Storage alignment constant is correct for pairs.") + { + using storage_ref_type = + cuco::bucket_storage_ref, bucket_size, cuco::extent>; + using bucket_type = typename storage_ref_type::bucket_type; + + constexpr auto alignment = storage_ref_type::alignment; + constexpr auto expected_align = + cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)), std::size_t{16}); + + STATIC_REQUIRE(alignment == expected_align); + STATIC_REQUIRE(cuda::std::has_single_bit(alignment)); + } + + SECTION("Storage alignment constant is correct for keys.") + { + using storage_ref_type = cuco::bucket_storage_ref>; + using bucket_type = typename storage_ref_type::bucket_type; + + constexpr auto alignment = storage_ref_type::alignment; + constexpr auto expected_align = + cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)), std::size_t{16}); + + STATIC_REQUIRE(alignment == expected_align); + STATIC_REQUIRE(cuda::std::has_single_bit(alignment)); + } + + SECTION("Storage data pointer is aligned to bucket boundary for pairs.") + { + auto s = cuco::bucket_storage, + bucket_size, + cuco::extent, + allocator_type>( + cuco::extent{size}, allocator, cuda::stream_ref{cudaStream_t{nullptr}}); + + auto const ptr = reinterpret_cast(s.data()); + auto const alignment = decltype(s)::ref_type::alignment; + + REQUIRE((ptr % alignment) == 0); + } + + SECTION("Storage data pointer is aligned to bucket boundary for keys.") + { + auto s = cuco::bucket_storage, allocator_type>( + cuco::extent{size}, allocator, cuda::stream_ref{cudaStream_t{nullptr}}); + + auto const ptr = reinterpret_cast(s.data()); + auto const alignment = decltype(s)::ref_type::alignment; + + REQUIRE((ptr % alignment) == 0); + } +} + +TEMPLATE_TEST_CASE_SIG("bucket storage alignment with different bucket sizes", + "", + ((typename T, int BucketSize), T, BucketSize), + (int32_t, 1), + (int32_t, 2), + (int32_t, 4), + (int64_t, 1), + (int64_t, 2), + (cuco::pair, 1), + (cuco::pair, 2), + (cuco::pair, 1)) +{ + constexpr std::size_t size{1'000}; + + using allocator_type = cuco::cuda_allocator; + using storage_type = + cuco::bucket_storage, allocator_type>; + using storage_ref_type = typename storage_type::ref_type; + using bucket_type = typename storage_ref_type::bucket_type; + + auto allocator = allocator_type{}; + + SECTION("Alignment constant is power of 2 and capped at 16.") + { + constexpr auto alignment = storage_ref_type::alignment; + + STATIC_REQUIRE(cuda::std::has_single_bit(alignment)); + STATIC_REQUIRE(alignment <= 16); + STATIC_REQUIRE(alignment >= sizeof(T)); + } + + SECTION("Alignment matches expected value.") + { + constexpr auto alignment = storage_ref_type::alignment; + constexpr auto expected = + cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)), std::size_t{16}); + + STATIC_REQUIRE(alignment == expected); + } + + SECTION("Data pointer is aligned to bucket boundary.") + { + auto s = storage_type(cuco::extent{size}, allocator, cuda::stream_ref{cudaStream_t{nullptr}}); + + auto const ptr = reinterpret_cast(s.data()); + auto const alignment = storage_ref_type::alignment; + + REQUIRE((ptr % alignment) == 0); + } } From 22028e6032af136975d17e6f01dc1485636606cc Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 5 Feb 2026 11:57:34 -0800 Subject: [PATCH 2/2] Minor fixes --- include/cuco/bucket_storage.cuh | 13 +++++++++---- tests/utility/storage_test.cu | 31 ++++++------------------------- 2 files changed, 15 insertions(+), 29 deletions(-) diff --git a/include/cuco/bucket_storage.cuh b/include/cuco/bucket_storage.cuh index 46a507cdb..9d99e3d7b 100644 --- a/include/cuco/bucket_storage.cuh +++ b/include/cuco/bucket_storage.cuh @@ -30,6 +30,7 @@ #include namespace cuco { + /** * @brief Non-owning array of slots storage reference type. * @@ -40,10 +41,14 @@ namespace cuco { template > class bucket_storage_ref { public: - static constexpr int32_t bucket_size = BucketSize; ///< Number of elements processed per bucket - using bucket_type = cuda::std::array; ///< Slot bucket type - static constexpr std::size_t alignment = cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)), - std::size_t{16}); ///< Required alignment + static constexpr int32_t bucket_size = BucketSize; ///< Number of elements per bucket + static constexpr std::size_t max_vector_load_bytes = 16; ///< Maximum vector load width in bytes + + using bucket_type = cuda::std::array; ///< Slot bucket type + + static constexpr std::size_t alignment = + cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)), + max_vector_load_bytes); ///< Required alignment in bytes using extent_type = Extent; ///< Storage extent type using size_type = typename extent_type::value_type; ///< Storage size type diff --git a/tests/utility/storage_test.cu b/tests/utility/storage_test.cu index 5b15d7e44..cddfe9ba3 100644 --- a/tests/utility/storage_test.cu +++ b/tests/utility/storage_test.cu @@ -107,9 +107,9 @@ TEMPLATE_TEST_CASE_SIG("utility storage tests", cuco::bucket_storage_ref, bucket_size, cuco::extent>; using bucket_type = typename storage_ref_type::bucket_type; - constexpr auto alignment = storage_ref_type::alignment; - constexpr auto expected_align = - cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)), std::size_t{16}); + constexpr auto alignment = storage_ref_type::alignment; + constexpr auto expected_align = cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)), + storage_ref_type::max_vector_load_bytes); STATIC_REQUIRE(alignment == expected_align); STATIC_REQUIRE(cuda::std::has_single_bit(alignment)); @@ -120,9 +120,9 @@ TEMPLATE_TEST_CASE_SIG("utility storage tests", using storage_ref_type = cuco::bucket_storage_ref>; using bucket_type = typename storage_ref_type::bucket_type; - constexpr auto alignment = storage_ref_type::alignment; - constexpr auto expected_align = - cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)), std::size_t{16}); + constexpr auto alignment = storage_ref_type::alignment; + constexpr auto expected_align = cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)), + storage_ref_type::max_vector_load_bytes); STATIC_REQUIRE(alignment == expected_align); STATIC_REQUIRE(cuda::std::has_single_bit(alignment)); @@ -172,28 +172,9 @@ TEMPLATE_TEST_CASE_SIG("bucket storage alignment with different bucket sizes", using storage_type = cuco::bucket_storage, allocator_type>; using storage_ref_type = typename storage_type::ref_type; - using bucket_type = typename storage_ref_type::bucket_type; auto allocator = allocator_type{}; - SECTION("Alignment constant is power of 2 and capped at 16.") - { - constexpr auto alignment = storage_ref_type::alignment; - - STATIC_REQUIRE(cuda::std::has_single_bit(alignment)); - STATIC_REQUIRE(alignment <= 16); - STATIC_REQUIRE(alignment >= sizeof(T)); - } - - SECTION("Alignment matches expected value.") - { - constexpr auto alignment = storage_ref_type::alignment; - constexpr auto expected = - cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)), std::size_t{16}); - - STATIC_REQUIRE(alignment == expected); - } - SECTION("Data pointer is aligned to bucket boundary.") { auto s = storage_type(cuco::extent{size}, allocator, cuda::stream_ref{cudaStream_t{nullptr}});