diff --git a/include/cuco/bucket_storage.cuh b/include/cuco/bucket_storage.cuh
index c735a3092..9d99e3d7b 100644
--- a/include/cuco/bucket_storage.cuh
+++ b/include/cuco/bucket_storage.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,15 +21,16 @@
 #include <cuco/utility/allocator.hpp>
 
 #include <cuda/std/array>
+#include <cuda/std/bit>
 #include <cuda/std/functional>
 #include <cuda/stream_ref>
 
 #include <cstddef>
 #include <cstdint>
-#include <iterator>
 #include <memory>
 
 namespace cuco {
+
 /**
  * @brief Non-owning array of slots storage reference type.
  *
@@ -40,14 +41,18 @@ namespace cuco {
 template <typename T, int32_t BucketSize, typename Extent = cuco::extent<std::size_t>>
 class bucket_storage_ref {
  public:
-  static constexpr int32_t bucket_size = BucketSize;  ///< Number of elements processed per bucket
+  static constexpr int32_t bucket_size = BucketSize;        ///< Number of elements per bucket
+  static constexpr std::size_t max_vector_load_bytes = 16;  ///< Maximum vector load width in bytes
+
+  using bucket_type = cuda::std::array<T, BucketSize>;  ///< Slot bucket type
+
   static constexpr std::size_t alignment =
-    cuda::std::min(sizeof(T) * bucket_size, std::size_t{16});  ///< Required alignment
+    cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)),
+                   max_vector_load_bytes);  ///< Required alignment in bytes
 
   using extent_type = Extent;                            ///< Storage extent type
   using size_type   = typename extent_type::value_type;  ///< Storage size type
   using value_type  = T;                                 ///< Slot type
-  using bucket_type = cuda::std::array<T, BucketSize>;   ///< Slot bucket type
 
   /**
    * @brief Constructor of slot storage ref.
@@ -150,7 +155,7 @@ class bucket_storage {
   using value_type  = T;                                 ///< Slot type
   using bucket_type = cuda::std::array<T, BucketSize>;   ///< Slot bucket type
 
-  /// Type of the allocator to (de)allocate buckets
+  /// Type of the allocator to (de)allocate slots
   using allocator_type =
     typename std::allocator_traits<Allocator>::template rebind_alloc<value_type>;
   using ref_type = bucket_storage_ref<value_type, bucket_size, extent_type>;  ///< Storage ref type
@@ -244,13 +249,19 @@ class bucket_storage {
   [[nodiscard]] __host__ __device__ constexpr extent_type extent() const noexcept;
 
  private:
-  using slot_deleter_type =
-    detail::custom_deleter<size_type, allocator_type>;  ///< Type of slot deleter
+  struct aligned_deleter {
+    value_type* raw_ptr_;
+    std::size_t size_;
+    allocator_type& allocator_;
+    cuda::stream_ref stream_;
+
+    void operator()(value_type*) const { allocator_.deallocate(raw_ptr_, size_, stream_); }
+  };
 
   extent_type extent_;        ///< Storage extent
   allocator_type allocator_;  ///< Allocator used to (de)allocate slots
-  /// Pointer to the slot storage
-  std::unique_ptr<value_type, slot_deleter_type> slots_;
+  /// Pointer to the aligned slot storage
+  std::unique_ptr<value_type, aligned_deleter> slots_;
 };
 }  // namespace cuco
 
diff --git a/include/cuco/detail/storage/bucket_storage.inl b/include/cuco/detail/storage/bucket_storage.inl
index 40cb9f84b..9cd91d95d 100644
--- a/include/cuco/detail/storage/bucket_storage.inl
+++ b/include/cuco/detail/storage/bucket_storage.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,9 +23,11 @@
 #include <cub/device/device_for.cuh>
 #include <cuda/std/array>
 #include <cuda/std/bit>
+#include <cuda/std/cstdint>
 #include <cuda/stream_ref>
 
 #include <cassert>
+#include <memory>
 
 namespace cuco {
 
@@ -98,10 +100,16 @@ bucket_storage_ref<T, BucketSize, Extent>::extent() const noexcept
 template <typename T, int BucketSize, typename Extent, typename Allocator>
 constexpr bucket_storage<T, BucketSize, Extent, Allocator>::bucket_storage(
   Extent size, Allocator const& allocator, cuda::stream_ref stream)
-  : extent_{size},
-    allocator_{allocator},
-    slots_{allocator_.allocate(capacity(), stream),
-           slot_deleter_type{capacity(), allocator_, stream}}
+  : extent_{size}, allocator_{allocator}, slots_{[this, &stream]() {
+      constexpr std::size_t align  = ref_type::alignment;
+      constexpr std::size_t extra  = (align - 1) / sizeof(value_type) + 1;
+      std::size_t const alloc_size = static_cast<std::size_t>(capacity()) + extra;
+      auto* const raw_ptr          = allocator_.allocate(alloc_size, stream);
+      auto* const aligned_ptr      = reinterpret_cast<value_type*>(
+        (reinterpret_cast<cuda::std::uintptr_t>(raw_ptr) + align - 1) & ~(align - 1));
+      return std::unique_ptr<value_type, aligned_deleter>{
+        aligned_ptr, aligned_deleter{raw_ptr, alloc_size, allocator_, stream}};
+    }()}
 {
 }
 
diff --git a/tests/utility/storage_test.cu b/tests/utility/storage_test.cu
index 87120c44a..cddfe9ba3 100644
--- a/tests/utility/storage_test.cu
+++ b/tests/utility/storage_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,8 +21,12 @@
 #include <cuco/pair.cuh>
 #include <cuco/utility/allocator.hpp>
 
+#include <cuda/std/bit>
+
 #include <catch2/catch_template_test_macros.hpp>
 
+#include <cstdint>
+
 TEMPLATE_TEST_CASE_SIG("utility storage tests",
                        "",
                        ((typename Key, typename Value), Key, Value),
@@ -96,4 +100,88 @@ TEMPLATE_TEST_CASE_SIG("utility storage tests",
     STATIC_REQUIRE(num_buckets == size / bucket_size);
     STATIC_REQUIRE(capacity == gold_capacity);
   }
+
+  SECTION("Storage alignment constant is correct for pairs.")
+  {
+    using storage_ref_type =
+      cuco::bucket_storage_ref<cuco::pair<Key, Value>, bucket_size, cuco::extent<std::size_t>>;
+    using bucket_type = typename storage_ref_type::bucket_type;
+
+    constexpr auto alignment      = storage_ref_type::alignment;
+    constexpr auto expected_align = cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)),
+                                                   storage_ref_type::max_vector_load_bytes);
+
+    STATIC_REQUIRE(alignment == expected_align);
+    STATIC_REQUIRE(cuda::std::has_single_bit(alignment));
+  }
+
+  SECTION("Storage alignment constant is correct for keys.")
+  {
+    using storage_ref_type = cuco::bucket_storage_ref<Key, bucket_size, cuco::extent<std::size_t>>;
+    using bucket_type      = typename storage_ref_type::bucket_type;
+
+    constexpr auto alignment      = storage_ref_type::alignment;
+    constexpr auto expected_align = cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)),
+                                                   storage_ref_type::max_vector_load_bytes);
+
+    STATIC_REQUIRE(alignment == expected_align);
+    STATIC_REQUIRE(cuda::std::has_single_bit(alignment));
+  }
+
+  SECTION("Storage data pointer is aligned to bucket boundary for pairs.")
+  {
+    auto s = cuco::bucket_storage<cuco::pair<Key, Value>,
+                                  bucket_size,
+                                  cuco::extent<std::size_t>,
+                                  allocator_type>(
+      cuco::extent{size}, allocator, cuda::stream_ref{cudaStream_t{nullptr}});
+
+    auto const ptr       = reinterpret_cast<std::uintptr_t>(s.data());
+    auto const alignment = decltype(s)::ref_type::alignment;
+
+    REQUIRE((ptr % alignment) == 0);
+  }
+
+  SECTION("Storage data pointer is aligned to bucket boundary for keys.")
+  {
+    auto s = cuco::bucket_storage<Key, bucket_size, cuco::extent<std::size_t>, allocator_type>(
+      cuco::extent{size}, allocator, cuda::stream_ref{cudaStream_t{nullptr}});
+
+    auto const ptr       = reinterpret_cast<std::uintptr_t>(s.data());
+    auto const alignment = decltype(s)::ref_type::alignment;
+
+    REQUIRE((ptr % alignment) == 0);
+  }
+}
+
+TEMPLATE_TEST_CASE_SIG("bucket storage alignment with different bucket sizes",
+                       "",
+                       ((typename T, int BucketSize), T, BucketSize),
+                       (int32_t, 1),
+                       (int32_t, 2),
+                       (int32_t, 4),
+                       (int64_t, 1),
+                       (int64_t, 2),
+                       (cuco::pair<int32_t, int32_t>, 1),
+                       (cuco::pair<int32_t, int32_t>, 2),
+                       (cuco::pair<int64_t, int64_t>, 1))
+{
+  constexpr std::size_t size{1'000};
+
+  using allocator_type = cuco::cuda_allocator<char>;
+  using storage_type =
+    cuco::bucket_storage<T, BucketSize, cuco::extent<std::size_t>, allocator_type>;
+  using storage_ref_type = typename storage_type::ref_type;
+
+  auto allocator = allocator_type{};
+
+  SECTION("Data pointer is aligned to bucket boundary.")
+  {
+    auto s = storage_type(cuco::extent{size}, allocator, cuda::stream_ref{cudaStream_t{nullptr}});
+
+    auto const ptr       = reinterpret_cast<std::uintptr_t>(s.data());
+    auto const alignment = storage_ref_type::alignment;
+
+    REQUIRE((ptr % alignment) == 0);
+  }
 }