From c513a27416f8a22312e2991b80751b98a53fb47c Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Mon, 9 Mar 2026 15:40:05 -0700
Subject: [PATCH 1/3] Revert "Revert "Use native NVSHMEM synchronization APIs
 in NVSHMEM backends (#107)" (#110)"

This reverts commit e4012426d9a968fd3898ac5da151489d3a496343.

Signed-off-by: Josh Romero <joshr@nvidia.com>
---
 include/internal/comm_routines.h | 115 +++++++++++++++----------------
 include/internal/common.h        |   1 +
 include/internal/transpose.h     |  30 +++++---
 src/autotune.cc                  |  20 ++++++
 src/cudecomp.cc                  |  10 +++
 5 files changed, 105 insertions(+), 71 deletions(-)
diff --git a/include/internal/comm_routines.h b/include/internal/comm_routines.h
index 9d7ab45..321be0e 100644
--- a/include/internal/comm_routines.h
+++ b/include/internal/comm_routines.h
@@ -95,10 +95,13 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
                  const std::vector<comm_count_t>& send_counts, const std::vector<comm_count_t>& send_offsets,
                  T* recv_buff, const std::vector<comm_count_t>& recv_counts,
                  const std::vector<comm_count_t>& recv_offsets, cudecompCommAxis comm_axis, cudaStream_t stream) {
-  auto comm_info = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info;
-  auto comm = comm_info.mpi_comm;
+  auto& comm_info = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info;
   auto team = comm_info.nvshmem_team;
   int self_rank = comm_info.rank;
+  auto aux_stream = handle->streams[handle->device_p2p_ce_count];
+
+  // Enforce sync dependency between transpose operations
+  CHECK_CUDA(cudaStreamWaitEvent(stream, grid_desc->nvshmem_sync_event));
 
   // Event dependency on external stream for intra-group transfers
   CHECK_CUDA(cudaEventRecord(grid_desc->events[0], stream));
@@ -106,15 +109,9 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
     CHECK_CUDA(cudaStreamWaitEvent(handle->streams[i], grid_desc->events[0], 0));
   }
 
-  // Using cudaEventSynchronize + barrier instead of nvshmemx_team_sync_on_stream for lower latency
-  CHECK_CUDA(cudaEventSynchronize(grid_desc->nvshmem_sync_event));
-  CHECK_MPI(MPI_Barrier(comm));
-  // nvshmemx_team_sync_on_stream(team, stream);
-
   cudecompNvshmemA2AParams<T> params;
 
   // Inter-group transfers (non-blocking)
-  bool need_quiet = false;
   params.send_buff = send_buff;
   params.recv_buff = recv_buff;
   int count = 0;
@@ -134,13 +131,11 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
       params.ntransfers = count;
       cudecomp_nvshmem_alltoallv(params, stream);
       count = 0;
-      need_quiet = true;
     }
   }
   if (count != 0) {
     params.ntransfers = count;
     cudecomp_nvshmem_alltoallv(params, stream);
-    need_quiet = true;
   }
 
   // Intra-group transfers (blocking, scheduled after non-blocking inter-group transfers for concurrency)
@@ -151,19 +146,19 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
     int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank);
     if (nvshmem_ptr(recv_buff, dst_rank_global)) {
 
-      if (comm_info.ngroups == 1 && handle->device_p2p_ce_count == 1 &&
+      if (comm_info.ngroups == 1 && handle->device_p2p_ce_count == 1 && count != 0 &&
           count % CUDECOMP_NVSHMEM_INTRAGROUP_SYNC_FREQ == 0) {
         // For single group, single P2P CE (e.g. NVSwitch), synchronize NVSHMEM team every
         // CUDECOMP_NVSHMEM_INTRAGROUP_SYNC_FREQ transfers This helps reduce CE contention due to accumulation of
         // jitter.
         for (int i = 0; i < handle->device_p2p_ce_count; ++i) {
           CHECK_CUDA(cudaEventRecord(grid_desc->events[0], handle->streams[i]));
-          CHECK_CUDA(cudaStreamWaitEvent(handle->streams[handle->device_p2p_ce_count], grid_desc->events[0], 0));
+          CHECK_CUDA(cudaStreamWaitEvent(aux_stream, grid_desc->events[0], 0));
         }
 
-        nvshmemx_team_sync_on_stream(team, handle->streams[handle->device_p2p_ce_count]);
+        nvshmemx_team_sync_on_stream(team, aux_stream);
 
-        CHECK_CUDA(cudaEventRecord(grid_desc->events[0], handle->streams[handle->device_p2p_ce_count]));
+        CHECK_CUDA(cudaEventRecord(grid_desc->events[0], aux_stream));
         for (int i = 0; i < handle->device_p2p_ce_count; ++i) {
           CHECK_CUDA(cudaStreamWaitEvent(handle->streams[i], grid_desc->events[0], 0));
         }
@@ -186,12 +181,7 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
     CHECK_CUDA(cudaStreamWaitEvent(stream, grid_desc->events[0], 0));
   }
 
-  if (need_quiet) { nvshmemx_quiet_on_stream(stream); }
-
-  // Using cudaStreamSynchronize + barrier instead of nvshmemx_team_sync_on_stream for lower latency
-  CHECK_CUDA(cudaStreamSynchronize(stream));
-  CHECK_MPI(MPI_Barrier(comm));
-  // nvshmemx_team_sync_on_stream(team, stream);
+  nvshmemx_barrier_on_stream(team, stream);
 }
 #endif
 
@@ -237,7 +227,7 @@ static void cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridD
 #endif
   }
   case CUDECOMP_TRANSPOSE_COMM_NCCL: {
-    auto comm_info = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info;
+    auto& comm_info = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info;
     // For fully intra-group alltoall, use distinct NCCL local comm instead of global comm as it is faster.
     auto comm = (comm_info.ngroups == 1) ? *grid_desc->nccl_local_comm : *grid_desc->nccl_comm;
 
@@ -367,7 +357,7 @@ cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc
                           const std::vector<comm_count_t>& recv_offsets,
                           const std::vector<comm_count_t>& recv_offsets_nvshmem, cudecompCommAxis comm_axis,
                           const std::vector<int>& src_ranks, const std::vector<int>& dst_ranks, cudaStream_t stream,
-                          bool& synced, cudecompTransposePerformanceSample* current_sample = nullptr) {
+                          cudecompTransposePerformanceSample* current_sample = nullptr) {
 
   // If there are no transfers to complete, quick return
   if (send_counts.size() == 0 && recv_counts.size() == 0) { return; }
@@ -404,14 +394,17 @@ cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc
   case CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL: {
 #ifdef ENABLE_NVSHMEM
     if (nvshmem_ptr(send_buff, handle->rank) && nvshmem_ptr(recv_buff, handle->rank)) {
-      auto comm =
-          (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info.mpi_comm : grid_desc->col_comm_info.mpi_comm;
-      // auto team = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info.nvshmem_team
-      //                                             : grid_desc->col_comm_info.nvshmem_team;
+      auto& comm_info = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info;
       auto pl_stream = handle->streams[0];
+      auto aux_stream = handle->streams[handle->device_p2p_ce_count];
       int self_rank = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info.rank : grid_desc->col_comm_info.rank;
 
-      bool barrier = false;
+      // Enforce sync dependency between transpose operations
+      CHECK_CUDA(cudaStreamWaitEvent(pl_stream, grid_desc->nvshmem_sync_event));
+
+      bool need_quiet = false;
+
+      // Inter-group transfers and self-copy (non-blocking)
       for (int i = 0; i < src_ranks.size(); ++i) {
         int src_rank = src_ranks[i];
         int dst_rank = dst_ranks[i];
@@ -421,39 +414,44 @@ cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc
           CHECK_CUDA(cudaMemcpyAsync(recv_buff + recv_offsets_nvshmem[self_rank], send_buff + send_offsets[self_rank],
                                      send_counts[self_rank] * sizeof(T), cudaMemcpyDeviceToDevice, stream));
         } else {
+          int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank);
+          if (nvshmem_ptr(recv_buff, dst_rank_global)) { continue; }
+
           CHECK_CUDA(cudaStreamWaitEvent(pl_stream, grid_desc->events[dst_rank], 0));
-          if (!synced) {
-            // Using cudaEventSynchronize + barrier instead of nvshmemx_team_sync_on_stream for lower latency
-            CHECK_CUDA(cudaEventSynchronize(grid_desc->nvshmem_sync_event));
-            CHECK_MPI(MPI_Barrier(comm));
-            // Only need to sync on the first remote operation of an alltoall sequence to ensure reads on other ranks
-            // from previous communication have completed.
-            synced = true;
-          }
 
-          int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank);
-          nvshmemx_putmem_nbi_on_stream(recv_buff + recv_offsets_nvshmem[dst_rank], send_buff + send_offsets[dst_rank],
-                                        send_counts[dst_rank] * sizeof(T), dst_rank_global, pl_stream);
+          nvshmemx_putmem_signal_nbi_on_stream(recv_buff + recv_offsets_nvshmem[dst_rank],
+                                               send_buff + send_offsets[dst_rank], send_counts[dst_rank] * sizeof(T),
+                                               &comm_info.nvshmem_signals[comm_info.rank], 1, NVSHMEM_SIGNAL_SET,
+                                               dst_rank_global, pl_stream);
 
-          barrier = true;
+          need_quiet = true;
         }
       }
 
-      if (barrier) {
-        nvshmemx_quiet_on_stream(pl_stream);
-        // Using cudaStreamSynchronize + barrier instead of nvshmemx_team_sync_on_stream for lower latency
-        CHECK_CUDA(cudaStreamSynchronize(pl_stream));
-        CHECK_MPI(MPI_Barrier(comm));
-
-        // nvshmemx_team_sync_on_stream(team, pl_stream);
-        // for (int i = 0; i < src_ranks.size(); ++i) {
-        //  int src_rank = src_ranks[i];
-        //  int dst_rank = dst_ranks[i];
-        //  if (src_rank != self_rank) {
-        //    CHECK_CUDA(cudaEventRecord(grid_desc->events[dst_rank], pl_stream));
-        //    CHECK_CUDA(cudaStreamWaitEvent(stream, grid_desc->events[dst_rank], 0));
-        //  }
-        //}
+      // Intra-group transfers (blocking, scheduled after non-blocking inter-group transfers for concurrency)
+      for (int i = 0; i < src_ranks.size(); ++i) {
+        int src_rank = src_ranks[i];
+        int dst_rank = dst_ranks[i];
+
+        int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank);
+        if (!nvshmem_ptr(recv_buff, dst_rank_global) || src_rank == self_rank) { continue; }
+
+        CHECK_CUDA(cudaStreamWaitEvent(pl_stream, grid_desc->events[dst_rank], 0));
+
+        nvshmemx_putmem_signal_on_stream(recv_buff + recv_offsets_nvshmem[dst_rank], send_buff + send_offsets[dst_rank],
+                                         send_counts[dst_rank] * sizeof(T), &comm_info.nvshmem_signals[comm_info.rank],
+                                         1, NVSHMEM_SIGNAL_SET, dst_rank_global, pl_stream);
+      }
+
+      if (need_quiet) { nvshmemx_quiet_on_stream(pl_stream); }
+      for (int i = 0; i < src_ranks.size(); ++i) {
+        int src_rank = src_ranks[i];
+        int dst_rank = dst_ranks[i];
+        if (src_rank != self_rank) {
+          nvshmemx_signal_wait_until_on_stream(&comm_info.nvshmem_signals[src_rank], NVSHMEM_CMP_EQ, 1, pl_stream);
+          CHECK_CUDA(cudaEventRecord(grid_desc->events[dst_rank], pl_stream));
+          CHECK_CUDA(cudaStreamWaitEvent(stream, grid_desc->events[dst_rank], 0));
+        }
       }
       break;
     } else {
@@ -594,8 +592,7 @@ static void cudecompSendRecvPair(const cudecompHandle_t& handle, const cudecompG
   case CUDECOMP_HALO_COMM_NVSHMEM_BLOCKING: {
 #ifdef ENABLE_NVSHMEM
     if (nvshmem_ptr(send_buff, handle->rank) && nvshmem_ptr(recv_buff, handle->rank)) {
-      nvshmemx_quiet_on_stream(stream);
-      nvshmemx_sync_all_on_stream(stream);
+      nvshmemx_barrier_all_on_stream(stream);
       for (int i = 0; i < send_counts.size(); ++i) {
         if (peer_ranks[i] == handle->rank) {
           // Self-copy with cudaMemcpy
@@ -608,14 +605,12 @@ static void cudecompSendRecvPair(const cudecompHandle_t& handle, const cudecompG
           }
         }
         if (grid_desc->config.halo_comm_backend == CUDECOMP_HALO_COMM_NVSHMEM_BLOCKING) {
-          nvshmemx_quiet_on_stream(stream);
-          nvshmemx_sync_all_on_stream(stream);
+          nvshmemx_barrier_all_on_stream(stream);
         }
       }
 
       if (grid_desc->config.halo_comm_backend == CUDECOMP_HALO_COMM_NVSHMEM) {
-        nvshmemx_quiet_on_stream(stream);
-        nvshmemx_sync_all_on_stream(stream);
+        nvshmemx_barrier_all_on_stream(stream);
       };
       break;
     } else {
diff --git a/include/internal/common.h b/include/internal/common.h
index d03de7a..f11e9ca 100644
--- a/include/internal/common.h
+++ b/include/internal/common.h
@@ -127,6 +127,7 @@ struct cudecompCommInfo {
 
 #ifdef ENABLE_NVSHMEM
   nvshmem_team_t nvshmem_team = NVSHMEM_TEAM_INVALID;
+  uint64_t* nvshmem_signals = nullptr;
 #endif
 
   bool mnnvl_active = false; // flag to indicate whether communicator has MNNVL connections
diff --git a/include/internal/transpose.h b/include/internal/transpose.h
index 2b24a5a..84cbe77 100644
--- a/include/internal/transpose.h
+++ b/include/internal/transpose.h
@@ -236,12 +236,26 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
   T* o2 = work + pinfo_a.size;
   T* o3 = output;
 
+#ifdef ENABLE_NVSHMEM
   if (transposeBackendRequiresNvshmem(grid_desc->config.transpose_comm_backend)) {
     auto max_pencil_size_a = getGlobalMaxPencilSize(handle, grid_desc, ax_a);
     o2 = work + max_pencil_size_a;
-    // Record event at start of transpose op for NVSHMEM team synchronization
-    CHECK_CUDA(cudaEventRecord(grid_desc->nvshmem_sync_event, stream));
+
+    // NVSHMEM team synchronization between transpose operations
+    if (splits_a.size() != 1) {
+      auto& comm_info = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info;
+      auto team = comm_info.nvshmem_team;
+      auto aux_stream = handle->streams[handle->device_p2p_ce_count];
+      CHECK_CUDA(cudaEventRecord(grid_desc->nvshmem_sync_event, stream));
+      CHECK_CUDA(cudaStreamWaitEvent(aux_stream, grid_desc->nvshmem_sync_event));
+      // Zero out signal buffer for this team here.
+      CHECK_CUDA(cudaMemsetAsync(comm_info.nvshmem_signals, 0, comm_info.nranks * sizeof(uint64_t), aux_stream));
+      nvshmemx_team_sync_on_stream(team, aux_stream);
+      CHECK_CUDA(cudaEventRecord(grid_desc->nvshmem_sync_event, aux_stream));
+      // Delay final stream wait dependency to alltoall to ensure sync runs concurrently with initial transpose/pack
+    }
   }
+#endif
 
   cudecompTransposePerformanceSample* current_sample = nullptr;
   if (handle->performance_report_enable) {
@@ -609,7 +623,6 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
       }
 
       if (pipelined) {
-        bool nvshmem_synced = false;
         for (int j = 0; j < splits_b.size(); ++j) {
           int src_rank, dst_rank;
           getAlltoallPeerRanks(grid_desc, comm_axis, j, src_rank, dst_rank);
@@ -639,8 +652,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
 
           if (o2 != o1) {
             cudecompAlltoallPipelined(handle, grid_desc, o1, send_counts, send_offsets, o2, recv_counts, recv_offsets,
-                                      recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced,
-                                      current_sample);
+                                      recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, current_sample);
           }
 
           if (o2 != o3) {
@@ -704,7 +716,6 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
         if (i > 0) { strides_out[i] = strides_out[i - 1] * extents_h[i - 1]; }
       }
 
-      bool nvshmem_synced = false;
       for (int j = 0; j < splits_b.size(); ++j) {
         int src_rank, dst_rank;
         getAlltoallPeerRanks(grid_desc, comm_axis, j, src_rank, dst_rank);
@@ -735,8 +746,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
 
           if (o2 != o1) {
             cudecompAlltoallPipelined(handle, grid_desc, o1, send_counts, send_offsets, o2, recv_counts, recv_offsets,
-                                      recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced,
-                                      current_sample);
+                                      recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, current_sample);
           }
         }
 
@@ -760,7 +770,6 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
     }
   } else {
     // Unpack
-    bool nvshmem_synced = false;
     int memcpy_count = 0;
     cudecompBatchedD2DMemcpy3DParams<T> memcpy_params;
     for (int j = 0; j < splits_a.size(); ++j) {
@@ -793,8 +802,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
 
         if (o2 != o1) {
           cudecompAlltoallPipelined(handle, grid_desc, o1, send_counts, send_offsets, o2, recv_counts, recv_offsets,
-                                    recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced,
-                                    current_sample);
+                                    recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, current_sample);
         }
       }
 
diff --git a/src/autotune.cc b/src/autotune.cc
index f970b55..d4872c8 100644
--- a/src/autotune.cc
+++ b/src/autotune.cc
@@ -275,6 +275,14 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
       nvshmem_team_config_t tmp;
       nvshmem_team_split_2d(NVSHMEM_TEAM_WORLD, grid_desc->config.pdims[1], &tmp, 0,
                             &grid_desc->row_comm_info.nvshmem_team, &tmp, 0, &grid_desc->col_comm_info.nvshmem_team);
+      grid_desc->row_comm_info.nvshmem_signals =
+          (uint64_t*)nvshmem_malloc(grid_desc->row_comm_info.nranks * sizeof(uint64_t));
+      CHECK_CUDA(
+          cudaMemset(grid_desc->row_comm_info.nvshmem_signals, 0, grid_desc->row_comm_info.nranks * sizeof(uint64_t)));
+      grid_desc->col_comm_info.nvshmem_signals =
+          (uint64_t*)nvshmem_malloc(grid_desc->col_comm_info.nranks * sizeof(uint64_t));
+      CHECK_CUDA(
+          cudaMemset(grid_desc->col_comm_info.nvshmem_signals, 0, grid_desc->col_comm_info.nranks * sizeof(uint64_t)));
 #endif
     }
 
@@ -451,6 +459,8 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
 #ifdef ENABLE_NVSHMEM
       nvshmem_team_destroy(grid_desc->row_comm_info.nvshmem_team);
       nvshmem_team_destroy(grid_desc->col_comm_info.nvshmem_team);
+      nvshmem_free(grid_desc->row_comm_info.nvshmem_signals);
+      nvshmem_free(grid_desc->col_comm_info.nvshmem_signals);
       grid_desc->row_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
       grid_desc->col_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
 #endif
@@ -691,6 +701,14 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
       nvshmem_team_config_t tmp;
       nvshmem_team_split_2d(NVSHMEM_TEAM_WORLD, grid_desc->config.pdims[1], &tmp, 0,
                             &grid_desc->row_comm_info.nvshmem_team, &tmp, 0, &grid_desc->col_comm_info.nvshmem_team);
+      grid_desc->row_comm_info.nvshmem_signals =
+          (uint64_t*)nvshmem_malloc(grid_desc->row_comm_info.nranks * sizeof(uint64_t));
+      CHECK_CUDA(
+          cudaMemset(grid_desc->row_comm_info.nvshmem_signals, 0, grid_desc->row_comm_info.nranks * sizeof(uint64_t)));
+      grid_desc->col_comm_info.nvshmem_signals =
+          (uint64_t*)nvshmem_malloc(grid_desc->col_comm_info.nranks * sizeof(uint64_t));
+      CHECK_CUDA(
+          cudaMemset(grid_desc->col_comm_info.nvshmem_signals, 0, grid_desc->col_comm_info.nranks * sizeof(uint64_t)));
 #endif
     }
 
@@ -804,6 +822,8 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
 #ifdef ENABLE_NVSHMEM
       nvshmem_team_destroy(grid_desc->row_comm_info.nvshmem_team);
       nvshmem_team_destroy(grid_desc->col_comm_info.nvshmem_team);
+      nvshmem_free(grid_desc->row_comm_info.nvshmem_signals);
+      nvshmem_free(grid_desc->col_comm_info.nvshmem_signals);
       grid_desc->row_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
       grid_desc->col_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
 #endif
diff --git a/src/cudecomp.cc b/src/cudecomp.cc
index 5df3435..9daa234 100644
--- a/src/cudecomp.cc
+++ b/src/cudecomp.cc
@@ -764,6 +764,14 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
       nvshmem_team_config_t tmp;
       nvshmem_team_split_2d(NVSHMEM_TEAM_WORLD, grid_desc->config.pdims[1], &tmp, 0,
                             &grid_desc->row_comm_info.nvshmem_team, &tmp, 0, &grid_desc->col_comm_info.nvshmem_team);
+      grid_desc->row_comm_info.nvshmem_signals =
+          (uint64_t*)nvshmem_malloc(grid_desc->row_comm_info.nranks * sizeof(uint64_t));
+      CHECK_CUDA(
+          cudaMemset(grid_desc->row_comm_info.nvshmem_signals, 0, grid_desc->row_comm_info.nranks * sizeof(uint64_t)));
+      grid_desc->col_comm_info.nvshmem_signals =
+          (uint64_t*)nvshmem_malloc(grid_desc->col_comm_info.nranks * sizeof(uint64_t));
+      CHECK_CUDA(
+          cudaMemset(grid_desc->col_comm_info.nvshmem_signals, 0, grid_desc->col_comm_info.nranks * sizeof(uint64_t)));
       handle->n_grid_descs_using_nvshmem++;
     } else {
       // Finalize nvshmem to reclaim symmetric heap memory if not used
@@ -891,9 +899,11 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe
         haloBackendRequiresNvshmem(grid_desc->config.halo_comm_backend)) {
       if (grid_desc->row_comm_info.nvshmem_team != NVSHMEM_TEAM_INVALID) {
         nvshmem_team_destroy(grid_desc->row_comm_info.nvshmem_team);
+        nvshmem_free(grid_desc->row_comm_info.nvshmem_signals);
       }
       if (grid_desc->col_comm_info.nvshmem_team != NVSHMEM_TEAM_INVALID) {
         nvshmem_team_destroy(grid_desc->col_comm_info.nvshmem_team);
+        nvshmem_free(grid_desc->col_comm_info.nvshmem_signals);
       }
       handle->n_grid_descs_using_nvshmem--;
 

From cb4a3c147df392b4da68c29a3ee5e10638414683 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Tue, 10 Mar 2026 10:15:42 -0700
Subject: [PATCH 2/3] Apply NVSHMEM_CUMEM_GRANULARITY workaround for older
 NVSHMEM versions.

Signed-off-by: Josh Romero <joshr@nvidia.com>
---
 include/internal/comm_routines.h |  1 -
 src/cudecomp.cc                  | 18 ++++++++++++++++--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/include/internal/comm_routines.h b/include/internal/comm_routines.h
index 321be0e..138de8f 100644
--- a/include/internal/comm_routines.h
+++ b/include/internal/comm_routines.h
@@ -396,7 +396,6 @@ cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc
     if (nvshmem_ptr(send_buff, handle->rank) && nvshmem_ptr(recv_buff, handle->rank)) {
       auto& comm_info = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info;
       auto pl_stream = handle->streams[0];
-      auto aux_stream = handle->streams[handle->device_p2p_ce_count];
       int self_rank = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info.rank : grid_desc->col_comm_info.rank;
 
       // Enforce sync dependency between transpose operations
diff --git a/src/cudecomp.cc b/src/cudecomp.cc
index 9daa234..f0ac511 100644
--- a/src/cudecomp.cc
+++ b/src/cudecomp.cc
@@ -400,7 +400,7 @@ static void inspectNvshmemEnvVars(cudecompHandle_t& handle) {
   }
 }
 
-static void checkNvshmemVersion() {
+static void checkNvshmemVersion(cudecompHandle_t& handle) {
   int major, minor, patch;
   char name[NVSHMEM_MAX_NAME_LEN];
   nvshmem_info_get_name(name);
@@ -413,6 +413,20 @@ static void checkNvshmemVersion() {
   if ((major == 2 && minor < 6) || major < 2) {
     THROW_NOT_SUPPORTED("NVSHMEM versions earlier than 2.6.0 are not supported.");
   }
+
+  // NVSHMEM <= 3.2.5 has a bug where putmem_signal APIs can trigger segfaults for inter-group
+  // transfers. Setting NVSHMEM_CUMEM_GRANULARITY to 1 GiB before initialization as a workaround
+  // for this issue.
+  bool has_signal_bug = (major < 3) || (major == 3 && minor < 2) || (major == 3 && minor == 2 && patch <= 5);
+  if (has_signal_bug) {
+    const char* granularity_str = std::getenv("NVSHMEM_CUMEM_GRANULARITY");
+    if (granularity_str && handle->rank == 0) {
+      printf("CUDECOMP:WARN: Overriding NVSHMEM_CUMEM_GRANULARITY (was %s) to 1 GiB to work around "
+             "a known bug in NVSHMEM %d.%d.%d affecting putmem_signal for inter-group transfers.\n",
+             granularity_str, major, minor, patch);
+    }
+    setenv("NVSHMEM_CUMEM_GRANULARITY", "1073741824", 1);
+  }
 }
 #endif
 
@@ -700,7 +714,7 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
         ((autotune_transpose_backend || autotune_halo_backend) && !autotune_disable_nvshmem_backends)) {
 #ifdef ENABLE_NVSHMEM
       if (!handle->nvshmem_initialized) {
-        checkNvshmemVersion();
+        checkNvshmemVersion(handle);
         inspectNvshmemEnvVars(handle);
         initNvshmemFromMPIComm(handle->mpi_comm);
         handle->nvshmem_initialized = true;

From bc2a81180ba5e568e36a2af5474810b416f08353 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Wed, 11 Mar 2026 08:52:59 -0700
Subject: [PATCH 3/3] Increase NVSHMEM_CUMEM_GRANULARITY to maximum of 2 GiB.

Signed-off-by: Josh Romero <joshr@nvidia.com>
---
 src/cudecomp.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/cudecomp.cc b/src/cudecomp.cc
index f0ac511..ae07c18 100644
--- a/src/cudecomp.cc
+++ b/src/cudecomp.cc
@@ -415,17 +415,17 @@ static void checkNvshmemVersion(cudecompHandle_t& handle) {
   }
 
   // NVSHMEM <= 3.2.5 has a bug where putmem_signal APIs can trigger segfaults for inter-group
-  // transfers. Setting NVSHMEM_CUMEM_GRANULARITY to 1 GiB before initialization as a workaround
-  // for this issue.
+  // transfers. Setting NVSHMEM_CUMEM_GRANULARITY to 2 GiB before initialization works around
+  // this issue.
   bool has_signal_bug = (major < 3) || (major == 3 && minor < 2) || (major == 3 && minor == 2 && patch <= 5);
   if (has_signal_bug) {
     const char* granularity_str = std::getenv("NVSHMEM_CUMEM_GRANULARITY");
     if (granularity_str && handle->rank == 0) {
-      printf("CUDECOMP:WARN: Overriding NVSHMEM_CUMEM_GRANULARITY (was %s) to 1 GiB to work around "
+      printf("CUDECOMP:WARN: Overriding NVSHMEM_CUMEM_GRANULARITY (was %s) to 2 GiB to work around "
              "a known bug in NVSHMEM %d.%d.%d affecting putmem_signal for inter-group transfers.\n",
              granularity_str, major, minor, patch);
     }
-    setenv("NVSHMEM_CUMEM_GRANULARITY", "1073741824", 1);
+    setenv("NVSHMEM_CUMEM_GRANULARITY", "2147483648", 1);
   }
 }
 #endif