diff --git a/docs/api/f_api.rst b/docs/api/f_api.rst
index 7a3aa94..9c32b16 100644
--- a/docs/api/f_api.rst
+++ b/docs/api/f_api.rst
@@ -69,6 +69,7 @@ _________________________________
   :f cudecompAutotuneGridMode grid_mode: which communication (transpose/halo) to use to autotune process grid (default: CUDECOMP_AUTOTUNE_GRID_TRANSPOSE)
   :f cudecompDataType dtype: datatype to use during autotuning (default: CUDECOMP_DOUBLE)
   :f logical allow_uneven_distributions: flag to control whether autotuning allows process grids that result in uneven distributions of elements across processes (default: true)
+  :f logical allow_empty_pencils: flag to control whether autotuning allows process grids that result in some processes having pencils with no elements (default: false)
   :f logical disable_nccl_backends: flag to disable NCCL backend options during autotuning (default: false)
   :f logical disable_nvshmem_backends: flag to disable NVSHMEM backend options during autotuning (default: false)
   :f real(c_double) skip_threshold: threshold used to skip testing slow configurations; skip configuration if :code:`skip_threshold * t > t_best`, where :code:`t` is the duration of the first timed trial for the configuration and :code:`t_best` is the average trial time of the current best configuration (default: 0.0)
diff --git a/include/cudecomp.h b/include/cudecomp.h
index b7ca852..67f30e1 100644
--- a/include/cudecomp.h
+++ b/include/cudecomp.h
@@ -145,6 +145,8 @@ typedef struct {
   cudecompDataType_t dtype;             ///< datatype to use during autotuning (default: CUDECOMP_DOUBLE)
   bool allow_uneven_decompositions; ///< flag to control whether autotuning allows process grids that result in uneven
                                     ///< distributions of elements across processes (default: true)
+  bool allow_empty_pencils;         ///< flag to control whether autotuning allows process grids that result in
+                                    ///< some processes having pencils with no elements (default: false)
   bool disable_nccl_backends;       ///< flag to disable NCCL backend options during autotuning (default: false)
   bool disable_nvshmem_backends;    ///< flag to disable NVSHMEM backend options during autotuning (default: false)
   double skip_threshold;            ///< threshold used to skip testing slow configurations; skip configuration
diff --git a/include/internal/comm_routines.h b/include/internal/comm_routines.h
index 137dedf..56fbdc3 100644
--- a/include/internal/comm_routines.h
+++ b/include/internal/comm_routines.h
@@ -211,7 +211,8 @@ static void cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridD
   switch (grid_desc->config.transpose_comm_backend) {
   case CUDECOMP_TRANSPOSE_COMM_NVSHMEM: {
 #ifdef ENABLE_NVSHMEM
-    if (nvshmem_ptr(send_buff, handle->rank) && nvshmem_ptr(recv_buff, handle->rank)) {
+    // Note: For ranks with empty pencils, send_buff or recv_buff can be nullptr.
+    if ((!send_buff || nvshmem_ptr(send_buff, handle->rank)) && (!recv_buff || nvshmem_ptr(recv_buff, handle->rank))) {
       nvshmemAlltoallV(handle, grid_desc, send_buff, send_counts, send_offsets, recv_buff, recv_counts,
                        recv_offsets_nvshmem, comm_axis, stream);
       break;
@@ -359,7 +360,7 @@ cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc
   switch (grid_desc->config.transpose_comm_backend) {
   case CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL: {
 #ifdef ENABLE_NVSHMEM
-    if (nvshmem_ptr(send_buff, handle->rank) && nvshmem_ptr(recv_buff, handle->rank)) {
+    if ((!send_buff || nvshmem_ptr(send_buff, handle->rank)) && (!recv_buff || nvshmem_ptr(recv_buff, handle->rank))) {
       auto comm =
           (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info.mpi_comm : grid_desc->col_comm_info.mpi_comm;
       // auto team = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info.nvshmem_team
diff --git a/include/internal/cudecomp_kernels.cuh b/include/internal/cudecomp_kernels.cuh
index f09c3d2..9a4ccbf 100644
--- a/include/internal/cudecomp_kernels.cuh
+++ b/include/internal/cudecomp_kernels.cuh
@@ -115,7 +115,10 @@ void cudecomp_batched_d2d_memcpy_3d_nd_dispatch(const cudecompBatchedD2DMemcpy3D
   int src_nd = 1;
   int dest_nd = 1;
   for (int i = 0; i < params.ncopies; ++i) {
-    N = std::max(N, params.extents[0][i] * params.extents[1][i] * params.extents[2][i]);
+    size_t N2 = params.extents[0][i] * params.extents[1][i] * params.extents[2][i];
+    if (N2 == 0) continue;
+
+    N = std::max(N, N2);
     if (params.src_strides[1][i] == params.extents[2][i] &&
         params.src_strides[0][i] / params.src_strides[1][i] == params.extents[1][i]) {
       src_nd = std::max(1, src_nd);
@@ -145,6 +148,8 @@ void cudecomp_batched_d2d_memcpy_3d_nd_dispatch(const cudecompBatchedD2DMemcpy3D
 
   if (total_blocks_unroll > CUDECOMP_MIN_BLOCKS_PER_SM * num_sms) { blocks_per_copy = blocks_per_copy_unroll; }
 
+  if (params.ncopies * blocks_per_copy == 0) return;
+
   switch (src_nd) {
   case 1:
     switch (dest_nd) {
diff --git a/include/internal/halo.h b/include/internal/halo.h
index 4fa960c..52ee051 100644
--- a/include/internal/halo.h
+++ b/include/internal/halo.h
@@ -45,11 +45,15 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
   if (padding_ptr) std::copy(padding_ptr, padding_ptr + 3, padding.begin());
 
   // Get pencil info
+  cudecompPencilInfo_t pinfo;
+  CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo, ax, nullptr, nullptr));
   cudecompPencilInfo_t pinfo_h;
   CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_h, ax, halo_extents.data(), nullptr));
   cudecompPencilInfo_t pinfo_h_p; // with padding
   CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_h_p, ax, halo_extents.data(), padding.data()));
 
+  if (pinfo.size == 0) { THROW_NOT_SUPPORTED("updating halos across axis with empty pencils is not supported"); }
+
   // Get global ordered shapes
   auto shape_g_h = getShapeG(pinfo_h);
   auto shape_g_h_p = getShapeG(pinfo_h_p);
@@ -76,40 +80,6 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
     CHECK_CUDA(cudaEventRecord(current_sample->halo_start_event, stream));
   }
 
-  // Check if halos include more than one process (unsupported currently).
-  int count = 0;
-  for (int i = 0; i < 3; ++i) {
-    if (i == ax) continue;
-    if (i == dim) break;
-    count++;
-  }
-
-  auto comm_axis = (count == 0) ? CUDECOMP_COMM_COL : CUDECOMP_COMM_ROW;
-  int comm_rank = (comm_axis == CUDECOMP_COMM_COL) ? grid_desc->col_comm_info.rank : grid_desc->row_comm_info.rank;
-
-  auto splits =
-      getSplits(grid_desc->config.gdims_dist[dim], grid_desc->config.pdims[comm_axis == CUDECOMP_COMM_COL ? 0 : 1],
-                grid_desc->config.gdims[dim] - grid_desc->config.gdims_dist[dim]);
-
-  int comm_rank_l = comm_rank - 1;
-  int comm_rank_r = comm_rank + 1;
-  if (halo_periods[dim]) {
-    comm_rank_l = (comm_rank_l + grid_desc->config.pdims[comm_axis]) % grid_desc->config.pdims[comm_axis];
-    comm_rank_r = (comm_rank_r + grid_desc->config.pdims[comm_axis]) % grid_desc->config.pdims[comm_axis];
-  }
-
-  if (comm_rank_l >= 0) {
-    if (halo_extents[dim] > splits[comm_rank_l] || halo_extents[dim] > splits[comm_rank]) {
-      THROW_INVALID_USAGE("halo spans multiple processes, this is not currently supported.");
-    }
-  }
-
-  if (comm_rank_r < splits.size()) {
-    if (halo_extents[dim] > splits[comm_rank_r] || halo_extents[dim] > splits[comm_rank]) {
-      THROW_INVALID_USAGE("halo spans multiple processes, this is not currently supported.");
-    }
-  }
-
   // Select correct case based on pencil memory order and transfer dim
   int c;
   if (dim != pinfo_h.order[0] && dim != pinfo_h.order[1]) {
@@ -131,6 +101,42 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
                                                     padding.data(), getCudecompDataType<T>()));
     }
     return;
+  } else {
+    // For multi-rank cases, check if halos include ranks other than nearest neighbor process (unsupported currently).
+    int count = 0;
+    for (int i = 0; i < 3; ++i) {
+      if (i == ax) continue;
+      if (i == dim) break;
+      count++;
+    }
+
+    auto comm_axis = (count == 0) ? CUDECOMP_COMM_COL : CUDECOMP_COMM_ROW;
+    int comm_rank = (comm_axis == CUDECOMP_COMM_COL) ? grid_desc->col_comm_info.rank : grid_desc->row_comm_info.rank;
+
+    auto splits =
+        getSplits(grid_desc->config.gdims_dist[dim], grid_desc->config.pdims[comm_axis == CUDECOMP_COMM_COL ? 0 : 1],
+                  grid_desc->config.gdims[dim] - grid_desc->config.gdims_dist[dim]);
+
+    int comm_rank_l = comm_rank - 1;
+    int comm_rank_r = comm_rank + 1;
+    if (halo_periods[dim]) {
+      comm_rank_l = (comm_rank_l + grid_desc->config.pdims[comm_axis]) % grid_desc->config.pdims[comm_axis];
+      comm_rank_r = (comm_rank_r + grid_desc->config.pdims[comm_axis]) % grid_desc->config.pdims[comm_axis];
+    }
+
+    if (comm_rank_l >= 0) {
+      if (halo_extents[dim] > splits[comm_rank_l] || halo_extents[dim] > splits[comm_rank]) {
+        THROW_INVALID_USAGE(
+            "halo includes ranks other than nearest neighbor processes, this is not currently supported.");
+      }
+    }
+
+    if (comm_rank_r < splits.size()) {
+      if (halo_extents[dim] > splits[comm_rank_r] || halo_extents[dim] > splits[comm_rank]) {
+        THROW_INVALID_USAGE(
+            "halo includes ranks other than nearest neighbor processes, this is not currently supported.");
+      }
+    }
   }
 
   bool managed = isManagedPointer(input);
diff --git a/include/internal/transpose.h b/include/internal/transpose.h
index c4893a0..aada897 100644
--- a/include/internal/transpose.h
+++ b/include/internal/transpose.h
@@ -207,6 +207,10 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
   CHECK_CUDECOMP(
       cudecompGetPencilInfo(handle, grid_desc, &pinfo_b_h, ax_b, output_halo_extents.data(), output_padding.data()));
 
+  if (pinfo_a_h.size != 0 and !input) { THROW_INVALID_USAGE("input argument cannot be null"); }
+  if (pinfo_b_h.size != 0 and !output) { THROW_INVALID_USAGE("output argument cannot be null"); }
+  if ((pinfo_a_h.size != 0 || pinfo_b_h.size != 0) and !work) { THROW_INVALID_USAGE("work argument cannot be null"); }
+
   // Check if input and output orders are the same
   bool orders_equal = true;
   for (int i = 0; i < 3; ++i) {
@@ -232,6 +236,18 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
   T* o2 = work + pinfo_a.size;
   T* o3 = output;
 
+  // Handle empty pencil cases
+  if (pinfo_a.size == 0) {
+    o1 = nullptr;
+    o2 = work;
+  } else if (pinfo_b.size == 0) {
+    o1 = work;
+    o2 = nullptr;
+  } else if (pinfo_a.size == 0 && pinfo_b.size == 0) {
+    o1 = nullptr;
+    o2 = nullptr;
+  }
+
   if (transposeBackendRequiresNvshmem(grid_desc->config.transpose_comm_backend)) {
     auto max_pencil_size_a = getGlobalMaxPencilSize(handle, grid_desc, ax_a);
     o2 = work + max_pencil_size_a;
diff --git a/src/autotune.cc b/src/autotune.cc
index 95fb1c9..70c3b97 100644
--- a/src/autotune.cc
+++ b/src/autotune.cc
@@ -172,9 +172,10 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
     CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z2, 2, options->transpose_input_halo_extents[2],
                                          options->transpose_input_padding[2]));
 
-    // Skip any decompositions with empty pencils
-    if (grid_desc->config.pdims[0] > std::min(grid_desc->config.gdims_dist[0], grid_desc->config.gdims_dist[1]) ||
-        grid_desc->config.pdims[1] > std::min(grid_desc->config.gdims_dist[1], grid_desc->config.gdims_dist[2])) {
+    // Skip any decompositions with empty pencils, if disabled
+    if (!options->allow_empty_pencils &&
+        (grid_desc->config.pdims[0] > std::min(grid_desc->config.gdims_dist[0], grid_desc->config.gdims_dist[1]) ||
+         grid_desc->config.pdims[1] > std::min(grid_desc->config.gdims_dist[1], grid_desc->config.gdims_dist[2]))) {
       continue;
     }
 
@@ -588,8 +589,15 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
                                          options->halo_padding));
 
     // Skip any decompositions with empty pencils
-    if (std::max(grid_desc->config.pdims[0], grid_desc->config.pdims[1]) >
-        std::min(grid_desc->config.gdims[1], grid_desc->config.gdims[2])) {
+    if ((options->halo_axis == 0 && (grid_desc->config.pdims[0] > grid_desc->config.gdims_dist[1] ||
+                                     grid_desc->config.pdims[1] > grid_desc->config.gdims_dist[2])) ||
+        (options->halo_axis == 1 && (grid_desc->config.pdims[0] > grid_desc->config.gdims_dist[0] ||
+                                     grid_desc->config.pdims[1] > grid_desc->config.gdims_dist[2])) ||
+        (options->halo_axis == 2 && (grid_desc->config.pdims[0] > grid_desc->config.gdims_dist[0] ||
+                                     grid_desc->config.pdims[1] > grid_desc->config.gdims_dist[1]))) {
+      if (options->allow_empty_pencils) {
+        THROW_NOT_SUPPORTED("cannot perform halo autotuning on distributions with empty pencils");
+      }
       continue;
     }
 
diff --git a/src/cudecomp.cc b/src/cudecomp.cc
index e67446f..a88a558 100644
--- a/src/cudecomp.cc
+++ b/src/cudecomp.cc
@@ -717,13 +717,6 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
     CHECK_CUDA(cudaEventCreateWithFlags(&grid_desc->nvshmem_sync_event, cudaEventDisableTiming));
 #endif
 
-    // Disable decompositions with empty pencils
-    if (!autotune_pdims &&
-        (grid_desc->config.pdims[0] > std::min(grid_desc->config.gdims_dist[0], grid_desc->config.gdims_dist[1]) ||
-         grid_desc->config.pdims[1] > std::min(grid_desc->config.gdims_dist[1], grid_desc->config.gdims_dist[2]))) {
-      THROW_NOT_SUPPORTED("grid descriptor settings yields a distribution with empty pencils");
-    }
-
     // Run autotuning if requested
     if (options) {
       if (options->grid_mode == CUDECOMP_AUTOTUNE_GRID_TRANSPOSE) {
@@ -962,6 +955,7 @@ cudecompResult_t cudecompGridDescAutotuneOptionsSetDefaults(cudecompGridDescAuto
     options->grid_mode = CUDECOMP_AUTOTUNE_GRID_TRANSPOSE;
     options->dtype = CUDECOMP_DOUBLE;
     options->allow_uneven_decompositions = true;
+    options->allow_empty_pencils = false;
     options->disable_nccl_backends = false;
     options->disable_nvshmem_backends = false;
     options->skip_threshold = 0.0;
@@ -1386,9 +1380,6 @@ cudecompResult_t cudecompTransposeXToY(cudecompHandle_t handle, cudecompGridDesc
     checkHandle(handle);
     checkGridDesc(grid_desc);
     checkDataType(dtype);
-    if (!input) { THROW_INVALID_USAGE("input argument cannot be null"); }
-    if (!output) { THROW_INVALID_USAGE("output argument cannot be null"); }
-    if (!work) { THROW_INVALID_USAGE("work argument cannot be null"); }
     switch (dtype) {
     case CUDECOMP_FLOAT:
       cudecompTransposeXToY(handle, grid_desc, reinterpret_cast<float*>(input), reinterpret_cast<float*>(output),
@@ -1429,9 +1420,6 @@ cudecompResult_t cudecompTransposeYToZ(cudecompHandle_t handle, cudecompGridDesc
     checkHandle(handle);
     checkGridDesc(grid_desc);
     checkDataType(dtype);
-    if (!input) { THROW_INVALID_USAGE("input argument cannot be null"); }
-    if (!output) { THROW_INVALID_USAGE("output argument cannot be null"); }
-    if (!work) { THROW_INVALID_USAGE("work argument cannot be null"); }
     switch (dtype) {
     case CUDECOMP_FLOAT:
       cudecompTransposeYToZ(handle, grid_desc, reinterpret_cast<float*>(input), reinterpret_cast<float*>(output),
@@ -1472,9 +1460,6 @@ cudecompResult_t cudecompTransposeZToY(cudecompHandle_t handle, cudecompGridDesc
     checkHandle(handle);
     checkGridDesc(grid_desc);
     checkDataType(dtype);
-    if (!input) { THROW_INVALID_USAGE("input argument cannot be null"); }
-    if (!output) { THROW_INVALID_USAGE("output argument cannot be null"); }
-    if (!work) { THROW_INVALID_USAGE("work argument cannot be null"); }
     switch (dtype) {
     case CUDECOMP_FLOAT:
       cudecompTransposeZToY(handle, grid_desc, reinterpret_cast<float*>(input), reinterpret_cast<float*>(output),
@@ -1515,9 +1500,6 @@ cudecompResult_t cudecompTransposeYToX(cudecompHandle_t handle, cudecompGridDesc
     checkHandle(handle);
     checkGridDesc(grid_desc);
     checkDataType(dtype);
-    if (!input) { THROW_INVALID_USAGE("input argument cannot be null"); }
-    if (!output) { THROW_INVALID_USAGE("output argument cannot be null"); }
-    if (!work) { THROW_INVALID_USAGE("work argument cannot be null"); }
     switch (dtype) {
     case CUDECOMP_FLOAT:
       cudecompTransposeYToX(handle, grid_desc, reinterpret_cast<float*>(input), reinterpret_cast<float*>(output),
diff --git a/src/cudecomp_m.cuf b/src/cudecomp_m.cuf
index 8d207b6..66f20f8 100644
--- a/src/cudecomp_m.cuf
+++ b/src/cudecomp_m.cuf
@@ -106,6 +106,7 @@ module cudecomp
     integer(c_int32_t) :: grid_mode ! which communication (transpose/halo) to use to autotune process grid
     integer(c_int32_t) :: dtype ! datatype to use during autotuning
     logical(c_bool) :: allow_uneven_decompositions ! flag to control whether autotuning allows uneven decompositions (based on gdims_dist if provided, gdims otherwise)
+    logical(c_bool) :: allow_empty_pencils ! flag to control whether autotuning allows process grids that result in some processes having pencils with no elements (default: false)
     logical(c_bool) :: disable_nccl_backends ! flag to disable NCCL backend options during autotuning
     logical(c_bool) :: disable_nvshmem_backends ! flag to disable NVSHMEM backend options during autotuning
     real(c_double) :: skip_threshold ! threshold used to skip testing slow configurations