diff --git a/docs/api/f_api.rst b/docs/api/f_api.rst index 7a3aa94..9c32b16 100644 --- a/docs/api/f_api.rst +++ b/docs/api/f_api.rst @@ -69,6 +69,7 @@ _________________________________ :f cudecompAutotuneGridMode grid_mode: which communication (transpose/halo) to use to autotune process grid (default: CUDECOMP_AUTOTUNE_GRID_TRANSPOSE) :f cudecompDataType dtype: datatype to use during autotuning (default: CUDECOMP_DOUBLE) :f logical allow_uneven_distributions: flag to control whether autotuning allows process grids that result in uneven distributions of elements across processes (default: true) + :f logical allow_empty_pencils: flag to control whether autotuning allows process grids that result in some processes having pencils with no elements (default: false) :f logical disable_nccl_backends: flag to disable NCCL backend options during autotuning (default: false) :f logical disable_nvshmem_backends: flag to disable NVSHMEM backend options during autotuning (default: false) :f real(c_double) skip_threshold: threshold used to skip testing slow configurations; skip configuration if :code:`skip_threshold * t > t_best`, where :code:`t` is the duration of the first timed trial for the configuration and :code:`t_best` is the average trial time of the current best configuration (default: 0.0) diff --git a/include/cudecomp.h b/include/cudecomp.h index b7ca852..67f30e1 100644 --- a/include/cudecomp.h +++ b/include/cudecomp.h @@ -145,6 +145,8 @@ typedef struct { cudecompDataType_t dtype; ///< datatype to use during autotuning (default: CUDECOMP_DOUBLE) bool allow_uneven_decompositions; ///< flag to control whether autotuning allows process grids that result in uneven ///< distributions of elements across processes (default: true) + bool allow_empty_pencils; ///< flag to control whether autotuning allows process grids that result in + ///< some processes having pencils with no elements (default: false) bool disable_nccl_backends; ///< flag to disable NCCL backend options during autotuning (default: false) bool disable_nvshmem_backends; ///< flag to disable NVSHMEM backend options during autotuning (default: false) double skip_threshold; ///< threshold used to skip testing slow configurations; skip configuration diff --git a/include/internal/comm_routines.h b/include/internal/comm_routines.h index 137dedf..56fbdc3 100644 --- a/include/internal/comm_routines.h +++ b/include/internal/comm_routines.h @@ -211,7 +211,8 @@ static void cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridD switch (grid_desc->config.transpose_comm_backend) { case CUDECOMP_TRANSPOSE_COMM_NVSHMEM: { #ifdef ENABLE_NVSHMEM - if (nvshmem_ptr(send_buff, handle->rank) && nvshmem_ptr(recv_buff, handle->rank)) { + // Note: For ranks with empty pencils, send_buff or recv_buff can be nullptr. + if ((!send_buff || nvshmem_ptr(send_buff, handle->rank)) && (!recv_buff || nvshmem_ptr(recv_buff, handle->rank))) { nvshmemAlltoallV(handle, grid_desc, send_buff, send_counts, send_offsets, recv_buff, recv_counts, recv_offsets_nvshmem, comm_axis, stream); break; @@ -359,7 +360,7 @@ cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc switch (grid_desc->config.transpose_comm_backend) { case CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL: { #ifdef ENABLE_NVSHMEM - if (nvshmem_ptr(send_buff, handle->rank) && nvshmem_ptr(recv_buff, handle->rank)) { + if ((!send_buff || nvshmem_ptr(send_buff, handle->rank)) && (!recv_buff || nvshmem_ptr(recv_buff, handle->rank))) { auto comm = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info.mpi_comm : grid_desc->col_comm_info.mpi_comm; // auto team = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info.nvshmem_team diff --git a/include/internal/cudecomp_kernels.cuh b/include/internal/cudecomp_kernels.cuh index f09c3d2..9a4ccbf 100644 --- a/include/internal/cudecomp_kernels.cuh +++ b/include/internal/cudecomp_kernels.cuh @@ -115,7 +115,10 @@ void cudecomp_batched_d2d_memcpy_3d_nd_dispatch(const cudecompBatchedD2DMemcpy3D int src_nd = 1; int dest_nd = 1; for (int i = 0; i < params.ncopies; ++i) { - N = std::max(N, params.extents[0][i] * params.extents[1][i] * params.extents[2][i]); + size_t N2 = params.extents[0][i] * params.extents[1][i] * params.extents[2][i]; + if (N2 == 0) continue; + + N = std::max(N, N2); if (params.src_strides[1][i] == params.extents[2][i] && params.src_strides[0][i] / params.src_strides[1][i] == params.extents[1][i]) { src_nd = std::max(1, src_nd); @@ -145,6 +148,8 @@ void cudecomp_batched_d2d_memcpy_3d_nd_dispatch(const cudecompBatchedD2DMemcpy3D if (total_blocks_unroll > CUDECOMP_MIN_BLOCKS_PER_SM * num_sms) { blocks_per_copy = blocks_per_copy_unroll; } + if (params.ncopies * blocks_per_copy == 0) return; + switch (src_nd) { case 1: switch (dest_nd) { diff --git a/include/internal/halo.h b/include/internal/halo.h index 4fa960c..52ee051 100644 --- a/include/internal/halo.h +++ b/include/internal/halo.h @@ -45,11 +45,15 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG if (padding_ptr) std::copy(padding_ptr, padding_ptr + 3, padding.begin()); // Get pencil info + cudecompPencilInfo_t pinfo; + CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo, ax, nullptr, nullptr)); cudecompPencilInfo_t pinfo_h; CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_h, ax, halo_extents.data(), nullptr)); cudecompPencilInfo_t pinfo_h_p; // with padding CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_h_p, ax, halo_extents.data(), padding.data())); + if (pinfo.size == 0) { THROW_NOT_SUPPORTED("updating halos across axis with empty pencils is not supported"); } + // Get global ordered shapes auto shape_g_h = getShapeG(pinfo_h); auto shape_g_h_p = getShapeG(pinfo_h_p); @@ -76,40 +80,6 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG CHECK_CUDA(cudaEventRecord(current_sample->halo_start_event, stream)); } - // Check if halos include more than one process (unsupported currently). - int count = 0; - for (int i = 0; i < 3; ++i) { - if (i == ax) continue; - if (i == dim) break; - count++; - } - - auto comm_axis = (count == 0) ? CUDECOMP_COMM_COL : CUDECOMP_COMM_ROW; - int comm_rank = (comm_axis == CUDECOMP_COMM_COL) ? grid_desc->col_comm_info.rank : grid_desc->row_comm_info.rank; - - auto splits = - getSplits(grid_desc->config.gdims_dist[dim], grid_desc->config.pdims[comm_axis == CUDECOMP_COMM_COL ? 0 : 1], - grid_desc->config.gdims[dim] - grid_desc->config.gdims_dist[dim]); - - int comm_rank_l = comm_rank - 1; - int comm_rank_r = comm_rank + 1; - if (halo_periods[dim]) { - comm_rank_l = (comm_rank_l + grid_desc->config.pdims[comm_axis]) % grid_desc->config.pdims[comm_axis]; - comm_rank_r = (comm_rank_r + grid_desc->config.pdims[comm_axis]) % grid_desc->config.pdims[comm_axis]; - } - - if (comm_rank_l >= 0) { - if (halo_extents[dim] > splits[comm_rank_l] || halo_extents[dim] > splits[comm_rank]) { - THROW_INVALID_USAGE("halo spans multiple processes, this is not currently supported."); - } - } - - if (comm_rank_r < splits.size()) { - if (halo_extents[dim] > splits[comm_rank_r] || halo_extents[dim] > splits[comm_rank]) { - THROW_INVALID_USAGE("halo spans multiple processes, this is not currently supported."); - } - } - // Select correct case based on pencil memory order and transfer dim int c; if (dim != pinfo_h.order[0] && dim != pinfo_h.order[1]) { @@ -131,6 +101,42 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG padding.data(), getCudecompDataType())); } return; + } else { + // For multi-rank cases, check if halos include ranks other than nearest neighbor process (unsupported currently). + int count = 0; + for (int i = 0; i < 3; ++i) { + if (i == ax) continue; + if (i == dim) break; + count++; + } + + auto comm_axis = (count == 0) ? CUDECOMP_COMM_COL : CUDECOMP_COMM_ROW; + int comm_rank = (comm_axis == CUDECOMP_COMM_COL) ? grid_desc->col_comm_info.rank : grid_desc->row_comm_info.rank; + + auto splits = + getSplits(grid_desc->config.gdims_dist[dim], grid_desc->config.pdims[comm_axis == CUDECOMP_COMM_COL ? 0 : 1], + grid_desc->config.gdims[dim] - grid_desc->config.gdims_dist[dim]); + + int comm_rank_l = comm_rank - 1; + int comm_rank_r = comm_rank + 1; + if (halo_periods[dim]) { + comm_rank_l = (comm_rank_l + grid_desc->config.pdims[comm_axis]) % grid_desc->config.pdims[comm_axis]; + comm_rank_r = (comm_rank_r + grid_desc->config.pdims[comm_axis]) % grid_desc->config.pdims[comm_axis]; + } + + if (comm_rank_l >= 0) { + if (halo_extents[dim] > splits[comm_rank_l] || halo_extents[dim] > splits[comm_rank]) { + THROW_INVALID_USAGE( + "halo includes ranks other than nearest neighbor processes, this is not currently supported."); + } + } + + if (comm_rank_r < splits.size()) { + if (halo_extents[dim] > splits[comm_rank_r] || halo_extents[dim] > splits[comm_rank]) { + THROW_INVALID_USAGE( + "halo includes ranks other than nearest neighbor processes, this is not currently supported."); + } + } } bool managed = isManagedPointer(input); diff --git a/include/internal/transpose.h b/include/internal/transpose.h index c4893a0..aada897 100644 --- a/include/internal/transpose.h +++ b/include/internal/transpose.h @@ -207,6 +207,10 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c CHECK_CUDECOMP( cudecompGetPencilInfo(handle, grid_desc, &pinfo_b_h, ax_b, output_halo_extents.data(), output_padding.data())); + if (pinfo_a_h.size != 0 and !input) { THROW_INVALID_USAGE("input argument cannot be null"); } + if (pinfo_b_h.size != 0 and !output) { THROW_INVALID_USAGE("output argument cannot be null"); } + if ((pinfo_a_h.size != 0 || pinfo_b_h.size != 0) and !work) { THROW_INVALID_USAGE("work argument cannot be null"); } + // Check if input and output orders are the same bool orders_equal = true; for (int i = 0; i < 3; ++i) { @@ -232,6 +236,18 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c T* o2 = work + pinfo_a.size; T* o3 = output; + // Handle empty pencil cases + if (pinfo_a.size == 0) { + o1 = nullptr; + o2 = work; + } else if (pinfo_b.size == 0) { + o1 = work; + o2 = nullptr; + } else if (pinfo_a.size == 0 && pinfo_b.size == 0) { + o1 = nullptr; + o2 = nullptr; + } + if (transposeBackendRequiresNvshmem(grid_desc->config.transpose_comm_backend)) { auto max_pencil_size_a = getGlobalMaxPencilSize(handle, grid_desc, ax_a); o2 = work + max_pencil_size_a; diff --git a/src/autotune.cc b/src/autotune.cc index 95fb1c9..70c3b97 100644 --- a/src/autotune.cc +++ b/src/autotune.cc @@ -172,9 +172,10 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z2, 2, options->transpose_input_halo_extents[2], options->transpose_input_padding[2])); - // Skip any decompositions with empty pencils - if (grid_desc->config.pdims[0] > std::min(grid_desc->config.gdims_dist[0], grid_desc->config.gdims_dist[1]) || - grid_desc->config.pdims[1] > std::min(grid_desc->config.gdims_dist[1], grid_desc->config.gdims_dist[2])) { + // Skip any decompositions with empty pencils, if disabled + if (!options->allow_empty_pencils && + (grid_desc->config.pdims[0] > std::min(grid_desc->config.gdims_dist[0], grid_desc->config.gdims_dist[1]) || + grid_desc->config.pdims[1] > std::min(grid_desc->config.gdims_dist[1], grid_desc->config.gdims_dist[2]))) { continue; } @@ -588,8 +589,15 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, options->halo_padding)); // Skip any decompositions with empty pencils - if (std::max(grid_desc->config.pdims[0], grid_desc->config.pdims[1]) > - std::min(grid_desc->config.gdims[1], grid_desc->config.gdims[2])) { + if ((options->halo_axis == 0 && (grid_desc->config.pdims[0] > grid_desc->config.gdims_dist[1] || + grid_desc->config.pdims[1] > grid_desc->config.gdims_dist[2])) || + (options->halo_axis == 1 && (grid_desc->config.pdims[0] > grid_desc->config.gdims_dist[0] || + grid_desc->config.pdims[1] > grid_desc->config.gdims_dist[2])) || + (options->halo_axis == 2 && (grid_desc->config.pdims[0] > grid_desc->config.gdims_dist[0] || + grid_desc->config.pdims[1] > grid_desc->config.gdims_dist[1]))) { + if (options->allow_empty_pencils) { + THROW_NOT_SUPPORTED("cannot perform halo autotuning on distributions with empty pencils"); + } continue; } diff --git a/src/cudecomp.cc b/src/cudecomp.cc index e67446f..a88a558 100644 --- a/src/cudecomp.cc +++ b/src/cudecomp.cc @@ -717,13 +717,6 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes CHECK_CUDA(cudaEventCreateWithFlags(&grid_desc->nvshmem_sync_event, cudaEventDisableTiming)); #endif - // Disable decompositions with empty pencils - if (!autotune_pdims && - (grid_desc->config.pdims[0] > std::min(grid_desc->config.gdims_dist[0], grid_desc->config.gdims_dist[1]) || - grid_desc->config.pdims[1] > std::min(grid_desc->config.gdims_dist[1], grid_desc->config.gdims_dist[2]))) { - THROW_NOT_SUPPORTED("grid descriptor settings yields a distribution with empty pencils"); - } - // Run autotuning if requested if (options) { if (options->grid_mode == CUDECOMP_AUTOTUNE_GRID_TRANSPOSE) { @@ -962,6 +955,7 @@ cudecompResult_t cudecompGridDescAutotuneOptionsSetDefaults(cudecompGridDescAuto options->grid_mode = CUDECOMP_AUTOTUNE_GRID_TRANSPOSE; options->dtype = CUDECOMP_DOUBLE; options->allow_uneven_decompositions = true; + options->allow_empty_pencils = false; options->disable_nccl_backends = false; options->disable_nvshmem_backends = false; options->skip_threshold = 0.0; @@ -1386,9 +1380,6 @@ cudecompResult_t cudecompTransposeXToY(cudecompHandle_t handle, cudecompGridDesc checkHandle(handle); checkGridDesc(grid_desc); checkDataType(dtype); - if (!input) { THROW_INVALID_USAGE("input argument cannot be null"); } - if (!output) { THROW_INVALID_USAGE("output argument cannot be null"); } - if (!work) { THROW_INVALID_USAGE("work argument cannot be null"); } switch (dtype) { case CUDECOMP_FLOAT: cudecompTransposeXToY(handle, grid_desc, reinterpret_cast(input), reinterpret_cast(output), @@ -1429,9 +1420,6 @@ cudecompResult_t cudecompTransposeYToZ(cudecompHandle_t handle, cudecompGridDesc checkHandle(handle); checkGridDesc(grid_desc); checkDataType(dtype); - if (!input) { THROW_INVALID_USAGE("input argument cannot be null"); } - if (!output) { THROW_INVALID_USAGE("output argument cannot be null"); } - if (!work) { THROW_INVALID_USAGE("work argument cannot be null"); } switch (dtype) { case CUDECOMP_FLOAT: cudecompTransposeYToZ(handle, grid_desc, reinterpret_cast(input), reinterpret_cast(output), @@ -1472,9 +1460,6 @@ cudecompResult_t cudecompTransposeZToY(cudecompHandle_t handle, cudecompGridDesc checkHandle(handle); checkGridDesc(grid_desc); checkDataType(dtype); - if (!input) { THROW_INVALID_USAGE("input argument cannot be null"); } - if (!output) { THROW_INVALID_USAGE("output argument cannot be null"); } - if (!work) { THROW_INVALID_USAGE("work argument cannot be null"); } switch (dtype) { case CUDECOMP_FLOAT: cudecompTransposeZToY(handle, grid_desc, reinterpret_cast(input), reinterpret_cast(output), @@ -1515,9 +1500,6 @@ cudecompResult_t cudecompTransposeYToX(cudecompHandle_t handle, cudecompGridDesc checkHandle(handle); checkGridDesc(grid_desc); checkDataType(dtype); - if (!input) { THROW_INVALID_USAGE("input argument cannot be null"); } - if (!output) { THROW_INVALID_USAGE("output argument cannot be null"); } - if (!work) { THROW_INVALID_USAGE("work argument cannot be null"); } switch (dtype) { case CUDECOMP_FLOAT: cudecompTransposeYToX(handle, grid_desc, reinterpret_cast(input), reinterpret_cast(output), diff --git a/src/cudecomp_m.cuf b/src/cudecomp_m.cuf index 8d207b6..66f20f8 100644 --- a/src/cudecomp_m.cuf +++ b/src/cudecomp_m.cuf @@ -106,6 +106,7 @@ module cudecomp integer(c_int32_t) :: grid_mode ! which communication (transpose/halo) to use to autotune process grid integer(c_int32_t) :: dtype ! datatype to use during autotuning logical(c_bool) :: allow_uneven_decompositions ! flag to control whether autotuning allows uneven decompositions (based on gdims_dist if provided, gdims otherwise) + logical(c_bool) :: allow_empty_pencils ! flag to control whether autotuning allows process grids that result in some processes having pencils with no elements (default: false) logical(c_bool) :: disable_nccl_backends ! flag to disable NCCL backend options during autotuning logical(c_bool) :: disable_nvshmem_backends ! flag to disable NVSHMEM backend options during autotuning real(c_double) :: skip_threshold ! threshold used to skip testing slow configurations