Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api/f_api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ _________________________________
:f cudecompAutotuneGridMode grid_mode: which communication (transpose/halo) to use to autotune process grid (default: CUDECOMP_AUTOTUNE_GRID_TRANSPOSE)
:f cudecompDataType dtype: datatype to use during autotuning (default: CUDECOMP_DOUBLE)
:f logical allow_uneven_distributions: flag to control whether autotuning allows process grids that result in uneven distributions of elements across processes (default: true)
:f logical allow_empty_pencils: flag to control whether autotuning allows process grids that result in some processes having pencils with no elements (default: false)
:f logical disable_nccl_backends: flag to disable NCCL backend options during autotuning (default: false)
:f logical disable_nvshmem_backends: flag to disable NVSHMEM backend options during autotuning (default: false)
:f real(c_double) skip_threshold: threshold used to skip testing slow configurations; skip configuration if :code:`skip_threshold * t > t_best`, where :code:`t` is the duration of the first timed trial for the configuration and :code:`t_best` is the average trial time of the current best configuration (default: 0.0)
Expand Down
2 changes: 2 additions & 0 deletions include/cudecomp.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ typedef struct {
cudecompDataType_t dtype; ///< datatype to use during autotuning (default: CUDECOMP_DOUBLE)
bool allow_uneven_decompositions; ///< flag to control whether autotuning allows process grids that result in uneven
///< distributions of elements across processes (default: true)
bool allow_empty_pencils; ///< flag to control whether autotuning allows process grids that result in
///< some processes having pencils with no elements (default: false)
bool disable_nccl_backends; ///< flag to disable NCCL backend options during autotuning (default: false)
bool disable_nvshmem_backends; ///< flag to disable NVSHMEM backend options during autotuning (default: false)
double skip_threshold; ///< threshold used to skip testing slow configurations; skip configuration
Expand Down
5 changes: 3 additions & 2 deletions include/internal/comm_routines.h
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,8 @@ static void cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridD
switch (grid_desc->config.transpose_comm_backend) {
case CUDECOMP_TRANSPOSE_COMM_NVSHMEM: {
#ifdef ENABLE_NVSHMEM
if (nvshmem_ptr(send_buff, handle->rank) && nvshmem_ptr(recv_buff, handle->rank)) {
// Note: For ranks with empty pencils, send_buff or recv_buff can be nullptr.
if ((!send_buff || nvshmem_ptr(send_buff, handle->rank)) && (!recv_buff || nvshmem_ptr(recv_buff, handle->rank))) {
nvshmemAlltoallV(handle, grid_desc, send_buff, send_counts, send_offsets, recv_buff, recv_counts,
recv_offsets_nvshmem, comm_axis, stream);
break;
Expand Down Expand Up @@ -359,7 +360,7 @@ cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc
switch (grid_desc->config.transpose_comm_backend) {
case CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL: {
#ifdef ENABLE_NVSHMEM
if (nvshmem_ptr(send_buff, handle->rank) && nvshmem_ptr(recv_buff, handle->rank)) {
if ((!send_buff || nvshmem_ptr(send_buff, handle->rank)) && (!recv_buff || nvshmem_ptr(recv_buff, handle->rank))) {
auto comm =
(comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info.mpi_comm : grid_desc->col_comm_info.mpi_comm;
// auto team = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info.nvshmem_team
Expand Down
7 changes: 6 additions & 1 deletion include/internal/cudecomp_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,10 @@ void cudecomp_batched_d2d_memcpy_3d_nd_dispatch(const cudecompBatchedD2DMemcpy3D
int src_nd = 1;
int dest_nd = 1;
for (int i = 0; i < params.ncopies; ++i) {
N = std::max(N, params.extents[0][i] * params.extents[1][i] * params.extents[2][i]);
size_t N2 = params.extents[0][i] * params.extents[1][i] * params.extents[2][i];
if (N2 == 0) continue;

N = std::max(N, N2);
if (params.src_strides[1][i] == params.extents[2][i] &&
params.src_strides[0][i] / params.src_strides[1][i] == params.extents[1][i]) {
src_nd = std::max(1, src_nd);
Expand Down Expand Up @@ -145,6 +148,8 @@ void cudecomp_batched_d2d_memcpy_3d_nd_dispatch(const cudecompBatchedD2DMemcpy3D

if (total_blocks_unroll > CUDECOMP_MIN_BLOCKS_PER_SM * num_sms) { blocks_per_copy = blocks_per_copy_unroll; }

if (params.ncopies * blocks_per_copy == 0) return;

switch (src_nd) {
case 1:
switch (dest_nd) {
Expand Down
74 changes: 40 additions & 34 deletions include/internal/halo.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,15 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
if (padding_ptr) std::copy(padding_ptr, padding_ptr + 3, padding.begin());

// Get pencil info
cudecompPencilInfo_t pinfo;
CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo, ax, nullptr, nullptr));
cudecompPencilInfo_t pinfo_h;
CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_h, ax, halo_extents.data(), nullptr));
cudecompPencilInfo_t pinfo_h_p; // with padding
CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_h_p, ax, halo_extents.data(), padding.data()));

if (pinfo.size == 0) { THROW_NOT_SUPPORTED("updating halos across axis with empty pencils is not supported"); }

// Get global ordered shapes
auto shape_g_h = getShapeG(pinfo_h);
auto shape_g_h_p = getShapeG(pinfo_h_p);
Expand All @@ -76,40 +80,6 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
CHECK_CUDA(cudaEventRecord(current_sample->halo_start_event, stream));
}

// Check if halos include more than one process (unsupported currently).
int count = 0;
for (int i = 0; i < 3; ++i) {
if (i == ax) continue;
if (i == dim) break;
count++;
}

auto comm_axis = (count == 0) ? CUDECOMP_COMM_COL : CUDECOMP_COMM_ROW;
int comm_rank = (comm_axis == CUDECOMP_COMM_COL) ? grid_desc->col_comm_info.rank : grid_desc->row_comm_info.rank;

auto splits =
getSplits(grid_desc->config.gdims_dist[dim], grid_desc->config.pdims[comm_axis == CUDECOMP_COMM_COL ? 0 : 1],
grid_desc->config.gdims[dim] - grid_desc->config.gdims_dist[dim]);

int comm_rank_l = comm_rank - 1;
int comm_rank_r = comm_rank + 1;
if (halo_periods[dim]) {
comm_rank_l = (comm_rank_l + grid_desc->config.pdims[comm_axis]) % grid_desc->config.pdims[comm_axis];
comm_rank_r = (comm_rank_r + grid_desc->config.pdims[comm_axis]) % grid_desc->config.pdims[comm_axis];
}

if (comm_rank_l >= 0) {
if (halo_extents[dim] > splits[comm_rank_l] || halo_extents[dim] > splits[comm_rank]) {
THROW_INVALID_USAGE("halo spans multiple processes, this is not currently supported.");
}
}

if (comm_rank_r < splits.size()) {
if (halo_extents[dim] > splits[comm_rank_r] || halo_extents[dim] > splits[comm_rank]) {
THROW_INVALID_USAGE("halo spans multiple processes, this is not currently supported.");
}
}

// Select correct case based on pencil memory order and transfer dim
int c;
if (dim != pinfo_h.order[0] && dim != pinfo_h.order[1]) {
Expand All @@ -131,6 +101,42 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
padding.data(), getCudecompDataType<T>()));
}
return;
} else {
// For multi-rank cases, check if halos include ranks other than nearest neighbor process (unsupported currently).
int count = 0;
for (int i = 0; i < 3; ++i) {
if (i == ax) continue;
if (i == dim) break;
count++;
}

auto comm_axis = (count == 0) ? CUDECOMP_COMM_COL : CUDECOMP_COMM_ROW;
int comm_rank = (comm_axis == CUDECOMP_COMM_COL) ? grid_desc->col_comm_info.rank : grid_desc->row_comm_info.rank;

auto splits =
getSplits(grid_desc->config.gdims_dist[dim], grid_desc->config.pdims[comm_axis == CUDECOMP_COMM_COL ? 0 : 1],
grid_desc->config.gdims[dim] - grid_desc->config.gdims_dist[dim]);

int comm_rank_l = comm_rank - 1;
int comm_rank_r = comm_rank + 1;
if (halo_periods[dim]) {
comm_rank_l = (comm_rank_l + grid_desc->config.pdims[comm_axis]) % grid_desc->config.pdims[comm_axis];
comm_rank_r = (comm_rank_r + grid_desc->config.pdims[comm_axis]) % grid_desc->config.pdims[comm_axis];
}

if (comm_rank_l >= 0) {
if (halo_extents[dim] > splits[comm_rank_l] || halo_extents[dim] > splits[comm_rank]) {
THROW_INVALID_USAGE(
"halo includes ranks other than nearest neighbor processes, this is not currently supported.");
}
}

if (comm_rank_r < splits.size()) {
if (halo_extents[dim] > splits[comm_rank_r] || halo_extents[dim] > splits[comm_rank]) {
THROW_INVALID_USAGE(
"halo includes ranks other than nearest neighbor processes, this is not currently supported.");
}
}
}

bool managed = isManagedPointer(input);
Expand Down
16 changes: 16 additions & 0 deletions include/internal/transpose.h
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,10 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
CHECK_CUDECOMP(
cudecompGetPencilInfo(handle, grid_desc, &pinfo_b_h, ax_b, output_halo_extents.data(), output_padding.data()));

if (pinfo_a_h.size != 0 and !input) { THROW_INVALID_USAGE("input argument cannot be null"); }
if (pinfo_b_h.size != 0 and !output) { THROW_INVALID_USAGE("output argument cannot be null"); }
if ((pinfo_a_h.size != 0 || pinfo_b_h.size != 0) and !work) { THROW_INVALID_USAGE("work argument cannot be null"); }

// Check if input and output orders are the same
bool orders_equal = true;
for (int i = 0; i < 3; ++i) {
Expand All @@ -232,6 +236,18 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
T* o2 = work + pinfo_a.size;
T* o3 = output;

// Handle empty pencil cases
if (pinfo_a.size == 0) {
o1 = nullptr;
o2 = work;
} else if (pinfo_b.size == 0) {
o1 = work;
o2 = nullptr;
} else if (pinfo_a.size == 0 && pinfo_b.size == 0) {
o1 = nullptr;
o2 = nullptr;
}

if (transposeBackendRequiresNvshmem(grid_desc->config.transpose_comm_backend)) {
auto max_pencil_size_a = getGlobalMaxPencilSize(handle, grid_desc, ax_a);
o2 = work + max_pencil_size_a;
Expand Down
18 changes: 13 additions & 5 deletions src/autotune.cc
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,10 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z2, 2, options->transpose_input_halo_extents[2],
options->transpose_input_padding[2]));

// Skip any decompositions with empty pencils
if (grid_desc->config.pdims[0] > std::min(grid_desc->config.gdims_dist[0], grid_desc->config.gdims_dist[1]) ||
grid_desc->config.pdims[1] > std::min(grid_desc->config.gdims_dist[1], grid_desc->config.gdims_dist[2])) {
// Skip any decompositions with empty pencils, if disabled
if (!options->allow_empty_pencils &&
(grid_desc->config.pdims[0] > std::min(grid_desc->config.gdims_dist[0], grid_desc->config.gdims_dist[1]) ||
grid_desc->config.pdims[1] > std::min(grid_desc->config.gdims_dist[1], grid_desc->config.gdims_dist[2]))) {
continue;
}

Expand Down Expand Up @@ -588,8 +589,15 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
options->halo_padding));

// Skip any decompositions with empty pencils
if (std::max(grid_desc->config.pdims[0], grid_desc->config.pdims[1]) >
std::min(grid_desc->config.gdims[1], grid_desc->config.gdims[2])) {
if ((options->halo_axis == 0 && (grid_desc->config.pdims[0] > grid_desc->config.gdims_dist[1] ||
grid_desc->config.pdims[1] > grid_desc->config.gdims_dist[2])) ||
(options->halo_axis == 1 && (grid_desc->config.pdims[0] > grid_desc->config.gdims_dist[0] ||
grid_desc->config.pdims[1] > grid_desc->config.gdims_dist[2])) ||
(options->halo_axis == 2 && (grid_desc->config.pdims[0] > grid_desc->config.gdims_dist[0] ||
grid_desc->config.pdims[1] > grid_desc->config.gdims_dist[1]))) {
if (options->allow_empty_pencils) {
THROW_NOT_SUPPORTED("cannot perform halo autotuning on distributions with empty pencils");
}
continue;
}

Expand Down
20 changes: 1 addition & 19 deletions src/cudecomp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -717,13 +717,6 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
CHECK_CUDA(cudaEventCreateWithFlags(&grid_desc->nvshmem_sync_event, cudaEventDisableTiming));
#endif

// Disable decompositions with empty pencils
if (!autotune_pdims &&
(grid_desc->config.pdims[0] > std::min(grid_desc->config.gdims_dist[0], grid_desc->config.gdims_dist[1]) ||
grid_desc->config.pdims[1] > std::min(grid_desc->config.gdims_dist[1], grid_desc->config.gdims_dist[2]))) {
THROW_NOT_SUPPORTED("grid descriptor settings yields a distribution with empty pencils");
}

// Run autotuning if requested
if (options) {
if (options->grid_mode == CUDECOMP_AUTOTUNE_GRID_TRANSPOSE) {
Expand Down Expand Up @@ -962,6 +955,7 @@ cudecompResult_t cudecompGridDescAutotuneOptionsSetDefaults(cudecompGridDescAuto
options->grid_mode = CUDECOMP_AUTOTUNE_GRID_TRANSPOSE;
options->dtype = CUDECOMP_DOUBLE;
options->allow_uneven_decompositions = true;
options->allow_empty_pencils = false;
options->disable_nccl_backends = false;
options->disable_nvshmem_backends = false;
options->skip_threshold = 0.0;
Expand Down Expand Up @@ -1386,9 +1380,6 @@ cudecompResult_t cudecompTransposeXToY(cudecompHandle_t handle, cudecompGridDesc
checkHandle(handle);
checkGridDesc(grid_desc);
checkDataType(dtype);
if (!input) { THROW_INVALID_USAGE("input argument cannot be null"); }
if (!output) { THROW_INVALID_USAGE("output argument cannot be null"); }
if (!work) { THROW_INVALID_USAGE("work argument cannot be null"); }
switch (dtype) {
case CUDECOMP_FLOAT:
cudecompTransposeXToY(handle, grid_desc, reinterpret_cast<float*>(input), reinterpret_cast<float*>(output),
Expand Down Expand Up @@ -1429,9 +1420,6 @@ cudecompResult_t cudecompTransposeYToZ(cudecompHandle_t handle, cudecompGridDesc
checkHandle(handle);
checkGridDesc(grid_desc);
checkDataType(dtype);
if (!input) { THROW_INVALID_USAGE("input argument cannot be null"); }
if (!output) { THROW_INVALID_USAGE("output argument cannot be null"); }
if (!work) { THROW_INVALID_USAGE("work argument cannot be null"); }
switch (dtype) {
case CUDECOMP_FLOAT:
cudecompTransposeYToZ(handle, grid_desc, reinterpret_cast<float*>(input), reinterpret_cast<float*>(output),
Expand Down Expand Up @@ -1472,9 +1460,6 @@ cudecompResult_t cudecompTransposeZToY(cudecompHandle_t handle, cudecompGridDesc
checkHandle(handle);
checkGridDesc(grid_desc);
checkDataType(dtype);
if (!input) { THROW_INVALID_USAGE("input argument cannot be null"); }
if (!output) { THROW_INVALID_USAGE("output argument cannot be null"); }
if (!work) { THROW_INVALID_USAGE("work argument cannot be null"); }
switch (dtype) {
case CUDECOMP_FLOAT:
cudecompTransposeZToY(handle, grid_desc, reinterpret_cast<float*>(input), reinterpret_cast<float*>(output),
Expand Down Expand Up @@ -1515,9 +1500,6 @@ cudecompResult_t cudecompTransposeYToX(cudecompHandle_t handle, cudecompGridDesc
checkHandle(handle);
checkGridDesc(grid_desc);
checkDataType(dtype);
if (!input) { THROW_INVALID_USAGE("input argument cannot be null"); }
if (!output) { THROW_INVALID_USAGE("output argument cannot be null"); }
if (!work) { THROW_INVALID_USAGE("work argument cannot be null"); }
switch (dtype) {
case CUDECOMP_FLOAT:
cudecompTransposeYToX(handle, grid_desc, reinterpret_cast<float*>(input), reinterpret_cast<float*>(output),
Expand Down
1 change: 1 addition & 0 deletions src/cudecomp_m.cuf
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ module cudecomp
integer(c_int32_t) :: grid_mode ! which communication (transpose/halo) to use to autotune process grid
integer(c_int32_t) :: dtype ! datatype to use during autotuning
logical(c_bool) :: allow_uneven_decompositions ! flag to control whether autotuning allows uneven decompositions (based on gdims_dist if provided, gdims otherwise)
logical(c_bool) :: allow_empty_pencils ! flag to control whether autotuning allows process grids that result in some processes having pencils with no elements (default: false)
logical(c_bool) :: disable_nccl_backends ! flag to disable NCCL backend options during autotuning
logical(c_bool) :: disable_nvshmem_backends ! flag to disable NVSHMEM backend options during autotuning
real(c_double) :: skip_threshold ! threshold used to skip testing slow configurations
Expand Down