From 6e3b496ac46288553e90ab0ef659e26e7ccbb043 Mon Sep 17 00:00:00 2001 From: Kevin Sala Date: Sat, 30 Aug 2025 00:43:52 -0700 Subject: [PATCH 1/2] [clang][CUDA] Avoid accounting for tail padding in LLVM offloading --- clang/lib/CodeGen/CGCUDANV.cpp | 24 +++++++++++++++---- offload/include/Shared/APITypes.h | 2 ++ .../common/src/PluginInterface.cpp | 4 +++- offload/plugins-nextgen/cuda/src/rtl.cpp | 2 +- .../offloading/CUDA/basic_launch_multi_arg.cu | 8 +++++++ 5 files changed, 33 insertions(+), 7 deletions(-) diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp index cb16fe1b36c68..59fb4b9e94bf9 100644 --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -327,9 +327,10 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, /// (void*, short, void*) is passed as {void **, short *, void **} to the launch /// function. For the LLVM/offload launch we flatten the arguments into the /// struct directly. In addition, we include the size of the arguments, thus -/// pass {sizeof({void *, short, void *}), ptr to {void *, short, void *}, -/// nullptr}. The last nullptr needs to be initialized to an array of pointers -/// pointing to the arguments if we want to offload to the host. +/// pass {size of ({void *, short, void *}) without tail padding, ptr to {void +/// *, short, void *}, nullptr}. The last nullptr needs to be initialized to an +/// array of pointers pointing to the arguments if we want to offload to the +/// host. Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF, FunctionArgList &Args) { SmallVector ArgTypes, KernelLaunchParamsTypes; @@ -339,6 +340,7 @@ Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF, auto *Int64Ty = CGF.Builder.getInt64Ty(); KernelLaunchParamsTypes.push_back(Int64Ty); + KernelLaunchParamsTypes.push_back(Int64Ty); KernelLaunchParamsTypes.push_back(PtrTy); KernelLaunchParamsTypes.push_back(PtrTy); @@ -351,12 +353,24 @@ Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF, "kernel_launch_params"); auto KernelArgsSize = CGM.getDataLayout().getTypeAllocSize(KernelArgsTy); + + // Avoid accounting the tail padding for CUDA. + auto KernelArgsSizeNoTailPadding = llvm::TypeSize::getZero(); + if (auto N = KernelArgsTy->getNumElements()) { + auto *SL = CGM.getDataLayout().getStructLayout(KernelArgsTy); + KernelArgsSizeNoTailPadding = SL->getElementOffset(N - 1); + KernelArgsSizeNoTailPadding += CGM.getDataLayout().getTypeAllocSize( + KernelArgsTy->getElementType(N - 1)); + } + CGF.Builder.CreateStore(llvm::ConstantInt::get(Int64Ty, KernelArgsSize), CGF.Builder.CreateStructGEP(KernelLaunchParams, 0)); - CGF.Builder.CreateStore(KernelArgs.emitRawPointer(CGF), + CGF.Builder.CreateStore(llvm::ConstantInt::get(Int64Ty, KernelArgsSizeNoTailPadding), CGF.Builder.CreateStructGEP(KernelLaunchParams, 1)); - CGF.Builder.CreateStore(llvm::Constant::getNullValue(PtrTy), + CGF.Builder.CreateStore(KernelArgs.emitRawPointer(CGF), CGF.Builder.CreateStructGEP(KernelLaunchParams, 2)); + CGF.Builder.CreateStore(llvm::Constant::getNullValue(PtrTy), + CGF.Builder.CreateStructGEP(KernelLaunchParams, 3)); for (unsigned i = 0; i < Args.size(); ++i) { auto *ArgVal = CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(Args[i])); diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h index 8c150b6bfc2d4..52725a0474c6a 100644 --- a/offload/include/Shared/APITypes.h +++ b/offload/include/Shared/APITypes.h @@ -121,6 +121,8 @@ static_assert(sizeof(KernelArgsTy) == struct KernelLaunchParamsTy { /// Size of the Data array. size_t Size = 0; + /// Size of the Data array without tail padding. + size_t SizeNoTailPadding = 0; /// Flat array of kernel parameters. void *Data = nullptr; /// Ptrs to the Data entries. Only strictly required for the host plugin. diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 7d05dd25dbf75..979c026c219a6 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -571,7 +571,9 @@ KernelLaunchParamsTy GenericKernelTy::prepareArgs( (void *)((intptr_t)ArgPtrs[I - KLEOffset] + ArgOffsets[I - KLEOffset]); Ptrs[I] = &Args[I]; } - return KernelLaunchParamsTy{sizeof(void *) * NumArgs, &Args[0], &Ptrs[0]}; + + size_t ArgsSize = sizeof(void *) * NumArgs; + return KernelLaunchParamsTy{ArgsSize, ArgsSize, &Args[0], &Ptrs[0]}; } uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice, diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index b2f840113cff3..81b33a93f16b1 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -1414,7 +1414,7 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data, CU_LAUNCH_PARAM_BUFFER_SIZE, - reinterpret_cast(&LaunchParams.Size), + reinterpret_cast(&LaunchParams.SizeNoTailPadding), CU_LAUNCH_PARAM_END}; // If we are running an RPC server we want to wake up the server thread diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu index 1f84a0e1288d4..ab6f753150932 100644 --- a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu +++ b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu @@ -23,6 +23,10 @@ __global__ void square(int *Dst, short Q, int *Src, short P) { Src[1] = P; } +__global__ void accumulate(short Q, int *Dst, char P) { + *Dst += Q + P; +} + int main(int argc, char **argv) { int DevNo = 0; int *Ptr = reinterpret_cast(llvm_omp_target_alloc_shared(4, DevNo)); @@ -39,5 +43,9 @@ int main(int argc, char **argv) { // CHECK: Ptr [[Ptr]], *Ptr: 42 printf("Src: %i : %i\n", Src[0], Src[1]); // CHECK: Src: 3 : 4 + accumulate<<<1, 1>>>(3, Ptr, 7); + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr]], *Ptr: 52 llvm_omp_target_free_shared(Ptr, DevNo); + llvm_omp_target_free_shared(Src, DevNo); } From 08dfe946eebd0b5d82a04f4ec79467129a4e3f91 Mon Sep 17 00:00:00 2001 From: Kevin Sala Date: Thu, 4 Sep 2025 16:01:09 -0700 Subject: [PATCH 2/2] Use the same args size (without tail padding) for AMDGPU --- clang/lib/CodeGen/CGCUDANV.cpp | 17 ++++++----------- offload/include/Shared/APITypes.h | 4 +--- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 5 ----- .../common/src/PluginInterface.cpp | 2 +- offload/plugins-nextgen/cuda/src/rtl.cpp | 2 +- 5 files changed, 9 insertions(+), 21 deletions(-) diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp index 59fb4b9e94bf9..a0aeb21bee380 100644 --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -340,7 +340,6 @@ Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF, auto *Int64Ty = CGF.Builder.getInt64Ty(); KernelLaunchParamsTypes.push_back(Int64Ty); - KernelLaunchParamsTypes.push_back(Int64Ty); KernelLaunchParamsTypes.push_back(PtrTy); KernelLaunchParamsTypes.push_back(PtrTy); @@ -352,25 +351,21 @@ Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF, KernelLaunchParamsTy, CharUnits::fromQuantity(16), "kernel_launch_params"); - auto KernelArgsSize = CGM.getDataLayout().getTypeAllocSize(KernelArgsTy); - - // Avoid accounting the tail padding for CUDA. - auto KernelArgsSizeNoTailPadding = llvm::TypeSize::getZero(); + // Avoid accounting the tail padding for the kernel arguments. + auto KernelArgsSize = llvm::TypeSize::getZero(); if (auto N = KernelArgsTy->getNumElements()) { auto *SL = CGM.getDataLayout().getStructLayout(KernelArgsTy); - KernelArgsSizeNoTailPadding = SL->getElementOffset(N - 1); - KernelArgsSizeNoTailPadding += CGM.getDataLayout().getTypeAllocSize( + KernelArgsSize += SL->getElementOffset(N - 1); + KernelArgsSize += CGM.getDataLayout().getTypeAllocSize( KernelArgsTy->getElementType(N - 1)); } CGF.Builder.CreateStore(llvm::ConstantInt::get(Int64Ty, KernelArgsSize), CGF.Builder.CreateStructGEP(KernelLaunchParams, 0)); - CGF.Builder.CreateStore(llvm::ConstantInt::get(Int64Ty, KernelArgsSizeNoTailPadding), - CGF.Builder.CreateStructGEP(KernelLaunchParams, 1)); CGF.Builder.CreateStore(KernelArgs.emitRawPointer(CGF), - CGF.Builder.CreateStructGEP(KernelLaunchParams, 2)); + CGF.Builder.CreateStructGEP(KernelLaunchParams, 1)); CGF.Builder.CreateStore(llvm::Constant::getNullValue(PtrTy), - CGF.Builder.CreateStructGEP(KernelLaunchParams, 3)); + CGF.Builder.CreateStructGEP(KernelLaunchParams, 2)); for (unsigned i = 0; i < Args.size(); ++i) { auto *ArgVal = CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(Args[i])); diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h index 52725a0474c6a..9868370de3d73 100644 --- a/offload/include/Shared/APITypes.h +++ b/offload/include/Shared/APITypes.h @@ -119,10 +119,8 @@ static_assert(sizeof(KernelArgsTy) == /// Flat array of kernel launch parameters and their total size. struct KernelLaunchParamsTy { - /// Size of the Data array. + /// Size of the Data array without the tail padding. size_t Size = 0; - /// Size of the Data array without tail padding. - size_t SizeNoTailPadding = 0; /// Flat array of kernel parameters. void *Data = nullptr; /// Ptrs to the Data entries. Only strictly required for the host plugin. diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 64470e9fabf46..dc0e5c3ba11ef 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -3658,11 +3658,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const { - if (ArgsSize != LaunchParams.Size && - ArgsSize > LaunchParams.Size + getImplicitArgsSize()) - return Plugin::error(ErrorCode::INVALID_ARGUMENT, - "invalid kernel arguments size"); - AMDGPUPluginTy &AMDGPUPlugin = static_cast(GenericDevice.Plugin); AMDHostDeviceTy &HostDevice = AMDGPUPlugin.getHostDevice(); diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 979c026c219a6..56006d03ef325 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -573,7 +573,7 @@ KernelLaunchParamsTy GenericKernelTy::prepareArgs( } size_t ArgsSize = sizeof(void *) * NumArgs; - return KernelLaunchParamsTy{ArgsSize, ArgsSize, &Args[0], &Ptrs[0]}; + return KernelLaunchParamsTy{ArgsSize, &Args[0], &Ptrs[0]}; } uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice, diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index 81b33a93f16b1..b2f840113cff3 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -1414,7 +1414,7 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data, CU_LAUNCH_PARAM_BUFFER_SIZE, - reinterpret_cast(&LaunchParams.SizeNoTailPadding), + reinterpret_cast(&LaunchParams.Size), CU_LAUNCH_PARAM_END}; // If we are running an RPC server we want to wake up the server thread