Skip to content

Commit f57f338

Browse files
authored
[flang][cuda] Add double descriptor information in allocate/deallocate operations (#170901)
After #169740, the allocate and deallocate cuf operation can be converted later. Update the way to recognize double descriptor case by adding this information directly on the operation itself.
1 parent ad1edc9 commit f57f338

File tree

7 files changed

+55
-37
lines changed

7 files changed

+55
-37
lines changed

flang/include/flang/Lower/CUDA.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@ translateSymbolCUFDataAttribute(mlir::MLIRContext *mlirContext,
6666
/// there is a conversion. Return null otherwise.
6767
hlfir::ElementalOp isTransferWithConversion(mlir::Value rhs);
6868

69+
/// Check if the value is an allocatable with double descriptor.
70+
bool hasDoubleDescriptor(mlir::Value);
71+
6972
} // end namespace Fortran::lower
7073

7174
#endif // FORTRAN_LOWER_CUDA_H

flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,8 @@ def cuf_AllocateOp : cuf_Op<"allocate", [AttrSizedOperandSegments,
100100
Optional<fir_ReferenceType>:$stream,
101101
Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$pinned,
102102
Arg<Optional<AnyRefOrBoxType>, "", [MemRead]>:$source,
103-
cuf_DataAttributeAttr:$data_attr, UnitAttr:$hasStat);
103+
cuf_DataAttributeAttr:$data_attr, UnitAttr:$hasStat,
104+
UnitAttr:$hasDoubleDescriptor);
104105

105106
let results = (outs AnyIntegerType:$stat);
106107

@@ -126,9 +127,9 @@ def cuf_DeallocateOp : cuf_Op<"deallocate",
126127
}];
127128

128129
let arguments = (ins Arg<fir_ReferenceType, "", [MemRead, MemWrite]>:$box,
129-
Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg,
130-
cuf_DataAttributeAttr:$data_attr,
131-
UnitAttr:$hasStat);
130+
Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg,
131+
cuf_DataAttributeAttr:$data_attr, UnitAttr:$hasStat,
132+
UnitAttr:$hasDoubleDescriptor);
132133

133134
let results = (outs AnyIntegerType:$stat);
134135

flang/lib/Lower/Allocatable.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -798,10 +798,12 @@ class AllocateStmtHelper {
798798
// Keep return type the same as a standard AllocatableAllocate call.
799799
mlir::Type retTy = fir::runtime::getModel<int>()(builder.getContext());
800800

801+
bool doubleDescriptors = Fortran::lower::hasDoubleDescriptor(box.getAddr());
801802
return cuf::AllocateOp::create(
802803
builder, loc, retTy, box.getAddr(), errmsg, stream, pinned,
803804
source, cudaAttr,
804-
errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr)
805+
errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr,
806+
doubleDescriptors ? builder.getUnitAttr() : nullptr)
805807
.getResult();
806808
}
807809

@@ -865,11 +867,13 @@ static mlir::Value genCudaDeallocate(fir::FirOpBuilder &builder,
865867
? nullptr
866868
: errorManager.errMsgAddr;
867869

868-
// Keep return type the same as a standard AllocatableAllocate call.
870+
// Keep return type the same as a standard AllocatableDeallocate call.
869871
mlir::Type retTy = fir::runtime::getModel<int>()(builder.getContext());
872+
bool doubleDescriptors = Fortran::lower::hasDoubleDescriptor(box.getAddr());
870873
return cuf::DeallocateOp::create(
871874
builder, loc, retTy, box.getAddr(), errmsg, cudaAttr,
872-
errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr)
875+
errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr,
876+
doubleDescriptors ? builder.getUnitAttr() : nullptr)
873877
.getResult();
874878
}
875879

flang/lib/Lower/CUDA.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,3 +91,17 @@ hlfir::ElementalOp Fortran::lower::isTransferWithConversion(mlir::Value rhs) {
9191
return elOp;
9292
return {};
9393
}
94+
95+
bool Fortran::lower::hasDoubleDescriptor(mlir::Value addr) {
96+
if (auto declareOp =
97+
mlir::dyn_cast_or_null<hlfir::DeclareOp>(addr.getDefiningOp())) {
98+
if (mlir::isa_and_nonnull<fir::AddrOfOp>(
99+
declareOp.getMemref().getDefiningOp())) {
100+
if (declareOp.getDataAttr() &&
101+
*declareOp.getDataAttr() == cuf::DataAttribute::Pinned)
102+
return false;
103+
return true;
104+
}
105+
}
106+
return false;
107+
}

flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -62,28 +62,6 @@ static inline unsigned getMemType(cuf::DataAttribute attr) {
6262
llvm_unreachable("unsupported memory type");
6363
}
6464

65-
template <typename OpTy>
66-
static bool hasDoubleDescriptors(OpTy op) {
67-
if (auto declareOp =
68-
mlir::dyn_cast_or_null<fir::DeclareOp>(op.getBox().getDefiningOp())) {
69-
if (mlir::isa_and_nonnull<fir::AddrOfOp>(
70-
declareOp.getMemref().getDefiningOp())) {
71-
if (isPinned(declareOp))
72-
return false;
73-
return true;
74-
}
75-
} else if (auto declareOp = mlir::dyn_cast_or_null<hlfir::DeclareOp>(
76-
op.getBox().getDefiningOp())) {
77-
if (mlir::isa_and_nonnull<fir::AddrOfOp>(
78-
declareOp.getMemref().getDefiningOp())) {
79-
if (isPinned(declareOp))
80-
return false;
81-
return true;
82-
}
83-
}
84-
return false;
85-
}
86-
8765
static bool inDeviceContext(mlir::Operation *op) {
8866
if (op->getParentOfType<cuf::KernelOp>())
8967
return true;
@@ -353,7 +331,7 @@ struct CUFAllocateOpConversion
353331
fir::FortranVariableFlagsEnum::pointer))
354332
isPointer = true;
355333

356-
if (hasDoubleDescriptors(op)) {
334+
if (op.getHasDoubleDescriptor()) {
357335
// Allocation for module variable are done with custom runtime entry point
358336
// so the descriptors can be synchronized.
359337
mlir::func::FuncOp func;
@@ -406,7 +384,7 @@ struct CUFDeallocateOpConversion
406384
fir::FirOpBuilder builder(rewriter, mod);
407385
mlir::Location loc = op.getLoc();
408386

409-
if (hasDoubleDescriptors(op)) {
387+
if (op.getHasDoubleDescriptor()) {
410388
// Deallocation for module variable are done with custom runtime entry
411389
// point so the descriptors can be synchronized.
412390
mlir::func::FuncOp func =

flang/test/Fir/CUDA/cuda-allocate.fir

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@ fir.global @_QMmod1Ea {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.
3737
func.func @_QPsub3() {
3838
%0 = fir.address_of(@_QMmod1Ea) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
3939
%1:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
40-
%2 = cuf.allocate %1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
41-
%3 = cuf.deallocate %1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
40+
%2 = cuf.allocate %1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
41+
%3 = cuf.deallocate %1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
4242
return
4343
}
4444

@@ -109,7 +109,7 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} {
109109
%3 = fir.convert %c1 : (index) -> i64
110110
%4 = fir.convert %c10_i32 : (i32) -> i64
111111
fir.call @_FortranAAllocatableSetBounds(%2, %c0_i32, %3, %4) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
112-
%6 = cuf.allocate %1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>} -> i32
112+
%6 = cuf.allocate %1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
113113
return
114114
}
115115

@@ -158,7 +158,7 @@ func.func @_QMmod1Pallocate_source_global() {
158158
%2 = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xf32>>> {bindc_name = "a", uniq_name = "_QMmod1Fallocate_source_globalEa"}
159159
%6 = fir.declare %2 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Fallocate_source_globalEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
160160
%7 = fir.load %6 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
161-
%21 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> source(%7 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>) {data_attr = #cuf.cuda<device>} -> i32
161+
%21 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> source(%7 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>) {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
162162
return
163163
}
164164

@@ -226,7 +226,7 @@ func.func @_QQpointer_sync() attributes {fir.bindc_name = "test"} {
226226
%3 = fir.convert %c1 : (index) -> i64
227227
%4 = fir.convert %c10_i32 : (i32) -> i64
228228
fir.call @_FortranAAllocatableSetBounds(%2, %c0_i32, %3, %4) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
229-
%6 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>} -> i32
229+
%6 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
230230
return
231231
}
232232

@@ -246,7 +246,7 @@ func.func @_QMmod1Ppointer_source_global() {
246246
%2 = fir.alloca !fir.box<!fir.ptr<!fir.array<?x?xf32>>> {bindc_name = "a", uniq_name = "_QMmod1Fallocate_source_globalEa"}
247247
%6 = fir.declare %2 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Fallocate_source_globalEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>
248248
%7 = fir.load %6 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>
249-
%21 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>> source(%7 : !fir.box<!fir.ptr<!fir.array<?x?xf32>>>) {data_attr = #cuf.cuda<device>} -> i32
249+
%21 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>> source(%7 : !fir.box<!fir.ptr<!fir.array<?x?xf32>>>) {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
250250
return
251251
}
252252

flang/test/Lower/CUDA/cuda-allocatable.cuf

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,3 +235,21 @@ end subroutine
235235

236236
! CHECK-LABEL: func.func @_QPcuda_component()
237237
! CHECK: cuf.allocate
238+
239+
subroutine module_allocate()
240+
use globals
241+
allocate(a_device(10))
242+
allocate(a_managed(10))
243+
allocate(a_pinned(10))
244+
deallocate(a_device)
245+
deallocate(a_managed)
246+
deallocate(a_pinned)
247+
end subroutine
248+
249+
! CHECK-LABEL: func.func @_QPmodule_allocate()
250+
! CHECK: cuf.allocate %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
251+
! CHECK: cuf.allocate %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<managed>, hasDoubleDescriptor} -> i32
252+
! CHECK: cuf.allocate %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<pinned>} -> i32
253+
! CHECK: cuf.deallocate %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
254+
! CHECK: cuf.deallocate %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<managed>, hasDoubleDescriptor} -> i32
255+
! CHECK: cuf.deallocate %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<pinned>} -> i32

0 commit comments

Comments
 (0)