Skip to content

Commit 35b2317

Browse files
authored
[AArch64] Support USDOT in performAddDotCombine (#171864)
This function does // ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y) Which can equally apply to USDOT too now that we have a node for it.
1 parent 1d821b0 commit 35b2317

File tree

3 files changed

+84
-73
lines changed

3 files changed

+84
-73
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21825,7 +21825,8 @@ static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
2182521825
// Handle commutivity
2182621826
auto isZeroDot = [](SDValue Dot) {
2182721827
return (Dot.getOpcode() == AArch64ISD::UDOT ||
21828-
Dot.getOpcode() == AArch64ISD::SDOT) &&
21828+
Dot.getOpcode() == AArch64ISD::SDOT ||
21829+
Dot.getOpcode() == AArch64ISD::USDOT) &&
2182921830
isZerosVector(Dot.getOperand(0).getNode());
2183021831
};
2183121832
if (!isZeroDot(Dot))

llvm/test/CodeGen/AArch64/aarch64-matmul.ll

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2-
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s | FileCheck %s
3-
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm -global-isel < %s | FileCheck %s
2+
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
3+
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
44

55
define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
66
; CHECK-LABEL: smmla.v4i32.v16i8:
@@ -160,6 +160,42 @@ entry:
160160
ret <4 x i32> %vusdot1.i
161161
}
162162

163+
define <2 x i32> @usdot_add_zero.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
164+
; CHECK-SD-LABEL: usdot_add_zero.v2i32.v8i8:
165+
; CHECK-SD: // %bb.0: // %entry
166+
; CHECK-SD-NEXT: usdot v0.2s, v1.8b, v2.8b
167+
; CHECK-SD-NEXT: ret
168+
;
169+
; CHECK-GI-LABEL: usdot_add_zero.v2i32.v8i8:
170+
; CHECK-GI: // %bb.0: // %entry
171+
; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
172+
; CHECK-GI-NEXT: usdot v3.2s, v1.8b, v2.8b
173+
; CHECK-GI-NEXT: add v0.2s, v3.2s, v0.2s
174+
; CHECK-GI-NEXT: ret
175+
entry:
176+
%x = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %a, <8 x i8> %b)
177+
%y = add <2 x i32> %x, %r
178+
ret <2 x i32> %y
179+
}
180+
181+
define <4 x i32> @usdot_add_zero.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
182+
; CHECK-SD-LABEL: usdot_add_zero.v4i32.v16i8:
183+
; CHECK-SD: // %bb.0: // %entry
184+
; CHECK-SD-NEXT: usdot v0.4s, v1.16b, v2.16b
185+
; CHECK-SD-NEXT: ret
186+
;
187+
; CHECK-GI-LABEL: usdot_add_zero.v4i32.v16i8:
188+
; CHECK-GI: // %bb.0: // %entry
189+
; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
190+
; CHECK-GI-NEXT: usdot v3.4s, v1.16b, v2.16b
191+
; CHECK-GI-NEXT: add v0.4s, v3.4s, v0.4s
192+
; CHECK-GI-NEXT: ret
193+
entry:
194+
%x = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %a, <16 x i8> %b)
195+
%y = add <4 x i32> %x, %r
196+
ret <4 x i32> %y
197+
}
198+
163199
declare <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
164200
declare <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
165201
declare <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2

llvm/test/CodeGen/AArch64/neon-dotreduce.ll

Lines changed: 44 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1375,11 +1375,9 @@ define i32 @test_usdot_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i
13751375
; CHECK-SD-LABEL: test_usdot_v8i8_double:
13761376
; CHECK-SD: // %bb.0: // %entry
13771377
; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
1378-
; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
1379-
; CHECK-SD-NEXT: usdot v5.2s, v0.8b, v1.8b
13801378
; CHECK-SD-NEXT: usdot v4.2s, v2.8b, v3.8b
1381-
; CHECK-SD-NEXT: add v0.2s, v5.2s, v4.2s
1382-
; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s
1379+
; CHECK-SD-NEXT: usdot v4.2s, v0.8b, v1.8b
1380+
; CHECK-SD-NEXT: addp v0.2s, v4.2s, v4.2s
13831381
; CHECK-SD-NEXT: fmov w0, s0
13841382
; CHECK-SD-NEXT: ret
13851383
;
@@ -1416,11 +1414,9 @@ define i32 @test_usdot_swapped_operands_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8
14161414
; CHECK-SD-LABEL: test_usdot_swapped_operands_v8i8_double:
14171415
; CHECK-SD: // %bb.0: // %entry
14181416
; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
1419-
; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
1420-
; CHECK-SD-NEXT: usdot v5.2s, v1.8b, v0.8b
14211417
; CHECK-SD-NEXT: usdot v4.2s, v3.8b, v2.8b
1422-
; CHECK-SD-NEXT: add v0.2s, v5.2s, v4.2s
1423-
; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s
1418+
; CHECK-SD-NEXT: usdot v4.2s, v1.8b, v0.8b
1419+
; CHECK-SD-NEXT: addp v0.2s, v4.2s, v4.2s
14241420
; CHECK-SD-NEXT: fmov w0, s0
14251421
; CHECK-SD-NEXT: ret
14261422
;
@@ -1457,11 +1453,9 @@ define i32 @test_usdot_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <1
14571453
; CHECK-SD-LABEL: test_usdot_v16i8_double:
14581454
; CHECK-SD: // %bb.0: // %entry
14591455
; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
1460-
; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
1461-
; CHECK-SD-NEXT: usdot v5.4s, v0.16b, v1.16b
14621456
; CHECK-SD-NEXT: usdot v4.4s, v2.16b, v3.16b
1463-
; CHECK-SD-NEXT: add v0.4s, v5.4s, v4.4s
1464-
; CHECK-SD-NEXT: addv s0, v0.4s
1457+
; CHECK-SD-NEXT: usdot v4.4s, v0.16b, v1.16b
1458+
; CHECK-SD-NEXT: addv s0, v4.4s
14651459
; CHECK-SD-NEXT: fmov w0, s0
14661460
; CHECK-SD-NEXT: ret
14671461
;
@@ -1509,11 +1503,9 @@ define i32 @test_usdot_swapped_operands_v16i8_double(<16 x i8> %a, <16 x i8> %b,
15091503
; CHECK-SD-LABEL: test_usdot_swapped_operands_v16i8_double:
15101504
; CHECK-SD: // %bb.0: // %entry
15111505
; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
1512-
; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
1513-
; CHECK-SD-NEXT: usdot v5.4s, v1.16b, v0.16b
15141506
; CHECK-SD-NEXT: usdot v4.4s, v3.16b, v2.16b
1515-
; CHECK-SD-NEXT: add v0.4s, v5.4s, v4.4s
1516-
; CHECK-SD-NEXT: addv s0, v0.4s
1507+
; CHECK-SD-NEXT: usdot v4.4s, v1.16b, v0.16b
1508+
; CHECK-SD-NEXT: addv s0, v4.4s
15171509
; CHECK-SD-NEXT: fmov w0, s0
15181510
; CHECK-SD-NEXT: ret
15191511
;
@@ -4384,12 +4376,10 @@ define i32 @test_usdot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %
43844376
; CHECK-SD-LABEL: test_usdot_v32i8:
43854377
; CHECK-SD: // %bb.0: // %entry
43864378
; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
4387-
; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
4388-
; CHECK-SD-NEXT: ldp q2, q3, [x0]
4389-
; CHECK-SD-NEXT: ldp q4, q5, [x1]
4390-
; CHECK-SD-NEXT: usdot v1.4s, v3.16b, v5.16b
4391-
; CHECK-SD-NEXT: usdot v0.4s, v2.16b, v4.16b
4392-
; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
4379+
; CHECK-SD-NEXT: ldp q1, q3, [x0]
4380+
; CHECK-SD-NEXT: ldp q2, q4, [x1]
4381+
; CHECK-SD-NEXT: usdot v0.4s, v3.16b, v4.16b
4382+
; CHECK-SD-NEXT: usdot v0.4s, v1.16b, v2.16b
43934383
; CHECK-SD-NEXT: addv s0, v0.4s
43944384
; CHECK-SD-NEXT: fmov w8, s0
43954385
; CHECK-SD-NEXT: add w0, w8, w2
@@ -4438,15 +4428,11 @@ define i32 @test_usdot_v32i8_double(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <3
44384428
; CHECK-SD: // %bb.0: // %entry
44394429
; CHECK-SD-NEXT: movi v16.2d, #0000000000000000
44404430
; CHECK-SD-NEXT: movi v17.2d, #0000000000000000
4441-
; CHECK-SD-NEXT: movi v18.2d, #0000000000000000
4442-
; CHECK-SD-NEXT: movi v19.2d, #0000000000000000
4443-
; CHECK-SD-NEXT: usdot v16.4s, v1.16b, v3.16b
4444-
; CHECK-SD-NEXT: usdot v18.4s, v0.16b, v2.16b
4445-
; CHECK-SD-NEXT: usdot v17.4s, v4.16b, v6.16b
4446-
; CHECK-SD-NEXT: usdot v19.4s, v5.16b, v7.16b
4447-
; CHECK-SD-NEXT: add v0.4s, v18.4s, v16.4s
4448-
; CHECK-SD-NEXT: add v1.4s, v17.4s, v19.4s
4449-
; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
4431+
; CHECK-SD-NEXT: usdot v17.4s, v1.16b, v3.16b
4432+
; CHECK-SD-NEXT: usdot v16.4s, v5.16b, v7.16b
4433+
; CHECK-SD-NEXT: usdot v17.4s, v0.16b, v2.16b
4434+
; CHECK-SD-NEXT: usdot v16.4s, v4.16b, v6.16b
4435+
; CHECK-SD-NEXT: add v0.4s, v17.4s, v16.4s
44504436
; CHECK-SD-NEXT: addv s0, v0.4s
44514437
; CHECK-SD-NEXT: fmov w0, s0
44524438
; CHECK-SD-NEXT: ret
@@ -8781,20 +8767,16 @@ define i32 @test_usdot_v64i8(ptr nocapture readonly %a, ptr nocapture readonly %
87818767
; CHECK-SD-LABEL: test_usdot_v64i8:
87828768
; CHECK-SD: // %bb.0: // %entry
87838769
; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
8784-
; CHECK-SD-NEXT: movi v3.2d, #0000000000000000
8785-
; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
8786-
; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
8787-
; CHECK-SD-NEXT: ldp q1, q2, [x0, #32]
8788-
; CHECK-SD-NEXT: ldp q6, q7, [x1, #32]
8789-
; CHECK-SD-NEXT: ldp q16, q17, [x0]
8790-
; CHECK-SD-NEXT: ldp q18, q19, [x1]
8791-
; CHECK-SD-NEXT: usdot v0.4s, v2.16b, v7.16b
8792-
; CHECK-SD-NEXT: usdot v5.4s, v1.16b, v6.16b
8793-
; CHECK-SD-NEXT: usdot v4.4s, v17.16b, v19.16b
8794-
; CHECK-SD-NEXT: usdot v3.4s, v16.16b, v18.16b
8795-
; CHECK-SD-NEXT: add v0.4s, v4.4s, v0.4s
8796-
; CHECK-SD-NEXT: add v1.4s, v3.4s, v5.4s
8797-
; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s
8770+
; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
8771+
; CHECK-SD-NEXT: ldp q2, q3, [x0, #32]
8772+
; CHECK-SD-NEXT: ldp q4, q5, [x1, #32]
8773+
; CHECK-SD-NEXT: usdot v1.4s, v3.16b, v5.16b
8774+
; CHECK-SD-NEXT: usdot v0.4s, v2.16b, v4.16b
8775+
; CHECK-SD-NEXT: ldp q2, q3, [x0]
8776+
; CHECK-SD-NEXT: ldp q4, q5, [x1]
8777+
; CHECK-SD-NEXT: usdot v1.4s, v3.16b, v5.16b
8778+
; CHECK-SD-NEXT: usdot v0.4s, v2.16b, v4.16b
8779+
; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
87988780
; CHECK-SD-NEXT: addv s0, v0.4s
87998781
; CHECK-SD-NEXT: fmov w8, s0
88008782
; CHECK-SD-NEXT: add w0, w8, w2
@@ -8863,32 +8845,24 @@ entry:
88638845
define i32 @test_usdot_v64i8_double(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
88648846
; CHECK-SD-LABEL: test_usdot_v64i8_double:
88658847
; CHECK-SD: // %bb.0: // %entry
8848+
; CHECK-SD-NEXT: movi v16.2d, #0000000000000000
8849+
; CHECK-SD-NEXT: movi v17.2d, #0000000000000000
88668850
; CHECK-SD-NEXT: movi v18.2d, #0000000000000000
8867-
; CHECK-SD-NEXT: movi v21.2d, #0000000000000000
8868-
; CHECK-SD-NEXT: movi v22.2d, #0000000000000000
8869-
; CHECK-SD-NEXT: movi v23.2d, #0000000000000000
8870-
; CHECK-SD-NEXT: ldp q16, q17, [sp, #64]
8871-
; CHECK-SD-NEXT: movi v24.2d, #0000000000000000
8872-
; CHECK-SD-NEXT: movi v25.2d, #0000000000000000
8873-
; CHECK-SD-NEXT: movi v26.2d, #0000000000000000
8874-
; CHECK-SD-NEXT: movi v27.2d, #0000000000000000
8875-
; CHECK-SD-NEXT: ldp q19, q20, [sp, #96]
8876-
; CHECK-SD-NEXT: usdot v18.4s, v3.16b, v7.16b
8877-
; CHECK-SD-NEXT: ldp q3, q7, [sp, #32]
8878-
; CHECK-SD-NEXT: usdot v21.4s, v1.16b, v5.16b
8879-
; CHECK-SD-NEXT: ldp q1, q5, [sp]
8880-
; CHECK-SD-NEXT: usdot v22.4s, v2.16b, v6.16b
8881-
; CHECK-SD-NEXT: usdot v23.4s, v0.16b, v4.16b
8882-
; CHECK-SD-NEXT: usdot v24.4s, v7.16b, v20.16b
8883-
; CHECK-SD-NEXT: usdot v27.4s, v3.16b, v19.16b
8884-
; CHECK-SD-NEXT: usdot v26.4s, v5.16b, v17.16b
8885-
; CHECK-SD-NEXT: usdot v25.4s, v1.16b, v16.16b
8886-
; CHECK-SD-NEXT: add v0.4s, v21.4s, v18.4s
8887-
; CHECK-SD-NEXT: add v1.4s, v23.4s, v22.4s
8888-
; CHECK-SD-NEXT: add v2.4s, v26.4s, v24.4s
8889-
; CHECK-SD-NEXT: add v3.4s, v25.4s, v27.4s
8890-
; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s
8891-
; CHECK-SD-NEXT: add v1.4s, v3.4s, v2.4s
8851+
; CHECK-SD-NEXT: movi v19.2d, #0000000000000000
8852+
; CHECK-SD-NEXT: ldp q20, q21, [sp, #96]
8853+
; CHECK-SD-NEXT: ldp q22, q23, [sp, #32]
8854+
; CHECK-SD-NEXT: usdot v16.4s, v3.16b, v7.16b
8855+
; CHECK-SD-NEXT: usdot v18.4s, v2.16b, v6.16b
8856+
; CHECK-SD-NEXT: usdot v19.4s, v23.16b, v21.16b
8857+
; CHECK-SD-NEXT: usdot v17.4s, v22.16b, v20.16b
8858+
; CHECK-SD-NEXT: ldp q2, q3, [sp, #64]
8859+
; CHECK-SD-NEXT: ldp q6, q7, [sp]
8860+
; CHECK-SD-NEXT: usdot v16.4s, v1.16b, v5.16b
8861+
; CHECK-SD-NEXT: usdot v18.4s, v0.16b, v4.16b
8862+
; CHECK-SD-NEXT: usdot v19.4s, v7.16b, v3.16b
8863+
; CHECK-SD-NEXT: usdot v17.4s, v6.16b, v2.16b
8864+
; CHECK-SD-NEXT: add v0.4s, v18.4s, v16.4s
8865+
; CHECK-SD-NEXT: add v1.4s, v17.4s, v19.4s
88928866
; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
88938867
; CHECK-SD-NEXT: addv s0, v0.4s
88948868
; CHECK-SD-NEXT: fmov w0, s0

0 commit comments

Comments
 (0)