-
Notifications
You must be signed in to change notification settings - Fork 12.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LV][LAA] Vectorize math lib calls with mem write-only attribute #78432
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-llvm-analysis Author: Paschalis Mpeis (paschalis-mpeis) ChangesTeach LAA to consider safe specific math lib calls which are known to This happens only when the calls are found through TLI to have Full diff: https://github.com/llvm/llvm-project/pull/78432.diff 2 Files Affected:
diff --git a/clang/test/CodeGen/aarch64-veclib-function-calls-linear-ptrs.c b/clang/test/CodeGen/aarch64-veclib-function-calls-linear-ptrs.c
new file mode 100644
index 000000000000000..957b3f5cb235d31
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-veclib-function-calls-linear-ptrs.c
@@ -0,0 +1,54 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --filter "call.*(frexp|modf)" --version 4
+// RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve -O3 -mllvm -vector-library=ArmPL -mllvm -force-vector-interleave=1 -mllvm -prefer-predicate-over-epilogue=predicate-dont-vectorize -emit-llvm -S -o - %s | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+/*
+Testing vectorization of math functions that have the attribute write-only to
+memory set. Given they have vectorized counterparts, they should be able to
+vectorize.
+*/
+
+// The following define is required to access some math functions.
+#define _GNU_SOURCE
+#include <math.h>
+
+// frexp/frexpf have no TLI mappings yet.
+
+// CHECK-LABEL: define dso_local void @frexp_f64(
+// CHECK-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef writeonly [[OUT1:%.*]], ptr nocapture noundef writeonly [[OUT2:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK: [[CALL:%.*]] = tail call double @frexp(double noundef [[TMP0:%.*]], ptr noundef [[ADD_PTR:%.*]]) #[[ATTR5:[0-9]+]]
+//
+void frexp_f64(double *in, double *out1, int *out2, int N) {
+ for (int i = 0; i < N; ++i)
+ *out1 = frexp(in[i], out2+i);
+}
+
+// CHECK-LABEL: define dso_local void @frexp_f32(
+// CHECK-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef writeonly [[OUT1:%.*]], ptr nocapture noundef writeonly [[OUT2:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK: [[CALL:%.*]] = tail call float @frexpf(float noundef [[TMP0:%.*]], ptr noundef [[ADD_PTR:%.*]]) #[[ATTR5]]
+//
+void frexp_f32(float *in, float *out1, int *out2, int N) {
+ for (int i = 0; i < N; ++i)
+ *out1 = frexpf(in[i], out2+i);
+}
+
+// CHECK-LABEL: define dso_local void @modf_f64(
+// CHECK-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef writeonly [[OUT1:%.*]], ptr nocapture noundef writeonly [[OUT2:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK: [[TMP11:%.*]] = tail call <vscale x 2 x double> @armpl_svmodf_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP10:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+// CHECK: [[CALL:%.*]] = tail call double @modf(double noundef [[TMP14:%.*]], ptr noundef [[ADD_PTR:%.*]]) #[[ATTR6:[0-9]+]]
+//
+void modf_f64(double *in, double *out1, double *out2, int N) {
+ for (int i = 0; i < N; ++i)
+ out1[i] = modf(in[i], out2+i);
+}
+
+// CHECK-LABEL: define dso_local void @modf_f32(
+// CHECK-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef writeonly [[OUT1:%.*]], ptr nocapture noundef writeonly [[OUT2:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK: [[TMP11:%.*]] = tail call <vscale x 4 x float> @armpl_svmodf_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP10:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+// CHECK: [[CALL:%.*]] = tail call float @modff(float noundef [[TMP14:%.*]], ptr noundef [[ADD_PTR:%.*]]) #[[ATTR7:[0-9]+]]
+//
+void modf_f32(float *in, float *out1, float *out2, int N) {
+ for (int i = 0; i < N; ++i)
+ out1[i] = modff(in[i], out2+i);
+}
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index aed60cc5a3f5ef0..0c8b4e51fcf5c16 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2274,6 +2274,20 @@ bool LoopAccessInfo::canAnalyzeLoop() {
return true;
}
+/// Returns whether \p I is a known math library call that has memory write-only
+/// attribute set.
+static bool isMathLibCallMemWriteOnly(const TargetLibraryInfo *TLI,
+ const Instruction &I) {
+ auto *Call = dyn_cast<CallInst>(&I);
+ if (!Call)
+ return false;
+
+ LibFunc Func;
+ TLI->getLibFunc(*Call, Func);
+ return Func == LibFunc::LibFunc_modf || Func == LibFunc::LibFunc_modff ||
+ Func == LibFunc::LibFunc_frexp || Func == LibFunc::LibFunc_frexpf;
+}
+
void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
const TargetLibraryInfo *TLI,
DominatorTree *DT) {
@@ -2364,6 +2378,11 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
// Save 'store' instructions. Abort if other instructions write to memory.
if (I.mayWriteToMemory()) {
+ // We can safety handle math functions that have vectorized
+ // counterparts and have the memory write-only attribute set.
+ if (isMathLibCallMemWriteOnly(TLI, I))
+ continue;
+
auto *St = dyn_cast<StoreInst>(&I);
if (!St) {
recordAnalysis("CantVectorizeInstruction", St)
|
1afad21
to
db90ef5
Compare
db90ef5
to
a18b4fe
Compare
a18b4fe
to
2a38526
Compare
Rebased to main after a couple of weeks of inactivity. Note: windows x64 build failure seems unrelated; more likely a wide problem, failing at cmake configure. |
void modf_f32(float *in, float *out1, float *out2, int N) { | ||
for (int i = 0; i < N; ++i) | ||
out1[i] = modff(in[i], out2+i); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This test needs to be in either llvm/test/Transforms/LoopVectorize
or llvm/test/Analysis/LoopAccessAnalysis
, depending on what exactly you want to test.
I don't really get what this test is checking for though, it doesn't look like anything was actually vectorized?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This test needs to be in either llvm/test/Transforms/LoopVectorize or llvm/test/Analysis/LoopAccessAnalysis, depending on what exactly you want to test.
I've used a C test, hence the placement. You'd prefer an LLVM IR test?
I don't really get what this test is checking for though, it doesn't look like anything was actually vectorized?
Actually, the below patch removed modf/modff TLI mappings since this one was opened, due to some errors that were discovered. I'm converting this patch to a draft until those mappings are back again.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was converted to LLVM tests for LoopVectorizer, and also added LoopAccessAnalysis tests.
Finally, the PR is stacked on top of #83143, which will re-introduce at least the modf/modff mappings
@@ -2405,6 +2421,11 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI, | |||
|
|||
// Save 'store' instructions. Abort if other instructions write to memory. | |||
if (I.mayWriteToMemory()) { | |||
// We can safety handle math functions that have vectorized | |||
// counterparts and have the memory write-only attribute set. | |||
if (isMathLibCallMemWriteOnly(TLI, I)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can this change be tested with a loop-access analysis tests using a debug output? if yes then it does not have to depend on the TLI mappings for modf/modff
2a38526
to
513ba7a
Compare
f28e739
to
36b0899
Compare
513ba7a
to
00a39d5
Compare
513ba7a
to
7ff8089
Compare
Rebased to main after #80296 was merged: |
// counterparts and have the memory write-only attribute set. | ||
if (isMathLibCallMemWriteOnly(TLI, I)) { | ||
LLVM_DEBUG(dbgs() | ||
<< "LAA: allow math function with write-only attribute:" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it is worth to edit this message a bit so it gives the full context in dbg output, what about:
"LAA: Allow to vectorize math function with write-only attribute:" ?
(also the dbg message start from capital letter)
|
||
target triple = "aarch64-unknown-linux-gnu" | ||
|
||
; TODO: add mappings for frexp/frexpf |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you can still test for the expected dbg output even if there is no mappings, mappings are tested later by LV I think.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Update:
- Using the
-passes='print<access-info>'
as suggested allows this. Thanks!
Original response (outdated):
It turns out without the mappings LV Legality does not let it vectorize earlier (here):
LV: Not vectorizing: Found a non-intrinsic callsite %call = tail call float @frexpf(float noundef %0, ptr noundef %add.ptr)
..
LV: Not vectorizing: Cannot prove legality
I will keep the those LAA frexp/frexpf tests, but I'll convert TODO's in a similar fashion with the next comment.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you planning to add support to LoopVectorizationLegality as well? Would probably good to do this first
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Short answer: Not needed.
Longer answer:
Using -passes='print<access-info>'
(as per @mgabka suggestion), LAA pass can run regardless of vecLib existence, TLI mappings, and/or even SVE support.
Previously, in the LAA test attr-mem-write-only.ll I had:
-mattr=+sve -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize
and yet, I couldn't check for the fexpr calls, as LoopVectorizationLegality
would not allow vectorization, causing LAA to never run on such examples.
With the latest patch, however, LAA runs regardless, so I'm able to do such checks already (here and here).
In the future, when the mappings for frexp are added, only the LV tests would need to be updated (AArch64/veclib-function-calls-linear-ptrs.ll).
|
||
target triple = "aarch64-unknown-linux-gnu" | ||
|
||
; TODO: add mappings for frexp/frexpf |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is this actually a TODO?
I would rather change it into a comment:
; Vectorization can not happen because there is no scalar to vector mapping in TLI for frexp/frexpf. Tests will need to be changed when such mapping are added.
@@ -0,0 +1,117 @@ | |||
; RUN: opt < %s -mattr=+sve -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize -debug-only=loop-accesses -disable-output 2>&1 | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you should only be calling here : "-passes='print' -debug-only=loop-accesses -disable-output"
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Indeed, using -passes='print<access-info>'
is a better option here:
- it allows running LAA on the code regardless of LV's Legality result
- this would allow checking for
frexp
methods in a target agnostic manner.
Great suggestion, thanks!
@@ -2422,6 +2438,15 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI, | |||
|
|||
// Save 'store' instructions. Abort if other instructions write to memory. | |||
if (I.mayWriteToMemory()) { | |||
// We can safety handle math functions that have vectorized | |||
// counterparts and have the memory write-only attribute set. | |||
if (isMathLibCallMemWriteOnly(TLI, I)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
in the read case we also check for !VFDatabase::getMappings(*Call).empty(), but i think it should be enough that LV checks for it, that allows also to test your code in a target agnostic way, and without mappings for frexp
7ff8089
to
4b7b976
Compare
4b7b976
to
c3aa63c
Compare
@@ -0,0 +1,134 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call.*(frexp|modf)" --version 4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sincos (which also takes linear pointers) has tests inside veclib-function-calls.ll, so I would suggest to move this tests to the same file.
Functions like modf/modff are math lib calls that set memory write-only attribute. Given that a target has vectorized mappings, LAA should allow vectorization.
Teach LAA to consider safe specific math lib calls which are known to have set the memory write-only attribute. Those attributes are set to calls by inferNonMandatoryLibFuncAttrs, in BuildLibCalls.cpp, and the current ones are modf/modff and frexp/frexpf. This happens only when the calls are found through TLI to have vectorized counterparts.
Removed C test. Code rebased on top of patch that enables mappings for modf/modff (among others).
Rebased history to introduce tests in 'veclib-function-calls.ll' and delete the (now unnecessary) test veclib-function-calls-linear-ptrs.ll. The commit history were modified as follows: - The initial commit (that showcases what was missing) was amended to add the tests in veclib-function-calls.ll. - Then, subsequent commits were similarly amended to include the updates that allow vectorization. - This current commit simply dropped the no longer needed tests (veclib-function-calls-linear-ptrs.ll)
c3aa63c
to
03fd4ea
Compare
Rebased to:
For (2), the commit history were modified as follows:
|
@@ -0,0 +1,116 @@ | |||
; RUN: opt < %s -passes='print<access-info>' -debug-only=loop-accesses -disable-output 2>&1 | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please also check the access-info
report.
@@ -2477,6 +2493,15 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI, | |||
|
|||
// Save 'store' instructions. Abort if other instructions write to memory. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
comment out of date
define void @frexp_f64(ptr %in, ptr %out1, ptr %out2, i32 %N) { | ||
; CHECK: LAA: Allow to vectorize math function with write-only attribute: %call = tail call double @frexp | ||
entry: | ||
%cmp4 = icmp sgt i32 %N, 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this shouldn't be needed, please try to simplify the test cases
br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup | ||
|
||
for.body.preheader: | ||
%wide.trip.count = zext nneg i32 %N to i64 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this shouldn't be needed, please try to simplify the test cases
ret void | ||
|
||
for.body: | ||
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please strip %indvars.
prefix to keep value names more concise
@@ -2477,6 +2493,15 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI, | |||
|
|||
// Save 'store' instructions. Abort if other instructions write to memory. | |||
if (I.mayWriteToMemory()) { | |||
// We can safety handle math functions that have vectorized |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
typo: safely
@@ -2477,6 +2493,15 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI, | |||
|
|||
// Save 'store' instructions. Abort if other instructions write to memory. | |||
if (I.mayWriteToMemory()) { | |||
// We can safety handle math functions that have vectorized | |||
// counterparts and have the memory write-only attribute set. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure the reasoning here is correct; the writes could still alias with other accesses,
e.g. something like below would be considered as safe, but %out2
could overlap with %in
in a way that results in a backwards dependence at runtime.
define void @frexp_f64(ptr %in, ptr %out1, ptr %out2, i32 %N) {
entry:
%cmp4 = icmp sgt i32 %N, 0
br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader:
%wide.trip.count = zext nneg i32 %N to i64
br label %for.body
for.cond.cleanup:
ret void
for.body:
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds double, ptr %in, i64 %indvars.iv
%0 = load double, ptr %arrayidx, align 8
%add.ptr = getelementptr inbounds i32, ptr %out2, i64 %indvars.iv
%call = tail call double @frexp(double noundef %0, ptr noundef %add.ptr)
store double %call, ptr %arrayidx, align 8
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
declare double @frexp(double, ptr) #1
attributes #1 = { memory(argmem: write) }
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @fhahn,
you are right.
The whole logic was based on checking later in the
void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { |
@paschalis-mpeis that means that there is an existing bug with sincos, as following code gets vectorised, without any checks:
double sin[N];
double cos[N];
sin[5] = M_PI;
for(int i=0; i<N; i++ ) {
sincos(sin[5], sin+i, cos+i);
}
Teach LAA to consider safe specific math lib calls which are known to
have set the memory write-only attribute. Those attributes are set to
calls by
inferNonMandatoryLibFuncAttrs
, in BuildLibCalls.cpp, and thecurrent ones are
modf
/modff
andfrexp
/frexpf
.This happens only when the calls are found through TLI to have vectorized counterparts.
Add tests for LAA and LoopVectorizer.