Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BlockPlacement] Add flag to disable profile usage #102956

Closed
wants to merge 1 commit into from

Conversation

ellishg
Copy link
Contributor

@ellishg ellishg commented Aug 12, 2024

Create the -block-placement-use-profile LLVM flag to enable/disable using profiles to make decisions.

When building with -Oz, consuming profiles can drastically increase binary size. We found -block-placement-use-profile=false gives a slight text size win, which mitigates some of this regression.

@llvmbot
Copy link
Collaborator

llvmbot commented Aug 12, 2024

@llvm/pr-subscribers-llvm-transforms

Author: Ellis Hoag (ellishg)

Changes

Create the -block-placement-use-profile LLVM flag to enable/disable using profiles to make decisions.

When building with -Oz, consuming profiles can drastically increase binary size. We found -block-placement-use-profile=false gives a slight text size win, which mitigates some of this regression.


Full diff: https://github.com/llvm/llvm-project/pull/102956.diff

5 Files Affected:

  • (modified) llvm/include/llvm/Transforms/Utils/LoopPeel.h (+2-1)
  • (modified) llvm/lib/CodeGen/MachineBlockPlacement.cpp (+14-8)
  • (modified) llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp (+9-2)
  • (modified) llvm/lib/Transforms/Utils/LoopPeel.cpp (+3-3)
  • (modified) llvm/test/Transforms/LoopUnroll/peel-loop-conditions-pgo-1.ll (+4-3)
diff --git a/llvm/include/llvm/Transforms/Utils/LoopPeel.h b/llvm/include/llvm/Transforms/Utils/LoopPeel.h
index 0b78700ca71bb..987c21b7ca561 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopPeel.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopPeel.h
@@ -37,7 +37,8 @@ gatherPeelingPreferences(Loop *L, ScalarEvolution &SE,
 void computePeelCount(Loop *L, unsigned LoopSize,
                       TargetTransformInfo::PeelingPreferences &PP,
                       unsigned TripCount, DominatorTree &DT,
-                      ScalarEvolution &SE, AssumptionCache *AC = nullptr,
+                      ScalarEvolution &SE, bool UseBranchWeights,
+                      AssumptionCache *AC = nullptr,
                       unsigned Threshold = UINT_MAX);
 
 } // end namespace llvm
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index be783bc4e2973..8d5cdc9c08b7f 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -219,6 +219,10 @@ static cl::opt<unsigned> ExtTspBlockPlacementMaxBlocks(
              "block placement."),
     cl::init(UINT_MAX), cl::Hidden);
 
+static cl::opt<bool>
+    UseProfileData("block-placement-use-profile", cl::init(true), cl::Hidden,
+                   cl::desc("Use profile data to do precise benefit analysis"));
+
 namespace llvm {
 extern cl::opt<bool> EnableExtTspBlockPlacement;
 extern cl::opt<bool> ApplyExtTspWithoutProfile;
@@ -1220,7 +1224,7 @@ bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
 
   // If profile information is available, findDuplicateCandidates can do more
   // precise benefit analysis.
-  if (F->getFunction().hasProfileData())
+  if (UseProfileData && F->getFunction().hasProfileData())
     return true;
 
   // This is mainly for function exit BB.
@@ -1388,7 +1392,7 @@ void MachineBlockPlacement::precomputeTriangleChains() {
 // When profile is available, we need to handle the triangle-shape CFG.
 static BranchProbability getLayoutSuccessorProbThreshold(
       const MachineBasicBlock *BB) {
-  if (!BB->getParent()->getFunction().hasProfileData())
+  if (!UseProfileData || !BB->getParent()->getFunction().hasProfileData())
     return BranchProbability(StaticLikelyProb, 100);
   if (BB->succ_size() == 2) {
     const MachineBasicBlock *Succ1 = *BB->succ_begin();
@@ -2621,7 +2625,8 @@ MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
   // will be merged into the first outer loop chain for which this block is not
   // cold anymore. This needs precise profile data and we only do this when
   // profile data is available.
-  if (F->getFunction().hasProfileData() || ForceLoopColdBlock) {
+  if ((UseProfileData && F->getFunction().hasProfileData()) ||
+      ForceLoopColdBlock) {
     BlockFrequency LoopFreq(0);
     for (auto *LoopPred : L.getHeader()->predecessors())
       if (!L.contains(LoopPred))
@@ -2670,8 +2675,8 @@ void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
   // this loop by modeling costs more precisely which requires the profile data
   // for better layout.
   bool RotateLoopWithProfile =
-      ForcePreciseRotationCost ||
-      (PreciseRotationCost && F->getFunction().hasProfileData());
+      ForcePreciseRotationCost || (PreciseRotationCost && UseProfileData &&
+                                   F->getFunction().hasProfileData());
 
   // First check to see if there is an obviously preferable top block for the
   // loop. This will default to the header, but may end up as one of the
@@ -3208,7 +3213,7 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock(
   bool IsSimple = TailDup.isSimpleBB(BB);
   SmallVector<MachineBasicBlock *, 8> CandidatePreds;
   SmallVectorImpl<MachineBasicBlock *> *CandidatePtr = nullptr;
-  if (F->getFunction().hasProfileData()) {
+  if (UseProfileData && F->getFunction().hasProfileData()) {
     // We can do partial duplication with precise profile information.
     findDuplicateCandidates(CandidatePreds, BB, BlockFilter);
     if (CandidatePreds.size() == 0)
@@ -3409,7 +3414,7 @@ void MachineBlockPlacement::findDuplicateCandidates(
 
 void MachineBlockPlacement::initDupThreshold() {
   DupThreshold = BlockFrequency(0);
-  if (!F->getFunction().hasProfileData())
+  if (!UseProfileData || !F->getFunction().hasProfileData())
     return;
 
   // We prefer to use prifile count.
@@ -3529,7 +3534,8 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
 
   // Apply a post-processing optimizing block placement.
   if (MF.size() >= 3 && EnableExtTspBlockPlacement &&
-      (ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData()) &&
+      (ApplyExtTspWithoutProfile ||
+       (UseProfileData && MF.getFunction().hasProfileData())) &&
       MF.size() <= ExtTspBlockPlacementMaxBlocks) {
     // Find a new placement and modify the layout of the blocks in the function.
     applyExtTsp();
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index cbc35b6dd4292..0a446851acf2d 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -179,6 +179,12 @@ static cl::opt<unsigned> PragmaUnrollFullMaxIterations(
     "pragma-unroll-full-max-iterations", cl::init(1'000'000), cl::Hidden,
     cl::desc("Maximum allowed iterations to unroll under pragma unroll full."));
 
+static cl::opt<bool>
+    UseBranchWeights("loop-unroll-use-branch-weights", cl::init(true),
+                     cl::Hidden,
+                     cl::desc("Estimate loop trip counts with branch weight "
+                              "metadata to help determine the peel count"));
+
 /// A magic value for use with the Threshold parameter to indicate
 /// that the loop unroll should be performed regardless of how much
 /// code expansion would result.
@@ -1012,7 +1018,8 @@ bool llvm::computeUnrollCount(
   }
 
   // 5th priority is loop peeling.
-  computePeelCount(L, LoopSize, PP, TripCount, DT, SE, AC, UP.Threshold);
+  computePeelCount(L, LoopSize, PP, TripCount, DT, SE, UseBranchWeights, AC,
+                   UP.Threshold);
   if (PP.PeelCount) {
     UP.Runtime = false;
     UP.Count = 1;
@@ -1081,7 +1088,7 @@ bool llvm::computeUnrollCount(
   }
 
   // Check if the runtime trip count is too small when profile is available.
-  if (L->getHeader()->getParent()->hasProfileData()) {
+  if (UseBranchWeights && L->getHeader()->getParent()->hasProfileData()) {
     if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
       if (*ProfileTripCount < FlatLoopTripCountThreshold)
         return false;
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index 5d7c0d947facc..9557d31a122a6 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -538,8 +538,8 @@ static bool violatesLegacyMultiExitLoopCheck(Loop *L) {
 void llvm::computePeelCount(Loop *L, unsigned LoopSize,
                             TargetTransformInfo::PeelingPreferences &PP,
                             unsigned TripCount, DominatorTree &DT,
-                            ScalarEvolution &SE, AssumptionCache *AC,
-                            unsigned Threshold) {
+                            ScalarEvolution &SE, bool UseBranchWeights,
+                            AssumptionCache *AC, unsigned Threshold) {
   assert(LoopSize > 0 && "Zero loop size is not allowed!");
   // Save the PP.PeelCount value set by the target in
   // TTI.getPeelingPreferences or by the flag -unroll-peel-count.
@@ -632,7 +632,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   // hit the peeled section.
   // We only do this in the presence of profile information, since otherwise
   // our estimates of the trip count are not reliable enough.
-  if (L->getHeader()->getParent()->hasProfileData()) {
+  if (UseBranchWeights && L->getHeader()->getParent()->hasProfileData()) {
     if (violatesLegacyMultiExitLoopCheck(L))
       return;
     std::optional<unsigned> EstimatedTripCount = getLoopEstimatedTripCount(L);
diff --git a/llvm/test/Transforms/LoopUnroll/peel-loop-conditions-pgo-1.ll b/llvm/test/Transforms/LoopUnroll/peel-loop-conditions-pgo-1.ll
index e3cfe53950f57..c7fb389c63595 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-loop-conditions-pgo-1.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-loop-conditions-pgo-1.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -passes=loop-unroll,loop-unroll -verify-dom-info -debug-only=loop-unroll -unroll-peel-max-count=7 2>&1 | FileCheck %s
+; RUN: opt < %s -S -passes=loop-unroll,loop-unroll -verify-dom-info -debug-only=loop-unroll -unroll-peel-max-count=7 2>&1 | FileCheck %s --check-prefixes=CHECK,PGO
+; RUN: opt < %s -S -passes=loop-unroll,loop-unroll -verify-dom-info -debug-only=loop-unroll -unroll-peel-max-count=7 -loop-unroll-use-branch-weights=false 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 declare void @f1()
@@ -11,8 +12,8 @@ declare void @f2()
 define void @test1(i32 %k) !prof !4 {
 ; CHECK: Loop Unroll: F[test1] Loop %for.body
 ; CHECK: PEELING loop %for.body with iteration count 2!
-; CHECK: PEELING loop %for.body with iteration count 5!
-; CHECK: llvm.loop.unroll.disable
+; PGO: PEELING loop %for.body with iteration count 5!
+; PGO: llvm.loop.unroll.disable
 for.body.lr.ph:
   br label %for.body
 

@ellishg
Copy link
Contributor Author

ellishg commented Aug 30, 2024

Friendly ping for review @amharc.

This flag is also useful to triage size regressions when consuming profiles. Many other passes have similar flags. #102950 is a similar PR.

@spupyrev spupyrev self-requested a review September 3, 2024 20:38
@ellishg ellishg closed this Oct 2, 2024
@ellishg ellishg deleted the block-placement-pgo branch October 2, 2024 23:53
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants