[PATCH] D108138: [SimplifyCFG] Remove switch statements before vectorization

Kerry McLaughlin via Phabricator via cfe-commits Mon, 16 Aug 2021 08:36:07 -0700

kmclaughlin created this revision.
kmclaughlin added reviewers: david-arm, fhahn, dmgreen, craig.topper, 
lebedev.ri.
Herald added subscribers: ctetreau, ormris, wenlei, steven_wu, hiraditya, 
kristof.beyls.
kmclaughlin requested review of this revision.
Herald added projects: clang, LLVM.
Herald added subscribers: llvm-commits, cfe-commits.


This patch adds a new function, TurnSmallSwitchIntoICmps, to SimplifyCFG
which attempts to replace small switch statements with a series of conditional
branches and compares. The purpose of this is to allow vectorization of loops
which is not possible at the moment due to the presence of switch statements.
We now run SimplifyCFG to unswitch just before the vectorizer; if we didn't
vectorize the loop then the switch is added back afterwards.

Two new options have been added, the first is `-remove-switch-blocks` which
enables/disables this feature and is on by default. The second is
`-switch-removal-threshold`, which sets the threshold number of switch cases
which we will convert to branches & compares, above which we will not attempt
to convert the switch. If unspecified, the default value used here initially is 
4.

The following tests have been added:

- SimplifyCFG/remove-switches.ll: Tests the changes to SimplifyCFG to replace 
switch statments & ensures branch weights are updated correctly if provided.
- LoopVectorize/AArch64/sve-remove-switches.ll: Tests that we can vectorize 
loops with switch statements with scalable vectors. Also tests that where 
vectorization is not possible, that the switch statement is created again.
- LoopVectorize/remove-switches.ll: Ensures that we do not vectorize the loop 
if the target doesn't support masked loads & stores, where the cost would be 
too high.

Patch originally by David Sherwood


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D108138

Files:
  clang/test/Frontend/optimization-remark-analysis.c
  llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
  llvm/lib/Passes/PassBuilder.cpp
  llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
  llvm/lib/Transforms/Utils/SimplifyCFG.cpp
  llvm/test/Other/new-pm-defaults.ll
  llvm/test/Other/new-pm-lto-defaults.ll
  llvm/test/Other/new-pm-thinlto-defaults.ll
  llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
  llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
  llvm/test/Transforms/LoopVectorize/AArch64/sve-remove-switches.ll
  llvm/test/Transforms/LoopVectorize/remove-switches.ll
  llvm/test/Transforms/SimplifyCFG/nomerge.ll
  llvm/test/Transforms/SimplifyCFG/remove-switches.ll

Index: llvm/test/Transforms/SimplifyCFG/remove-switches.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/SimplifyCFG/remove-switches.ll
@@ -0,0 +1,142 @@
+; RUN: opt < %s -simplifycfg -switch-removal-threshold=4 -S | FileCheck %s
+
+define void @unswitch(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i64 %N){
+; CHECK-LABEL: @unswitch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L4:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[SWITCH:%.*]] = icmp eq i32 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[SWITCH]], label [[L4]], label [[FOR_BODY_SWITCH:%.*]], !prof !0
+; CHECK:       for.body.switch:
+; CHECK-NEXT:    [[SWITCH1:%.*]] = icmp eq i32 [[TMP0]], 2
+; CHECK-NEXT:    br i1 [[SWITCH1]], label [[L2:%.*]], label [[FOR_BODY_SWITCH2:%.*]], !prof !1
+; CHECK:       for.body.switch2:
+; CHECK-NEXT:    [[SWITCH3:%.*]] = icmp eq i32 [[TMP0]], 3
+; CHECK-NEXT:    br i1 [[SWITCH3]], label [[L3:%.*]], label [[FOR_BODY_SWITCH4:%.*]], !prof !2
+; CHECK:       for.body.switch4:
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[MUL]], [[TMP0]]
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    br label [[L2]]
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L4 ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  switch i32 %0, label %L1 [
+  i32 4, label %L4
+  i32 2, label %L2
+  i32 3, label %L3
+  ], !prof !0
+
+L1:
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
+  %1 = load i32, i32* %arrayidx5
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %0
+  store i32 %add, i32* %arrayidx
+  br label %L2
+
+L2:
+  %2 = phi i32 [ %0, %for.body ], [ %add, %L1 ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i
+  %3 = load i32, i32* %arrayidx7, align 4
+  %mul9 = mul nsw i32 %3, %3
+  %add11 = add nsw i32 %2, %mul9
+  store i32 %add11, i32* %arrayidx
+  br label %L3
+
+L3:
+  %4 = phi i32 [ %0, %for.body ], [ %add11, %L2 ]
+  %arrayidx13 = getelementptr inbounds i32, i32* %c, i64 %i
+  %5 = load i32, i32* %arrayidx13
+  %mul14 = mul nsw i32 %5, %4
+  %add16 = add nsw i32 %mul14, %4
+  store i32 %add16, i32* %arrayidx
+  br label %L4
+
+L4:
+  %6 = phi i32 [ %0, %for.body ], [ %add16, %L3 ]
+  %arrayidx17 = getelementptr inbounds i32, i32* %c, i64 %i
+  %7 = load i32, i32* %arrayidx17
+  %mul19 = mul nsw i32 %7, %7
+  %add21 = add nsw i32 %6, %mul19
+  store i32 %add21, i32* %arrayidx
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; This test should not replace the switch statement as multiple cases have the same destination block
+define dso_local void @switch2(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i64 %N) {
+; CHECK-LABEL: @switch2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L3:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    switch i32 [[TMP0]], label [[L1:%.*]] [
+; CHECK-NEXT:    i32 4, label [[L3]]
+; CHECK-NEXT:    i32 2, label [[L2:%.*]]
+; CHECK-NEXT:    i32 3, label [[L3]]
+; CHECK-NEXT:    ]
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L3 ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  switch i32 %0, label %L1 [
+  i32 4, label %L3
+  i32 2, label %L2
+  i32 3, label %L3
+  ]
+
+L1:
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
+  %1 = load i32, i32* %arrayidx5
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %0
+  store i32 %add, i32* %arrayidx
+  br label %L2
+
+L2:
+  %2 = phi i32 [ %0, %for.body ], [ %add, %L1 ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i
+  %3 = load i32, i32* %arrayidx7
+  %mul9 = mul nsw i32 %3, %3
+  %add11 = add nsw i32 %2, %mul9
+  store i32 %add11, i32* %arrayidx
+  br label %L3
+
+L3:
+  %4 = phi i32 [ %0, %for.body ], [ %0, %for.body ], [ %add11, %L2 ]
+  %arrayidx13 = getelementptr inbounds i32, i32* %c, i64 %i
+  %5 = load i32, i32* %arrayidx13
+  %mul14 = mul nsw i32 %5, %4
+  %add16 = add nsw i32 %mul14, %4
+  store i32 %add16, i32* %arrayidx
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+!0 = !{!"branch_weights", i32 15, i32 5, i32 10, i32 2}
+; CHECK: !0 = !{!"branch_weights", i32 5, i32 27}
+; CHECK: !1 = !{!"branch_weights", i32 10, i32 17}
+; CHECK: !2 = !{!"branch_weights", i32 2, i32 15}
Index: llvm/test/Transforms/SimplifyCFG/nomerge.ll
===================================================================
--- llvm/test/Transforms/SimplifyCFG/nomerge.ll
+++ llvm/test/Transforms/SimplifyCFG/nomerge.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -O1 -S | FileCheck %s
+; RUN: opt < %s -O1 -remove-switch-blocks=false -S | FileCheck %s
 
 ; The attribute nomerge prevents the 3 bar() calls from being sunk/hoisted into
 ; one inside a function. Check that there are still 3 tail calls.
Index: llvm/test/Transforms/LoopVectorize/remove-switches.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/remove-switches.ll
@@ -0,0 +1,352 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -O3 -loop-vectorize -pass-remarks-analysis=loop-vectorize -S 2>%t | FileCheck %s
+; RUN: cat %t | FileCheck %s -check-prefix=CHECK-REMARKS
+
+; We should not vectorize this loop since we do not have masked loads and stores
+; CHECK-REMARKS: remark: <unknown>:0:0: the cost-model indicates that vectorization is not beneficial
+define void @switch_cost(i32* noalias %a, i32* noalias readonly %b, i32* noalias readonly %c, i64 %N) #0 {
+; CHECK-LABEL: @switch_cost(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-NOT:   vector.body
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L4:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    switch i32 [[TMP0]], label [[FOR_BODY_SWITCH5:%.*]] [
+; CHECK-NEXT:    i32 4, label [[FOR_BODY_L4_CRIT_EDGE:%.*]]
+; CHECK-NEXT:    i32 2, label [[FOR_BODY_L2_CRIT_EDGE:%.*]]
+; CHECK-NEXT:    i32 3, label [[L3:%.*]]
+; CHECK-NEXT:    ]
+
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L4 ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  switch i32 %0, label %L1 [
+  i32 4, label %L4
+  i32 2, label %L2
+  i32 3, label %L3
+  ]
+
+L1:
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
+  %1 = load i32, i32* %arrayidx5
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %0
+  store i32 %add, i32* %arrayidx
+  br label %L2
+
+L2:
+  %2 = phi i32 [ 2, %for.body ], [ %add, %L1 ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i
+  %3 = load i32, i32* %arrayidx7
+  %mul9 = mul nsw i32 %3, %3
+  %add11 = add nsw i32 %2, %mul9
+  store i32 %add11, i32* %arrayidx
+  br label %L3
+
+L3:
+  %4 = phi i32 [ 3, %for.body ], [ %add11, %L2 ]
+  %arrayidx13 = getelementptr inbounds i32, i32* %c, i64 %i
+  %5 = load i32, i32* %arrayidx13
+  %mul14 = mul nsw i32 %5, %4
+  %add16 = add nsw i32 %mul14, %4
+  store i32 %add16, i32* %arrayidx
+  br label %L4
+
+L4:
+  %6 = phi i32 [ 4, %for.body ], [ %add16, %L3 ]
+  %arrayidx17 = getelementptr inbounds i32, i32* %c, i64 %i
+  %7 = load i32, i32* %arrayidx17
+  %mul19 = mul nsw i32 %7, %7
+  %add21 = add nsw i32 %6, %mul19
+  store i32 %add21, i32* %arrayidx
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @switch(i32* noalias %a, i32* noalias %b, i64 %N) {
+; CHECK-LABEL: @switch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP14:%.*]] = icmp sgt i64 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP14]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER4:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[DOTOP:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> <i32 9, i32 9, i32 9, i32 9>, <4 x i32> <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32> [[DOTOP]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER4]]
+; CHECK:       for.body.preheader4:
+; CHECK-NEXT:    [[I_015_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[I_015_PH]], [[FOR_BODY_PREHEADER4]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_015]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[SWITCH:%.*]] = icmp eq i32 [[TMP10]], 3
+; CHECK-NEXT:    [[SWITCH1:%.*]] = icmp eq i32 [[TMP10]], 2
+; CHECK-NEXT:    [[R_0_OP:%.*]] = select i1 [[SWITCH1]], i32 9, i32 16
+; CHECK-NEXT:    [[ADD4:%.*]] = select i1 [[SWITCH]], i32 7, i32 [[R_0_OP]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I_015]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP11]], [[ADD4]]
+; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_015]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+
+entry:
+  %cmp14 = icmp sgt i64 %N, 0
+  br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %L3
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %L3
+  %i.015 = phi i64 [ %inc, %L3 ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i.015
+  %0 = load i32, i32* %arrayidx
+  switch i32 %0, label %L1 [
+  i32 3, label %L3
+  i32 2, label %L2
+  ]
+
+L1:                                               ; preds = %for.body
+  br label %L2
+
+L2:                                               ; preds = %for.body, %L1
+  %r.0 = phi i32 [ 12, %L1 ], [ 5, %for.body ]
+  br label %L3
+
+L3:                                               ; preds = %for.body, %L2
+  %r.1 = phi i32 [ %r.0, %L2 ], [ 3, %for.body ]
+  %add4 = add nuw nsw i32 %r.1, 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i.015
+  %1 = load i32, i32* %arrayidx5
+  %mul = mul nsw i32 %1, %add4
+  store i32 %mul, i32* %arrayidx5
+  %inc = add nuw nsw i64 %i.015, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !0
+}
+
+define void @switch_VF1_UF2(i32* noalias %a, i32* noalias readonly %b, i32* noalias readonly %c, i64 %N) {
+; CHECK-LABEL: @switch_VF1_UF2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -2
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
+; CHECK-NEXT:    [[INDUCTION4:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDUCTION4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP2]], 3
+; CHECK-NEXT:    [[DOTNOT8:%.*]] = icmp eq i32 [[TMP3]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw i32 [[TMP2]], 3
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw i32 [[TMP3]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP4]], i32 2, i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP5]], i32 2, i32 [[TMP7]]
+; CHECK-NEXT:    br i1 [[DOTNOT]], label [[PRED_LOAD_CONTINUE:%.*]], label [[PRED_LOAD_IF:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    br i1 [[DOTNOT8]], label [[PRED_LOAD_CONTINUE6]], label [[PRED_LOAD_IF5:%.*]]
+; CHECK:       pred.load.if5:
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
+; CHECK:       pred.load.continue6:
+; CHECK-NEXT:    [[TMP15:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nsw i32 [[TMP12]], 3
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw i32 [[TMP15]], 3
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw i32 [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP19:%.*]] = add nsw i32 [[TMP17]], [[TMP9]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[DOTNOT]], i32 3, i32 [[TMP18]]
+; CHECK-NEXT:    [[PREDPHI7:%.*]] = select i1 [[DOTNOT8]], i32 3, i32 [[TMP19]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDUCTION4]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP20]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP21]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = shl nsw i32 [[TMP22]], 2
+; CHECK-NEXT:    [[TMP25:%.*]] = shl nsw i32 [[TMP23]], 2
+; CHECK-NEXT:    [[TMP26:%.*]] = add nsw i32 [[TMP24]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP27:%.*]] = add nsw i32 [[TMP25]], [[PREDPHI7]]
+; CHECK-NEXT:    store i32 [[TMP26]], i32* [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP27]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L3:%.*]] ], [ {{.*}}, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[SWITCH:%.*]] = icmp eq i32 [[TMP29]], 3
+; CHECK-NEXT:    br i1 [[SWITCH]], label [[L3]], label [[FOR_BODY_SWITCH:%.*]]
+
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L3 ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  %switch = icmp eq i32 %0, 3
+  br i1 %switch, label %L3, label %for.body.switch
+
+for.body.switch:
+  %switch1 = icmp eq i32 %0, 2
+  br i1 %switch1, label %L2, label %for.body.switch2
+
+for.body.switch2:
+  %add = mul nsw i32 %0, 3
+  store i32 %add, i32* %arrayidx
+  br label %L2
+
+L2:
+  %1 = phi i32 [ %add, %for.body.switch2 ], [ %0, %for.body.switch ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
+  %2 = load i32, i32* %arrayidx5
+  %mul6 = mul nsw i32 %2, 3
+  %add8 = add nsw i32 %1, %mul6
+  store i32 %add8, i32* %arrayidx
+  br label %L3
+
+L3:
+  %3 = phi i32 [ %0, %for.body ], [ %add8, %L2 ]
+  %arrayidx9 = getelementptr inbounds i32, i32* %c, i64 %i
+  %4 = load i32, i32* %arrayidx9
+  %mul10 = shl nsw i32 %4, 2
+  %add12 = add nsw i32 %3, %mul10
+  store i32 %add12, i32* %arrayidx
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+  ret void
+}
+
+; This loop will not vectorize due to unsafe FP ops, ensure the switch statement is created again in for.body
+define float @switch_no_vectorize(i32* noalias %a, i32* noalias readonly %b, i32* noalias readonly %c, float %val, i64 %N) {
+; CHECK-LABEL: @switch_no_vectorize(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-NOT:   vector.body:
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L3:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SUM_033:%.*]] = phi float [ [[CONV20:%.*]], [[L3]] ], [ 2.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    switch i32 [[TMP0]], label [[FOR_BODY_SWITCH2:%.*]] [
+; CHECK-NEXT:    i32 3, label [[L3]]
+; CHECK-NEXT:    i32 2, label [[L2:%.*]]
+; CHECK-NEXT:    ]
+
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L3 ], [ 0, %entry ]
+  %sum.033 = phi float [ %conv20, %L3 ], [ 2.000000e+00, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  switch i32 %0, label %L1 [
+  i32 3, label %L3
+  i32 2, label %L2
+  ]
+
+L1:
+  %conv = sitofp i32 %0 to float
+  %conv4 = fpext float %conv to double
+  %add = fadd double %conv4, 1.000000e+00
+  %conv5 = fpext float %sum.033 to double
+  %mul = fmul double %add, %conv5
+  %conv6 = fptrunc double %mul to float
+  br label %L2
+
+L2:
+  %sum.1 = phi float [ %conv6, %L1 ], [ %sum.033, %for.body ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i
+  %1 = load i32, i32* %arrayidx7
+  %conv8 = sitofp i32 %1 to float
+  %conv9 = fpext float %conv8 to double
+  %add10 = fadd double %conv9, 2.000000e+00
+  %conv11 = fpext float %sum.1 to double
+  %mul12 = fmul double %add10, %conv11
+  %conv13 = fptrunc double %mul12 to float
+  br label %L3
+
+L3:
+  %sum.2 = phi float [ %conv13, %L2 ], [ %sum.033, %for.body ]
+  %arrayidx14 = getelementptr inbounds i32, i32* %c, i64 %i
+  %2 = load i32, i32* %arrayidx14
+  %conv15 = sitofp i32 %2 to float
+  %conv16 = fpext float %conv15 to double
+  %add17 = fadd double %conv16, 3.000000e+00
+  %conv18 = fpext float %sum.2 to double
+  %mul19 = fmul double %add17, %conv18
+  %conv20 = fptrunc double %mul19 to float
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret float %conv20
+}
+
+!0 = distinct !{!0, !2, !4, !6}
+!1 = distinct !{!1, !3, !5, !6}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.width", i32 1}
+!4 = !{!"llvm.loop.interleave.count", i32 1}
+!5 = !{!"llvm.loop.interleave.count", i32 2}
+!6 = !{!"llvm.loop.vectorize.enable", i1 true}
Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-remove-switches.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/AArch64/sve-remove-switches.ll
@@ -0,0 +1,277 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -O3 -loop-vectorize -mtriple aarch64-linux-gnu -mattr=+sve -scalable-vectorization=on -S | FileCheck %s
+
+define void @switch(i32* noalias %a, i32* noalias %b, i32* noalias %c, i64 %N) #0 {
+; CHECK-LABEL: @switch(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 4, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <vscale x 4 x i1> [[TMP9]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 false, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <vscale x 4 x i1> [[TMP10]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 false, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP16]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <vscale x 4 x i32> [[TMP17]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 false, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> [[TMP9]]
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP20]], i32 4, <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD6]]
+; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP21:%.*]] = mul nsw <vscale x 4 x i32> [[PREDPHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP22:%.*]] = add nsw <vscale x 4 x i32> [[TMP21]], [[PREDPHI7]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <vscale x 4 x i1> [[TMP19]], [[TMP15]]
+; CHECK-NEXT:    [[TMP24:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 false, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[TMP22]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or <vscale x 4 x i1> [[TMP24]], [[TMP23]]
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP25]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP27]], i32 4, <vscale x 4 x i1> [[TMP26]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP28:%.*]] = mul nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD9]], [[PREDPHI8]]
+; CHECK-NEXT:    [[TMP29:%.*]] = add nsw <vscale x 4 x i32> [[TMP28]], [[PREDPHI8]]
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast i32* [[TMP25]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP30]], i32 4, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[PREDPHI11:%.*]] = select <vscale x 4 x i1> [[TMP26]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD9]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD10]]
+; CHECK-NEXT:    [[PREDPHI12:%.*]] = select <vscale x 4 x i1> [[TMP26]], <vscale x 4 x i32> [[TMP29]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 4, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP31:%.*]] = mul nsw <vscale x 4 x i32> [[PREDPHI11]], [[PREDPHI11]]
+; CHECK-NEXT:    [[TMP32:%.*]] = add nsw <vscale x 4 x i32> [[TMP31]], [[PREDPHI12]]
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i32* [[TMP6]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP32]], <vscale x 4 x i32>* [[TMP33]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], {{.*}}
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{.*}}
+; CHECK-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L4:%.*]] ], [ {{.*}}, %for.body.preheader ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
+; CHECK-NEXT:    [[TMP35:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    switch i32 [[TMP35]], label [[FOR_BODY_SWITCH5:%.*]] [
+; CHECK-NEXT:    i32 4, label [[FOR_BODY_L4_CRIT_EDGE:%.*]]
+; CHECK-NEXT:    i32 2, label [[FOR_BODY_L2_CRIT_EDGE:%.*]]
+; CHECK-NEXT:    i32 3, label [[L3:%.*]]
+; CHECK-NEXT:    ]
+
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L4 ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  switch i32 %0, label %L1 [
+  i32 4, label %L4
+  i32 2, label %L2
+  i32 3, label %L3
+  ]
+
+L1:
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
+  %1 = load i32, i32* %arrayidx5
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %0
+  store i32 %add, i32* %arrayidx
+  br label %L2
+
+L2:
+  %2 = phi i32 [ 2, %for.body ], [ %add, %L1 ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i
+  %3 = load i32, i32* %arrayidx7
+  %mul9 = mul nsw i32 %3, %3
+  %add11 = add nsw i32 %2, %mul9
+  store i32 %add11, i32* %arrayidx
+  br label %L3
+
+L3:
+  %4 = phi i32 [ 3, %for.body ], [ %add11, %L2 ]
+  %arrayidx13 = getelementptr inbounds i32, i32* %c, i64 %i
+  %5 = load i32, i32* %arrayidx13
+  %mul14 = mul nsw i32 %5, %4
+  %add16 = add nsw i32 %mul14, %4
+  store i32 %add16, i32* %arrayidx
+  br label %L4
+
+L4:
+  %6 = phi i32 [ 4, %for.body ], [ %add16, %L3 ]
+  %arrayidx17 = getelementptr inbounds i32, i32* %c, i64 %i
+  %7 = load i32, i32* %arrayidx17
+  %mul19 = mul nsw i32 %7, %7
+  %add21 = add nsw i32 %6, %mul19
+  store i32 %add21, i32* %arrayidx
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @switch_VF1_UF2(i32* noalias %a, i32* noalias %b, i32* noalias %c, i64 %N) #0 {
+; CHECK-LABEL: @switch_VF1_UF2(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
+; CHECK-NEXT:    [[INDUCTION4:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i32> [[TMP2]], <i32 3, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <2 x i32> [[TMP2]], <i32 2, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw <2 x i32> [[TMP2]], <i32 3, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> <i32 2, i32 2>, <2 x i32> [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
+; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_LOAD_CONTINUE:%.*]], label [[PRED_LOAD_IF:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, %vector.body ], [ [[TMP9]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1
+; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_LOAD_CONTINUE6]], label [[PRED_LOAD_IF5:%.*]]
+; CHECK:       pred.load.if5:
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
+; CHECK:       pred.load.continue6:
+; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP14]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw <2 x i32> [[TMP16]], <i32 3, i32 3>
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <2 x i32> [[TMP17]], [[TMP6]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> <i32 3, i32 3>, <2 x i32> [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP22:%.*]] = load <2 x i32>, <2 x i32>* [[TMP21]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = shl nsw <2 x i32> [[TMP22]], <i32 2, i32 2>
+; CHECK-NEXT:    [[TMP24:%.*]] = add nsw <2 x i32> [[TMP23]], [[TMP19]]
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP24]], <2 x i32>* [[TMP25]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{.*}}
+; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L3:%.*]] ], [ {{.*}}, %for.body.preheader ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[SWITCH:%.*]] = icmp eq i32 [[TMP27]], 3
+; CHECK-NEXT:    br i1 [[SWITCH]], label [[L3]], label [[FOR_BODY_SWITCH:%.*]]
+
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L3 ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  %switch = icmp eq i32 %0, 3
+  br i1 %switch, label %L3, label %for.body.switch
+
+for.body.switch:
+  %switch1 = icmp eq i32 %0, 2
+  br i1 %switch1, label %L2, label %for.body.switch2
+
+for.body.switch2:
+  %add = mul nsw i32 %0, 3
+  store i32 %add, i32* %arrayidx
+  br label %L2
+
+L2:
+  %1 = phi i32 [ %add, %for.body.switch2 ], [ %0, %for.body.switch ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
+  %2 = load i32, i32* %arrayidx5
+  %mul6 = mul nsw i32 %2, 3
+  %add8 = add nsw i32 %1, %mul6
+  store i32 %add8, i32* %arrayidx
+  br label %L3
+
+L3:
+  %3 = phi i32 [ %0, %for.body ], [ %add8, %L2 ]
+  %arrayidx9 = getelementptr inbounds i32, i32* %c, i64 %i
+  %4 = load i32, i32* %arrayidx9
+  %mul10 = shl nsw i32 %4, 2
+  %add12 = add nsw i32 %3, %mul10
+  store i32 %add12, i32* %arrayidx
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+; This loop will not vectorize due to unsafe FP ops, ensure the switch statement is created again in for.body
+define float @switch_no_vectorize(i32* noalias %a, i32* noalias %b, i32* noalias %c, float %val, i64 %N) {
+; CHECK-LABEL: @switch_no_vectorize(
+; CHECK-NOT:   vector.body
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L3:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SUM_033:%.*]] = phi float [ [[CONV20:%.*]], [[L3]] ], [ 2.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    switch i32 [[TMP0]], label [[FOR_BODY_SWITCH2:%.*]] [
+; CHECK-NEXT:    i32 3, label [[L3]]
+; CHECK-NEXT:    i32 2, label [[L2:%.*]]
+; CHECK-NEXT:    ]
+
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L3 ], [ 0, %entry ]
+  %sum.033 = phi float [ %conv20, %L3 ], [ 2.000000e+00, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  switch i32 %0, label %L1 [
+  i32 3, label %L3
+  i32 2, label %L2
+  ]
+
+L1:
+  %conv = sitofp i32 %0 to float
+  %conv4 = fpext float %conv to double
+  %add = fadd double %conv4, 1.000000e+00
+  %conv5 = fpext float %sum.033 to double
+  %mul = fmul double %add, %conv5
+  %conv6 = fptrunc double %mul to float
+  br label %L2
+
+L2:
+  %sum.1 = phi float [ %conv6, %L1 ], [ %sum.033, %for.body ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i
+  %1 = load i32, i32* %arrayidx7
+  %conv8 = sitofp i32 %1 to float
+  %conv9 = fpext float %conv8 to double
+  %add10 = fadd double %conv9, 2.000000e+00
+  %conv11 = fpext float %sum.1 to double
+  %mul12 = fmul double %add10, %conv11
+  %conv13 = fptrunc double %mul12 to float
+  br label %L3
+
+L3:
+  %sum.2 = phi float [ %conv13, %L2 ], [ %sum.033, %for.body ]
+  %arrayidx14 = getelementptr inbounds i32, i32* %c, i64 %i
+  %2 = load i32, i32* %arrayidx14
+  %conv15 = sitofp i32 %2 to float
+  %conv16 = fpext float %conv15 to double
+  %add17 = fadd double %conv16, 3.000000e+00
+  %conv18 = fpext float %sum.2 to double
+  %mul19 = fmul double %add17, %conv18
+  %conv20 = fptrunc double %mul19 to float
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret float %conv20
+}
+
+!0 = distinct !{!0, !1, !2, !3, !4}
+!1 = !{!"llvm.loop.vectorize.width", i32 1}
+!2 = !{!"llvm.loop.interleave.count", i32 2}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+!4 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
Index: llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
===================================================================
--- llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -180,6 +180,7 @@
 ; CHECK-O-NEXT: Running pass: LoopRotatePass
 ; CHECK-O-NEXT: Running pass: LoopDistributePass
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
+; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
Index: llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
===================================================================
--- llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -168,6 +168,7 @@
 ; CHECK-O-NEXT: Running pass: LoopRotatePass
 ; CHECK-O-NEXT: Running pass: LoopDistributePass
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
+; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
Index: llvm/test/Other/new-pm-thinlto-defaults.ll
===================================================================
--- llvm/test/Other/new-pm-thinlto-defaults.ll
+++ llvm/test/Other/new-pm-thinlto-defaults.ll
@@ -197,6 +197,7 @@
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopRotatePass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopDistributePass
 ; CHECK-POSTLINK-O-NEXT: Running pass: InjectTLIMappings
+; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass
 ; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis
 ; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis
Index: llvm/test/Other/new-pm-lto-defaults.ll
===================================================================
--- llvm/test/Other/new-pm-lto-defaults.ll
+++ llvm/test/Other/new-pm-lto-defaults.ll
@@ -105,6 +105,7 @@
 ; CHECK-O23SZ-NEXT: Running pass: LoopDeletionPass on Loop
 ; CHECK-O23SZ-NEXT: Running pass: LoopFullUnrollPass on Loop
 ; CHECK-O23SZ-NEXT: Running pass: LoopDistributePass on foo
+; CHECK-O23SZ-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O23SZ-NEXT: Running pass: LoopVectorizePass on foo
 ; CHECK-O23SZ-NEXT: Running analysis: BlockFrequencyAnalysis on foo
 ; CHECK-O23SZ-NEXT: Running analysis: BranchProbabilityAnalysis on foo
Index: llvm/test/Other/new-pm-defaults.ll
===================================================================
--- llvm/test/Other/new-pm-defaults.ll
+++ llvm/test/Other/new-pm-defaults.ll
@@ -216,6 +216,7 @@
 ; CHECK-O-NEXT: Running pass: LoopRotatePass
 ; CHECK-O-NEXT: Running pass: LoopDistributePass
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
+; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
 ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis
 ; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis
Index: llvm/lib/Transforms/Utils/SimplifyCFG.cpp
===================================================================
--- llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -6141,6 +6141,102 @@
   return true;
 }
 
+// Attempt to turn a switch statement into a series of conditional branches
+// which we may later be able to vectorize.
+static bool TurnSmallSwitchIntoICmps(SwitchInst *SI, IRBuilder<> &Builder) {
+  assert(SI->getNumCases() > 1 && "Degenerate switch?");
+
+  // Check to see if we have a genuine default, reachable block with executable
+  // instructions in them.
+  bool HasDefault =
+      !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
+
+  BasicBlock *DefaultBlock = HasDefault ? SI->getDefaultDest() : nullptr;
+  BasicBlock *BB = SI->getParent();
+
+  // Make sure each of the cases has a unique destination
+  for (auto Case : SI->cases())
+    if (!SI->findCaseDest(Case.getCaseSuccessor()))
+      return false;
+
+  // Record the total weighting for this switch block.
+  uint64_t TotalWeight = 0;
+  SmallVector<uint64_t, 8> Weights;
+  if (HasBranchWeights(SI)) {
+    GetBranchWeights(SI, Weights);
+    if (Weights.size() == (SI->getNumCases() + 1))
+      for (auto W : Weights)
+        TotalWeight += W;
+  }
+
+  BasicBlock *FalseDest = nullptr;
+  uint64_t FalseWeight = TotalWeight;
+  for (auto CI : SI->cases()) {
+    BasicBlock *TrueDest = CI.getCaseSuccessor();
+    Value *Cmp =
+        Builder.CreateICmpEQ(SI->getCondition(), CI.getCaseValue(), "switch");
+
+    // Walk through PHIs in TrueDest and see which ones came
+    // from the switch block, then remap them.
+    if (FalseDest) {
+      for (PHINode &PN : TrueDest->phis()) {
+        for (auto PB : PN.blocks()) {
+          if (PB == BB) {
+            Value *V = PN.getIncomingValueForBlock(BB);
+            PN.removeIncomingValue(BB, false);
+            PN.addIncoming(V, FalseDest);
+          }
+        }
+      }
+    }
+
+    BasicBlock *MoveAfter = FalseDest ? FalseDest : BB;
+    FalseDest = BasicBlock::Create(BB->getContext(), BB->getName() + ".switch",
+                                 BB->getParent(), BB);
+    FalseDest->moveAfter(MoveAfter);
+
+    Instruction *I = Builder.CreateCondBr(Cmp, TrueDest, FalseDest);
+    // Update weight for the newly-created conditional branch.
+    // We set the weight of the TrueDest to the weight for the successor
+    // of the current case. The FalseDest is assigned the remaining total
+    // weight, minus the weight assigned to TrueDest.
+    if (TotalWeight) {
+      int Index = CI.getSuccessorIndex();
+      FalseWeight -= Weights[Index];
+      setBranchWeights(I, Weights[Index], FalseWeight);
+    }
+    Builder.SetInsertPoint(FalseDest);
+  }
+
+  if (DefaultBlock) {
+    Builder.CreateBr(DefaultBlock);
+
+    // The block that we jump to may have had some PHIs that came
+    // from the block containing the switch statement. Now that we
+    // are removing the switch statement we need to fix up the PHIs.
+
+    // Walk through PHIs in DefaultBlock and see which ones came
+    // from the switch block, then remap them.
+    for (PHINode &PN : DefaultBlock->phis()) {
+      for (auto PB : PN.blocks()) {
+        if (PB == BB) {
+          Value *V = PN.getIncomingValueForBlock(BB);
+          PN.removeIncomingValue(BB, false);
+          PN.addIncoming(V, FalseDest);
+        }
+      }
+    }
+  } else
+    Builder.CreateUnreachable();
+
+  // Drop the switch.
+  SI->eraseFromParent();
+
+  Builder.SetInsertPoint(BB);
+
+  return true;
+}
+
 bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   BasicBlock *BB = SI->getParent();
 
@@ -6163,8 +6259,14 @@
         return requestResimplify();
   }
 
+  unsigned NumCases = SI->getNumCases();
+  bool RemoveSwitches = Options.SwitchRemovalThreshold >= NumCases;
+
+  if (RemoveSwitches && TurnSmallSwitchIntoICmps(SI, Builder))
+    return simplifyCFG(BB, TTI, DTU, Options) | true;
+
   // Try to transform the switch into an icmp and a branch.
-  if (TurnSwitchRangeIntoICmp(SI, Builder))
+  if (!RemoveSwitches && TurnSwitchRangeIntoICmp(SI, Builder))
     return requestResimplify();
 
   // Remove unreachable cases.
@@ -6412,16 +6514,18 @@
       if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder))
         return requestResimplify();
 
-    // This block must be empty, except for the setcond inst, if it exists.
-    // Ignore dbg and pseudo intrinsics.
-    auto I = BB->instructionsWithoutDebug(true).begin();
-    if (&*I == BI) {
-      if (FoldValueComparisonIntoPredecessors(BI, Builder))
-        return requestResimplify();
-    } else if (&*I == cast<Instruction>(BI->getCondition())) {
-      ++I;
-      if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder))
-        return requestResimplify();
+    if (Options.SwitchRemovalThreshold == 0) {
+      // This block must be empty, except for the setcond inst, if it exists.
+      // Ignore dbg and pseudo intrinsics.
+      auto I = BB->instructionsWithoutDebug(true).begin();
+      if (&*I == BI) {
+        if (FoldValueComparisonIntoPredecessors(BI, Builder))
+          return requestResimplify();
+      } else if (&*I == cast<Instruction>(BI->getCondition())) {
+        ++I;
+        if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder))
+          return requestResimplify();
+      }
     }
   }
 
Index: llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -55,6 +55,11 @@
     "bonus-inst-threshold", cl::Hidden, cl::init(1),
     cl::desc("Control the number of bonus instructions (default = 1)"));
 
+static cl::opt<unsigned> UserSwitchRemovalThreshold(
+    "switch-removal-threshold", cl::Hidden, cl::init(0),
+    cl::desc("Set the threshold for the number of switch cases where we"
+             "convert switch blocks to branches and compares"));
+
 static cl::opt<bool> UserKeepLoops(
     "keep-loops", cl::Hidden, cl::init(true),
     cl::desc("Preserve canonical loop structure (default = true)"));
@@ -308,6 +313,8 @@
     Options.HoistCommonInsts = UserHoistCommonInsts;
   if (UserSinkCommonInsts.getNumOccurrences())
     Options.SinkCommonInsts = UserSinkCommonInsts;
+  if (UserSwitchRemovalThreshold.getNumOccurrences())
+    Options.SwitchRemovalThreshold = UserSwitchRemovalThreshold;
 }
 
 SimplifyCFGPass::SimplifyCFGPass() : Options() {
Index: llvm/lib/Passes/PassBuilder.cpp
===================================================================
--- llvm/lib/Passes/PassBuilder.cpp
+++ llvm/lib/Passes/PassBuilder.cpp
@@ -256,6 +256,17 @@
     cl::desc("Run synthetic function entry count generation "
              "pass"));
 
+static cl::opt<bool>
+    RemoveSwitchBlocks("remove-switch-blocks", cl::init(true), cl::Hidden,
+                       cl::desc("Convert switch blocks into a branch sequence "
+                                "prior to vectorization."));
+
+// This value determines the point at which we stop removing switch statements
+// before the vectorizer pass. Removing switch blocks and replacing them with
+// compares and branches allows architectures that support predication to
+// vectorize.
+static const int RemoveSwitchCaseThreshold = 4;
+
 static const Regex DefaultAliasRegex(
     "^(default|thinlto-pre-link|thinlto|lto-pre-link|lto)<(O[0123sz])>$");
 
@@ -1201,6 +1212,10 @@
 /// TODO: Should LTO cause any differences to this set of passes?
 void PassBuilder::addVectorPasses(OptimizationLevel Level,
                                   FunctionPassManager &FPM, bool IsFullLTO) {
+  if (RemoveSwitchBlocks)
+    FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().switchRemovalThreshold(
+        RemoveSwitchCaseThreshold)));
+
   FPM.addPass(LoopVectorizePass(
       LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
 
Index: llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
===================================================================
--- llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
+++ llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
@@ -29,6 +29,7 @@
   bool SinkCommonInsts = false;
   bool SimplifyCondBranch = true;
   bool FoldTwoEntryPHINode = true;
+  unsigned SwitchRemovalThreshold = 0;
 
   AssumptionCache *AC = nullptr;
 
@@ -70,6 +71,11 @@
     FoldTwoEntryPHINode = B;
     return *this;
   }
+
+  SimplifyCFGOptions &switchRemovalThreshold(int I) {
+    SwitchRemovalThreshold = I;
+    return *this;
+  }
 };
 
 } // namespace llvm
Index: clang/test/Frontend/optimization-remark-analysis.c
===================================================================
--- clang/test/Frontend/optimization-remark-analysis.c
+++ clang/test/Frontend/optimization-remark-analysis.c
@@ -1,5 +1,5 @@
-// RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -emit-llvm -Rpass-analysis -S %s -o - 2>&1 | FileCheck %s --check-prefix=RPASS
-// RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -emit-llvm -S %s -o - 2>&1 | FileCheck %s
+// RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -mllvm -remove-switch-blocks=false -emit-llvm -Rpass-analysis -S %s -o - 2>&1 | FileCheck %s --check-prefix=RPASS
+// RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -mllvm -remove-switch-blocks=false -emit-llvm -S %s -o - 2>&1 | FileCheck %s
 
 // RPASS: {{.*}}:7:8: remark: loop not vectorized: loop contains a switch statement
 // CHECK-NOT: {{.*}}:7:8: remark: loop not vectorized: loop contains a switch statement

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D108138: [SimplifyCFG] Remove switch statements before vectorization

Reply via email to