From 80afe56609fc5b0dce62a6760dff602b86841424 Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski@arm.com>
Date: Mon, 30 Jun 2025 16:00:08 +0000
Subject: [PATCH 1/9] [flang][OpenMP] Add -f[no]-openmp-simd

Both clang and gfortran support the -fopenmp-simd flag, which enables
OpenMP support only for simd constructs, while disabling the rest of
OpenMP.

Add a new SimdOnly flang OpenMP IR pass which rewrites generated
OpenMP FIR to remove all constructs except for omp.simd constructs,
and constructs nested under them.
With this approach, the logic required to make the flag work can
be self-contained within the pass, as opposed to being scattered
all over the lowering code.

The flag is expected to have no effect if -fopenmp is passed
explicitly, and is only expected to remove OpenMP constructs, not
things like OpenMP library functions calls. This matches the
behaviour of other compilers.

Signed-off-by: Kajetan Puchalski <kajetan.puchalski@arm.com>
---
 clang/include/clang/Driver/Options.td         |  15 +-
 clang/lib/Driver/ToolChains/Flang.cpp         |   5 +
 .../include/flang/Optimizer/OpenMP/Passes.td  |   5 +
 .../flang/Optimizer/Passes/Pipelines.h        |   5 +-
 .../flang/Optimizer/Transforms/Utils.h        |   4 +
 flang/include/flang/Support/LangOptions.def   |   2 +
 flang/include/flang/Tools/CrossToolHelpers.h  |   1 +
 flang/lib/Frontend/CompilerInvocation.cpp     |  11 +-
 flang/lib/Frontend/FrontendActions.cpp        |  25 +-
 flang/lib/Lower/OpenMP/ClauseProcessor.h      |  11 +-
 flang/lib/Lower/OpenMP/OpenMP.cpp             |  54 +-
 flang/lib/Optimizer/OpenMP/CMakeLists.txt     |   1 +
 flang/lib/Optimizer/OpenMP/SimdOnly.cpp       | 360 ++++++++++
 flang/lib/Optimizer/Passes/Pipelines.cpp      |  14 +-
 .../Transforms/ControlFlowConverter.cpp       | 206 +++---
 flang/test/Driver/fopenmp-simd.f90            |  59 ++
 flang/test/Transforms/OpenMP/simd-only.mlir   | 622 ++++++++++++++++++
 flang/tools/bbc/bbc.cpp                       |   4 +-
 18 files changed, 1263 insertions(+), 141 deletions(-)
 create mode 100644 flang/lib/Optimizer/OpenMP/SimdOnly.cpp
 create mode 100644 flang/test/Driver/fopenmp-simd.f90
 create mode 100644 flang/test/Transforms/OpenMP/simd-only.mlir

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 6aab43c9ed57f..8a2bf40cc6a32 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3731,14 +3731,19 @@ def fopenmp_relocatable_target : Flag<["-"], "fopenmp-relocatable-target">,
 def fnoopenmp_relocatable_target : Flag<["-"], "fnoopenmp-relocatable-target">,
   Group<f_Group>, Flags<[NoArgumentUnused, HelpHidden]>,
   Visibility<[ClangOption, CC1Option]>;
-def fopenmp_simd : Flag<["-"], "fopenmp-simd">, Group<f_Group>,
-  Flags<[NoArgumentUnused]>, Visibility<[ClangOption, CC1Option]>,
-  HelpText<"Emit OpenMP code only for SIMD-based constructs.">;
+def fopenmp_simd : Flag<["-"], "fopenmp-simd">,
+                   Group<f_Group>,
+                   Flags<[NoArgumentUnused]>,
+                   Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
+                   HelpText<"Emit OpenMP code only for SIMD-based constructs.">;
 def fopenmp_enable_irbuilder : Flag<["-"], "fopenmp-enable-irbuilder">, Group<f_Group>,
   Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
   HelpText<"Use the experimental OpenMP-IR-Builder codegen path.">;
-def fno_openmp_simd : Flag<["-"], "fno-openmp-simd">, Group<f_Group>,
-  Flags<[NoArgumentUnused]>, Visibility<[ClangOption, CC1Option]>;
+def fno_openmp_simd
+    : Flag<["-"], "fno-openmp-simd">,
+      Group<f_Group>,
+      Flags<[NoArgumentUnused]>,
+      Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>;
 def fopenmp_cuda_mode : Flag<["-"], "fopenmp-cuda-mode">, Group<f_Group>,
   Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>;
 def fno_openmp_cuda_mode : Flag<["-"], "fno-openmp-cuda-mode">, Group<f_Group>,
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 7ab41e9b85a04..547e3156f519a 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -937,6 +937,8 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA,
 
       if (Args.hasArg(options::OPT_fopenmp_force_usm))
         CmdArgs.push_back("-fopenmp-force-usm");
+      Args.AddLastArg(CmdArgs, options::OPT_fopenmp_simd,
+                      options::OPT_fno_openmp_simd);
 
       // FIXME: Clang supports a whole bunch more flags here.
       break;
@@ -952,6 +954,9 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA,
           << A->getSpelling() << A->getValue();
       break;
     }
+  } else {
+    Args.AddLastArg(CmdArgs, options::OPT_fopenmp_simd,
+                    options::OPT_fno_openmp_simd);
   }
 
   // Pass the path to compiler resource files.
diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td
index 704faf0ccd856..79c1a5cfd9aca 100644
--- a/flang/include/flang/Optimizer/OpenMP/Passes.td
+++ b/flang/include/flang/Optimizer/OpenMP/Passes.td
@@ -112,4 +112,9 @@ def GenericLoopConversionPass
   ];
 }
 
+def SimdOnlyPass : Pass<"omp-simd-only", "mlir::func::FuncOp"> {
+  let summary = "Filters out non-simd OpenMP constructs";
+  let dependentDialects = ["mlir::omp::OpenMPDialect"];
+}
+
 #endif //FORTRAN_OPTIMIZER_OPENMP_PASSES
diff --git a/flang/include/flang/Optimizer/Passes/Pipelines.h b/flang/include/flang/Optimizer/Passes/Pipelines.h
index a3f59ee8dd013..fd8c43cc88a19 100644
--- a/flang/include/flang/Optimizer/Passes/Pipelines.h
+++ b/flang/include/flang/Optimizer/Passes/Pipelines.h
@@ -119,13 +119,16 @@ void registerDefaultInlinerPass(MLIRToLLVMPassPipelineConfig &config);
 void createDefaultFIROptimizerPassPipeline(mlir::PassManager &pm,
                                            MLIRToLLVMPassPipelineConfig &pc);
 
+/// Select which mode to enable OpenMP support in.
+enum class EnableOpenMP { None, Simd, Full };
+
 /// Create a pass pipeline for lowering from HLFIR to FIR
 ///
 /// \param pm - MLIR pass manager that will hold the pipeline definition
 /// \param optLevel - optimization level used for creating FIR optimization
 ///   passes pipeline
 void createHLFIRToFIRPassPipeline(
-    mlir::PassManager &pm, bool enableOpenMP,
+    mlir::PassManager &pm, EnableOpenMP enableOpenMP,
     llvm::OptimizationLevel optLevel = defaultOptLevel);
 
 struct OpenMPFIRPassPipelineOpts {
diff --git a/flang/include/flang/Optimizer/Transforms/Utils.h b/flang/include/flang/Optimizer/Transforms/Utils.h
index 49a616fb40fd5..307e6b59c57d4 100644
--- a/flang/include/flang/Optimizer/Transforms/Utils.h
+++ b/flang/include/flang/Optimizer/Transforms/Utils.h
@@ -33,6 +33,10 @@ void genMinMaxlocReductionLoop(fir::FirOpBuilder &builder, mlir::Value array,
                                mlir::Type maskElemType, mlir::Value resultArr,
                                bool maskMayBeLogicalScalar);
 
+std::pair<mlir::Block *, mlir::Block *>
+convertDoLoopToCFG(DoLoopOp loop, mlir::PatternRewriter &rewriter, bool setNSW,
+                   bool forceLoopToExecuteOnce);
+
 } // namespace fir
 
 #endif // FORTRAN_OPTIMIZER_TRANSFORMS_UTILS_H
diff --git a/flang/include/flang/Support/LangOptions.def b/flang/include/flang/Support/LangOptions.def
index d5bf7a2ecc036..ba72d7b4b7212 100644
--- a/flang/include/flang/Support/LangOptions.def
+++ b/flang/include/flang/Support/LangOptions.def
@@ -58,6 +58,8 @@ LANGOPT(OpenMPTeamSubscription, 1, 0)
 LANGOPT(OpenMPNoThreadState, 1, 0)
 /// Assume that no thread in a parallel region will encounter a parallel region
 LANGOPT(OpenMPNoNestedParallelism, 1, 0)
+/// Use SIMD only OpenMP support.
+LANGOPT(OpenMPSimd, 1, false)
 
 LANGOPT(VScaleMin, 32, 0)  ///< Minimum vscale range value
 LANGOPT(VScaleMax, 32, 0)  ///< Maximum vscale range value
diff --git a/flang/include/flang/Tools/CrossToolHelpers.h b/flang/include/flang/Tools/CrossToolHelpers.h
index df1da27058552..51958fa36c3ad 100644
--- a/flang/include/flang/Tools/CrossToolHelpers.h
+++ b/flang/include/flang/Tools/CrossToolHelpers.h
@@ -134,6 +134,7 @@ struct MLIRToLLVMPassPipelineConfig : public FlangEPCallBacks {
                                       ///< functions.
   bool NSWOnLoopVarInc = true; ///< Add nsw flag to loop variable increments.
   bool EnableOpenMP = false; ///< Enable OpenMP lowering.
+  bool EnableOpenMPSimd = false; ///< Enable OpenMP simd-only mode.
   std::string InstrumentFunctionEntry =
       ""; ///< Name of the instrument-function that is called on each
           ///< function-entry
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 111c5aa48726f..708fb7f0b82ee 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -1162,8 +1162,15 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
                             clang::DiagnosticsEngine &diags) {
   llvm::opt::Arg *arg = args.getLastArg(clang::driver::options::OPT_fopenmp,
                                         clang::driver::options::OPT_fno_openmp);
-  if (!arg || arg->getOption().matches(clang::driver::options::OPT_fno_openmp))
-    return true;
+  if (!arg ||
+      arg->getOption().matches(clang::driver::options::OPT_fno_openmp)) {
+    bool isSimdSpecified = args.hasFlag(
+        clang::driver::options::OPT_fopenmp_simd,
+        clang::driver::options::OPT_fno_openmp_simd, /*Default=*/false);
+    if (!isSimdSpecified)
+      return true;
+    res.getLangOpts().OpenMPSimd = 1;
+  }
 
   unsigned numErrorsBefore = diags.getNumErrors();
   llvm::Triple t(res.getTargetOpts().triple);
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 5c66ecf3043cd..3bef6b1c31825 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -298,6 +298,7 @@ bool CodeGenAction::beginSourceFileAction() {
   bool isOpenMPEnabled =
       ci.getInvocation().getFrontendOpts().features.IsEnabled(
           Fortran::common::LanguageFeature::OpenMP);
+  bool isOpenMPSimd = ci.getInvocation().getLangOpts().OpenMPSimd;
 
   fir::OpenMPFIRPassPipelineOpts opts;
 
@@ -329,12 +330,13 @@ bool CodeGenAction::beginSourceFileAction() {
     if (auto offloadMod = llvm::dyn_cast<mlir::omp::OffloadModuleInterface>(
             mlirModule->getOperation()))
       opts.isTargetDevice = offloadMod.getIsTargetDevice();
+  }
 
-    // WARNING: This pipeline must be run immediately after the lowering to
-    // ensure that the FIR is correct with respect to OpenMP operations/
-    // attributes.
+  // WARNING: This pipeline must be run immediately after the lowering to
+  // ensure that the FIR is correct with respect to OpenMP operations/
+  // attributes.
+  if (isOpenMPEnabled || isOpenMPSimd)
     fir::createOpenMPFIRPassPipeline(pm, opts);
-  }
 
   pm.enableVerifier(/*verifyPasses=*/true);
   pm.addPass(std::make_unique<Fortran::lower::VerifierPass>());
@@ -617,12 +619,14 @@ void CodeGenAction::lowerHLFIRToFIR() {
   pm.addPass(std::make_unique<Fortran::lower::VerifierPass>());
   pm.enableVerifier(/*verifyPasses=*/true);
 
+  fir::EnableOpenMP enableOpenMP = fir::EnableOpenMP::None;
+  if (ci.getInvocation().getFrontendOpts().features.IsEnabled(
+          Fortran::common::LanguageFeature::OpenMP))
+    enableOpenMP = fir::EnableOpenMP::Full;
+  if (ci.getInvocation().getLangOpts().OpenMPSimd)
+    enableOpenMP = fir::EnableOpenMP::Simd;
   // Create the pass pipeline
-  fir::createHLFIRToFIRPassPipeline(
-      pm,
-      ci.getInvocation().getFrontendOpts().features.IsEnabled(
-          Fortran::common::LanguageFeature::OpenMP),
-      level);
+  fir::createHLFIRToFIRPassPipeline(pm, enableOpenMP, level);
   (void)mlir::applyPassManagerCLOptions(pm);
 
   mlir::TimingScope timingScopeMLIRPasses = timingScopeRoot.nest(
@@ -748,6 +752,9 @@ void CodeGenAction::generateLLVMIR() {
           Fortran::common::LanguageFeature::OpenMP))
     config.EnableOpenMP = true;
 
+  if (ci.getInvocation().getLangOpts().OpenMPSimd)
+    config.EnableOpenMPSimd = true;
+
   if (ci.getInvocation().getLoweringOpts().getIntegerWrapAround())
     config.NSWOnLoopVarInc = false;
 
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index 7f894afc1ab37..a197666f26b2c 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -208,11 +208,12 @@ void ClauseProcessor::processTODO(mlir::Location currentLocation,
     if (!x)
       return;
     unsigned version = semaCtx.langOptions().OpenMPVersion;
-    TODO(currentLocation,
-         "Unhandled clause " + llvm::omp::getOpenMPClauseName(id).upper() +
-             " in " +
-             llvm::omp::getOpenMPDirectiveName(directive, version).upper() +
-             " construct");
+    if (!semaCtx.langOptions().OpenMPSimd)
+      TODO(currentLocation,
+           "Unhandled clause " + llvm::omp::getOpenMPClauseName(id).upper() +
+               " in " +
+               llvm::omp::getOpenMPDirectiveName(directive, version).upper() +
+               " construct");
   };
 
   for (ClauseIterator it = clauses.begin(); it != clauses.end(); ++it)
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index ae60432afccd0..fef64ccc15015 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2262,7 +2262,8 @@ genOrderedOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
              semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
              mlir::Location loc, const ConstructQueue &queue,
              ConstructQueue::const_iterator item) {
-  TODO(loc, "OMPD_ordered");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(loc, "OMPD_ordered");
   return nullptr;
 }
 
@@ -2449,7 +2450,8 @@ genScopeOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
            semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
            mlir::Location loc, const ConstructQueue &queue,
            ConstructQueue::const_iterator item) {
-  TODO(loc, "Scope construct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(loc, "Scope construct");
   return nullptr;
 }
 
@@ -3276,7 +3278,8 @@ static mlir::omp::TaskloopOp genCompositeTaskloopSimd(
     lower::pft::Evaluation &eval, mlir::Location loc,
     const ConstructQueue &queue, ConstructQueue::const_iterator item) {
   assert(std::distance(item, queue.end()) == 2 && "Invalid leaf constructs");
-  TODO(loc, "Composite TASKLOOP SIMD");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(loc, "Composite TASKLOOP SIMD");
   return nullptr;
 }
 
@@ -3448,8 +3451,10 @@ static void genOMPDispatch(lower::AbstractConverter &converter,
     break;
   case llvm::omp::Directive::OMPD_tile: {
     unsigned version = semaCtx.langOptions().OpenMPVersion;
-    TODO(loc, "Unhandled loop directive (" +
-                  llvm::omp::getOpenMPDirectiveName(dir, version) + ")");
+    if (!semaCtx.langOptions().OpenMPSimd)
+      TODO(loc, "Unhandled loop directive (" +
+                    llvm::omp::getOpenMPDirectiveName(dir, version) + ")");
+    break;
   }
   case llvm::omp::Directive::OMPD_unroll:
     genUnrollOp(converter, symTable, stmtCtx, semaCtx, eval, loc, queue, item);
@@ -3484,35 +3489,40 @@ static void
 genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
        semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
        const parser::OpenMPDeclarativeAllocate &declarativeAllocate) {
-  TODO(converter.getCurrentLocation(), "OpenMPDeclarativeAllocate");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPDeclarativeAllocate");
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPDeclarativeAssumes &assumesConstruct) {
-  TODO(converter.getCurrentLocation(), "OpenMP ASSUMES declaration");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMP ASSUMES declaration");
 }
 
 static void
 genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
        semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
        const parser::OmpDeclareVariantDirective &declareVariantDirective) {
-  TODO(converter.getCurrentLocation(), "OmpDeclareVariantDirective");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OmpDeclareVariantDirective");
 }
 
 static void genOMP(
     lower::AbstractConverter &converter, lower::SymMap &symTable,
     semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
     const parser::OpenMPDeclareReductionConstruct &declareReductionConstruct) {
-  TODO(converter.getCurrentLocation(), "OpenMPDeclareReductionConstruct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPDeclareReductionConstruct");
 }
 
 static void
 genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
        semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
        const parser::OpenMPDeclareSimdConstruct &declareSimdConstruct) {
-  TODO(converter.getCurrentLocation(), "OpenMPDeclareSimdConstruct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPDeclareSimdConstruct");
 }
 
 static void
@@ -3706,14 +3716,16 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
   (void)objects;
   (void)clauses;
 
-  TODO(converter.getCurrentLocation(), "OpenMPDepobjConstruct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPDepobjConstruct");
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPInteropConstruct &interopConstruct) {
-  TODO(converter.getCurrentLocation(), "OpenMPInteropConstruct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPInteropConstruct");
 }
 
 static void
@@ -3729,7 +3741,8 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPAllocatorsConstruct &allocsConstruct) {
-  TODO(converter.getCurrentLocation(), "OpenMPAllocatorsConstruct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPAllocatorsConstruct");
 }
 
 //===----------------------------------------------------------------------===//
@@ -3795,7 +3808,8 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
         !std::holds_alternative<clause::Detach>(clause.u)) {
       std::string name =
           parser::ToUpperCaseLetters(llvm::omp::getOpenMPClauseName(clause.id));
-      TODO(clauseLocation, name + " clause is not implemented yet");
+      if (!semaCtx.langOptions().OpenMPSimd)
+        TODO(clauseLocation, name + " clause is not implemented yet");
     }
   }
 
@@ -3811,7 +3825,8 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPAssumeConstruct &assumeConstruct) {
   mlir::Location clauseLocation = converter.genLocation(assumeConstruct.source);
-  TODO(clauseLocation, "OpenMP ASSUME construct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(clauseLocation, "OpenMP ASSUME construct");
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
@@ -3847,21 +3862,24 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPUtilityConstruct &) {
-  TODO(converter.getCurrentLocation(), "OpenMPUtilityConstruct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPUtilityConstruct");
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPDispatchConstruct &) {
-  TODO(converter.getCurrentLocation(), "OpenMPDispatchConstruct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPDispatchConstruct");
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPExecutableAllocate &execAllocConstruct) {
-  TODO(converter.getCurrentLocation(), "OpenMPExecutableAllocate");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPExecutableAllocate");
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
index e31543328a9f9..3fb0bac05ce0d 100644
--- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
@@ -9,6 +9,7 @@ add_flang_library(FlangOpenMPTransforms
   MarkDeclareTarget.cpp
   LowerWorkshare.cpp
   LowerNontemporal.cpp
+  SimdOnly.cpp
 
   DEPENDS
   FIRDialect
diff --git a/flang/lib/Optimizer/OpenMP/SimdOnly.cpp b/flang/lib/Optimizer/OpenMP/SimdOnly.cpp
new file mode 100644
index 0000000000000..b4c97df767e65
--- /dev/null
+++ b/flang/lib/Optimizer/OpenMP/SimdOnly.cpp
@@ -0,0 +1,360 @@
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Transforms/Utils.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include <llvm/Support/Debug.h>
+#include <mlir/IR/MLIRContext.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/PatternMatch.h>
+#include <mlir/Support/LLVM.h>
+
+namespace flangomp {
+#define GEN_PASS_DEF_SIMDONLYPASS
+#include "flang/Optimizer/OpenMP/Passes.h.inc"
+} // namespace flangomp
+
+namespace {
+
+#define DEBUG_TYPE "omp-simd-only-pass"
+
+class SimdOnlyConversionPattern : public mlir::RewritePattern {
+public:
+  SimdOnlyConversionPattern(mlir::MLIRContext *ctx)
+      : mlir::RewritePattern(MatchAnyOpTypeTag{}, 1, ctx) {}
+
+  mlir::LogicalResult
+  matchAndRewrite(mlir::Operation *op,
+                  mlir::PatternRewriter &rewriter) const override {
+    if (op->getDialect()->getNamespace() !=
+        mlir::omp::OpenMPDialect::getDialectNamespace())
+      return rewriter.notifyMatchFailure(op, "Not an OpenMP op");
+
+    if (auto simdOp = mlir::dyn_cast<mlir::omp::SimdOp>(op)) {
+      // Remove the composite attr given that the op will no longer be composite
+      if (simdOp.isComposite()) {
+        simdOp.setComposite(false);
+        return mlir::success();
+      }
+
+      return rewriter.notifyMatchFailure(op, "Op is a plain SimdOp");
+    }
+
+    if (op->getParentOfType<mlir::omp::SimdOp>())
+      return rewriter.notifyMatchFailure(op, "Op is nested under a SimdOp");
+
+    if (!mlir::isa<mlir::func::FuncOp>(op->getParentOp()) &&
+        (mlir::isa<mlir::omp::TerminatorOp>(op) ||
+         mlir::isa<mlir::omp::YieldOp>(op)))
+      return rewriter.notifyMatchFailure(op,
+                                         "Non top-level yield or terminator");
+
+    // SectionOp overrides its BlockArgInterface based on the parent SectionsOp.
+    // We need to make sure we only rewrite omp.sections once all omp.section
+    // ops inside it have been rewritten, otherwise the individual omp.section
+    // ops will not be able to access their argument values.
+    if (auto sectionsOp = mlir::dyn_cast<mlir::omp::SectionsOp>(op)) {
+      for (auto &opInSections : sectionsOp.getRegion().getOps())
+        if (mlir::isa<mlir::omp::SectionOp>(opInSections))
+          return rewriter.notifyMatchFailure(
+              op, "SectionsOp still contains individual sections");
+    }
+
+    LLVM_DEBUG(llvm::dbgs() << "SimdOnlyPass matched OpenMP op:\n");
+    LLVM_DEBUG(op->dump());
+
+    // Erase ops that don't need any special handling
+    if (mlir::isa<mlir::omp::BarrierOp>(op) ||
+        mlir::isa<mlir::omp::FlushOp>(op) ||
+        mlir::isa<mlir::omp::TaskyieldOp>(op) ||
+        mlir::isa<mlir::omp::MapBoundsOp>(op) ||
+        mlir::isa<mlir::omp::TargetEnterDataOp>(op) ||
+        mlir::isa<mlir::omp::TargetExitDataOp>(op) ||
+        mlir::isa<mlir::omp::TargetUpdateOp>(op) ||
+        mlir::isa<mlir::omp::OrderedOp>(op) ||
+        mlir::isa<mlir::omp::CancelOp>(op) ||
+        mlir::isa<mlir::omp::CancellationPointOp>(op) ||
+        mlir::isa<mlir::omp::ScanOp>(op) ||
+        mlir::isa<mlir::omp::TaskwaitOp>(op)) {
+      rewriter.eraseOp(op);
+      return mlir::success();
+    }
+
+    fir::FirOpBuilder builder(rewriter, op);
+    mlir::Location loc = op->getLoc();
+
+    auto inlineSimpleOp = [&](mlir::Operation *ompOp) -> bool {
+      if (!ompOp)
+        return false;
+
+      llvm::SmallVector<std::pair<mlir::Value, mlir::BlockArgument>>
+          blockArgsPairs;
+      if (auto iface =
+              mlir::dyn_cast<mlir::omp::BlockArgOpenMPOpInterface>(op)) {
+        iface.getBlockArgsPairs(blockArgsPairs);
+        for (auto [value, argument] : blockArgsPairs)
+          rewriter.replaceAllUsesWith(argument, value);
+      }
+
+      if (ompOp->getRegion(0).getBlocks().size() == 1) {
+        auto &block = *ompOp->getRegion(0).getBlocks().begin();
+        // This block is about to be removed so any arguments should have been
+        // replaced by now.
+        block.eraseArguments(0, block.getNumArguments());
+        if (auto terminatorOp =
+                mlir::dyn_cast<mlir::omp::TerminatorOp>(block.back())) {
+          rewriter.eraseOp(terminatorOp);
+        }
+        rewriter.inlineBlockBefore(&block, op, {});
+      } else {
+        // When dealing with multi-block regions we need to fix up the control
+        // flow
+        auto *origBlock = ompOp->getBlock();
+        auto *newBlock = rewriter.splitBlock(origBlock, ompOp->getIterator());
+        auto *innerFrontBlock = &ompOp->getRegion(0).getBlocks().front();
+        builder.setInsertionPointToEnd(origBlock);
+        builder.create<mlir::cf::BranchOp>(loc, innerFrontBlock);
+        // We are no longer passing any arguments to the first block in the
+        // region, so this should be safe to erase.
+        innerFrontBlock->eraseArguments(0, innerFrontBlock->getNumArguments());
+
+        for (auto &innerBlock : ompOp->getRegion(0).getBlocks()) {
+          // Remove now-unused block arguments
+          for (auto arg : innerBlock.getArguments()) {
+            if (arg.getUses().empty())
+              innerBlock.eraseArgument(arg.getArgNumber());
+          }
+          if (auto terminatorOp =
+                  mlir::dyn_cast<mlir::omp::TerminatorOp>(innerBlock.back())) {
+            builder.setInsertionPointToEnd(&innerBlock);
+            builder.create<mlir::cf::BranchOp>(loc, newBlock);
+            rewriter.eraseOp(terminatorOp);
+          }
+        }
+
+        rewriter.inlineRegionBefore(ompOp->getRegion(0), newBlock);
+      }
+
+      rewriter.eraseOp(op);
+      return true;
+    };
+
+    if (auto ompOp = mlir::dyn_cast<mlir::omp::LoopNestOp>(op)) {
+      mlir::Type indexType = builder.getIndexType();
+      mlir::Type oldIndexType = ompOp.getIVs().begin()->getType();
+      builder.setInsertionPoint(op);
+      auto one = builder.create<mlir::arith::ConstantIndexOp>(loc, 1);
+
+      // Generate the new loop nest
+      mlir::Block *nestBody = nullptr;
+      fir::DoLoopOp outerLoop = nullptr;
+      llvm::SmallVector<mlir::Value> loopIndArgs;
+      for (auto extent : ompOp.getLoopUpperBounds()) {
+        auto ub = builder.createConvert(loc, indexType, extent);
+        auto doLoop = builder.create<fir::DoLoopOp>(loc, one, ub, one, false);
+        nestBody = doLoop.getBody();
+        builder.setInsertionPointToStart(nestBody);
+        // Convert the indices to the type used inside the loop if needed
+        if (oldIndexType != indexType) {
+          auto convertedIndVar = builder.createConvert(
+              loc, oldIndexType, doLoop.getInductionVar());
+          loopIndArgs.push_back(convertedIndVar);
+        } else {
+          loopIndArgs.push_back(doLoop.getInductionVar());
+        }
+        if (!outerLoop)
+          outerLoop = doLoop;
+      }
+
+      // Move the omp loop body into the new loop body
+      if (ompOp->getRegion(0).getBlocks().size() == 1) {
+        auto &block = *ompOp->getRegion(0).getBlocks().begin();
+        rewriter.mergeBlocks(&block, nestBody, loopIndArgs);
+
+        // Find the new loop block terminator and move it before the end of the
+        // block
+        for (auto &loopBodyOp : nestBody->getOperations()) {
+          if (auto resultOp = mlir::dyn_cast<fir::ResultOp>(loopBodyOp)) {
+            rewriter.moveOpBefore(resultOp.getOperation(), &nestBody->back());
+            break;
+          }
+        }
+
+        // Remove omp.yield at the end of the loop body
+        if (auto yieldOp = mlir::dyn_cast<mlir::omp::YieldOp>(nestBody->back()))
+          rewriter.eraseOp(yieldOp);
+        // DoLoopOp does not support multi-block regions, thus if we're dealing
+        // with multiple blocks we need to convert it into basic control-flow
+        // operations.
+      } else {
+        rewriter.inlineRegionBefore(ompOp->getRegion(0), nestBody);
+        auto indVarArg = outerLoop->getRegion(0).front().getArgument(0);
+        // fir::convertDoLoopToCFG expects the induction variable to be of type
+        // index while the OpenMP LoopNestOp can have indices of different
+        // types. We need to work around it.
+        if (indVarArg.getType() != indexType)
+          indVarArg.setType(indexType);
+
+        auto loopBlocks =
+            fir::convertDoLoopToCFG(outerLoop, rewriter, false, false);
+        auto *conditionalBlock = loopBlocks.first;
+        auto *firstBlock =
+            conditionalBlock->getNextNode(); // Start of the loop body
+        auto *lastBlock = loopBlocks.second; // Incrementing induction variables
+
+        // If the induction variable is used within the loop and was originally
+        // not of type index, then we need to add a convert to the original type
+        // and replace its uses inside the loop body.
+        if (oldIndexType != indexType) {
+          indVarArg = conditionalBlock->getArgument(0);
+          builder.setInsertionPointToStart(firstBlock);
+          auto convertedIndVar =
+              builder.createConvert(loc, oldIndexType, indVarArg);
+          rewriter.replaceUsesWithIf(
+              indVarArg, convertedIndVar, [&](auto &use) -> bool {
+                return use.getOwner() != convertedIndVar.getDefiningOp() &&
+                       use.getOwner()->getBlock() != lastBlock;
+              });
+        }
+
+        // There might be an unused convert and an unused argument to the block.
+        // If so, remove them.
+        if (lastBlock->front().getUses().empty())
+          lastBlock->front().erase();
+        for (auto arg : lastBlock->getArguments()) {
+          if (arg.getUses().empty())
+            lastBlock->eraseArgument(arg.getArgNumber());
+        }
+
+        // Any loop blocks that end in omp.yield should just branch to
+        // lastBlock.
+        for (auto *loopBlock = conditionalBlock; loopBlock != lastBlock;
+             loopBlock = loopBlock->getNextNode()) {
+          if (auto yieldOp =
+                  mlir::dyn_cast<mlir::omp::YieldOp>(loopBlock->back())) {
+            builder.setInsertionPointToEnd(loopBlock);
+            builder.create<mlir::cf::BranchOp>(loc, lastBlock);
+            rewriter.eraseOp(yieldOp);
+          }
+        }
+      }
+
+      rewriter.eraseOp(ompOp);
+      return mlir::success();
+    }
+
+    if (auto mapInfoOp = mlir::dyn_cast<mlir::omp::MapInfoOp>(op)) {
+      mapInfoOp.getResult().replaceAllUsesWith(mapInfoOp.getVarPtr());
+      rewriter.eraseOp(mapInfoOp);
+      return mlir::success();
+    }
+
+    if (auto atomicReadOp = mlir::dyn_cast<mlir::omp::AtomicReadOp>(op)) {
+      builder.setInsertionPoint(op);
+      auto loadOp = builder.create<fir::LoadOp>(loc, atomicReadOp.getX());
+      auto storeOp = builder.create<fir::StoreOp>(loc, loadOp.getResult(),
+                                                  atomicReadOp.getV());
+      rewriter.replaceOp(op, storeOp);
+      return mlir::success();
+    }
+
+    if (auto atomicWriteOp = mlir::dyn_cast<mlir::omp::AtomicWriteOp>(op)) {
+      auto storeOp = builder.create<fir::StoreOp>(loc, atomicWriteOp.getExpr(),
+                                                  atomicWriteOp.getX());
+      rewriter.replaceOp(op, storeOp);
+      return mlir::success();
+    }
+
+    if (auto atomicUpdateOp = mlir::dyn_cast<mlir::omp::AtomicUpdateOp>(op)) {
+      assert("one block in region" &&
+             atomicUpdateOp.getRegion().getBlocks().size() == 1);
+      auto &block = *atomicUpdateOp.getRegion().getBlocks().begin();
+      builder.setInsertionPointToStart(&block);
+
+      // Load the update `x` operand and replace its uses within the block
+      auto loadOp = builder.create<fir::LoadOp>(loc, atomicUpdateOp.getX());
+      rewriter.replaceUsesWithIf(
+          block.getArgument(0), loadOp.getResult(),
+          [&](auto &op) { return op.get().getParentBlock() == &block; });
+
+      // Store the result back into `x` in line with omp.yield semantics for
+      // this op
+      auto yieldOp = mlir::cast<mlir::omp::YieldOp>(block.back());
+      assert("only one yield operand" && yieldOp->getNumOperands() == 1);
+      builder.setInsertionPointAfter(yieldOp);
+      builder.create<fir::StoreOp>(loc, yieldOp->getOperand(0),
+                                   atomicUpdateOp.getX());
+      rewriter.eraseOp(yieldOp);
+
+      // Inline the final block and remove the now-empty op
+      assert("only one block argument" && block.getNumArguments() == 1);
+      block.eraseArguments(0, block.getNumArguments());
+      rewriter.inlineBlockBefore(&block, op, {});
+      rewriter.eraseOp(op);
+      return mlir::success();
+    }
+
+    if (auto threadPrivateOp = mlir::dyn_cast<mlir::omp::ThreadprivateOp>(op)) {
+      threadPrivateOp.getTlsAddr().replaceAllUsesWith(
+          threadPrivateOp.getSymAddr());
+      rewriter.eraseOp(threadPrivateOp);
+      return mlir::success();
+    }
+
+    if (inlineSimpleOp(mlir::dyn_cast<mlir::omp::TeamsOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::ParallelOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::SingleOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::SectionOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::SectionsOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::WsloopOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::LoopOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::TargetOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::TargetDataOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::DistributeOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::TaskOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::TaskloopOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::MasterOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::CriticalOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::OrderedRegionOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::AtomicCaptureOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::MaskedOp>(op)))
+      return mlir::success();
+
+    op->emitOpError("OpenMP operation left unhandled after SimdOnly pass.");
+    return mlir::failure();
+  }
+};
+
+class SimdOnlyPass : public flangomp::impl::SimdOnlyPassBase<SimdOnlyPass> {
+
+public:
+  SimdOnlyPass() = default;
+
+  void runOnOperation() override {
+    mlir::func::FuncOp func = getOperation();
+
+    if (func.isDeclaration())
+      return;
+
+    mlir::MLIRContext *context = &getContext();
+    mlir::RewritePatternSet patterns(context);
+    patterns.insert<SimdOnlyConversionPattern>(context);
+
+    mlir::GreedyRewriteConfig config;
+    // Prevent the pattern driver from merging blocks.
+    config.setRegionSimplificationLevel(
+        mlir::GreedySimplifyRegionLevel::Disabled);
+
+    if (mlir::failed(
+            mlir::applyPatternsGreedily(func, std::move(patterns), config))) {
+      mlir::emitError(func.getLoc(), "error in simd-only conversion pass");
+      signalPassFailure();
+    }
+  }
+};
+
+} // namespace
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index ca8e820608688..5a870928f8413 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -242,7 +242,8 @@ void createDefaultFIROptimizerPassPipeline(mlir::PassManager &pm,
 /// \param pm - MLIR pass manager that will hold the pipeline definition
 /// \param optLevel - optimization level used for creating FIR optimization
 ///   passes pipeline
-void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, bool enableOpenMP,
+void createHLFIRToFIRPassPipeline(mlir::PassManager &pm,
+                                  EnableOpenMP enableOpenMP,
                                   llvm::OptimizationLevel optLevel) {
   if (optLevel.isOptimizingForSpeed()) {
     addCanonicalizerPassWithoutRegionSimplification(pm);
@@ -294,8 +295,10 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, bool enableOpenMP,
     addNestedPassToAllTopLevelOperations<PassConstructor>(
         pm, hlfir::createInlineHLFIRAssign);
   pm.addPass(hlfir::createConvertHLFIRtoFIR());
-  if (enableOpenMP)
+  if (enableOpenMP != EnableOpenMP::None)
     pm.addPass(flangomp::createLowerWorkshare());
+  if (enableOpenMP == EnableOpenMP::Simd)
+    pm.addPass(flangomp::createSimdOnlyPass());
 }
 
 /// Create a pass pipeline for handling certain OpenMP transformations needed
@@ -396,7 +399,12 @@ void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
 void createMLIRToLLVMPassPipeline(mlir::PassManager &pm,
                                   MLIRToLLVMPassPipelineConfig &config,
                                   llvm::StringRef inputFilename) {
-  fir::createHLFIRToFIRPassPipeline(pm, config.EnableOpenMP, config.OptLevel);
+  fir::EnableOpenMP enableOpenMP = fir::EnableOpenMP::None;
+  if (config.EnableOpenMP)
+    enableOpenMP = fir::EnableOpenMP::Full;
+  if (config.EnableOpenMPSimd)
+    enableOpenMP = fir::EnableOpenMP::Simd;
+  fir::createHLFIRToFIRPassPipeline(pm, enableOpenMP, config.OptLevel);
 
   // Add default optimizer pass pipeline.
   fir::createDefaultFIROptimizerPassPipeline(pm, config);
diff --git a/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp b/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
index e466aed753e63..4bcf7d857c7b0 100644
--- a/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
+++ b/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
@@ -14,6 +14,7 @@
 #include "flang/Optimizer/Support/InternalNames.h"
 #include "flang/Optimizer/Support/TypeCode.h"
 #include "flang/Optimizer/Transforms/Passes.h"
+#include "flang/Optimizer/Transforms/Utils.h"
 #include "flang/Runtime/derived-api.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
@@ -31,6 +32,113 @@ namespace fir {
 using namespace fir;
 using namespace mlir;
 
+// Extracted here for use in other passes
+
+/// Convert fir::DoLoopOp to control-flow operations
+std::pair<mlir::Block *, mlir::Block *>
+fir::convertDoLoopToCFG(DoLoopOp loop, mlir::PatternRewriter &rewriter,
+                        bool setNSW, bool forceLoopToExecuteOnce) {
+  auto loc = loop.getLoc();
+  mlir::arith::IntegerOverflowFlags flags{};
+  if (setNSW)
+    flags = bitEnumSet(flags, mlir::arith::IntegerOverflowFlags::nsw);
+  auto iofAttr =
+      mlir::arith::IntegerOverflowFlagsAttr::get(rewriter.getContext(), flags);
+
+  // Create the start and end blocks that will wrap the DoLoopOp with an
+  // initalizer and an end point
+  auto *initBlock = rewriter.getInsertionBlock();
+  auto initPos = rewriter.getInsertionPoint();
+  auto *endBlock = rewriter.splitBlock(initBlock, initPos);
+
+  // Split the first DoLoopOp block in two parts. The part before will be the
+  // conditional block since it already has the induction variable and
+  // loop-carried values as arguments.
+  auto *conditionalBlock = &loop.getRegion().front();
+  conditionalBlock->addArgument(rewriter.getIndexType(), loc);
+  auto *firstBlock =
+      rewriter.splitBlock(conditionalBlock, conditionalBlock->begin());
+  auto *lastBlock = &loop.getRegion().back();
+
+  // Move the blocks from the DoLoopOp between initBlock and endBlock
+  rewriter.inlineRegionBefore(loop.getRegion(), endBlock);
+
+  // Get loop values from the DoLoopOp
+  auto low = loop.getLowerBound();
+  auto high = loop.getUpperBound();
+  assert(low && high && "must be a Value");
+  auto step = loop.getStep();
+
+  // Initalization block
+  rewriter.setInsertionPointToEnd(initBlock);
+  auto diff = mlir::arith::SubIOp::create(rewriter, loc, high, low);
+  auto distance = mlir::arith::AddIOp::create(rewriter, loc, diff, step);
+  mlir::Value iters =
+      mlir::arith::DivSIOp::create(rewriter, loc, distance, step);
+
+  if (forceLoopToExecuteOnce) {
+    auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
+    auto cond = mlir::arith::CmpIOp::create(
+        rewriter, loc, arith::CmpIPredicate::sle, iters, zero);
+    auto one = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
+    iters = mlir::arith::SelectOp::create(rewriter, loc, cond, one, iters);
+  }
+
+  llvm::SmallVector<mlir::Value> loopOperands;
+  loopOperands.push_back(low);
+  auto operands = loop.getIterOperands();
+  loopOperands.append(operands.begin(), operands.end());
+  loopOperands.push_back(iters);
+
+  mlir::cf::BranchOp::create(rewriter, loc, conditionalBlock, loopOperands);
+
+  // Last loop block
+  auto *terminator = lastBlock->getTerminator();
+  rewriter.setInsertionPointToEnd(lastBlock);
+  auto iv = conditionalBlock->getArgument(0);
+  mlir::Value steppedIndex =
+      mlir::arith::AddIOp::create(rewriter, loc, iv, step, iofAttr);
+  assert(steppedIndex && "must be a Value");
+  auto lastArg = conditionalBlock->getNumArguments() - 1;
+  auto itersLeft = conditionalBlock->getArgument(lastArg);
+  auto one = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
+  mlir::Value itersMinusOne =
+      mlir::arith::SubIOp::create(rewriter, loc, itersLeft, one);
+
+  llvm::SmallVector<mlir::Value> loopCarried;
+  loopCarried.push_back(steppedIndex);
+  auto begin = loop.getFinalValue() ? std::next(terminator->operand_begin())
+                                    : terminator->operand_begin();
+  loopCarried.append(begin, terminator->operand_end());
+  loopCarried.push_back(itersMinusOne);
+  auto backEdge =
+      mlir::cf::BranchOp::create(rewriter, loc, conditionalBlock, loopCarried);
+  rewriter.eraseOp(terminator);
+
+  // Copy loop annotations from the do loop to the loop back edge.
+  if (auto ann = loop.getLoopAnnotation())
+    backEdge->setAttr("loop_annotation", *ann);
+
+  // Conditional block
+  rewriter.setInsertionPointToEnd(conditionalBlock);
+  auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
+  auto comparison = mlir::arith::CmpIOp::create(
+      rewriter, loc, arith::CmpIPredicate::sgt, itersLeft, zero);
+
+  mlir::cf::CondBranchOp::create(rewriter, loc, comparison, firstBlock,
+                                 llvm::ArrayRef<mlir::Value>(), endBlock,
+                                 llvm::ArrayRef<mlir::Value>());
+
+  // The result of the loop operation is the values of the condition block
+  // arguments except the induction variable on the last iteration.
+  auto args = loop.getFinalValue()
+                  ? conditionalBlock->getArguments()
+                  : conditionalBlock->getArguments().drop_front();
+  rewriter.replaceOp(loop, args.drop_back());
+
+  return std::make_pair(conditionalBlock, lastBlock);
+}
+
 namespace {
 
 // Conversion of fir control ops to more primitive control-flow.
@@ -50,103 +158,7 @@ class CfgLoopConv : public mlir::OpRewritePattern<fir::DoLoopOp> {
   llvm::LogicalResult
   matchAndRewrite(DoLoopOp loop,
                   mlir::PatternRewriter &rewriter) const override {
-    auto loc = loop.getLoc();
-    mlir::arith::IntegerOverflowFlags flags{};
-    if (setNSW)
-      flags = bitEnumSet(flags, mlir::arith::IntegerOverflowFlags::nsw);
-    auto iofAttr = mlir::arith::IntegerOverflowFlagsAttr::get(
-        rewriter.getContext(), flags);
-
-    // Create the start and end blocks that will wrap the DoLoopOp with an
-    // initalizer and an end point
-    auto *initBlock = rewriter.getInsertionBlock();
-    auto initPos = rewriter.getInsertionPoint();
-    auto *endBlock = rewriter.splitBlock(initBlock, initPos);
-
-    // Split the first DoLoopOp block in two parts. The part before will be the
-    // conditional block since it already has the induction variable and
-    // loop-carried values as arguments.
-    auto *conditionalBlock = &loop.getRegion().front();
-    conditionalBlock->addArgument(rewriter.getIndexType(), loc);
-    auto *firstBlock =
-        rewriter.splitBlock(conditionalBlock, conditionalBlock->begin());
-    auto *lastBlock = &loop.getRegion().back();
-
-    // Move the blocks from the DoLoopOp between initBlock and endBlock
-    rewriter.inlineRegionBefore(loop.getRegion(), endBlock);
-
-    // Get loop values from the DoLoopOp
-    auto low = loop.getLowerBound();
-    auto high = loop.getUpperBound();
-    assert(low && high && "must be a Value");
-    auto step = loop.getStep();
-
-    // Initalization block
-    rewriter.setInsertionPointToEnd(initBlock);
-    auto diff = mlir::arith::SubIOp::create(rewriter, loc, high, low);
-    auto distance = mlir::arith::AddIOp::create(rewriter, loc, diff, step);
-    mlir::Value iters =
-        mlir::arith::DivSIOp::create(rewriter, loc, distance, step);
-
-    if (forceLoopToExecuteOnce) {
-      auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
-      auto cond = mlir::arith::CmpIOp::create(
-          rewriter, loc, arith::CmpIPredicate::sle, iters, zero);
-      auto one = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
-      iters = mlir::arith::SelectOp::create(rewriter, loc, cond, one, iters);
-    }
-
-    llvm::SmallVector<mlir::Value> loopOperands;
-    loopOperands.push_back(low);
-    auto operands = loop.getIterOperands();
-    loopOperands.append(operands.begin(), operands.end());
-    loopOperands.push_back(iters);
-
-    mlir::cf::BranchOp::create(rewriter, loc, conditionalBlock, loopOperands);
-
-    // Last loop block
-    auto *terminator = lastBlock->getTerminator();
-    rewriter.setInsertionPointToEnd(lastBlock);
-    auto iv = conditionalBlock->getArgument(0);
-    mlir::Value steppedIndex =
-        mlir::arith::AddIOp::create(rewriter, loc, iv, step, iofAttr);
-    assert(steppedIndex && "must be a Value");
-    auto lastArg = conditionalBlock->getNumArguments() - 1;
-    auto itersLeft = conditionalBlock->getArgument(lastArg);
-    auto one = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
-    mlir::Value itersMinusOne =
-        mlir::arith::SubIOp::create(rewriter, loc, itersLeft, one);
-
-    llvm::SmallVector<mlir::Value> loopCarried;
-    loopCarried.push_back(steppedIndex);
-    auto begin = loop.getFinalValue() ? std::next(terminator->operand_begin())
-                                      : terminator->operand_begin();
-    loopCarried.append(begin, terminator->operand_end());
-    loopCarried.push_back(itersMinusOne);
-    auto backEdge = mlir::cf::BranchOp::create(rewriter, loc, conditionalBlock,
-                                               loopCarried);
-    rewriter.eraseOp(terminator);
-
-    // Copy loop annotations from the do loop to the loop back edge.
-    if (auto ann = loop.getLoopAnnotation())
-      backEdge->setAttr("loop_annotation", *ann);
-
-    // Conditional block
-    rewriter.setInsertionPointToEnd(conditionalBlock);
-    auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
-    auto comparison = mlir::arith::CmpIOp::create(
-        rewriter, loc, arith::CmpIPredicate::sgt, itersLeft, zero);
-
-    mlir::cf::CondBranchOp::create(rewriter, loc, comparison, firstBlock,
-                                   llvm::ArrayRef<mlir::Value>(), endBlock,
-                                   llvm::ArrayRef<mlir::Value>());
-
-    // The result of the loop operation is the values of the condition block
-    // arguments except the induction variable on the last iteration.
-    auto args = loop.getFinalValue()
-                    ? conditionalBlock->getArguments()
-                    : conditionalBlock->getArguments().drop_front();
-    rewriter.replaceOp(loop, args.drop_back());
+    convertDoLoopToCFG(loop, rewriter, setNSW, forceLoopToExecuteOnce);
     return success();
   }
 
diff --git a/flang/test/Driver/fopenmp-simd.f90 b/flang/test/Driver/fopenmp-simd.f90
new file mode 100644
index 0000000000000..b25adee2779ee
--- /dev/null
+++ b/flang/test/Driver/fopenmp-simd.f90
@@ -0,0 +1,59 @@
+! RUN: %flang -target x86_64-linux-gnu -fopenmp-simd %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-OPENMP-SIMD-FLAG --check-prefix=CHECK-NO-LD-ANY
+! RUN: %flang -target x86_64-darwin -fopenmp-simd %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-OPENMP-SIMD-FLAG --check-prefix=CHECK-NO-LD-ANY
+! RUN: %flang -target x86_64-freebsd -fopenmp-simd %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-OPENMP-SIMD-FLAG --check-prefix=CHECK-NO-LD-ANY
+! RUN: %flang -target x86_64-windows-gnu -fopenmp-simd %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-OPENMP-SIMD-FLAG --check-prefix=CHECK-NO-LD-ANY
+
+! CHECK-OPENMP-SIMD-FLAG: "-fopenmp-simd"
+! CHECK-NO-LD-ANY-NOT: "-l{{(omp|gomp|iomp5)}}"
+
+! -fopenmp-simd enables openmp support only for simd constructs
+! RUN: %flang_fc1 -fopenmp-simd %s -emit-fir -o - | FileCheck --check-prefix=CHECK-OMP-SIMD %s
+! RUN: %flang_fc1 -fno-openmp-simd %s -emit-fir -o - | FileCheck --check-prefix=CHECK-NO-OMP-SIMD %s
+! RUN: %flang_fc1 -fopenmp-simd -fno-openmp-simd %s -emit-fir -o - | FileCheck --check-prefix=CHECK-NO-OMP-SIMD %s
+! RUN: %flang_fc1 -fno-openmp-simd -fopenmp-simd %s -emit-fir -o - | FileCheck --check-prefix=CHECK-OMP-SIMD %s
+! -fopenmp-simd should have no effect if -fopenmp is already set
+! RUN: %flang_fc1 -fopenmp %s -emit-fir -o - | FileCheck --check-prefix=CHECK-OMP %s
+! RUN: %flang_fc1 -fopenmp -fopenmp-simd %s -emit-fir -o - | FileCheck --check-prefix=CHECK-OMP %s
+! RUN: %flang_fc1 -fopenmp -fno-openmp-simd %s -emit-fir -o - | FileCheck --check-prefix=CHECK-OMP %s
+
+subroutine main
+  ! CHECK-OMP-SIMD-NOT: omp.parallel
+  ! CHECK-OMP-SIMD-NOT: omp.wsloop
+  ! CHECK-OMP-SIMD-NOT: omp.loop_nest
+  ! CHECK-OMP-SIMD: fir.do_loop
+  ! CHECK-NO-OMP-SIMD-NOT: omp.parallel
+  ! CHECK-NO-OMP-SIMD-NOT: omp.wsloop
+  ! CHECK-NO-OMP-SIMD-NOT: omp.loop_nest
+  ! CHECK-NO-OMP-SIMD: fir.do_loop
+  ! CHECK-OMP: omp.parallel
+  ! CHECK-OMP: omp.wsloop
+  ! CHECK-OMP: omp.loop_nest
+  ! CHECK-OMP-NOT: fir.do_loop
+  !$omp parallel do
+  do i = 1, 10
+    print *, "test"
+  end do
+  ! CHECK-NO-OMP-SIMD-NOT: omp.yield
+  ! CHECK-NO-OMP-SIMD-NOT: omp.terminator
+  ! CHECK-OMP-SIMD-NOT: omp.yield
+  ! CHECK-OMP-SIMD-NOT: omp.terminator
+  ! CHECK-OMP: omp.yield
+  ! CHECK-OMP: omp.terminator
+  !$omp end parallel do
+
+  ! CHECK-OMP-SIMD: omp.simd
+  ! CHECK-NO-OMP-SIMD-NOT: omp.simd
+  ! CHECK-OMP: omp.simd
+  !$omp simd
+  ! CHECK-OMP-SIMD: omp.loop_nest
+  ! CHECK-NO-OMP-SIMD-NOT: omp.loop_nest
+  ! CHECK-NO-OMP-SIMD: fir.do_loop
+  ! CHECK-OMP: omp.loop_nest
+  ! CHECK-OMP-NOT: fir.do_loop
+  do i = 1, 10
+    print *, "test"
+  ! CHECK-OMP-SIMD: omp.yield
+  ! CHECK-NO-OMP-SIMD-NOT: omp.yield
+  ! CHECK-OMP: omp.yield
+  end do
+end subroutine
diff --git a/flang/test/Transforms/OpenMP/simd-only.mlir b/flang/test/Transforms/OpenMP/simd-only.mlir
new file mode 100644
index 0000000000000..c3efce13c4414
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/simd-only.mlir
@@ -0,0 +1,622 @@
+// RUN: fir-opt --split-input-file --omp-simd-only %s | FileCheck %s
+
+// Check that simd operations are not removed and rewritten, but all the other OpenMP ops are.
+
+// CHECK-LABEL: func.func @simd
+omp.private {type = private} @_QFEi_private_i32 : i32
+func.func @simd(%arg0: i32, %arg1: !fir.ref<i32>, %arg2: !fir.ref<i32>) {
+  %c1_i32 = arith.constant 1 : i32
+  %c100000_i32 = arith.constant 100000 : i32
+  // CHECK: omp.simd private
+  omp.simd private(@_QFEi_private_i32 %arg2 -> %arg3 : !fir.ref<i32>) {
+    // CHECK: omp.loop_nest
+    omp.loop_nest (%arg4) : i32 = (%c1_i32) to (%c100000_i32) inclusive step (%c1_i32) {
+      // CHECK: fir.store
+      fir.store %arg0 to %arg1 : !fir.ref<i32>
+      // CHECK: omp.yield
+      omp.yield
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @simd_composite
+func.func @simd_composite(%arg0: i32, %arg1: !fir.ref<i32>) {
+  %c1_i32 = arith.constant 1 : i32
+  %c100000_i32 = arith.constant 100000 : i32
+  // CHECK-NOT: omp.parallel
+  omp.parallel {
+    // CHECK-NOT: omp.wsloop
+    omp.wsloop {
+      // CHECK: omp.simd
+      omp.simd {
+        // CHECK: omp.loop_nest
+        omp.loop_nest (%arg3) : i32 = (%c1_i32) to (%c100000_i32) inclusive step (%c1_i32) {
+          // CHECK: fir.store
+          fir.store %arg0 to %arg1 : !fir.ref<i32>
+          // CHECK: omp.yield
+          omp.yield
+        }
+      // CHECK-NOT: {omp.composite}
+      } {omp.composite}
+    } {omp.composite}
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @parallel
+omp.private {type = private} @_QFEi_private_i32 : i32
+func.func @parallel(%arg0: i32, %arg1: !fir.ref<i32>) {
+  %c1 = arith.constant 1 : index
+  %c1_i32 = arith.constant 1 : i32
+  %c100000_i32 = arith.constant 100000 : i32
+  // CHECK-NOT: omp.parallel
+  omp.parallel private(@_QFEi_private_i32 %arg1 -> %arg3 : !fir.ref<i32>) {
+    // CHECK: fir.convert
+    %15 = fir.convert %c1_i32 : (i32) -> index
+    // CHECK: fir.convert
+    %16 = fir.convert %c100000_i32 : (i32) -> index
+    // CHECK: fir.do_loop
+    %18:2 = fir.do_loop %arg4 = %15 to %16 step %c1 iter_args(%arg2 = %arg0) -> (index, i32) {
+      // CHECK: fir.store
+      fir.store %arg0 to %arg1 : !fir.ref<i32>
+      // CHECK-NOT: omp.barrier
+      omp.barrier
+      fir.result %arg4, %arg2 : index, i32
+    }
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+    }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @do
+func.func @do(%arg5: i32, %arg6: !fir.ref<i32>) {
+  // CHECK: %[[C1:.*]] = arith.constant 1 : index
+  %c1_i32 = arith.constant 1 : i32
+  // CHECK: %[[C100:.*]] = fir.convert %c100_i32 : (i32) -> index
+  %c100_i32 = arith.constant 100 : i32
+  // CHECK-NOT: omp.wsloop
+  omp.wsloop {
+    // CHECK-NOT: omp.loop_nest
+    // CHECK: fir.do_loop %[[IVAR:.*]] = %[[C1]] to %[[C100]] step %[[C1]]
+    omp.loop_nest (%arg1) : i32 = (%c1_i32) to (%c100_i32) inclusive step (%c1_i32) {
+      // CHECK: fir.store
+      fir.store %arg5 to %arg6 : !fir.ref<i32>
+      // CHECK-NOT: omp.yield
+      omp.yield
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @do_nested
+func.func @do_nested(%arg5: i32, %arg6: !fir.ref<i32>) {
+  // CHECK: %[[C1:.*]] = arith.constant 1 : index
+  %c1_i32 = arith.constant 1 : i32
+  %c100_i32 = arith.constant 100 : i32
+  %c200_i32 = arith.constant 200 : i32
+  // CHECK-NOT: omp.wsloop
+  omp.wsloop {
+    // CHECK: %[[C200:.*]] = fir.convert %c200_i32 : (i32) -> index
+    // CHECK-NOT: omp.loop_nest
+    // CHECK: fir.do_loop %[[IVAR_1:.*]] = %[[C1]] to %[[C200]] step %[[C1]]
+    // CHECK: %[[C100:.*]] = fir.convert %c100_i32 : (i32) -> index
+    // CHECK: fir.do_loop %[[IVAR_2:.*]] = %[[C1]] to %[[C100]] step %[[C1]]
+    omp.loop_nest (%arg2, %arg3) : i32 = (%c1_i32, %c1_i32) to (%c200_i32, %c100_i32) inclusive step (%c1_i32, %c1_i32) {
+      // CHECK: fir.store
+      fir.store %arg5 to %arg6 : !fir.ref<i32>
+      // CHECK-NOT: omp.yield
+      omp.yield
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @single
+func.func @single(%arg0: i32, %arg1: !fir.ref<i32>) {
+  // CHECK-NOT: omp.single
+  omp.single {
+    // CHECK: fir.store
+    fir.store %arg0 to %arg1 : !fir.ref<i32>
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @target_map(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
+func.func @target_map(%arg5: i32, %arg6: !fir.ref<i32>) {
+  // CHECK-NOT: omp.map.info
+  %3 = omp.map.info var_ptr(%arg6 : !fir.ref<i32>, i32) map_clauses(implicit) capture(ByCopy) -> !fir.ref<i32>
+  // CHECK-NOT: omp.target
+  omp.target map_entries(%3 -> %arg0 : !fir.ref<i32>) {
+    // CHECK: arith.constant
+    %c1_i32 = arith.constant 1 : i32
+    // CHECK: fir.store %c1_i32 to %[[ARG_1]]
+    fir.store %c1_i32 to %arg0 : !fir.ref<i32>
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @task(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
+omp.private {type = private} @_QFEi_private_i32 : i32
+func.func @task(%arg5: i32, %arg6: !fir.ref<i32>) {
+  // CHECK-NOT: omp.task
+  omp.task private(@_QFEi_private_i32 %arg6 -> %arg2 : !fir.ref<i32>) {
+    // CHECK: fir.store %[[ARG_0]] to %[[ARG_1]]
+    fir.store %arg5 to %arg2 : !fir.ref<i32>
+    // CHECK-NOT: omp.flush
+    omp.flush
+    // CHECK-NOT: omp.taskyield
+    omp.taskyield
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @teams
+func.func @teams(%arg0: i32, %arg1: !fir.ref<i32>) {
+  // CHECK-NOT: omp.teams
+  omp.teams {
+    // CHECK: fir.store
+    fir.store %arg0 to %arg1 : !fir.ref<i32>
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @distribute
+func.func @distribute(%arg0: i32, %arg1: i32, %arg2: !fir.ref<i32>) {
+  %c1_i32 = arith.constant 1 : i32
+  // CHECK-NOT: omp.teams
+  omp.teams {
+    // CHECK-NOT: omp.distribute
+    omp.distribute {
+      // CHECK-NOT: omp.loop_nest
+      // CHECK: fir.do_loop
+      omp.loop_nest (%arg5) : i32 = (%arg0) to (%arg1) inclusive step (%c1_i32) {
+        // CHECK: fir.store
+        fir.store %arg0 to %arg2 : !fir.ref<i32>
+        // CHECK-NOT: omp.yield
+        omp.yield
+      }
+    }
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @threadprivate(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
+func.func @threadprivate(%arg0: i32, %arg1: !fir.ref<i32>) {
+  // CHECK-NOT: omp.threadprivate
+  %1 = omp.threadprivate %arg1 : !fir.ref<i32> -> !fir.ref<i32>
+  // CHECK: fir.store %[[ARG_0]] to %[[ARG_1]]
+  fir.store %arg0 to %1 : !fir.ref<i32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @taskloop(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
+func.func @taskloop(%funcArg0: i32, %funcArg1: !fir.ref<i32>) {
+  %c1_i32 = arith.constant 1 : i32
+  %c2_i32 = arith.constant 2 : i32
+  %c10_i32 = arith.constant 10 : i32
+  // CHECK-NOT: omp.taskloop
+  omp.taskloop grainsize(%c2_i32: i32) {
+    // CHECK-NOT: omp.loop_nest
+    // CHECK: fir.do_loop
+    omp.loop_nest (%arg1) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32) {
+      // CHECK: fir.store %[[ARG_0]] to %[[ARG_1]]
+      fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
+      // CHECK-NOT: omp.yield
+      omp.yield
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @target_update_enter_data_map_info(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
+func.func @target_update_enter_data_map_info(%funcArg0: i32, %funcArg1: !fir.ref<i32>) {
+  %c1 = arith.constant 1 : index
+  // CHECK-NOT: omp.map.bounds
+  %1 = omp.map.bounds lower_bound(%c1 : index) upper_bound(%c1 : index) extent(%c1 : index) stride(%c1 : index) start_idx(%c1 : index)
+  // CHECK-NOT: omp.map.info
+  %13 = omp.map.info var_ptr(%funcArg1 : !fir.ref<i32>, i32) map_clauses(to) capture(ByRef) bounds(%1) -> !fir.ref<i32>
+  // CHECK-NOT: omp.target_enter_data
+  omp.target_enter_data map_entries(%13 : !fir.ref<i32>)
+  // CHECK-NOT: omp.target
+  omp.target map_entries(%13 -> %arg3 : !fir.ref<i32>) {
+    %c1_i32 = arith.constant 1 : i32
+    // CHECK: fir.store %c1_i32 to %[[ARG_1]]
+    fir.store %c1_i32 to %arg3 : !fir.ref<i32>
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  // CHECK-NOT: omp.map.info
+  %18 = omp.map.info var_ptr(%funcArg1 : !fir.ref<i32>, i32) map_clauses(from) capture(ByRef) bounds(%1) -> !fir.ref<i32>
+  // CHECK-NOT: omp.target_update
+  omp.target_update map_entries(%18 : !fir.ref<i32>)
+  // CHECK-NOT: omp.target_exit_data
+  omp.target_exit_data map_entries(%18 : !fir.ref<i32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @target_data(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
+func.func @target_data(%funcArg0: i32, %funcArg1: !fir.ref<i32>) {
+  %c1 = arith.constant 1 : index
+  // CHECK-NOT: omp.map.bounds
+  %3 = omp.map.bounds lower_bound(%c1 : index) upper_bound(%c1 : index) extent(%c1 : index) stride(%c1 : index) start_idx(%c1 : index)
+  // CHECK-NOT: omp.map.info
+  %4 = omp.map.info var_ptr(%funcArg1 : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) bounds(%3) -> !fir.ref<i32>
+  // CHECK-NOT: omp.target_data
+  omp.target_data map_entries(%4 : !fir.ref<i32>) {
+    %c1_i32 = arith.constant 1 : i32
+    // CHECK: fir.store %c1_i32 to %[[ARG_1]]
+    fir.store %c1_i32 to %4 : !fir.ref<i32>
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @sections(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_2:.*]]: !fir.ref<i32>
+func.func @sections(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %funcArg2: !fir.ref<i32>) {
+  // CHECK-NOT: omp.sections
+  omp.sections {
+    // CHECK-NOT: omp.section
+    omp.section {
+      // CHECK: fir.store
+      fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
+      // CHECK-NOT: omp.terminator
+      omp.terminator
+    }
+    // CHECK-NOT: omp.section
+    omp.section {
+      // CHECK: fir.store
+      fir.store %funcArg0 to %funcArg2 : !fir.ref<i32>
+      // CHECK-NOT: omp.terminator
+      omp.terminator
+    }
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+omp.declare_reduction @add_reduction_i32 : i32 init {
+^bb0(%arg0: i32):
+  %c0_i32 = arith.constant 0 : i32
+  omp.yield(%c0_i32 : i32)
+} combiner {
+^bb0(%arg0: i32, %arg1: i32):
+  %0 = arith.addi %arg0, %arg1 : i32
+  omp.yield(%0 : i32)
+}
+// CHECK-LABEL: func.func @reduction_scan(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
+func.func @reduction_scan(%funcArg0: i32, %funcArg1: !fir.ref<i32>) {
+  %c1_i32 = arith.constant 1 : i32
+  %c8_i32 = arith.constant 8 : i32
+  // CHECK-NOT: omp.wsloop
+  omp.wsloop reduction(mod: inscan, @add_reduction_i32 %funcArg1 -> %arg3 : !fir.ref<i32>) {
+    // CHECK-NOT: omp.loop_nest
+    // CHECK: fir.do_loop
+    omp.loop_nest (%arg2) : i32 = (%c1_i32) to (%c8_i32) inclusive step (%c1_i32) {
+      // CHECK: fir.declare %[[ARG_1]]
+      %1 = fir.declare %arg3 {uniq_name = "a"} : (!fir.ref<i32>) -> !fir.ref<i32>
+      // CHECK-NOT: omp.scan
+      omp.scan inclusive(%1 : !fir.ref<i32>)
+      // CHECK: fir.store
+      fir.store %funcArg0 to %1 : !fir.ref<i32>
+      // CHECK-NOT: omp.yield
+      omp.yield
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @ordered(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
+func.func @ordered(%funcArg0: i32, %funcArg1: !fir.ref<i32>) {
+  %c1_i32 = arith.constant 1 : i32
+  %c10_i32 = arith.constant 10 : i32
+  // CHECK-NOT: omp.parallel
+  omp.parallel {
+    // CHECK-NOT: omp.wsloop
+    omp.wsloop ordered(0) {
+      // CHECK-NOT: omp.loop_nest
+      // CHECK: fir.do_loop
+      omp.loop_nest (%arg2) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32) {
+        // CHECK-NOT: omp.ordered.region
+        omp.ordered.region {
+          // CHECK: fir.store
+          fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
+          // CHECK-NOT: omp.terminator
+          omp.terminator
+        }
+        // CHECK-NOT: omp.yield
+        omp.yield
+      }
+    }
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @master(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_2:.*]]: !fir.ref<i32>
+func.func @master(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %funcArg2: !fir.ref<i32>) {
+  // CHECK-NOT: omp.parallel
+  omp.parallel {
+    // CHECK: fir.store
+    fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
+    // CHECK-NOT: omp.master
+    omp.master {
+      // CHECK: fir.store
+      fir.store %funcArg0 to %funcArg2 : !fir.ref<i32>
+      // CHECK-NOT: omp.terminator
+      omp.terminator
+    }
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @masked(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_2:.*]]: !fir.ref<i32>
+func.func @masked(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %funcArg2: !fir.ref<i32>) {
+  // CHECK-NOT: omp.parallel
+  omp.parallel {
+    // CHECK: fir.store
+    fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
+    // CHECK-NOT: omp.masked
+    omp.masked {
+      // CHECK: fir.store
+      fir.store %funcArg0 to %funcArg2 : !fir.ref<i32>
+      // CHECK-NOT: omp.terminator
+      omp.terminator
+    }
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @critical(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_2:.*]]: !fir.ref<i32>
+omp.critical.declare @mylock
+func.func @critical(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %funcArg2: !fir.ref<i32>) {
+  // CHECK-NOT: omp.parallel
+  omp.parallel {
+    // CHECK: fir.store
+    fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
+    // CHECK-NOT: omp.critical
+    omp.critical(@mylock) {
+      // CHECK: fir.store
+      fir.store %funcArg0 to %funcArg2 : !fir.ref<i32>
+      // CHECK-NOT: omp.terminator
+      omp.terminator
+    }
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @cancel(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_2:.*]]: !fir.ref<i32>, %[[ARG_3:.*]]: i1
+func.func @cancel(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %funcArg2: !fir.ref<i32>, %funcArg3: i1) {
+  %c1_i32 = arith.constant 1 : i32
+  %c10_i32 = arith.constant 10 : i32
+  // CHECK-NOT: omp.parallel
+  omp.parallel {
+    // CHECK-NOT: omp.wsloop
+    omp.wsloop {
+      // CHECK-NOT: omp.loop_nest
+      // CHECK: fir.do_loop
+      omp.loop_nest (%arg1) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32) {
+        // CHECK: fir.store
+        fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
+        // CHECK-NOT: fir.if
+        fir.if %funcArg3 {
+          // CHECK-NOT: omp.cancel
+          omp.cancel cancellation_construct_type(loop)
+        }
+        // CHECK-NOT: omp.cancellation_point
+        omp.cancellation_point cancellation_construct_type(loop)
+        // CHECK: fir.store
+        fir.store %funcArg0 to %funcArg2 : !fir.ref<i32>
+        // CHECK-NOT: omp.yield
+        omp.yield
+      }
+    }
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @atomic(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_2:.*]]: !fir.ref<i32>, %[[ARG_3:.*]]: i32
+func.func @atomic(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %funcArg2: !fir.ref<i32>, %funcArg3: i32) {
+  %c1_i32 = arith.constant 1 : i32
+  %5 = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
+  // CHECK: %[[VAL_0:.*]] = fir.declare
+  %6 = fir.declare %5 {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  // CHECK-NOT: omp.parallel
+  omp.parallel {
+    // CHECK-NOT: omp.atomic.write
+    // CHECK: fir.store %[[ARG_0]] to %[[ARG_1]]
+    omp.atomic.write %funcArg1 = %funcArg0 : !fir.ref<i32>, i32
+    // CHECK-NOT: omp.atomic.read
+    // CHECK: %[[VAL_1:.*]] = fir.load %[[ARG_1]]
+    // CHECK-NEXT: fir.store %[[VAL_1]] to %[[ARG_2]]
+    omp.atomic.read %funcArg2 = %funcArg1 : !fir.ref<i32>, !fir.ref<i32>, i32
+    // CHECK-NOT: omp.atomic.update
+    // CHECK: fir.load %[[VAL_0]]
+    // CHECK-NEXT: %[[ADD_VAL:.*]] = arith.addi
+    // CHECK-NOT: omp.yield
+    // CHECK-NEXT: fir.store %[[ADD_VAL]] to %[[VAL_0]]
+    omp.atomic.update %6 : !fir.ref<i32> {
+    ^bb0(%arg3: i32):
+      %88 = arith.addi %arg3, %c1_i32 : i32
+      omp.yield(%88 : i32)
+    }
+    // CHECK-NOT: omp.atomic.read
+    // CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]]
+    // CHECK-NEXT: fir.store %[[VAL_2]] to %[[ARG_1]]
+    omp.atomic.read %funcArg1 = %6 : !fir.ref<i32>, !fir.ref<i32>, i32
+    // CHECK-NOT: omp.atomic.capture
+    omp.atomic.capture {
+      // CHECK-NOT: omp.atomic.read
+      // CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_0]]
+      // CHECK-NEXT: fir.store %[[VAL_3]] to %[[ARG_2]]
+      omp.atomic.read %funcArg2 = %6 : !fir.ref<i32>, !fir.ref<i32>, i32
+      // CHECK-NOT: omp.atomic.update
+      // CHECK: fir.load %[[VAL_0]]
+      // CHECK-NEXT: %[[ADD_VAL_2:.*]] = arith.addi
+      // CHECK-NOT: omp.yield
+      // CHECK-NEXT: fir.store %[[ADD_VAL_2]] to %[[VAL_0]]
+      omp.atomic.update %6 : !fir.ref<i32> {
+      ^bb0(%arg3: i32):
+        %88 = arith.addi %arg3, %c1_i32 : i32
+        omp.yield(%88 : i32)
+      }
+    }
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @multi_block(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_3:.*]]: i1
+func.func @multi_block(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %6: i1) {
+  %false = arith.constant false
+  %c0_i32 = arith.constant 0 : i32
+  // CHECK-NOT: omp.parallel
+  omp.parallel {
+    // CHECK: cf.cond_br %[[ARG_3]], ^[[BB1:.*]], ^[[BB2:.*]]
+    cf.cond_br %6, ^bb1, ^bb2
+  // CHECK: ^[[BB1]]
+  ^bb1:  // pred: ^bb0
+    // CHECK: fir.call
+    fir.call @_FortranAStopStatement(%c0_i32, %false, %false) fastmath<contract> : (i32, i1, i1) -> ()
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  // CHECK: ^[[BB2]]
+  ^bb2:  // pred: ^bb0
+    // CHECK: fir.store
+    fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @do_multi_block(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_3:.*]]: i1
+func.func @do_multi_block(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %6: i1) {
+  %false = arith.constant false
+  %c1_i32 = arith.constant 1 : i32
+  %c100_i32 = arith.constant 100 : i32
+  // CHECK-NOT: omp.wsloop
+  omp.wsloop {
+    // CHECK-NOT: omp.loop_nest
+    // CHECK: cf.br ^[[CBB:.*]](
+    // CHECK: ^[[CBB]]
+    // CHECK: %[[CMP_VAL:.*]] = arith.cmpi
+    // CHECK: cf.cond_br %[[CMP_VAL]], ^[[FBB:.*]], ^[[LBB:.*]]
+    omp.loop_nest (%arg2) : i32 = (%c1_i32) to (%c100_i32) inclusive step (%c1_i32) {
+    // CHECK: ^[[FBB]]
+      // CHECK: fir.store
+      fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
+      // CHECK: cf.br ^[[BBB:.*]]
+      cf.br ^bb1
+    // CHECK: ^[[BBB]]
+    ^bb1:  // pred: ^bb0
+      // CHECK: fir.store
+      fir.store %c1_i32 to %funcArg1 : !fir.ref<i32>
+      // CHECK: cf.cond_br
+      cf.cond_br %6, ^bb2, ^bb3
+    // CHECK: ^[[SBB:.*]]
+    ^bb2:  // pred: ^bb1
+      // CHECK: fir.call
+      fir.call @_FortranAStopStatement(%c1_i32, %false, %false) fastmath<contract> : (i32, i1, i1) -> ()
+      // CHECK-NOT: omp.yield
+      omp.yield
+      // CHECK: cf.br ^[[LBB:.*]]
+    // CHECK: ^[[OBB:.*]]
+      // CHECK: cf.br ^[[LBB]]
+    // CHECK: ^[[LBB]]
+      // CHECK: arith.subi
+      // CHECK: cf.br ^[[CBB]]
+    // CHECK: ^[[EBB:.*]]
+    ^bb3:  // pred: ^bb1
+      // CHECK-NOT: omp.yield
+      omp.yield
+    }
+  }
+  return
+}
diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
index edfc878d17524..82dff2653ad09 100644
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -520,7 +520,9 @@ static llvm::LogicalResult convertFortranSourceToMLIR(
 
     if (emitFIR && useHLFIR) {
       // lower HLFIR to FIR
-      fir::createHLFIRToFIRPassPipeline(pm, enableOpenMP,
+      fir::EnableOpenMP enableOmp =
+          enableOpenMP ? fir::EnableOpenMP::Full : fir::EnableOpenMP::None;
+      fir::createHLFIRToFIRPassPipeline(pm, enableOmp,
                                         llvm::OptimizationLevel::O2);
       if (mlir::failed(pm.run(mlirModule))) {
         llvm::errs() << "FATAL: lowering from HLFIR to FIR failed";

From 99b84c4ebc0449521bc229d16975765f2dc91ce7 Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski@arm.com>
Date: Thu, 24 Jul 2025 14:33:42 +0000
Subject: [PATCH 2/9] Address Tom's review comments

---
 .../include/flang/Optimizer/OpenMP/Passes.td  |   2 +-
 .../flang/Optimizer/Transforms/Utils.h        |   2 +
 flang/lib/Lower/OpenMP/ClauseProcessor.h      |   5 +-
 flang/lib/Optimizer/OpenMP/SimdOnly.cpp       | 217 +++++++++++-------
 flang/lib/Optimizer/Transforms/CMakeLists.txt |   1 +
 .../Transforms/ControlFlowConverter.cpp       | 107 ---------
 flang/lib/Optimizer/Transforms/Utils.cpp      | 121 ++++++++++
 flang/test/Transforms/OpenMP/simd-only.mlir   |   2 +-
 8 files changed, 259 insertions(+), 198 deletions(-)
 create mode 100644 flang/lib/Optimizer/Transforms/Utils.cpp

diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td
index 79c1a5cfd9aca..e06289cfa8229 100644
--- a/flang/include/flang/Optimizer/OpenMP/Passes.td
+++ b/flang/include/flang/Optimizer/OpenMP/Passes.td
@@ -112,7 +112,7 @@ def GenericLoopConversionPass
   ];
 }
 
-def SimdOnlyPass : Pass<"omp-simd-only", "mlir::func::FuncOp"> {
+def SimdOnlyPass : Pass<"omp-simd-only", "mlir::ModuleOp"> {
   let summary = "Filters out non-simd OpenMP constructs";
   let dependentDialects = ["mlir::omp::OpenMPDialect"];
 }
diff --git a/flang/include/flang/Optimizer/Transforms/Utils.h b/flang/include/flang/Optimizer/Transforms/Utils.h
index 307e6b59c57d4..116a4eefdc794 100644
--- a/flang/include/flang/Optimizer/Transforms/Utils.h
+++ b/flang/include/flang/Optimizer/Transforms/Utils.h
@@ -13,6 +13,8 @@
 #ifndef FORTRAN_OPTIMIZER_TRANSFORMS_UTILS_H
 #define FORTRAN_OPTIMIZER_TRANSFORMS_UTILS_H
 
+#include "flang/Optimizer/Dialect/FIROps.h"
+
 namespace fir {
 
 using MinlocBodyOpGeneratorTy = llvm::function_ref<mlir::Value(
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index a197666f26b2c..c46bdb348a3ef 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -208,7 +208,10 @@ void ClauseProcessor::processTODO(mlir::Location currentLocation,
     if (!x)
       return;
     unsigned version = semaCtx.langOptions().OpenMPVersion;
-    if (!semaCtx.langOptions().OpenMPSimd)
+    bool isSimdDirective = llvm::omp::getOpenMPDirectiveName(directive, version)
+                               .upper()
+                               .find("SIMD") != llvm::StringRef::npos;
+    if (!semaCtx.langOptions().OpenMPSimd || isSimdDirective)
       TODO(currentLocation,
            "Unhandled clause " + llvm::omp::getOpenMPClauseName(id).upper() +
                " in " +
diff --git a/flang/lib/Optimizer/OpenMP/SimdOnly.cpp b/flang/lib/Optimizer/OpenMP/SimdOnly.cpp
index b4c97df767e65..1e832f8744c51 100644
--- a/flang/lib/Optimizer/OpenMP/SimdOnly.cpp
+++ b/flang/lib/Optimizer/OpenMP/SimdOnly.cpp
@@ -1,17 +1,24 @@
+//===-- SimdOnly.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Transforms/Utils.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
-#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include <llvm/Support/Debug.h>
-#include <mlir/IR/MLIRContext.h>
-#include <mlir/IR/Operation.h>
-#include <mlir/IR/PatternMatch.h>
-#include <mlir/Support/LLVM.h>
+#include "llvm/Support/Debug.h"
 
 namespace flangomp {
 #define GEN_PASS_DEF_SIMDONLYPASS
@@ -44,8 +51,15 @@ class SimdOnlyConversionPattern : public mlir::RewritePattern {
       return rewriter.notifyMatchFailure(op, "Op is a plain SimdOp");
     }
 
-    if (op->getParentOfType<mlir::omp::SimdOp>())
-      return rewriter.notifyMatchFailure(op, "Op is nested under a SimdOp");
+    if (op->getParentOfType<mlir::omp::SimdOp>() &&
+        (mlir::isa<mlir::omp::YieldOp>(op) ||
+         mlir::isa<mlir::omp::LoopNestOp>(op) ||
+         mlir::isa<mlir::omp::WsloopOp>(op) ||
+         mlir::isa<mlir::omp::WorkshareLoopWrapperOp>(op) ||
+         mlir::isa<mlir::omp::DistributeOp>(op) ||
+         mlir::isa<mlir::omp::TaskloopOp>(op) ||
+         mlir::isa<mlir::omp::TerminatorOp>(op)))
+      return rewriter.notifyMatchFailure(op, "Op is part of a simd construct");
 
     if (!mlir::isa<mlir::func::FuncOp>(op->getParentOp()) &&
         (mlir::isa<mlir::omp::TerminatorOp>(op) ||
@@ -67,6 +81,28 @@ class SimdOnlyConversionPattern : public mlir::RewritePattern {
     LLVM_DEBUG(llvm::dbgs() << "SimdOnlyPass matched OpenMP op:\n");
     LLVM_DEBUG(op->dump());
 
+    auto eraseUnlessUsedBySimd = [&](mlir::Operation *ompOp,
+                                     mlir::StringAttr name) {
+      if (auto uses =
+              mlir::SymbolTable::getSymbolUses(name, op->getParentOp())) {
+        for (auto &use : *uses)
+          if (mlir::isa<mlir::omp::SimdOp>(use.getUser()))
+            return rewriter.notifyMatchFailure(op,
+                                               "Op used by a simd construct");
+      }
+      rewriter.eraseOp(ompOp);
+      return mlir::success();
+    };
+
+    if (auto ompOp = mlir::dyn_cast<mlir::omp::PrivateClauseOp>(op))
+      return eraseUnlessUsedBySimd(ompOp, ompOp.getSymNameAttr());
+    if (auto ompOp = mlir::dyn_cast<mlir::omp::DeclareReductionOp>(op))
+      return eraseUnlessUsedBySimd(ompOp, ompOp.getSymNameAttr());
+    if (auto ompOp = mlir::dyn_cast<mlir::omp::CriticalDeclareOp>(op))
+      return eraseUnlessUsedBySimd(ompOp, ompOp.getSymNameAttr());
+    if (auto ompOp = mlir::dyn_cast<mlir::omp::DeclareMapperOp>(op))
+      return eraseUnlessUsedBySimd(ompOp, ompOp.getSymNameAttr());
+
     // Erase ops that don't need any special handling
     if (mlir::isa<mlir::omp::BarrierOp>(op) ||
         mlir::isa<mlir::omp::FlushOp>(op) ||
@@ -87,67 +123,11 @@ class SimdOnlyConversionPattern : public mlir::RewritePattern {
     fir::FirOpBuilder builder(rewriter, op);
     mlir::Location loc = op->getLoc();
 
-    auto inlineSimpleOp = [&](mlir::Operation *ompOp) -> bool {
-      if (!ompOp)
-        return false;
-
-      llvm::SmallVector<std::pair<mlir::Value, mlir::BlockArgument>>
-          blockArgsPairs;
-      if (auto iface =
-              mlir::dyn_cast<mlir::omp::BlockArgOpenMPOpInterface>(op)) {
-        iface.getBlockArgsPairs(blockArgsPairs);
-        for (auto [value, argument] : blockArgsPairs)
-          rewriter.replaceAllUsesWith(argument, value);
-      }
-
-      if (ompOp->getRegion(0).getBlocks().size() == 1) {
-        auto &block = *ompOp->getRegion(0).getBlocks().begin();
-        // This block is about to be removed so any arguments should have been
-        // replaced by now.
-        block.eraseArguments(0, block.getNumArguments());
-        if (auto terminatorOp =
-                mlir::dyn_cast<mlir::omp::TerminatorOp>(block.back())) {
-          rewriter.eraseOp(terminatorOp);
-        }
-        rewriter.inlineBlockBefore(&block, op, {});
-      } else {
-        // When dealing with multi-block regions we need to fix up the control
-        // flow
-        auto *origBlock = ompOp->getBlock();
-        auto *newBlock = rewriter.splitBlock(origBlock, ompOp->getIterator());
-        auto *innerFrontBlock = &ompOp->getRegion(0).getBlocks().front();
-        builder.setInsertionPointToEnd(origBlock);
-        builder.create<mlir::cf::BranchOp>(loc, innerFrontBlock);
-        // We are no longer passing any arguments to the first block in the
-        // region, so this should be safe to erase.
-        innerFrontBlock->eraseArguments(0, innerFrontBlock->getNumArguments());
-
-        for (auto &innerBlock : ompOp->getRegion(0).getBlocks()) {
-          // Remove now-unused block arguments
-          for (auto arg : innerBlock.getArguments()) {
-            if (arg.getUses().empty())
-              innerBlock.eraseArgument(arg.getArgNumber());
-          }
-          if (auto terminatorOp =
-                  mlir::dyn_cast<mlir::omp::TerminatorOp>(innerBlock.back())) {
-            builder.setInsertionPointToEnd(&innerBlock);
-            builder.create<mlir::cf::BranchOp>(loc, newBlock);
-            rewriter.eraseOp(terminatorOp);
-          }
-        }
-
-        rewriter.inlineRegionBefore(ompOp->getRegion(0), newBlock);
-      }
-
-      rewriter.eraseOp(op);
-      return true;
-    };
-
     if (auto ompOp = mlir::dyn_cast<mlir::omp::LoopNestOp>(op)) {
       mlir::Type indexType = builder.getIndexType();
       mlir::Type oldIndexType = ompOp.getIVs().begin()->getType();
       builder.setInsertionPoint(op);
-      auto one = builder.create<mlir::arith::ConstantIndexOp>(loc, 1);
+      auto one = mlir::arith::ConstantIndexOp::create(builder, loc, 1);
 
       // Generate the new loop nest
       mlir::Block *nestBody = nullptr;
@@ -155,7 +135,7 @@ class SimdOnlyConversionPattern : public mlir::RewritePattern {
       llvm::SmallVector<mlir::Value> loopIndArgs;
       for (auto extent : ompOp.getLoopUpperBounds()) {
         auto ub = builder.createConvert(loc, indexType, extent);
-        auto doLoop = builder.create<fir::DoLoopOp>(loc, one, ub, one, false);
+        auto doLoop = fir::DoLoopOp::create(builder, loc, one, ub, one, false);
         nestBody = doLoop.getBody();
         builder.setInsertionPointToStart(nestBody);
         // Convert the indices to the type used inside the loop if needed
@@ -185,11 +165,12 @@ class SimdOnlyConversionPattern : public mlir::RewritePattern {
         }
 
         // Remove omp.yield at the end of the loop body
-        if (auto yieldOp = mlir::dyn_cast<mlir::omp::YieldOp>(nestBody->back()))
+        if (auto yieldOp =
+                mlir::dyn_cast<mlir::omp::YieldOp>(nestBody->back())) {
+          assert("omp.loop_nests's omp.yield has no operands" &&
+                 yieldOp->getNumOperands() == 0);
           rewriter.eraseOp(yieldOp);
-        // DoLoopOp does not support multi-block regions, thus if we're dealing
-        // with multiple blocks we need to convert it into basic control-flow
-        // operations.
+        }
       } else {
         rewriter.inlineRegionBefore(ompOp->getRegion(0), nestBody);
         auto indVarArg = outerLoop->getRegion(0).front().getArgument(0);
@@ -199,6 +180,9 @@ class SimdOnlyConversionPattern : public mlir::RewritePattern {
         if (indVarArg.getType() != indexType)
           indVarArg.setType(indexType);
 
+        // fir.do_loop, unlike omp.loop_nest does not support multi-block
+        // regions. If we're dealing with multiple blocks inside omp.loop_nest,
+        // we need to convert it into basic control-flow operations instead.
         auto loopBlocks =
             fir::convertDoLoopToCFG(outerLoop, rewriter, false, false);
         auto *conditionalBlock = loopBlocks.first;
@@ -237,7 +221,9 @@ class SimdOnlyConversionPattern : public mlir::RewritePattern {
           if (auto yieldOp =
                   mlir::dyn_cast<mlir::omp::YieldOp>(loopBlock->back())) {
             builder.setInsertionPointToEnd(loopBlock);
-            builder.create<mlir::cf::BranchOp>(loc, lastBlock);
+            mlir::cf::BranchOp::create(builder, loc, lastBlock);
+            assert("omp.loop_nests's omp.yield has no operands" &&
+                   yieldOp->getNumOperands() == 0);
             rewriter.eraseOp(yieldOp);
           }
         }
@@ -255,16 +241,16 @@ class SimdOnlyConversionPattern : public mlir::RewritePattern {
 
     if (auto atomicReadOp = mlir::dyn_cast<mlir::omp::AtomicReadOp>(op)) {
       builder.setInsertionPoint(op);
-      auto loadOp = builder.create<fir::LoadOp>(loc, atomicReadOp.getX());
-      auto storeOp = builder.create<fir::StoreOp>(loc, loadOp.getResult(),
-                                                  atomicReadOp.getV());
+      auto loadOp = fir::LoadOp::create(builder, loc, atomicReadOp.getX());
+      auto storeOp = fir::StoreOp::create(builder, loc, loadOp.getResult(),
+                                          atomicReadOp.getV());
       rewriter.replaceOp(op, storeOp);
       return mlir::success();
     }
 
     if (auto atomicWriteOp = mlir::dyn_cast<mlir::omp::AtomicWriteOp>(op)) {
-      auto storeOp = builder.create<fir::StoreOp>(loc, atomicWriteOp.getExpr(),
-                                                  atomicWriteOp.getX());
+      auto storeOp = fir::StoreOp::create(builder, loc, atomicWriteOp.getExpr(),
+                                          atomicWriteOp.getX());
       rewriter.replaceOp(op, storeOp);
       return mlir::success();
     }
@@ -276,7 +262,7 @@ class SimdOnlyConversionPattern : public mlir::RewritePattern {
       builder.setInsertionPointToStart(&block);
 
       // Load the update `x` operand and replace its uses within the block
-      auto loadOp = builder.create<fir::LoadOp>(loc, atomicUpdateOp.getX());
+      auto loadOp = fir::LoadOp::create(builder, loc, atomicUpdateOp.getX());
       rewriter.replaceUsesWithIf(
           block.getArgument(0), loadOp.getResult(),
           [&](auto &op) { return op.get().getParentBlock() == &block; });
@@ -286,14 +272,14 @@ class SimdOnlyConversionPattern : public mlir::RewritePattern {
       auto yieldOp = mlir::cast<mlir::omp::YieldOp>(block.back());
       assert("only one yield operand" && yieldOp->getNumOperands() == 1);
       builder.setInsertionPointAfter(yieldOp);
-      builder.create<fir::StoreOp>(loc, yieldOp->getOperand(0),
-                                   atomicUpdateOp.getX());
+      fir::StoreOp::create(builder, loc, yieldOp->getOperand(0),
+                           atomicUpdateOp.getX());
       rewriter.eraseOp(yieldOp);
 
       // Inline the final block and remove the now-empty op
       assert("only one block argument" && block.getNumArguments() == 1);
       block.eraseArguments(0, block.getNumArguments());
-      rewriter.inlineBlockBefore(&block, op, {});
+      rewriter.inlineBlockBefore(&block, atomicUpdateOp, {});
       rewriter.eraseOp(op);
       return mlir::success();
     }
@@ -305,6 +291,64 @@ class SimdOnlyConversionPattern : public mlir::RewritePattern {
       return mlir::success();
     }
 
+    auto inlineSimpleOp = [&](mlir::Operation *ompOp) -> bool {
+      if (!ompOp)
+        return false;
+
+      assert("OpenMP operation has one region" && ompOp->getNumRegions() == 1);
+
+      llvm::SmallVector<std::pair<mlir::Value, mlir::BlockArgument>>
+          blockArgsPairs;
+      if (auto iface =
+              mlir::dyn_cast<mlir::omp::BlockArgOpenMPOpInterface>(op)) {
+        iface.getBlockArgsPairs(blockArgsPairs);
+        for (auto [value, argument] : blockArgsPairs)
+          rewriter.replaceAllUsesWith(argument, value);
+      }
+
+      if (ompOp->getRegion(0).getBlocks().size() == 1) {
+        auto &block = *ompOp->getRegion(0).getBlocks().begin();
+        // This block is about to be removed so any arguments should have been
+        // replaced by now.
+        block.eraseArguments(0, block.getNumArguments());
+        if (auto terminatorOp =
+                mlir::dyn_cast<mlir::omp::TerminatorOp>(block.back())) {
+          rewriter.eraseOp(terminatorOp);
+        }
+        rewriter.inlineBlockBefore(&block, ompOp, {});
+      } else {
+        // When dealing with multi-block regions we need to fix up the control
+        // flow
+        auto *origBlock = ompOp->getBlock();
+        auto *newBlock = rewriter.splitBlock(origBlock, ompOp->getIterator());
+        auto *innerFrontBlock = &ompOp->getRegion(0).getBlocks().front();
+        builder.setInsertionPointToEnd(origBlock);
+        mlir::cf::BranchOp::create(builder, loc, innerFrontBlock);
+        // We are no longer passing any arguments to the first block in the
+        // region, so this should be safe to erase.
+        innerFrontBlock->eraseArguments(0, innerFrontBlock->getNumArguments());
+
+        for (auto &innerBlock : ompOp->getRegion(0).getBlocks()) {
+          // Remove now-unused block arguments
+          for (auto arg : innerBlock.getArguments()) {
+            if (arg.getUses().empty())
+              innerBlock.eraseArgument(arg.getArgNumber());
+          }
+          if (auto terminatorOp =
+                  mlir::dyn_cast<mlir::omp::TerminatorOp>(innerBlock.back())) {
+            builder.setInsertionPointToEnd(&innerBlock);
+            mlir::cf::BranchOp::create(builder, loc, newBlock);
+            rewriter.eraseOp(terminatorOp);
+          }
+        }
+
+        rewriter.inlineRegionBefore(ompOp->getRegion(0), newBlock);
+      }
+
+      rewriter.eraseOp(op);
+      return true;
+    };
+
     if (inlineSimpleOp(mlir::dyn_cast<mlir::omp::TeamsOp>(op)) ||
         inlineSimpleOp(mlir::dyn_cast<mlir::omp::ParallelOp>(op)) ||
         inlineSimpleOp(mlir::dyn_cast<mlir::omp::SingleOp>(op)) ||
@@ -324,7 +368,7 @@ class SimdOnlyConversionPattern : public mlir::RewritePattern {
         inlineSimpleOp(mlir::dyn_cast<mlir::omp::MaskedOp>(op)))
       return mlir::success();
 
-    op->emitOpError("OpenMP operation left unhandled after SimdOnly pass.");
+    op->emitOpError("left unhandled after SimdOnly pass.");
     return mlir::failure();
   }
 };
@@ -335,10 +379,7 @@ class SimdOnlyPass : public flangomp::impl::SimdOnlyPassBase<SimdOnlyPass> {
   SimdOnlyPass() = default;
 
   void runOnOperation() override {
-    mlir::func::FuncOp func = getOperation();
-
-    if (func.isDeclaration())
-      return;
+    mlir::ModuleOp module = getOperation();
 
     mlir::MLIRContext *context = &getContext();
     mlir::RewritePatternSet patterns(context);
@@ -350,8 +391,8 @@ class SimdOnlyPass : public flangomp::impl::SimdOnlyPassBase<SimdOnlyPass> {
         mlir::GreedySimplifyRegionLevel::Disabled);
 
     if (mlir::failed(
-            mlir::applyPatternsGreedily(func, std::move(patterns), config))) {
-      mlir::emitError(func.getLoc(), "error in simd-only conversion pass");
+            mlir::applyPatternsGreedily(module, std::move(patterns), config))) {
+      mlir::emitError(module.getLoc(), "error in simd-only conversion pass");
       signalPassFailure();
     }
   }
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index a8812e08c1ccd..302776a14b9f1 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -35,6 +35,7 @@ add_flang_library(FIRTransforms
   GenRuntimeCallsForTest.cpp
   SimplifyFIROperations.cpp
   OptimizeArrayRepacking.cpp
+  Utils.cpp
 
   DEPENDS
   CUFAttrs
diff --git a/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp b/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
index 4bcf7d857c7b0..cdb194f5a68c9 100644
--- a/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
+++ b/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
@@ -32,113 +32,6 @@ namespace fir {
 using namespace fir;
 using namespace mlir;
 
-// Extracted here for use in other passes
-
-/// Convert fir::DoLoopOp to control-flow operations
-std::pair<mlir::Block *, mlir::Block *>
-fir::convertDoLoopToCFG(DoLoopOp loop, mlir::PatternRewriter &rewriter,
-                        bool setNSW, bool forceLoopToExecuteOnce) {
-  auto loc = loop.getLoc();
-  mlir::arith::IntegerOverflowFlags flags{};
-  if (setNSW)
-    flags = bitEnumSet(flags, mlir::arith::IntegerOverflowFlags::nsw);
-  auto iofAttr =
-      mlir::arith::IntegerOverflowFlagsAttr::get(rewriter.getContext(), flags);
-
-  // Create the start and end blocks that will wrap the DoLoopOp with an
-  // initalizer and an end point
-  auto *initBlock = rewriter.getInsertionBlock();
-  auto initPos = rewriter.getInsertionPoint();
-  auto *endBlock = rewriter.splitBlock(initBlock, initPos);
-
-  // Split the first DoLoopOp block in two parts. The part before will be the
-  // conditional block since it already has the induction variable and
-  // loop-carried values as arguments.
-  auto *conditionalBlock = &loop.getRegion().front();
-  conditionalBlock->addArgument(rewriter.getIndexType(), loc);
-  auto *firstBlock =
-      rewriter.splitBlock(conditionalBlock, conditionalBlock->begin());
-  auto *lastBlock = &loop.getRegion().back();
-
-  // Move the blocks from the DoLoopOp between initBlock and endBlock
-  rewriter.inlineRegionBefore(loop.getRegion(), endBlock);
-
-  // Get loop values from the DoLoopOp
-  auto low = loop.getLowerBound();
-  auto high = loop.getUpperBound();
-  assert(low && high && "must be a Value");
-  auto step = loop.getStep();
-
-  // Initalization block
-  rewriter.setInsertionPointToEnd(initBlock);
-  auto diff = mlir::arith::SubIOp::create(rewriter, loc, high, low);
-  auto distance = mlir::arith::AddIOp::create(rewriter, loc, diff, step);
-  mlir::Value iters =
-      mlir::arith::DivSIOp::create(rewriter, loc, distance, step);
-
-  if (forceLoopToExecuteOnce) {
-    auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
-    auto cond = mlir::arith::CmpIOp::create(
-        rewriter, loc, arith::CmpIPredicate::sle, iters, zero);
-    auto one = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
-    iters = mlir::arith::SelectOp::create(rewriter, loc, cond, one, iters);
-  }
-
-  llvm::SmallVector<mlir::Value> loopOperands;
-  loopOperands.push_back(low);
-  auto operands = loop.getIterOperands();
-  loopOperands.append(operands.begin(), operands.end());
-  loopOperands.push_back(iters);
-
-  mlir::cf::BranchOp::create(rewriter, loc, conditionalBlock, loopOperands);
-
-  // Last loop block
-  auto *terminator = lastBlock->getTerminator();
-  rewriter.setInsertionPointToEnd(lastBlock);
-  auto iv = conditionalBlock->getArgument(0);
-  mlir::Value steppedIndex =
-      mlir::arith::AddIOp::create(rewriter, loc, iv, step, iofAttr);
-  assert(steppedIndex && "must be a Value");
-  auto lastArg = conditionalBlock->getNumArguments() - 1;
-  auto itersLeft = conditionalBlock->getArgument(lastArg);
-  auto one = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
-  mlir::Value itersMinusOne =
-      mlir::arith::SubIOp::create(rewriter, loc, itersLeft, one);
-
-  llvm::SmallVector<mlir::Value> loopCarried;
-  loopCarried.push_back(steppedIndex);
-  auto begin = loop.getFinalValue() ? std::next(terminator->operand_begin())
-                                    : terminator->operand_begin();
-  loopCarried.append(begin, terminator->operand_end());
-  loopCarried.push_back(itersMinusOne);
-  auto backEdge =
-      mlir::cf::BranchOp::create(rewriter, loc, conditionalBlock, loopCarried);
-  rewriter.eraseOp(terminator);
-
-  // Copy loop annotations from the do loop to the loop back edge.
-  if (auto ann = loop.getLoopAnnotation())
-    backEdge->setAttr("loop_annotation", *ann);
-
-  // Conditional block
-  rewriter.setInsertionPointToEnd(conditionalBlock);
-  auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
-  auto comparison = mlir::arith::CmpIOp::create(
-      rewriter, loc, arith::CmpIPredicate::sgt, itersLeft, zero);
-
-  mlir::cf::CondBranchOp::create(rewriter, loc, comparison, firstBlock,
-                                 llvm::ArrayRef<mlir::Value>(), endBlock,
-                                 llvm::ArrayRef<mlir::Value>());
-
-  // The result of the loop operation is the values of the condition block
-  // arguments except the induction variable on the last iteration.
-  auto args = loop.getFinalValue()
-                  ? conditionalBlock->getArguments()
-                  : conditionalBlock->getArguments().drop_front();
-  rewriter.replaceOp(loop, args.drop_back());
-
-  return std::make_pair(conditionalBlock, lastBlock);
-}
-
 namespace {
 
 // Conversion of fir control ops to more primitive control-flow.
diff --git a/flang/lib/Optimizer/Transforms/Utils.cpp b/flang/lib/Optimizer/Transforms/Utils.cpp
new file mode 100644
index 0000000000000..2b2a2159e2501
--- /dev/null
+++ b/flang/lib/Optimizer/Transforms/Utils.cpp
@@ -0,0 +1,121 @@
+//===-- Utils.cpp ---------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Transforms/Utils.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/Dialect/Support/FIRContext.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
+
+/// Convert fir::DoLoopOp to control-flow operations
+std::pair<mlir::Block *, mlir::Block *>
+fir::convertDoLoopToCFG(DoLoopOp loop, mlir::PatternRewriter &rewriter,
+                        bool setNSW, bool forceLoopToExecuteOnce) {
+  auto loc = loop.getLoc();
+  mlir::arith::IntegerOverflowFlags flags{};
+  if (setNSW)
+    flags = bitEnumSet(flags, mlir::arith::IntegerOverflowFlags::nsw);
+  auto iofAttr =
+      mlir::arith::IntegerOverflowFlagsAttr::get(rewriter.getContext(), flags);
+
+  // Create the start and end blocks that will wrap the DoLoopOp with an
+  // initalizer and an end point
+  auto *initBlock = rewriter.getInsertionBlock();
+  auto initPos = rewriter.getInsertionPoint();
+  auto *endBlock = rewriter.splitBlock(initBlock, initPos);
+
+  // Split the first DoLoopOp block in two parts. The part before will be the
+  // conditional block since it already has the induction variable and
+  // loop-carried values as arguments.
+  auto *conditionalBlock = &loop.getRegion().front();
+  conditionalBlock->addArgument(rewriter.getIndexType(), loc);
+  auto *firstBlock =
+      rewriter.splitBlock(conditionalBlock, conditionalBlock->begin());
+  auto *lastBlock = &loop.getRegion().back();
+
+  // Move the blocks from the DoLoopOp between initBlock and endBlock
+  rewriter.inlineRegionBefore(loop.getRegion(), endBlock);
+
+  // Get loop values from the DoLoopOp
+  auto low = loop.getLowerBound();
+  auto high = loop.getUpperBound();
+  assert(low && high && "must be a Value");
+  auto step = loop.getStep();
+
+  // Initalization block
+  rewriter.setInsertionPointToEnd(initBlock);
+  auto diff = mlir::arith::SubIOp::create(rewriter, loc, high, low);
+  auto distance = mlir::arith::AddIOp::create(rewriter, loc, diff, step);
+  mlir::Value iters =
+      mlir::arith::DivSIOp::create(rewriter, loc, distance, step);
+
+  if (forceLoopToExecuteOnce) {
+    auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
+    auto cond = mlir::arith::CmpIOp::create(
+        rewriter, loc, mlir::arith::CmpIPredicate::sle, iters, zero);
+    auto one = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
+    iters = mlir::arith::SelectOp::create(rewriter, loc, cond, one, iters);
+  }
+
+  llvm::SmallVector<mlir::Value> loopOperands;
+  loopOperands.push_back(low);
+  auto operands = loop.getIterOperands();
+  loopOperands.append(operands.begin(), operands.end());
+  loopOperands.push_back(iters);
+
+  mlir::cf::BranchOp::create(rewriter, loc, conditionalBlock, loopOperands);
+
+  // Last loop block
+  auto *terminator = lastBlock->getTerminator();
+  rewriter.setInsertionPointToEnd(lastBlock);
+  auto iv = conditionalBlock->getArgument(0);
+  mlir::Value steppedIndex =
+      mlir::arith::AddIOp::create(rewriter, loc, iv, step, iofAttr);
+  assert(steppedIndex && "must be a Value");
+  auto lastArg = conditionalBlock->getNumArguments() - 1;
+  auto itersLeft = conditionalBlock->getArgument(lastArg);
+  auto one = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
+  mlir::Value itersMinusOne =
+      mlir::arith::SubIOp::create(rewriter, loc, itersLeft, one);
+
+  llvm::SmallVector<mlir::Value> loopCarried;
+  loopCarried.push_back(steppedIndex);
+  auto begin = loop.getFinalValue() ? std::next(terminator->operand_begin())
+                                    : terminator->operand_begin();
+  loopCarried.append(begin, terminator->operand_end());
+  loopCarried.push_back(itersMinusOne);
+  auto backEdge =
+      mlir::cf::BranchOp::create(rewriter, loc, conditionalBlock, loopCarried);
+  rewriter.eraseOp(terminator);
+
+  // Copy loop annotations from the do loop to the loop back edge.
+  if (auto ann = loop.getLoopAnnotation())
+    backEdge->setAttr("loop_annotation", *ann);
+
+  // Conditional block
+  rewriter.setInsertionPointToEnd(conditionalBlock);
+  auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
+  auto comparison = mlir::arith::CmpIOp::create(
+      rewriter, loc, mlir::arith::CmpIPredicate::sgt, itersLeft, zero);
+
+  mlir::cf::CondBranchOp::create(rewriter, loc, comparison, firstBlock,
+                                 llvm::ArrayRef<mlir::Value>(), endBlock,
+                                 llvm::ArrayRef<mlir::Value>());
+
+  // The result of the loop operation is the values of the condition block
+  // arguments except the induction variable on the last iteration.
+  auto args = loop.getFinalValue()
+                  ? conditionalBlock->getArguments()
+                  : conditionalBlock->getArguments().drop_front();
+  rewriter.replaceOp(loop, args.drop_back());
+
+  return std::make_pair(conditionalBlock, lastBlock);
+}
diff --git a/flang/test/Transforms/OpenMP/simd-only.mlir b/flang/test/Transforms/OpenMP/simd-only.mlir
index c3efce13c4414..8c0f8af7914d0 100644
--- a/flang/test/Transforms/OpenMP/simd-only.mlir
+++ b/flang/test/Transforms/OpenMP/simd-only.mlir
@@ -1,4 +1,4 @@
-// RUN: fir-opt --split-input-file --omp-simd-only %s | FileCheck %s
+// RUN: fir-opt --split-input-file --verify-diagnostics --omp-simd-only %s | FileCheck %s
 
 // Check that simd operations are not removed and rewritten, but all the other OpenMP ops are.
 

From 21374b4d948df1ec773b7262093c960629dfe172 Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski@arm.com>
Date: Fri, 25 Jul 2025 14:41:00 +0000
Subject: [PATCH 3/9] Add help message for -fno-openmp-simd, expand tests

---
 clang/include/clang/Driver/Options.td       |  3 ++-
 flang/test/Transforms/OpenMP/simd-only.mlir | 24 +++++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 8a2bf40cc6a32..51a3062ad37fd 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3743,7 +3743,8 @@ def fno_openmp_simd
     : Flag<["-"], "fno-openmp-simd">,
       Group<f_Group>,
       Flags<[NoArgumentUnused]>,
-      Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>;
+      Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
+      HelpText<"Do not emit code for any OpenMP constructs.">;
 def fopenmp_cuda_mode : Flag<["-"], "fopenmp-cuda-mode">, Group<f_Group>,
   Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>;
 def fno_openmp_cuda_mode : Flag<["-"], "fno-openmp-cuda-mode">, Group<f_Group>,
diff --git a/flang/test/Transforms/OpenMP/simd-only.mlir b/flang/test/Transforms/OpenMP/simd-only.mlir
index 8c0f8af7914d0..278527d448864 100644
--- a/flang/test/Transforms/OpenMP/simd-only.mlir
+++ b/flang/test/Transforms/OpenMP/simd-only.mlir
@@ -2,6 +2,7 @@
 
 // Check that simd operations are not removed and rewritten, but all the other OpenMP ops are.
 
+// CHECK: omp.private
 // CHECK-LABEL: func.func @simd
 omp.private {type = private} @_QFEi_private_i32 : i32
 func.func @simd(%arg0: i32, %arg1: !fir.ref<i32>, %arg2: !fir.ref<i32>) {
@@ -49,6 +50,7 @@ func.func @simd_composite(%arg0: i32, %arg1: !fir.ref<i32>) {
 
 // -----
 
+// CHECK-NOT: omp.private
 // CHECK-LABEL: func.func @parallel
 omp.private {type = private} @_QFEi_private_i32 : i32
 func.func @parallel(%arg0: i32, %arg1: !fir.ref<i32>) {
@@ -326,6 +328,7 @@ func.func @sections(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %funcArg2: !fir.re
 
 // -----
 
+// CHECK-NOT: omp.declare_reduction
 omp.declare_reduction @add_reduction_i32 : i32 init {
 ^bb0(%arg0: i32):
   %c0_i32 = arith.constant 0 : i32
@@ -620,3 +623,24 @@ func.func @do_multi_block(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %6: i1) {
   }
   return
 }
+
+// -----
+
+// CHECK-LABEL: func.func @simd_nested_atomic(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_2:.*]]: !fir.ref<i32>
+func.func @simd_nested_atomic(%arg0: i32, %arg1: !fir.ref<i32>, %arg2: !fir.ref<i32>) {
+  %c1_i32 = arith.constant 1 : i32
+  %c100000_i32 = arith.constant 100000 : i32
+  // CHECK: omp.simd
+  omp.simd {
+    // CHECK: omp.loop_nest
+    omp.loop_nest (%arg3) : i32 = (%c1_i32) to (%c100000_i32) inclusive step (%c1_i32) {
+      // CHECK-NOT: omp.atomic.write
+      // CHECK: fir.store %[[ARG_0]] to %[[ARG_2]]
+      omp.atomic.write %arg2 = %arg0 : !fir.ref<i32>, i32
+      // CHECK: omp.yield
+      omp.yield
+    }
+  }
+  return
+}

From dfc696431f9720df5756ae27a854da6c093b6ce2 Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski@arm.com>
Date: Mon, 4 Aug 2025 17:00:28 +0000
Subject: [PATCH 4/9] Support omp.canonical_loop and related new ops

---
 flang/lib/Optimizer/OpenMP/SimdOnly.cpp     | 131 +++++++++++++++++++-
 flang/test/Transforms/OpenMP/simd-only.mlir |  28 +++++
 2 files changed, 155 insertions(+), 4 deletions(-)

diff --git a/flang/lib/Optimizer/OpenMP/SimdOnly.cpp b/flang/lib/Optimizer/OpenMP/SimdOnly.cpp
index 1e832f8744c51..37a8053601d34 100644
--- a/flang/lib/Optimizer/OpenMP/SimdOnly.cpp
+++ b/flang/lib/Optimizer/OpenMP/SimdOnly.cpp
@@ -54,10 +54,6 @@ class SimdOnlyConversionPattern : public mlir::RewritePattern {
     if (op->getParentOfType<mlir::omp::SimdOp>() &&
         (mlir::isa<mlir::omp::YieldOp>(op) ||
          mlir::isa<mlir::omp::LoopNestOp>(op) ||
-         mlir::isa<mlir::omp::WsloopOp>(op) ||
-         mlir::isa<mlir::omp::WorkshareLoopWrapperOp>(op) ||
-         mlir::isa<mlir::omp::DistributeOp>(op) ||
-         mlir::isa<mlir::omp::TaskloopOp>(op) ||
          mlir::isa<mlir::omp::TerminatorOp>(op)))
       return rewriter.notifyMatchFailure(op, "Op is part of a simd construct");
 
@@ -67,6 +63,10 @@ class SimdOnlyConversionPattern : public mlir::RewritePattern {
       return rewriter.notifyMatchFailure(op,
                                          "Non top-level yield or terminator");
 
+    if (mlir::isa<mlir::omp::UnrollHeuristicOp>(op))
+      return rewriter.notifyMatchFailure(
+          op, "UnrollHeuristic has special handling");
+
     // SectionOp overrides its BlockArgInterface based on the parent SectionsOp.
     // We need to make sure we only rewrite omp.sections once all omp.section
     // ops inside it have been rewritten, otherwise the individual omp.section
@@ -291,6 +291,129 @@ class SimdOnlyConversionPattern : public mlir::RewritePattern {
       return mlir::success();
     }
 
+    if (auto cLoopOp = mlir::dyn_cast<mlir::omp::CanonicalLoopOp>(op)) {
+      assert("CanonicalLoopOp has one region" && cLoopOp->getNumRegions() == 1);
+      auto cli = cLoopOp.getCli();
+      auto tripCount = cLoopOp.getTripCount();
+
+      builder.setInsertionPoint(cLoopOp);
+      mlir::Type indexType = builder.getIndexType();
+      mlir::Type oldIndexType = tripCount.getType();
+      auto one = mlir::arith::ConstantIndexOp::create(builder, loc, 1);
+      auto ub = builder.createConvert(loc, indexType, tripCount);
+
+      llvm::SmallVector<mlir::Value> loopIndArgs;
+      auto doLoop = fir::DoLoopOp::create(builder, loc, one, ub, one, false);
+      builder.setInsertionPointToStart(doLoop.getBody());
+      if (oldIndexType != indexType) {
+        auto convertedIndVar =
+            builder.createConvert(loc, oldIndexType, doLoop.getInductionVar());
+        loopIndArgs.push_back(convertedIndVar);
+      } else {
+        loopIndArgs.push_back(doLoop.getInductionVar());
+      }
+
+      if (cLoopOp.getRegion().getBlocks().size() == 1) {
+        auto &block = *cLoopOp.getRegion().getBlocks().begin();
+        // DoLoopOp will handle incrementing the induction variable
+        if (auto addIOp = mlir::dyn_cast<mlir::arith::AddIOp>(block.front())) {
+          rewriter.replaceOpUsesWithinBlock(addIOp, addIOp.getLhs(), &block);
+          rewriter.eraseOp(addIOp);
+        }
+
+        rewriter.mergeBlocks(&block, doLoop.getBody(), loopIndArgs);
+
+        // Find the new loop block terminator and move it before the end of the
+        // block
+        for (auto &loopBodyOp : doLoop.getBody()->getOperations()) {
+          if (auto resultOp = mlir::dyn_cast<fir::ResultOp>(loopBodyOp)) {
+            rewriter.moveOpBefore(resultOp.getOperation(),
+                                  &doLoop.getBody()->back());
+            break;
+          }
+        }
+
+        // Remove omp.terminator at the end of the loop body
+        if (auto terminatorOp = mlir::dyn_cast<mlir::omp::TerminatorOp>(
+                doLoop.getBody()->back())) {
+          rewriter.eraseOp(terminatorOp);
+        }
+      } else {
+        rewriter.inlineRegionBefore(cLoopOp->getRegion(0), doLoop.getBody());
+        auto indVarArg = doLoop.getBody()->getArgument(0);
+        // fir::convertDoLoopToCFG expects the induction variable to be of type
+        // index while the OpenMP CanonicalLoopOp can have indices of different
+        // types. We need to work around it.
+        if (indVarArg.getType() != indexType)
+          indVarArg.setType(indexType);
+
+        // fir.do_loop, unlike omp.canonical_loop does not support multi-block
+        // regions. If we're dealing with multiple blocks inside omp.loop_nest,
+        // we need to convert it into basic control-flow operations instead.
+        auto loopBlocks =
+            fir::convertDoLoopToCFG(doLoop, rewriter, false, false);
+        auto *conditionalBlock = loopBlocks.first;
+        auto *firstBlock =
+            conditionalBlock->getNextNode(); // Start of the loop body
+        auto *lastBlock = loopBlocks.second; // Incrementing induction variables
+
+        // Incrementing the induction variable is handled elsewhere
+        if (auto addIOp =
+                mlir::dyn_cast<mlir::arith::AddIOp>(firstBlock->front())) {
+          rewriter.replaceOpUsesWithinBlock(addIOp, addIOp.getLhs(),
+                                            firstBlock);
+          rewriter.eraseOp(addIOp);
+        }
+
+        // If the induction variable is used within the loop and was originally
+        // not of type index, then we need to add a convert to the original type
+        // and replace its uses inside the loop body.
+        if (oldIndexType != indexType) {
+          indVarArg = conditionalBlock->getArgument(0);
+          builder.setInsertionPointToStart(firstBlock);
+          auto convertedIndVar =
+              builder.createConvert(loc, oldIndexType, indVarArg);
+          rewriter.replaceUsesWithIf(
+              indVarArg, convertedIndVar, [&](auto &use) -> bool {
+                return use.getOwner() != convertedIndVar.getDefiningOp() &&
+                       use.getOwner()->getBlock() != lastBlock;
+              });
+        }
+
+        // There might be an unused convert and an unused argument to the block.
+        // If so, remove them.
+        if (lastBlock->front().getUses().empty())
+          lastBlock->front().erase();
+        for (auto arg : lastBlock->getArguments()) {
+          if (arg.getUses().empty())
+            lastBlock->eraseArgument(arg.getArgNumber());
+        }
+
+        // Any loop blocks that end in omp.terminator should just branch to
+        // lastBlock.
+        for (auto *loopBlock = conditionalBlock; loopBlock != lastBlock;
+             loopBlock = loopBlock->getNextNode()) {
+          if (auto terminatorOp =
+                  mlir::dyn_cast<mlir::omp::TerminatorOp>(loopBlock->back())) {
+            builder.setInsertionPointToEnd(loopBlock);
+            mlir::cf::BranchOp::create(builder, loc, lastBlock);
+            rewriter.eraseOp(terminatorOp);
+          }
+        }
+      }
+
+      rewriter.eraseOp(cLoopOp);
+      // Handle the optional omp.new_cli op
+      if (cli) {
+        // cli will be used by omp.unroll_heuristic ops
+        for (auto *user : cli.getUsers())
+          rewriter.eraseOp(user);
+        rewriter.eraseOp(cli.getDefiningOp());
+      }
+
+      return mlir::success();
+    }
+
     auto inlineSimpleOp = [&](mlir::Operation *ompOp) -> bool {
       if (!ompOp)
         return false;
diff --git a/flang/test/Transforms/OpenMP/simd-only.mlir b/flang/test/Transforms/OpenMP/simd-only.mlir
index 278527d448864..d768823565677 100644
--- a/flang/test/Transforms/OpenMP/simd-only.mlir
+++ b/flang/test/Transforms/OpenMP/simd-only.mlir
@@ -644,3 +644,31 @@ func.func @simd_nested_atomic(%arg0: i32, %arg1: !fir.ref<i32>, %arg2: !fir.ref<
   }
   return
 }
+
+// -----
+
+// CHECK-LABEL: func.func @unroll(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
+func.func @unroll(%arg0: i32, %arg1: !fir.ref<i32>) {
+  %c1_i32 = arith.constant 1 : i32
+  // CHECK: %[[RANGE_i32:.*]] = arith.constant 16 : i32
+  %c16_i32 = arith.constant 16 : i32
+  // CHECK: %[[C1_IDX:.*]] = arith.constant 1 : index
+  // CHECK: %[[RANGE:.*]] = fir.convert %[[RANGE_i32]]
+  // CHECK-NOT: omp.new_cli
+  %canonloop_s0 = omp.new_cli
+  // CHECK-NOT: omp.canonical_loop
+  // CHECK: fir.do_loop %[[IVAR:.*]] = %[[C1_IDX]] to %[[RANGE]] step %[[C1_IDX]]
+  omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%c16_i32) {
+    // CHECK: %[[IVAR_CVT:.*]] = fir.convert %[[IVAR]] : (index) -> i32
+    // CHECK-NOT: arith.addi
+    %3 = arith.addi %iv, %c1_i32 : i32
+    // CHECK: fir.store %[[IVAR_CVT]] to %[[ARG_1]]
+    fir.store %3 to %arg1 : !fir.ref<i32>
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  // CHECK-NOT: omp.unroll_heuristic
+  omp.unroll_heuristic(%canonloop_s0)
+  return
+}

From 773fb19214f8d25dd40a545edd54caef2b660a37 Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski@arm.com>
Date: Mon, 11 Aug 2025 13:04:42 +0000
Subject: [PATCH 5/9] Redesign with parse tree rewriting

---
 flang/lib/Optimizer/OpenMP/SimdOnly.cpp     | 345 +------------
 flang/lib/Semantics/rewrite-parse-tree.cpp  | 287 +++++++++++
 flang/test/Semantics/OpenMP/simd-only.f90   | 416 +++++++++++++++
 flang/test/Transforms/OpenMP/simd-only.mlir | 538 ++------------------
 4 files changed, 750 insertions(+), 836 deletions(-)
 create mode 100644 flang/test/Semantics/OpenMP/simd-only.f90

diff --git a/flang/lib/Optimizer/OpenMP/SimdOnly.cpp b/flang/lib/Optimizer/OpenMP/SimdOnly.cpp
index 37a8053601d34..c9d9dbe03dac2 100644
--- a/flang/lib/Optimizer/OpenMP/SimdOnly.cpp
+++ b/flang/lib/Optimizer/OpenMP/SimdOnly.cpp
@@ -29,6 +29,11 @@ namespace {
 
 #define DEBUG_TYPE "omp-simd-only-pass"
 
+/// Rewrite and remove OpenMP operations left after the parse tree rewriting for
+/// -fopenmp-simd is done. If possible, OpenMP constructs should be rewritten at
+/// the parse tree stage. This pass is supposed to only handle complexities
+/// around untangling composite simd constructs, and perform the necessary
+/// cleanup.
 class SimdOnlyConversionPattern : public mlir::RewritePattern {
 public:
   SimdOnlyConversionPattern(mlir::MLIRContext *ctx)
@@ -53,6 +58,7 @@ class SimdOnlyConversionPattern : public mlir::RewritePattern {
 
     if (op->getParentOfType<mlir::omp::SimdOp>() &&
         (mlir::isa<mlir::omp::YieldOp>(op) ||
+         mlir::isa<mlir::omp::ScanOp>(op) ||
          mlir::isa<mlir::omp::LoopNestOp>(op) ||
          mlir::isa<mlir::omp::TerminatorOp>(op)))
       return rewriter.notifyMatchFailure(op, "Op is part of a simd construct");
@@ -63,21 +69,6 @@ class SimdOnlyConversionPattern : public mlir::RewritePattern {
       return rewriter.notifyMatchFailure(op,
                                          "Non top-level yield or terminator");
 
-    if (mlir::isa<mlir::omp::UnrollHeuristicOp>(op))
-      return rewriter.notifyMatchFailure(
-          op, "UnrollHeuristic has special handling");
-
-    // SectionOp overrides its BlockArgInterface based on the parent SectionsOp.
-    // We need to make sure we only rewrite omp.sections once all omp.section
-    // ops inside it have been rewritten, otherwise the individual omp.section
-    // ops will not be able to access their argument values.
-    if (auto sectionsOp = mlir::dyn_cast<mlir::omp::SectionsOp>(op)) {
-      for (auto &opInSections : sectionsOp.getRegion().getOps())
-        if (mlir::isa<mlir::omp::SectionOp>(opInSections))
-          return rewriter.notifyMatchFailure(
-              op, "SectionsOp still contains individual sections");
-    }
-
     LLVM_DEBUG(llvm::dbgs() << "SimdOnlyPass matched OpenMP op:\n");
     LLVM_DEBUG(op->dump());
 
@@ -98,192 +89,19 @@ class SimdOnlyConversionPattern : public mlir::RewritePattern {
       return eraseUnlessUsedBySimd(ompOp, ompOp.getSymNameAttr());
     if (auto ompOp = mlir::dyn_cast<mlir::omp::DeclareReductionOp>(op))
       return eraseUnlessUsedBySimd(ompOp, ompOp.getSymNameAttr());
-    if (auto ompOp = mlir::dyn_cast<mlir::omp::CriticalDeclareOp>(op))
-      return eraseUnlessUsedBySimd(ompOp, ompOp.getSymNameAttr());
-    if (auto ompOp = mlir::dyn_cast<mlir::omp::DeclareMapperOp>(op))
-      return eraseUnlessUsedBySimd(ompOp, ompOp.getSymNameAttr());
 
-    // Erase ops that don't need any special handling
-    if (mlir::isa<mlir::omp::BarrierOp>(op) ||
-        mlir::isa<mlir::omp::FlushOp>(op) ||
-        mlir::isa<mlir::omp::TaskyieldOp>(op) ||
-        mlir::isa<mlir::omp::MapBoundsOp>(op) ||
-        mlir::isa<mlir::omp::TargetEnterDataOp>(op) ||
-        mlir::isa<mlir::omp::TargetExitDataOp>(op) ||
-        mlir::isa<mlir::omp::TargetUpdateOp>(op) ||
-        mlir::isa<mlir::omp::OrderedOp>(op) ||
-        mlir::isa<mlir::omp::CancelOp>(op) ||
-        mlir::isa<mlir::omp::CancellationPointOp>(op) ||
-        mlir::isa<mlir::omp::ScanOp>(op) ||
-        mlir::isa<mlir::omp::TaskwaitOp>(op)) {
+    // Might be left over from rewriting composite simd with target map
+    if (mlir::isa<mlir::omp::MapBoundsOp>(op)) {
       rewriter.eraseOp(op);
       return mlir::success();
     }
-
-    fir::FirOpBuilder builder(rewriter, op);
-    mlir::Location loc = op->getLoc();
-
-    if (auto ompOp = mlir::dyn_cast<mlir::omp::LoopNestOp>(op)) {
-      mlir::Type indexType = builder.getIndexType();
-      mlir::Type oldIndexType = ompOp.getIVs().begin()->getType();
-      builder.setInsertionPoint(op);
-      auto one = mlir::arith::ConstantIndexOp::create(builder, loc, 1);
-
-      // Generate the new loop nest
-      mlir::Block *nestBody = nullptr;
-      fir::DoLoopOp outerLoop = nullptr;
-      llvm::SmallVector<mlir::Value> loopIndArgs;
-      for (auto extent : ompOp.getLoopUpperBounds()) {
-        auto ub = builder.createConvert(loc, indexType, extent);
-        auto doLoop = fir::DoLoopOp::create(builder, loc, one, ub, one, false);
-        nestBody = doLoop.getBody();
-        builder.setInsertionPointToStart(nestBody);
-        // Convert the indices to the type used inside the loop if needed
-        if (oldIndexType != indexType) {
-          auto convertedIndVar = builder.createConvert(
-              loc, oldIndexType, doLoop.getInductionVar());
-          loopIndArgs.push_back(convertedIndVar);
-        } else {
-          loopIndArgs.push_back(doLoop.getInductionVar());
-        }
-        if (!outerLoop)
-          outerLoop = doLoop;
-      }
-
-      // Move the omp loop body into the new loop body
-      if (ompOp->getRegion(0).getBlocks().size() == 1) {
-        auto &block = *ompOp->getRegion(0).getBlocks().begin();
-        rewriter.mergeBlocks(&block, nestBody, loopIndArgs);
-
-        // Find the new loop block terminator and move it before the end of the
-        // block
-        for (auto &loopBodyOp : nestBody->getOperations()) {
-          if (auto resultOp = mlir::dyn_cast<fir::ResultOp>(loopBodyOp)) {
-            rewriter.moveOpBefore(resultOp.getOperation(), &nestBody->back());
-            break;
-          }
-        }
-
-        // Remove omp.yield at the end of the loop body
-        if (auto yieldOp =
-                mlir::dyn_cast<mlir::omp::YieldOp>(nestBody->back())) {
-          assert("omp.loop_nests's omp.yield has no operands" &&
-                 yieldOp->getNumOperands() == 0);
-          rewriter.eraseOp(yieldOp);
-        }
-      } else {
-        rewriter.inlineRegionBefore(ompOp->getRegion(0), nestBody);
-        auto indVarArg = outerLoop->getRegion(0).front().getArgument(0);
-        // fir::convertDoLoopToCFG expects the induction variable to be of type
-        // index while the OpenMP LoopNestOp can have indices of different
-        // types. We need to work around it.
-        if (indVarArg.getType() != indexType)
-          indVarArg.setType(indexType);
-
-        // fir.do_loop, unlike omp.loop_nest does not support multi-block
-        // regions. If we're dealing with multiple blocks inside omp.loop_nest,
-        // we need to convert it into basic control-flow operations instead.
-        auto loopBlocks =
-            fir::convertDoLoopToCFG(outerLoop, rewriter, false, false);
-        auto *conditionalBlock = loopBlocks.first;
-        auto *firstBlock =
-            conditionalBlock->getNextNode(); // Start of the loop body
-        auto *lastBlock = loopBlocks.second; // Incrementing induction variables
-
-        // If the induction variable is used within the loop and was originally
-        // not of type index, then we need to add a convert to the original type
-        // and replace its uses inside the loop body.
-        if (oldIndexType != indexType) {
-          indVarArg = conditionalBlock->getArgument(0);
-          builder.setInsertionPointToStart(firstBlock);
-          auto convertedIndVar =
-              builder.createConvert(loc, oldIndexType, indVarArg);
-          rewriter.replaceUsesWithIf(
-              indVarArg, convertedIndVar, [&](auto &use) -> bool {
-                return use.getOwner() != convertedIndVar.getDefiningOp() &&
-                       use.getOwner()->getBlock() != lastBlock;
-              });
-        }
-
-        // There might be an unused convert and an unused argument to the block.
-        // If so, remove them.
-        if (lastBlock->front().getUses().empty())
-          lastBlock->front().erase();
-        for (auto arg : lastBlock->getArguments()) {
-          if (arg.getUses().empty())
-            lastBlock->eraseArgument(arg.getArgNumber());
-        }
-
-        // Any loop blocks that end in omp.yield should just branch to
-        // lastBlock.
-        for (auto *loopBlock = conditionalBlock; loopBlock != lastBlock;
-             loopBlock = loopBlock->getNextNode()) {
-          if (auto yieldOp =
-                  mlir::dyn_cast<mlir::omp::YieldOp>(loopBlock->back())) {
-            builder.setInsertionPointToEnd(loopBlock);
-            mlir::cf::BranchOp::create(builder, loc, lastBlock);
-            assert("omp.loop_nests's omp.yield has no operands" &&
-                   yieldOp->getNumOperands() == 0);
-            rewriter.eraseOp(yieldOp);
-          }
-        }
-      }
-
-      rewriter.eraseOp(ompOp);
-      return mlir::success();
-    }
-
     if (auto mapInfoOp = mlir::dyn_cast<mlir::omp::MapInfoOp>(op)) {
       mapInfoOp.getResult().replaceAllUsesWith(mapInfoOp.getVarPtr());
       rewriter.eraseOp(mapInfoOp);
       return mlir::success();
     }
 
-    if (auto atomicReadOp = mlir::dyn_cast<mlir::omp::AtomicReadOp>(op)) {
-      builder.setInsertionPoint(op);
-      auto loadOp = fir::LoadOp::create(builder, loc, atomicReadOp.getX());
-      auto storeOp = fir::StoreOp::create(builder, loc, loadOp.getResult(),
-                                          atomicReadOp.getV());
-      rewriter.replaceOp(op, storeOp);
-      return mlir::success();
-    }
-
-    if (auto atomicWriteOp = mlir::dyn_cast<mlir::omp::AtomicWriteOp>(op)) {
-      auto storeOp = fir::StoreOp::create(builder, loc, atomicWriteOp.getExpr(),
-                                          atomicWriteOp.getX());
-      rewriter.replaceOp(op, storeOp);
-      return mlir::success();
-    }
-
-    if (auto atomicUpdateOp = mlir::dyn_cast<mlir::omp::AtomicUpdateOp>(op)) {
-      assert("one block in region" &&
-             atomicUpdateOp.getRegion().getBlocks().size() == 1);
-      auto &block = *atomicUpdateOp.getRegion().getBlocks().begin();
-      builder.setInsertionPointToStart(&block);
-
-      // Load the update `x` operand and replace its uses within the block
-      auto loadOp = fir::LoadOp::create(builder, loc, atomicUpdateOp.getX());
-      rewriter.replaceUsesWithIf(
-          block.getArgument(0), loadOp.getResult(),
-          [&](auto &op) { return op.get().getParentBlock() == &block; });
-
-      // Store the result back into `x` in line with omp.yield semantics for
-      // this op
-      auto yieldOp = mlir::cast<mlir::omp::YieldOp>(block.back());
-      assert("only one yield operand" && yieldOp->getNumOperands() == 1);
-      builder.setInsertionPointAfter(yieldOp);
-      fir::StoreOp::create(builder, loc, yieldOp->getOperand(0),
-                           atomicUpdateOp.getX());
-      rewriter.eraseOp(yieldOp);
-
-      // Inline the final block and remove the now-empty op
-      assert("only one block argument" && block.getNumArguments() == 1);
-      block.eraseArguments(0, block.getNumArguments());
-      rewriter.inlineBlockBefore(&block, atomicUpdateOp, {});
-      rewriter.eraseOp(op);
-      return mlir::success();
-    }
-
+    // Might be leftover after parse tree rewriting
     if (auto threadPrivateOp = mlir::dyn_cast<mlir::omp::ThreadprivateOp>(op)) {
       threadPrivateOp.getTlsAddr().replaceAllUsesWith(
           threadPrivateOp.getSymAddr());
@@ -291,128 +109,8 @@ class SimdOnlyConversionPattern : public mlir::RewritePattern {
       return mlir::success();
     }
 
-    if (auto cLoopOp = mlir::dyn_cast<mlir::omp::CanonicalLoopOp>(op)) {
-      assert("CanonicalLoopOp has one region" && cLoopOp->getNumRegions() == 1);
-      auto cli = cLoopOp.getCli();
-      auto tripCount = cLoopOp.getTripCount();
-
-      builder.setInsertionPoint(cLoopOp);
-      mlir::Type indexType = builder.getIndexType();
-      mlir::Type oldIndexType = tripCount.getType();
-      auto one = mlir::arith::ConstantIndexOp::create(builder, loc, 1);
-      auto ub = builder.createConvert(loc, indexType, tripCount);
-
-      llvm::SmallVector<mlir::Value> loopIndArgs;
-      auto doLoop = fir::DoLoopOp::create(builder, loc, one, ub, one, false);
-      builder.setInsertionPointToStart(doLoop.getBody());
-      if (oldIndexType != indexType) {
-        auto convertedIndVar =
-            builder.createConvert(loc, oldIndexType, doLoop.getInductionVar());
-        loopIndArgs.push_back(convertedIndVar);
-      } else {
-        loopIndArgs.push_back(doLoop.getInductionVar());
-      }
-
-      if (cLoopOp.getRegion().getBlocks().size() == 1) {
-        auto &block = *cLoopOp.getRegion().getBlocks().begin();
-        // DoLoopOp will handle incrementing the induction variable
-        if (auto addIOp = mlir::dyn_cast<mlir::arith::AddIOp>(block.front())) {
-          rewriter.replaceOpUsesWithinBlock(addIOp, addIOp.getLhs(), &block);
-          rewriter.eraseOp(addIOp);
-        }
-
-        rewriter.mergeBlocks(&block, doLoop.getBody(), loopIndArgs);
-
-        // Find the new loop block terminator and move it before the end of the
-        // block
-        for (auto &loopBodyOp : doLoop.getBody()->getOperations()) {
-          if (auto resultOp = mlir::dyn_cast<fir::ResultOp>(loopBodyOp)) {
-            rewriter.moveOpBefore(resultOp.getOperation(),
-                                  &doLoop.getBody()->back());
-            break;
-          }
-        }
-
-        // Remove omp.terminator at the end of the loop body
-        if (auto terminatorOp = mlir::dyn_cast<mlir::omp::TerminatorOp>(
-                doLoop.getBody()->back())) {
-          rewriter.eraseOp(terminatorOp);
-        }
-      } else {
-        rewriter.inlineRegionBefore(cLoopOp->getRegion(0), doLoop.getBody());
-        auto indVarArg = doLoop.getBody()->getArgument(0);
-        // fir::convertDoLoopToCFG expects the induction variable to be of type
-        // index while the OpenMP CanonicalLoopOp can have indices of different
-        // types. We need to work around it.
-        if (indVarArg.getType() != indexType)
-          indVarArg.setType(indexType);
-
-        // fir.do_loop, unlike omp.canonical_loop does not support multi-block
-        // regions. If we're dealing with multiple blocks inside omp.loop_nest,
-        // we need to convert it into basic control-flow operations instead.
-        auto loopBlocks =
-            fir::convertDoLoopToCFG(doLoop, rewriter, false, false);
-        auto *conditionalBlock = loopBlocks.first;
-        auto *firstBlock =
-            conditionalBlock->getNextNode(); // Start of the loop body
-        auto *lastBlock = loopBlocks.second; // Incrementing induction variables
-
-        // Incrementing the induction variable is handled elsewhere
-        if (auto addIOp =
-                mlir::dyn_cast<mlir::arith::AddIOp>(firstBlock->front())) {
-          rewriter.replaceOpUsesWithinBlock(addIOp, addIOp.getLhs(),
-                                            firstBlock);
-          rewriter.eraseOp(addIOp);
-        }
-
-        // If the induction variable is used within the loop and was originally
-        // not of type index, then we need to add a convert to the original type
-        // and replace its uses inside the loop body.
-        if (oldIndexType != indexType) {
-          indVarArg = conditionalBlock->getArgument(0);
-          builder.setInsertionPointToStart(firstBlock);
-          auto convertedIndVar =
-              builder.createConvert(loc, oldIndexType, indVarArg);
-          rewriter.replaceUsesWithIf(
-              indVarArg, convertedIndVar, [&](auto &use) -> bool {
-                return use.getOwner() != convertedIndVar.getDefiningOp() &&
-                       use.getOwner()->getBlock() != lastBlock;
-              });
-        }
-
-        // There might be an unused convert and an unused argument to the block.
-        // If so, remove them.
-        if (lastBlock->front().getUses().empty())
-          lastBlock->front().erase();
-        for (auto arg : lastBlock->getArguments()) {
-          if (arg.getUses().empty())
-            lastBlock->eraseArgument(arg.getArgNumber());
-        }
-
-        // Any loop blocks that end in omp.terminator should just branch to
-        // lastBlock.
-        for (auto *loopBlock = conditionalBlock; loopBlock != lastBlock;
-             loopBlock = loopBlock->getNextNode()) {
-          if (auto terminatorOp =
-                  mlir::dyn_cast<mlir::omp::TerminatorOp>(loopBlock->back())) {
-            builder.setInsertionPointToEnd(loopBlock);
-            mlir::cf::BranchOp::create(builder, loc, lastBlock);
-            rewriter.eraseOp(terminatorOp);
-          }
-        }
-      }
-
-      rewriter.eraseOp(cLoopOp);
-      // Handle the optional omp.new_cli op
-      if (cli) {
-        // cli will be used by omp.unroll_heuristic ops
-        for (auto *user : cli.getUsers())
-          rewriter.eraseOp(user);
-        rewriter.eraseOp(cli.getDefiningOp());
-      }
-
-      return mlir::success();
-    }
+    fir::FirOpBuilder builder(rewriter, op);
+    mlir::Location loc = op->getLoc();
 
     auto inlineSimpleOp = [&](mlir::Operation *ompOp) -> bool {
       if (!ompOp)
@@ -472,23 +170,14 @@ class SimdOnlyConversionPattern : public mlir::RewritePattern {
       return true;
     };
 
+    // Remove ops that will be surrounding simd once a composite simd construct
+    // goes through the codegen stage. All of the other ones should have alredy
+    // been removed in the parse tree rewriting stage.
     if (inlineSimpleOp(mlir::dyn_cast<mlir::omp::TeamsOp>(op)) ||
         inlineSimpleOp(mlir::dyn_cast<mlir::omp::ParallelOp>(op)) ||
-        inlineSimpleOp(mlir::dyn_cast<mlir::omp::SingleOp>(op)) ||
-        inlineSimpleOp(mlir::dyn_cast<mlir::omp::SectionOp>(op)) ||
-        inlineSimpleOp(mlir::dyn_cast<mlir::omp::SectionsOp>(op)) ||
-        inlineSimpleOp(mlir::dyn_cast<mlir::omp::WsloopOp>(op)) ||
-        inlineSimpleOp(mlir::dyn_cast<mlir::omp::LoopOp>(op)) ||
         inlineSimpleOp(mlir::dyn_cast<mlir::omp::TargetOp>(op)) ||
-        inlineSimpleOp(mlir::dyn_cast<mlir::omp::TargetDataOp>(op)) ||
-        inlineSimpleOp(mlir::dyn_cast<mlir::omp::DistributeOp>(op)) ||
-        inlineSimpleOp(mlir::dyn_cast<mlir::omp::TaskOp>(op)) ||
-        inlineSimpleOp(mlir::dyn_cast<mlir::omp::TaskloopOp>(op)) ||
-        inlineSimpleOp(mlir::dyn_cast<mlir::omp::MasterOp>(op)) ||
-        inlineSimpleOp(mlir::dyn_cast<mlir::omp::CriticalOp>(op)) ||
-        inlineSimpleOp(mlir::dyn_cast<mlir::omp::OrderedRegionOp>(op)) ||
-        inlineSimpleOp(mlir::dyn_cast<mlir::omp::AtomicCaptureOp>(op)) ||
-        inlineSimpleOp(mlir::dyn_cast<mlir::omp::MaskedOp>(op)))
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::WsloopOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::DistributeOp>(op)))
       return mlir::success();
 
     op->emitOpError("left unhandled after SimdOnly pass.");
@@ -515,7 +204,7 @@ class SimdOnlyPass : public flangomp::impl::SimdOnlyPassBase<SimdOnlyPass> {
 
     if (mlir::failed(
             mlir::applyPatternsGreedily(module, std::move(patterns), config))) {
-      mlir::emitError(module.getLoc(), "error in simd-only conversion pass");
+      mlir::emitError(module.getLoc(), "Error in SimdOnly conversion pass");
       signalPassFailure();
     }
   }
diff --git a/flang/lib/Semantics/rewrite-parse-tree.cpp b/flang/lib/Semantics/rewrite-parse-tree.cpp
index 4eeb1b9ed3c1e..a1a127fa22121 100644
--- a/flang/lib/Semantics/rewrite-parse-tree.cpp
+++ b/flang/lib/Semantics/rewrite-parse-tree.cpp
@@ -41,11 +41,23 @@ class RewriteMutator {
 
   void Post(parser::Name &);
   bool Pre(parser::MainProgram &);
+  bool Pre(parser::Module &);
   bool Pre(parser::FunctionSubprogram &);
   bool Pre(parser::SubroutineSubprogram &);
   bool Pre(parser::SeparateModuleSubprogram &);
   bool Pre(parser::BlockConstruct &);
+  bool Pre(parser::Block &);
+  bool Pre(parser::DoConstruct &);
+  bool Pre(parser::IfConstruct &);
   bool Pre(parser::ActionStmt &);
+  void Post(parser::MainProgram &);
+  void Post(parser::FunctionSubprogram &);
+  void Post(parser::SubroutineSubprogram &);
+  void Post(parser::SeparateModuleSubprogram &);
+  void Post(parser::BlockConstruct &);
+  void Post(parser::Block &);
+  void Post(parser::DoConstruct &);
+  void Post(parser::IfConstruct &);
   void Post(parser::ReadStmt &);
   void Post(parser::WriteStmt &);
 
@@ -67,8 +79,15 @@ class RewriteMutator {
   bool Pre(parser::EndSubroutineStmt &) { return false; }
   bool Pre(parser::EndTypeStmt &) { return false; }
 
+  bool Pre(parser::OpenMPBlockConstruct &);
+  bool Pre(parser::OpenMPLoopConstruct &);
+  void Post(parser::OpenMPBlockConstruct &);
+  void Post(parser::OpenMPLoopConstruct &);
+
 private:
   void FixMisparsedStmtFuncs(parser::SpecificationPart &, parser::Block &);
+  void OpenMPSimdOnly(parser::Block &, bool);
+  void OpenMPSimdOnly(parser::SpecificationPart &);
 
   SemanticsContext &context_;
   bool errorOnUnresolvedName_{true};
@@ -96,6 +115,154 @@ static bool ReturnsDataPointer(const Symbol &symbol) {
   return false;
 }
 
+static bool LoopConstructIsSIMD(parser::OpenMPLoopConstruct *ompLoop) {
+  auto &begin = std::get<parser::OmpBeginLoopDirective>(ompLoop->t);
+  auto directive = std::get<parser::OmpLoopDirective>(begin.t).v;
+  if (directive == llvm::omp::OMPD_simd ||
+      directive == llvm::omp::OMPD_do_simd ||
+      directive == llvm::omp::OMPD_target_simd ||
+      directive == llvm::omp::OMPD_taskloop_simd ||
+      directive == llvm::omp::OMPD_distribute_simd ||
+      directive == llvm::omp::OMPD_teams_distribute_simd ||
+      directive == llvm::omp::OMPD_teams_distribute_parallel_do_simd ||
+      directive == llvm::omp::OMPD_target_teams_distribute_simd ||
+      directive == llvm::omp::OMPD_target_teams_distribute_parallel_do_simd ||
+      directive == llvm::omp::OMPD_parallel_do_simd) {
+    return true;
+  }
+  return false;
+}
+
+// Remove non-SIMD OpenMPConstructs once they are parsed.
+// This massively simplifies the logic inside the SimdOnlyPass for
+// -fopenmp-simd.
+void RewriteMutator::OpenMPSimdOnly(parser::SpecificationPart &specPart) {
+  auto &list{std::get<std::list<parser::DeclarationConstruct>>(specPart.t)};
+  for (auto it{list.begin()}; it != list.end();) {
+    if (auto *specConstr{std::get_if<parser::SpecificationConstruct>(&it->u)}) {
+      if (auto *ompDecl{std::get_if<
+              common::Indirection<parser::OpenMPDeclarativeConstruct>>(
+              &specConstr->u)}) {
+        if (std::holds_alternative<parser::OpenMPThreadprivate>(
+                ompDecl->value().u) ||
+            std::holds_alternative<parser::OpenMPDeclareMapperConstruct>(
+                ompDecl->value().u)) {
+          it = list.erase(it);
+          continue;
+        }
+      }
+    }
+    ++it;
+  }
+}
+
+// Remove non-SIMD OpenMPConstructs once they are parsed.
+// This massively simplifies the logic inside the SimdOnlyPass for
+// -fopenmp-simd. `isNonSimdLoopBody` should be set to true if `block` is the
+// body of a non-simd OpenMP loop. This is to indicate that scan constructs
+// should be removed from the body, where they would be kept if it were a simd
+// loop.
+void RewriteMutator::OpenMPSimdOnly(
+    parser::Block &block, bool isNonSimdLoopBody = false) {
+  using ExecutionListIterator =
+      std::_List_iterator<parser::ExecutionPartConstruct>;
+  auto replaceInlineBlock =
+      [&](std::list<parser::ExecutionPartConstruct> &block,
+          ExecutionListIterator it) -> ExecutionListIterator {
+    auto insertPos = std::next(it);
+    block.splice(insertPos, block);
+    block.erase(it);
+    return insertPos;
+  };
+
+  for (auto it{block.begin()}; it != block.end();) {
+    if (auto *stmt{std::get_if<parser::ExecutableConstruct>(&it->u)}) {
+      if (auto *omp{std::get_if<common::Indirection<parser::OpenMPConstruct>>(
+              &stmt->u)}) {
+        if (auto *ompStandalone{std::get_if<parser::OpenMPStandaloneConstruct>(
+                &omp->value().u)}) {
+          if (std::holds_alternative<parser::OpenMPCancelConstruct>(
+                  ompStandalone->u) ||
+              std::holds_alternative<parser::OpenMPFlushConstruct>(
+                  ompStandalone->u) ||
+              std::holds_alternative<parser::OpenMPCancellationPointConstruct>(
+                  ompStandalone->u)) {
+            it = block.erase(it);
+            continue;
+          }
+          if (auto *constr{std::get_if<parser::OpenMPSimpleStandaloneConstruct>(
+                  &ompStandalone->u)}) {
+            auto directive = constr->v.DirId();
+            // Scan should only be removed from non-simd loops
+            if (isNonSimdLoopBody && directive == llvm::omp::OMPD_scan) {
+              it = block.erase(it);
+              continue;
+            }
+            if (directive == llvm::omp::OMPD_taskyield ||
+                directive == llvm::omp::OMPD_barrier ||
+                directive == llvm::omp::OMPD_ordered ||
+                directive == llvm::omp::OMPD_target_enter_data ||
+                directive == llvm::omp::OMPD_target_exit_data ||
+                directive == llvm::omp::OMPD_target_update ||
+                directive == llvm::omp::OMPD_taskwait) {
+              it = block.erase(it);
+              continue;
+            }
+          }
+        } else if (auto *ompBlock{std::get_if<parser::OpenMPBlockConstruct>(
+                       &omp->value().u)}) {
+          it = replaceInlineBlock(std::get<parser::Block>(ompBlock->t), it);
+          continue;
+        } else if (auto *ompLoop{std::get_if<parser::OpenMPLoopConstruct>(
+                       &omp->value().u)}) {
+          if (LoopConstructIsSIMD(ompLoop)) {
+            ++it;
+            continue;
+          }
+          auto &nest =
+              std::get<std::optional<parser::NestedConstruct>>(ompLoop->t);
+
+          if (auto *doConstruct =
+                  std::get_if<parser::DoConstruct>(&nest.value())) {
+            auto &loopBody = std::get<parser::Block>(doConstruct->t);
+            // We can only remove some constructs from a loop when it's _not_ a
+            // OpenMP simd loop
+            OpenMPSimdOnly(loopBody, /*isNonSimdLoopBody=*/true);
+            auto newLoop = parser::ExecutionPartConstruct{
+                parser::ExecutableConstruct{std::move(*doConstruct)}};
+            it = block.erase(it);
+            block.insert(it, std::move(newLoop));
+            continue;
+          }
+        } else if (auto *ompCon{std::get_if<parser::OpenMPSectionsConstruct>(
+                       &omp->value().u)}) {
+          auto &sections =
+              std::get<std::list<parser::OpenMPConstruct>>(ompCon->t);
+          auto insertPos = std::next(it);
+          for (auto &sectionCon : sections) {
+            auto &section =
+                std::get<parser::OpenMPSectionConstruct>(sectionCon.u);
+            auto &innerBlock = std::get<parser::Block>(section.t);
+            block.splice(insertPos, innerBlock);
+          }
+          block.erase(it);
+          it = insertPos;
+          continue;
+        } else if (auto *atomic{std::get_if<parser::OpenMPAtomicConstruct>(
+                       &omp->value().u)}) {
+          it = replaceInlineBlock(std::get<parser::Block>(atomic->t), it);
+          continue;
+        } else if (auto *critical{std::get_if<parser::OpenMPCriticalConstruct>(
+                       &omp->value().u)}) {
+          it = replaceInlineBlock(std::get<parser::Block>(critical->t), it);
+          continue;
+        }
+      }
+    }
+    ++it;
+  }
+}
+
 // Finds misparsed statement functions in a specification part, rewrites
 // them into array element assignment statements, and moves them into the
 // beginning of the corresponding (execution part's) block.
@@ -133,33 +300,153 @@ void RewriteMutator::FixMisparsedStmtFuncs(
 bool RewriteMutator::Pre(parser::MainProgram &program) {
   FixMisparsedStmtFuncs(std::get<parser::SpecificationPart>(program.t),
       std::get<parser::ExecutionPart>(program.t).v);
+  if (context_.langOptions().OpenMPSimd) {
+    OpenMPSimdOnly(std::get<parser::ExecutionPart>(program.t).v);
+    OpenMPSimdOnly(std::get<parser::SpecificationPart>(program.t));
+  }
+  return true;
+}
+
+void RewriteMutator::Post(parser::MainProgram &program) {
+  if (context_.langOptions().OpenMPSimd) {
+    OpenMPSimdOnly(std::get<parser::ExecutionPart>(program.t).v);
+  }
+}
+
+bool RewriteMutator::Pre(parser::Module &module) {
+  if (context_.langOptions().OpenMPSimd) {
+    OpenMPSimdOnly(std::get<parser::SpecificationPart>(module.t));
+  }
   return true;
 }
 
 bool RewriteMutator::Pre(parser::FunctionSubprogram &func) {
   FixMisparsedStmtFuncs(std::get<parser::SpecificationPart>(func.t),
       std::get<parser::ExecutionPart>(func.t).v);
+  if (context_.langOptions().OpenMPSimd) {
+    OpenMPSimdOnly(std::get<parser::ExecutionPart>(func.t).v);
+  }
   return true;
 }
 
+void RewriteMutator::Post(parser::FunctionSubprogram &func) {
+  if (context_.langOptions().OpenMPSimd) {
+    OpenMPSimdOnly(std::get<parser::ExecutionPart>(func.t).v);
+  }
+}
+
 bool RewriteMutator::Pre(parser::SubroutineSubprogram &subr) {
   FixMisparsedStmtFuncs(std::get<parser::SpecificationPart>(subr.t),
       std::get<parser::ExecutionPart>(subr.t).v);
+  if (context_.langOptions().OpenMPSimd) {
+    OpenMPSimdOnly(std::get<parser::ExecutionPart>(subr.t).v);
+  }
   return true;
 }
 
+void RewriteMutator::Post(parser::SubroutineSubprogram &subr) {
+  if (context_.langOptions().OpenMPSimd) {
+    OpenMPSimdOnly(std::get<parser::ExecutionPart>(subr.t).v);
+  }
+}
+
 bool RewriteMutator::Pre(parser::SeparateModuleSubprogram &subp) {
   FixMisparsedStmtFuncs(std::get<parser::SpecificationPart>(subp.t),
       std::get<parser::ExecutionPart>(subp.t).v);
+  if (context_.langOptions().OpenMPSimd) {
+    OpenMPSimdOnly(std::get<parser::ExecutionPart>(subp.t).v);
+  }
   return true;
 }
 
+void RewriteMutator::Post(parser::SeparateModuleSubprogram &subp) {
+  if (context_.langOptions().OpenMPSimd) {
+    OpenMPSimdOnly(std::get<parser::ExecutionPart>(subp.t).v);
+  }
+}
+
 bool RewriteMutator::Pre(parser::BlockConstruct &block) {
   FixMisparsedStmtFuncs(std::get<parser::BlockSpecificationPart>(block.t).v,
       std::get<parser::Block>(block.t));
+  if (context_.langOptions().OpenMPSimd) {
+    OpenMPSimdOnly(std::get<parser::Block>(block.t));
+  }
+  return true;
+}
+
+void RewriteMutator::Post(parser::BlockConstruct &block) {
+  if (context_.langOptions().OpenMPSimd) {
+    OpenMPSimdOnly(std::get<parser::Block>(block.t));
+  }
+}
+
+bool RewriteMutator::Pre(parser::Block &block) {
+  if (context_.langOptions().OpenMPSimd) {
+    OpenMPSimdOnly(block);
+  }
+  return true;
+}
+
+void RewriteMutator::Post(parser::Block &block) { this->Pre(block); }
+
+bool RewriteMutator::Pre(parser::OpenMPBlockConstruct &block) {
+  if (context_.langOptions().OpenMPSimd) {
+    auto &innerBlock = std::get<parser::Block>(block.t);
+    OpenMPSimdOnly(innerBlock);
+  }
+  return true;
+}
+
+void RewriteMutator::Post(parser::OpenMPBlockConstruct &block) {
+  this->Pre(block);
+}
+
+bool RewriteMutator::Pre(parser::OpenMPLoopConstruct &ompLoop) {
+  if (context_.langOptions().OpenMPSimd) {
+    if (LoopConstructIsSIMD(&ompLoop)) {
+      return true;
+    }
+    // If we're looking at a non-simd OpenMP loop, we need to explicitly
+    // call OpenMPSimdOnly on the nested loop block while indicating where
+    // the block comes from.
+    auto &nest = std::get<std::optional<parser::NestedConstruct>>(ompLoop.t);
+    if (nest.has_value()) {
+      auto &doConstruct = std::get<parser::DoConstruct>(nest.value());
+      auto &innerBlock = std::get<parser::Block>(doConstruct.t);
+      OpenMPSimdOnly(innerBlock, /*isNonSimdLoopBody=*/true);
+    }
+  }
   return true;
 }
 
+void RewriteMutator::Post(parser::OpenMPLoopConstruct &ompLoop) {
+  this->Pre(ompLoop);
+}
+
+bool RewriteMutator::Pre(parser::DoConstruct &doConstruct) {
+  if (context_.langOptions().OpenMPSimd) {
+    auto &innerBlock = std::get<parser::Block>(doConstruct.t);
+    OpenMPSimdOnly(innerBlock);
+  }
+  return true;
+}
+
+void RewriteMutator::Post(parser::DoConstruct &doConstruct) {
+  this->Pre(doConstruct);
+}
+
+bool RewriteMutator::Pre(parser::IfConstruct &ifConstruct) {
+  if (context_.langOptions().OpenMPSimd) {
+    auto &innerBlock = std::get<parser::Block>(ifConstruct.t);
+    OpenMPSimdOnly(innerBlock);
+  }
+  return true;
+}
+
+void RewriteMutator::Post(parser::IfConstruct &ifConstruct) {
+  this->Pre(ifConstruct);
+}
+
 // Rewrite PRINT NML -> WRITE(*,NML=NML)
 bool RewriteMutator::Pre(parser::ActionStmt &x) {
   if (auto *print{std::get_if<common::Indirection<parser::PrintStmt>>(&x.u)};
diff --git a/flang/test/Semantics/OpenMP/simd-only.f90 b/flang/test/Semantics/OpenMP/simd-only.f90
new file mode 100644
index 0000000000000..da42b10d73bed
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/simd-only.f90
@@ -0,0 +1,416 @@
+! RUN: %flang_fc1 -fopenmp-simd -fdebug-dump-parse-tree %s 2>&1 | FileCheck %s
+
+! Test that non-SIMD OpenMPConstructs are removed on the parse tree level
+! when -fopenmp-simd is specified.
+! Tests the logic in lib/Semantics/rewrite-parse-tree.cpp
+
+! CHECK-LABEL: Name = 'test_simd'
+subroutine test_simd()
+  integer :: i
+
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK: OmpLoopDirective -> llvm::omp::Directive = simd
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp simd
+  do i = 1, 100
+  end do
+end subroutine
+
+! CHECK-LABEL: Name = 'test_do_simd'
+subroutine test_do_simd()
+  integer :: i
+
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK: OmpLoopDirective -> llvm::omp::Directive = do simd
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp do simd
+  do i = 1, 100
+  end do
+end subroutine
+
+
+! CHECK-LABEL: Name = 'test_parallel_do_simd'
+subroutine test_parallel_do_simd()
+  integer :: i
+
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK: OmpLoopDirective -> llvm::omp::Directive = parallel do simd
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp parallel do simd
+  do i = 1, 100
+  end do
+end subroutine
+
+! CHECK-LABEL: Name = 'test_simd_scan'
+subroutine test_simd_scan()
+  integer :: i
+  real :: sum
+
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK: OmpLoopDirective -> llvm::omp::Directive = simd
+  !$omp simd reduction(inscan,+:sum)
+  do i = 1, N
+    sum = sum + a(i)
+    ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPSimpleStandaloneConstruct -> OmpDirectiveSpecification
+    ! CHECK: OmpDirectiveName -> llvm::omp::Directive = scan
+    !$omp scan inclusive(sum)
+    sum       = sum + a(i)
+  end do
+
+end subroutine
+
+! CHECK-LABEL: Name = 'test_simd_atomic'
+subroutine test_simd_atomic()
+  integer :: i, x
+
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK: OmpLoopDirective -> llvm::omp::Directive = simd
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp simd
+  do i = 1, 100
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=i'
+  !$omp atomic write
+  x = i
+  end do
+end subroutine
+
+! CHECK-LABEL: Name = 'test_do'
+subroutine test_do()
+  integer :: i
+
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = do
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp parallel do
+  do i = 1, 100
+  end do
+end subroutine
+
+! CHECK-LABEL: Name = 'test_do_nested'
+subroutine test_do_nested()
+  integer :: i
+
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = parallel do
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp parallel do
+  do i = 1, 100
+    do j = 1, 100
+    end do
+  end do
+end subroutine
+
+! CHECK-LABEL: Name = 'test_target'
+subroutine test_target()
+  integer :: i
+
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockct
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = target
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp target
+  do i = 1, 100
+  end do
+  !$omp end target
+end subroutine
+
+! CHECK-LABEL: Name = 'test_target_teams_distribute'
+subroutine test_target_teams_distribute()
+  integer :: i
+
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = target teams distribute
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp target teams distribute
+  do i = 1, 100
+  end do
+  !$omp end target teams distribute
+end subroutine
+
+
+! CHECK-LABEL: Name = 'test_target_data'
+subroutine test_target_data()
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = target data
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp target data map(to: A) map(tofrom: B)
+  do i = 1, 100
+  end do
+  !$omp end target data
+end subroutine
+
+! CHECK-LABEL: Name = 'test_loop'
+subroutine test_loop()
+  integer :: i
+
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = loop
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp loop bind(thread)
+  do i = 1, 100
+  end do
+end subroutine
+
+! CHECK-LABEL: Name = 'test_unroll'
+subroutine test_unroll()
+  integer :: i
+
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = unroll
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp unroll
+  do i = 1, 100
+  end do
+end subroutine
+
+! CHECK-LABEL: Name = 'test_do_ordered'
+subroutine test_do_ordered()
+  integer :: i, x
+  x = 0
+
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = do
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp do ordered
+  do i = 1, 100
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = ordered
+  !$omp ordered
+  x = x + 1
+  !$omp end ordered
+  end do
+end subroutine
+
+! CHECK-LABEL: Name = 'test_cancel'
+subroutine test_cancel()
+  integer :: i, x
+  x = 0
+
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = parallel do
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp parallel do
+  do i = 1, 100
+  if (i == 10) then
+    ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPCancelConstruct -> OmpDirectiveSpecification
+    ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = cancel
+    !$omp cancel do
+  end if
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPCancellationPointConstruct -> OmpDirectiveSpecification
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = cancellation point
+  !$omp cancellation point do
+  end do
+end subroutine
+
+! CHECK-LABEL: Name = 'test_scan'
+subroutine test_scan()
+  integer :: i, sum
+
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = parallel do
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp parallel do reduction(inscan, +: sum)
+  do i = 1, n
+    sum = sum + i
+    ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPSimpleStandaloneConstruct -> OmpDirectiveSpecification
+    ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = scan
+    !$omp scan inclusive(sum)
+  end do
+  !$omp end parallel do
+end subroutine
+
+! CHECK-LABEL: Name = 'test_target_map'
+subroutine test_target_map()
+  integer :: array(10)
+
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = target
+  !$omp target map(tofrom: array(2:10))
+    array(2) = array(2) * 2
+  !$omp end target
+end subroutine
+
+! CHECK-LABEL: Name = 'test_sections'
+subroutine test_sections()
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPSectionsConstruct
+  !$omp sections
+  ! CHECK-NOT: OpenMPConstruct -> OpenMPSectionConstruct
+  !$omp section
+  ! CHECK-NOT: OpenMPConstruct -> OpenMPSectionConstruct
+  !$omp section
+  !$omp end sections
+end subroutine
+
+! CHECK-LABEL: Name = 'test_threadprivate_mod'
+module test_threadprivate_mod
+  implicit none
+  ! CHECK: DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt
+  ! CHECK: Name = 'x'
+  ! CHECK: Name = 'y'
+  integer :: x, y
+  ! CHECK: DeclarationConstruct -> SpecificationConstruct -> OtherSpecificationStmt -> CommonStmt
+  ! CHECK: Name = 'x'
+  ! CHECK: Name = 'y'
+  common /vars/ x, y
+  ! CHECK-NOT: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPThreadprivate
+  !$omp threadprivate(/vars/)
+end module
+
+! CHECK-LABEL: Name = 'test_atomic'
+subroutine test_atomic()
+  real :: z, x, y
+  !$omp parallel private(tid, z)
+    ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct
+    ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=y'
+    !$omp atomic write
+      x = y
+    ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct
+    ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'z=x'
+    !$omp atomic read
+      z = x
+    ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct
+    ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=x+1._4'
+    !$omp atomic update
+      x = x + 1
+    ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct
+    ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'z=x'
+    !$omp atomic read
+      z = x
+    ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct
+    ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=x+y'
+    !$omp atomic capture
+      x   = x + y
+    !$omp end atomic
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL: Name = 'test_task_single_taskwait'
+subroutine test_task_single_taskwait()
+  integer :: x
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = parallel
+  !$omp parallel
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = single
+  !$omp single
+    ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+    do i = 1, 5
+      ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+      ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = task
+      ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=i'
+      !$omp task
+      x = i
+      !$omp end task
+    end do
+    ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPSimpleStandaloneConstruct -> OmpDirectiveSpecification
+    ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = taskwait
+    !$omp taskwait
+  !$omp end single
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL: Name = 'test_task_taskyield_flush_barrier'
+subroutine test_task_taskyield_flush_barrier()
+  integer :: x, i
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = parallel
+  !$omp parallel
+    ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPSimpleStandaloneConstruct -> OmpDirectiveSpecification
+    ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = barrier
+    !$omp barrier
+    ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+    ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = single
+    !$omp single
+      ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+      ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = task
+      !$omp task
+        ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPSimpleStandaloneConstruct -> OmpDirectiveSpecification
+        ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = taskyield
+        !$omp taskyield
+        ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=i'
+        x = i
+        ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPFlushConstruct -> OmpDirectiveSpecification
+        !$omp flush
+      !$omp end task
+      ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+      ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = task
+      !$omp task
+        ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPFlushConstruct -> OmpDirectiveSpecification
+        !$omp flush
+      !$omp end task
+      ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPSimpleStandaloneConstruct -> OmpDirectiveSpecification
+      ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = taskwait
+      !$omp taskwait
+    !$omp end single
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL: Name = 'test_master_masked'
+subroutine test_master_masked()
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = parallel
+  !$omp parallel private(tid)
+    ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+    ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = masked
+    !$omp masked
+    ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=y'
+    x = y
+    !$omp end masked
+    ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+    ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = master
+    !$omp master
+    ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'y=x'
+    y = x
+    !$omp end master
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL: Name = 'test_critical'
+subroutine test_critical()
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = parallel
+  !$omp parallel do private(i)
+  do i = 1, 4
+    !$omp critical(mylock)
+    ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=y'
+    x = y
+    !$omp end critical(mylock)
+  end do
+  !$omp end parallel do
+end subroutine
+
+! CHECK-LABEL: Name = 'test_target_enter_exit_update_data'
+subroutine test_target_enter_exit_update_data()
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPSimpleStandaloneConstruct -> OmpDirectiveSpecification
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = target enter data
+  !$omp target enter data map(to: A)
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = target teams distribute parallel do
+  !$omp target teams distribute parallel do
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  do i = 1, n
+    ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=y'
+    x = y
+  end do
+  !$omp end target teams distribute parallel do
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPSimpleStandaloneConstruct -> OmpDirectiveSpecification
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = target update
+  !$omp target update from(A)
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPSimpleStandaloneConstruct -> OmpDirectiveSpecification
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = target exit data
+  !$omp target exit data map(from: A)
+end subroutine
+
+! CHECK-LABEL: Name = 'test_declare_mapper'
+module test_declare_mapper
+  implicit none
+
+  type :: myvec_t
+    integer               :: len
+    real, allocatable     :: data(:)
+  end type myvec_t
+
+  ! CHECK-NOT: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareMapperConstruct
+  !$omp declare mapper(myvec_t :: v) map(v, v%data(1:v%len))
+end module
diff --git a/flang/test/Transforms/OpenMP/simd-only.mlir b/flang/test/Transforms/OpenMP/simd-only.mlir
index d768823565677..0025d10fbd21a 100644
--- a/flang/test/Transforms/OpenMP/simd-only.mlir
+++ b/flang/test/Transforms/OpenMP/simd-only.mlir
@@ -1,6 +1,7 @@
 // RUN: fir-opt --split-input-file --verify-diagnostics --omp-simd-only %s | FileCheck %s
 
 // Check that simd operations are not removed and rewritten, but all the other OpenMP ops are.
+// Tests the logic in flang/lib/Optimizer/OpenMP/SimdOnly.cpp
 
 // CHECK: omp.private
 // CHECK-LABEL: func.func @simd
@@ -67,8 +68,6 @@ func.func @parallel(%arg0: i32, %arg1: !fir.ref<i32>) {
     %18:2 = fir.do_loop %arg4 = %15 to %16 step %c1 iter_args(%arg2 = %arg0) -> (index, i32) {
       // CHECK: fir.store
       fir.store %arg0 to %arg1 : !fir.ref<i32>
-      // CHECK-NOT: omp.barrier
-      omp.barrier
       fir.result %arg4, %arg2 : index, i32
     }
     // CHECK-NOT: omp.terminator
@@ -79,67 +78,6 @@ func.func @parallel(%arg0: i32, %arg1: !fir.ref<i32>) {
 
 // -----
 
-// CHECK-LABEL: func.func @do
-func.func @do(%arg5: i32, %arg6: !fir.ref<i32>) {
-  // CHECK: %[[C1:.*]] = arith.constant 1 : index
-  %c1_i32 = arith.constant 1 : i32
-  // CHECK: %[[C100:.*]] = fir.convert %c100_i32 : (i32) -> index
-  %c100_i32 = arith.constant 100 : i32
-  // CHECK-NOT: omp.wsloop
-  omp.wsloop {
-    // CHECK-NOT: omp.loop_nest
-    // CHECK: fir.do_loop %[[IVAR:.*]] = %[[C1]] to %[[C100]] step %[[C1]]
-    omp.loop_nest (%arg1) : i32 = (%c1_i32) to (%c100_i32) inclusive step (%c1_i32) {
-      // CHECK: fir.store
-      fir.store %arg5 to %arg6 : !fir.ref<i32>
-      // CHECK-NOT: omp.yield
-      omp.yield
-    }
-  }
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func.func @do_nested
-func.func @do_nested(%arg5: i32, %arg6: !fir.ref<i32>) {
-  // CHECK: %[[C1:.*]] = arith.constant 1 : index
-  %c1_i32 = arith.constant 1 : i32
-  %c100_i32 = arith.constant 100 : i32
-  %c200_i32 = arith.constant 200 : i32
-  // CHECK-NOT: omp.wsloop
-  omp.wsloop {
-    // CHECK: %[[C200:.*]] = fir.convert %c200_i32 : (i32) -> index
-    // CHECK-NOT: omp.loop_nest
-    // CHECK: fir.do_loop %[[IVAR_1:.*]] = %[[C1]] to %[[C200]] step %[[C1]]
-    // CHECK: %[[C100:.*]] = fir.convert %c100_i32 : (i32) -> index
-    // CHECK: fir.do_loop %[[IVAR_2:.*]] = %[[C1]] to %[[C100]] step %[[C1]]
-    omp.loop_nest (%arg2, %arg3) : i32 = (%c1_i32, %c1_i32) to (%c200_i32, %c100_i32) inclusive step (%c1_i32, %c1_i32) {
-      // CHECK: fir.store
-      fir.store %arg5 to %arg6 : !fir.ref<i32>
-      // CHECK-NOT: omp.yield
-      omp.yield
-    }
-  }
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func.func @single
-func.func @single(%arg0: i32, %arg1: !fir.ref<i32>) {
-  // CHECK-NOT: omp.single
-  omp.single {
-    // CHECK: fir.store
-    fir.store %arg0 to %arg1 : !fir.ref<i32>
-    // CHECK-NOT: omp.terminator
-    omp.terminator
-  }
-  return
-}
-
-// -----
-
 // CHECK-LABEL: func.func @target_map(
 // CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
 func.func @target_map(%arg5: i32, %arg6: !fir.ref<i32>) {
@@ -159,26 +97,6 @@ func.func @target_map(%arg5: i32, %arg6: !fir.ref<i32>) {
 
 // -----
 
-// CHECK-LABEL: func.func @task(
-// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
-omp.private {type = private} @_QFEi_private_i32 : i32
-func.func @task(%arg5: i32, %arg6: !fir.ref<i32>) {
-  // CHECK-NOT: omp.task
-  omp.task private(@_QFEi_private_i32 %arg6 -> %arg2 : !fir.ref<i32>) {
-    // CHECK: fir.store %[[ARG_0]] to %[[ARG_1]]
-    fir.store %arg5 to %arg2 : !fir.ref<i32>
-    // CHECK-NOT: omp.flush
-    omp.flush
-    // CHECK-NOT: omp.taskyield
-    omp.taskyield
-    // CHECK-NOT: omp.terminator
-    omp.terminator
-  }
-  return
-}
-
-// -----
-
 // CHECK-LABEL: func.func @teams
 func.func @teams(%arg0: i32, %arg1: !fir.ref<i32>) {
   // CHECK-NOT: omp.teams
@@ -193,25 +111,25 @@ func.func @teams(%arg0: i32, %arg1: !fir.ref<i32>) {
 
 // -----
 
-// CHECK-LABEL: func.func @distribute
-func.func @distribute(%arg0: i32, %arg1: i32, %arg2: !fir.ref<i32>) {
+// CHECK-LABEL: func.func @distribute_simd
+func.func @distribute_simd(%arg0: i32, %arg1: !fir.ref<i32>) {
   %c1_i32 = arith.constant 1 : i32
-  // CHECK-NOT: omp.teams
-  omp.teams {
-    // CHECK-NOT: omp.distribute
-    omp.distribute {
-      // CHECK-NOT: omp.loop_nest
-      // CHECK: fir.do_loop
-      omp.loop_nest (%arg5) : i32 = (%arg0) to (%arg1) inclusive step (%c1_i32) {
+  %c100000_i32 = arith.constant 100000 : i32
+  // CHECK-NOT: omp.distribute
+  omp.distribute {
+    // CHECK: omp.simd
+    omp.simd {
+      // CHECK: omp.loop_nest
+      omp.loop_nest (%arg3) : i32 = (%c1_i32) to (%c100000_i32) inclusive step (%c1_i32) {
         // CHECK: fir.store
-        fir.store %arg0 to %arg2 : !fir.ref<i32>
-        // CHECK-NOT: omp.yield
+        fir.store %arg0 to %arg1 : !fir.ref<i32>
+        // CHECK: omp.yield
         omp.yield
       }
-    }
-    // CHECK-NOT: omp.terminator
-    omp.terminator
-  }
+    // CHECK-NOT: {omp.composite}
+    } {omp.composite}
+  // CHECK-NOT: {omp.composite}
+  } {omp.composite}
   return
 }
 
@@ -229,328 +147,6 @@ func.func @threadprivate(%arg0: i32, %arg1: !fir.ref<i32>) {
 
 // -----
 
-// CHECK-LABEL: func.func @taskloop(
-// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
-func.func @taskloop(%funcArg0: i32, %funcArg1: !fir.ref<i32>) {
-  %c1_i32 = arith.constant 1 : i32
-  %c2_i32 = arith.constant 2 : i32
-  %c10_i32 = arith.constant 10 : i32
-  // CHECK-NOT: omp.taskloop
-  omp.taskloop grainsize(%c2_i32: i32) {
-    // CHECK-NOT: omp.loop_nest
-    // CHECK: fir.do_loop
-    omp.loop_nest (%arg1) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32) {
-      // CHECK: fir.store %[[ARG_0]] to %[[ARG_1]]
-      fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
-      // CHECK-NOT: omp.yield
-      omp.yield
-    }
-  }
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func.func @target_update_enter_data_map_info(
-// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
-func.func @target_update_enter_data_map_info(%funcArg0: i32, %funcArg1: !fir.ref<i32>) {
-  %c1 = arith.constant 1 : index
-  // CHECK-NOT: omp.map.bounds
-  %1 = omp.map.bounds lower_bound(%c1 : index) upper_bound(%c1 : index) extent(%c1 : index) stride(%c1 : index) start_idx(%c1 : index)
-  // CHECK-NOT: omp.map.info
-  %13 = omp.map.info var_ptr(%funcArg1 : !fir.ref<i32>, i32) map_clauses(to) capture(ByRef) bounds(%1) -> !fir.ref<i32>
-  // CHECK-NOT: omp.target_enter_data
-  omp.target_enter_data map_entries(%13 : !fir.ref<i32>)
-  // CHECK-NOT: omp.target
-  omp.target map_entries(%13 -> %arg3 : !fir.ref<i32>) {
-    %c1_i32 = arith.constant 1 : i32
-    // CHECK: fir.store %c1_i32 to %[[ARG_1]]
-    fir.store %c1_i32 to %arg3 : !fir.ref<i32>
-    // CHECK-NOT: omp.terminator
-    omp.terminator
-  }
-  // CHECK-NOT: omp.map.info
-  %18 = omp.map.info var_ptr(%funcArg1 : !fir.ref<i32>, i32) map_clauses(from) capture(ByRef) bounds(%1) -> !fir.ref<i32>
-  // CHECK-NOT: omp.target_update
-  omp.target_update map_entries(%18 : !fir.ref<i32>)
-  // CHECK-NOT: omp.target_exit_data
-  omp.target_exit_data map_entries(%18 : !fir.ref<i32>)
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func.func @target_data(
-// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
-func.func @target_data(%funcArg0: i32, %funcArg1: !fir.ref<i32>) {
-  %c1 = arith.constant 1 : index
-  // CHECK-NOT: omp.map.bounds
-  %3 = omp.map.bounds lower_bound(%c1 : index) upper_bound(%c1 : index) extent(%c1 : index) stride(%c1 : index) start_idx(%c1 : index)
-  // CHECK-NOT: omp.map.info
-  %4 = omp.map.info var_ptr(%funcArg1 : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) bounds(%3) -> !fir.ref<i32>
-  // CHECK-NOT: omp.target_data
-  omp.target_data map_entries(%4 : !fir.ref<i32>) {
-    %c1_i32 = arith.constant 1 : i32
-    // CHECK: fir.store %c1_i32 to %[[ARG_1]]
-    fir.store %c1_i32 to %4 : !fir.ref<i32>
-    // CHECK-NOT: omp.terminator
-    omp.terminator
-  }
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func.func @sections(
-// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_2:.*]]: !fir.ref<i32>
-func.func @sections(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %funcArg2: !fir.ref<i32>) {
-  // CHECK-NOT: omp.sections
-  omp.sections {
-    // CHECK-NOT: omp.section
-    omp.section {
-      // CHECK: fir.store
-      fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
-      // CHECK-NOT: omp.terminator
-      omp.terminator
-    }
-    // CHECK-NOT: omp.section
-    omp.section {
-      // CHECK: fir.store
-      fir.store %funcArg0 to %funcArg2 : !fir.ref<i32>
-      // CHECK-NOT: omp.terminator
-      omp.terminator
-    }
-    // CHECK-NOT: omp.terminator
-    omp.terminator
-  }
-  return
-}
-
-// -----
-
-// CHECK-NOT: omp.declare_reduction
-omp.declare_reduction @add_reduction_i32 : i32 init {
-^bb0(%arg0: i32):
-  %c0_i32 = arith.constant 0 : i32
-  omp.yield(%c0_i32 : i32)
-} combiner {
-^bb0(%arg0: i32, %arg1: i32):
-  %0 = arith.addi %arg0, %arg1 : i32
-  omp.yield(%0 : i32)
-}
-// CHECK-LABEL: func.func @reduction_scan(
-// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
-func.func @reduction_scan(%funcArg0: i32, %funcArg1: !fir.ref<i32>) {
-  %c1_i32 = arith.constant 1 : i32
-  %c8_i32 = arith.constant 8 : i32
-  // CHECK-NOT: omp.wsloop
-  omp.wsloop reduction(mod: inscan, @add_reduction_i32 %funcArg1 -> %arg3 : !fir.ref<i32>) {
-    // CHECK-NOT: omp.loop_nest
-    // CHECK: fir.do_loop
-    omp.loop_nest (%arg2) : i32 = (%c1_i32) to (%c8_i32) inclusive step (%c1_i32) {
-      // CHECK: fir.declare %[[ARG_1]]
-      %1 = fir.declare %arg3 {uniq_name = "a"} : (!fir.ref<i32>) -> !fir.ref<i32>
-      // CHECK-NOT: omp.scan
-      omp.scan inclusive(%1 : !fir.ref<i32>)
-      // CHECK: fir.store
-      fir.store %funcArg0 to %1 : !fir.ref<i32>
-      // CHECK-NOT: omp.yield
-      omp.yield
-    }
-  }
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func.func @ordered(
-// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
-func.func @ordered(%funcArg0: i32, %funcArg1: !fir.ref<i32>) {
-  %c1_i32 = arith.constant 1 : i32
-  %c10_i32 = arith.constant 10 : i32
-  // CHECK-NOT: omp.parallel
-  omp.parallel {
-    // CHECK-NOT: omp.wsloop
-    omp.wsloop ordered(0) {
-      // CHECK-NOT: omp.loop_nest
-      // CHECK: fir.do_loop
-      omp.loop_nest (%arg2) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32) {
-        // CHECK-NOT: omp.ordered.region
-        omp.ordered.region {
-          // CHECK: fir.store
-          fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
-          // CHECK-NOT: omp.terminator
-          omp.terminator
-        }
-        // CHECK-NOT: omp.yield
-        omp.yield
-      }
-    }
-    // CHECK-NOT: omp.terminator
-    omp.terminator
-  }
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func.func @master(
-// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_2:.*]]: !fir.ref<i32>
-func.func @master(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %funcArg2: !fir.ref<i32>) {
-  // CHECK-NOT: omp.parallel
-  omp.parallel {
-    // CHECK: fir.store
-    fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
-    // CHECK-NOT: omp.master
-    omp.master {
-      // CHECK: fir.store
-      fir.store %funcArg0 to %funcArg2 : !fir.ref<i32>
-      // CHECK-NOT: omp.terminator
-      omp.terminator
-    }
-    // CHECK-NOT: omp.terminator
-    omp.terminator
-  }
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func.func @masked(
-// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_2:.*]]: !fir.ref<i32>
-func.func @masked(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %funcArg2: !fir.ref<i32>) {
-  // CHECK-NOT: omp.parallel
-  omp.parallel {
-    // CHECK: fir.store
-    fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
-    // CHECK-NOT: omp.masked
-    omp.masked {
-      // CHECK: fir.store
-      fir.store %funcArg0 to %funcArg2 : !fir.ref<i32>
-      // CHECK-NOT: omp.terminator
-      omp.terminator
-    }
-    // CHECK-NOT: omp.terminator
-    omp.terminator
-  }
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func.func @critical(
-// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_2:.*]]: !fir.ref<i32>
-omp.critical.declare @mylock
-func.func @critical(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %funcArg2: !fir.ref<i32>) {
-  // CHECK-NOT: omp.parallel
-  omp.parallel {
-    // CHECK: fir.store
-    fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
-    // CHECK-NOT: omp.critical
-    omp.critical(@mylock) {
-      // CHECK: fir.store
-      fir.store %funcArg0 to %funcArg2 : !fir.ref<i32>
-      // CHECK-NOT: omp.terminator
-      omp.terminator
-    }
-    // CHECK-NOT: omp.terminator
-    omp.terminator
-  }
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func.func @cancel(
-// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_2:.*]]: !fir.ref<i32>, %[[ARG_3:.*]]: i1
-func.func @cancel(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %funcArg2: !fir.ref<i32>, %funcArg3: i1) {
-  %c1_i32 = arith.constant 1 : i32
-  %c10_i32 = arith.constant 10 : i32
-  // CHECK-NOT: omp.parallel
-  omp.parallel {
-    // CHECK-NOT: omp.wsloop
-    omp.wsloop {
-      // CHECK-NOT: omp.loop_nest
-      // CHECK: fir.do_loop
-      omp.loop_nest (%arg1) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32) {
-        // CHECK: fir.store
-        fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
-        // CHECK-NOT: fir.if
-        fir.if %funcArg3 {
-          // CHECK-NOT: omp.cancel
-          omp.cancel cancellation_construct_type(loop)
-        }
-        // CHECK-NOT: omp.cancellation_point
-        omp.cancellation_point cancellation_construct_type(loop)
-        // CHECK: fir.store
-        fir.store %funcArg0 to %funcArg2 : !fir.ref<i32>
-        // CHECK-NOT: omp.yield
-        omp.yield
-      }
-    }
-    // CHECK-NOT: omp.terminator
-    omp.terminator
-  }
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func.func @atomic(
-// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_2:.*]]: !fir.ref<i32>, %[[ARG_3:.*]]: i32
-func.func @atomic(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %funcArg2: !fir.ref<i32>, %funcArg3: i32) {
-  %c1_i32 = arith.constant 1 : i32
-  %5 = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
-  // CHECK: %[[VAL_0:.*]] = fir.declare
-  %6 = fir.declare %5 {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> !fir.ref<i32>
-  // CHECK-NOT: omp.parallel
-  omp.parallel {
-    // CHECK-NOT: omp.atomic.write
-    // CHECK: fir.store %[[ARG_0]] to %[[ARG_1]]
-    omp.atomic.write %funcArg1 = %funcArg0 : !fir.ref<i32>, i32
-    // CHECK-NOT: omp.atomic.read
-    // CHECK: %[[VAL_1:.*]] = fir.load %[[ARG_1]]
-    // CHECK-NEXT: fir.store %[[VAL_1]] to %[[ARG_2]]
-    omp.atomic.read %funcArg2 = %funcArg1 : !fir.ref<i32>, !fir.ref<i32>, i32
-    // CHECK-NOT: omp.atomic.update
-    // CHECK: fir.load %[[VAL_0]]
-    // CHECK-NEXT: %[[ADD_VAL:.*]] = arith.addi
-    // CHECK-NOT: omp.yield
-    // CHECK-NEXT: fir.store %[[ADD_VAL]] to %[[VAL_0]]
-    omp.atomic.update %6 : !fir.ref<i32> {
-    ^bb0(%arg3: i32):
-      %88 = arith.addi %arg3, %c1_i32 : i32
-      omp.yield(%88 : i32)
-    }
-    // CHECK-NOT: omp.atomic.read
-    // CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]]
-    // CHECK-NEXT: fir.store %[[VAL_2]] to %[[ARG_1]]
-    omp.atomic.read %funcArg1 = %6 : !fir.ref<i32>, !fir.ref<i32>, i32
-    // CHECK-NOT: omp.atomic.capture
-    omp.atomic.capture {
-      // CHECK-NOT: omp.atomic.read
-      // CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_0]]
-      // CHECK-NEXT: fir.store %[[VAL_3]] to %[[ARG_2]]
-      omp.atomic.read %funcArg2 = %6 : !fir.ref<i32>, !fir.ref<i32>, i32
-      // CHECK-NOT: omp.atomic.update
-      // CHECK: fir.load %[[VAL_0]]
-      // CHECK-NEXT: %[[ADD_VAL_2:.*]] = arith.addi
-      // CHECK-NOT: omp.yield
-      // CHECK-NEXT: fir.store %[[ADD_VAL_2]] to %[[VAL_0]]
-      omp.atomic.update %6 : !fir.ref<i32> {
-      ^bb0(%arg3: i32):
-        %88 = arith.addi %arg3, %c1_i32 : i32
-        omp.yield(%88 : i32)
-      }
-    }
-    // CHECK-NOT: omp.terminator
-    omp.terminator
-  }
-  return
-}
-
-// -----
-
 // CHECK-LABEL: func.func @multi_block(
 // CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_3:.*]]: i1
 func.func @multi_block(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %6: i1) {
@@ -578,97 +174,23 @@ func.func @multi_block(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %6: i1) {
 
 // -----
 
-// CHECK-LABEL: func.func @do_multi_block(
-// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_3:.*]]: i1
-func.func @do_multi_block(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %6: i1) {
-  %false = arith.constant false
-  %c1_i32 = arith.constant 1 : i32
-  %c100_i32 = arith.constant 100 : i32
-  // CHECK-NOT: omp.wsloop
-  omp.wsloop {
-    // CHECK-NOT: omp.loop_nest
-    // CHECK: cf.br ^[[CBB:.*]](
-    // CHECK: ^[[CBB]]
-    // CHECK: %[[CMP_VAL:.*]] = arith.cmpi
-    // CHECK: cf.cond_br %[[CMP_VAL]], ^[[FBB:.*]], ^[[LBB:.*]]
-    omp.loop_nest (%arg2) : i32 = (%c1_i32) to (%c100_i32) inclusive step (%c1_i32) {
-    // CHECK: ^[[FBB]]
-      // CHECK: fir.store
-      fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
-      // CHECK: cf.br ^[[BBB:.*]]
-      cf.br ^bb1
-    // CHECK: ^[[BBB]]
-    ^bb1:  // pred: ^bb0
-      // CHECK: fir.store
-      fir.store %c1_i32 to %funcArg1 : !fir.ref<i32>
-      // CHECK: cf.cond_br
-      cf.cond_br %6, ^bb2, ^bb3
-    // CHECK: ^[[SBB:.*]]
-    ^bb2:  // pred: ^bb1
-      // CHECK: fir.call
-      fir.call @_FortranAStopStatement(%c1_i32, %false, %false) fastmath<contract> : (i32, i1, i1) -> ()
-      // CHECK-NOT: omp.yield
-      omp.yield
-      // CHECK: cf.br ^[[LBB:.*]]
-    // CHECK: ^[[OBB:.*]]
-      // CHECK: cf.br ^[[LBB]]
-    // CHECK: ^[[LBB]]
-      // CHECK: arith.subi
-      // CHECK: cf.br ^[[CBB]]
-    // CHECK: ^[[EBB:.*]]
-    ^bb3:  // pred: ^bb1
-      // CHECK-NOT: omp.yield
-      omp.yield
-    }
-  }
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func.func @simd_nested_atomic(
-// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_2:.*]]: !fir.ref<i32>
-func.func @simd_nested_atomic(%arg0: i32, %arg1: !fir.ref<i32>, %arg2: !fir.ref<i32>) {
-  %c1_i32 = arith.constant 1 : i32
-  %c100000_i32 = arith.constant 100000 : i32
-  // CHECK: omp.simd
-  omp.simd {
-    // CHECK: omp.loop_nest
-    omp.loop_nest (%arg3) : i32 = (%c1_i32) to (%c100000_i32) inclusive step (%c1_i32) {
-      // CHECK-NOT: omp.atomic.write
-      // CHECK: fir.store %[[ARG_0]] to %[[ARG_2]]
-      omp.atomic.write %arg2 = %arg0 : !fir.ref<i32>, i32
-      // CHECK: omp.yield
-      omp.yield
-    }
-  }
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func.func @unroll(
+// CHECK-LABEL: func.func @map_info(
 // CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
-func.func @unroll(%arg0: i32, %arg1: !fir.ref<i32>) {
-  %c1_i32 = arith.constant 1 : i32
-  // CHECK: %[[RANGE_i32:.*]] = arith.constant 16 : i32
-  %c16_i32 = arith.constant 16 : i32
-  // CHECK: %[[C1_IDX:.*]] = arith.constant 1 : index
-  // CHECK: %[[RANGE:.*]] = fir.convert %[[RANGE_i32]]
-  // CHECK-NOT: omp.new_cli
-  %canonloop_s0 = omp.new_cli
-  // CHECK-NOT: omp.canonical_loop
-  // CHECK: fir.do_loop %[[IVAR:.*]] = %[[C1_IDX]] to %[[RANGE]] step %[[C1_IDX]]
-  omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%c16_i32) {
-    // CHECK: %[[IVAR_CVT:.*]] = fir.convert %[[IVAR]] : (index) -> i32
-    // CHECK-NOT: arith.addi
-    %3 = arith.addi %iv, %c1_i32 : i32
-    // CHECK: fir.store %[[IVAR_CVT]] to %[[ARG_1]]
-    fir.store %3 to %arg1 : !fir.ref<i32>
+func.func @map_info(%funcArg0: i32, %funcArg1: !fir.ref<i32>) {
+  %c1 = arith.constant 1 : index
+  // CHECK-NOT: omp.map.bounds
+  %1 = omp.map.bounds lower_bound(%c1 : index) upper_bound(%c1 : index) extent(%c1 : index) stride(%c1 : index) start_idx(%c1 : index)
+  // CHECK-NOT: omp.map.info
+  %13 = omp.map.info var_ptr(%funcArg1 : !fir.ref<i32>, i32) map_clauses(to) capture(ByRef) bounds(%1) -> !fir.ref<i32>
+  // CHECK-NOT: omp.target
+  omp.target map_entries(%13 -> %arg3 : !fir.ref<i32>) {
+    %c1_i32 = arith.constant 1 : i32
+    // CHECK: fir.store %c1_i32 to %[[ARG_1]]
+    fir.store %c1_i32 to %arg3 : !fir.ref<i32>
     // CHECK-NOT: omp.terminator
     omp.terminator
   }
-  // CHECK-NOT: omp.unroll_heuristic
-  omp.unroll_heuristic(%canonloop_s0)
+  // CHECK-NOT: omp.map.info
+  %18 = omp.map.info var_ptr(%funcArg1 : !fir.ref<i32>, i32) map_clauses(from) capture(ByRef) bounds(%1) -> !fir.ref<i32>
   return
 }

From 0e7c0afc09f2ada3f4b45f0b39e37279f6041c8a Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski@arm.com>
Date: Mon, 11 Aug 2025 13:59:24 +0000
Subject: [PATCH 6/9] Cleanup no longer needed changes

---
 .../flang/Optimizer/Transforms/Utils.h        |   6 -
 flang/lib/Optimizer/OpenMP/SimdOnly.cpp       |   1 -
 flang/lib/Optimizer/Transforms/CMakeLists.txt |   1 -
 .../Transforms/ControlFlowConverter.cpp       |  99 +++++++++++++-
 flang/lib/Optimizer/Transforms/Utils.cpp      | 121 ------------------
 5 files changed, 97 insertions(+), 131 deletions(-)
 delete mode 100644 flang/lib/Optimizer/Transforms/Utils.cpp

diff --git a/flang/include/flang/Optimizer/Transforms/Utils.h b/flang/include/flang/Optimizer/Transforms/Utils.h
index 116a4eefdc794..49a616fb40fd5 100644
--- a/flang/include/flang/Optimizer/Transforms/Utils.h
+++ b/flang/include/flang/Optimizer/Transforms/Utils.h
@@ -13,8 +13,6 @@
 #ifndef FORTRAN_OPTIMIZER_TRANSFORMS_UTILS_H
 #define FORTRAN_OPTIMIZER_TRANSFORMS_UTILS_H
 
-#include "flang/Optimizer/Dialect/FIROps.h"
-
 namespace fir {
 
 using MinlocBodyOpGeneratorTy = llvm::function_ref<mlir::Value(
@@ -35,10 +33,6 @@ void genMinMaxlocReductionLoop(fir::FirOpBuilder &builder, mlir::Value array,
                                mlir::Type maskElemType, mlir::Value resultArr,
                                bool maskMayBeLogicalScalar);
 
-std::pair<mlir::Block *, mlir::Block *>
-convertDoLoopToCFG(DoLoopOp loop, mlir::PatternRewriter &rewriter, bool setNSW,
-                   bool forceLoopToExecuteOnce);
-
 } // namespace fir
 
 #endif // FORTRAN_OPTIMIZER_TRANSFORMS_UTILS_H
diff --git a/flang/lib/Optimizer/OpenMP/SimdOnly.cpp b/flang/lib/Optimizer/OpenMP/SimdOnly.cpp
index c9d9dbe03dac2..7d332faf9b299 100644
--- a/flang/lib/Optimizer/OpenMP/SimdOnly.cpp
+++ b/flang/lib/Optimizer/OpenMP/SimdOnly.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Optimizer/Builder/FIRBuilder.h"
-#include "flang/Optimizer/Transforms/Utils.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index 302776a14b9f1..a8812e08c1ccd 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -35,7 +35,6 @@ add_flang_library(FIRTransforms
   GenRuntimeCallsForTest.cpp
   SimplifyFIROperations.cpp
   OptimizeArrayRepacking.cpp
-  Utils.cpp
 
   DEPENDS
   CUFAttrs
diff --git a/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp b/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
index cdb194f5a68c9..e466aed753e63 100644
--- a/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
+++ b/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
@@ -14,7 +14,6 @@
 #include "flang/Optimizer/Support/InternalNames.h"
 #include "flang/Optimizer/Support/TypeCode.h"
 #include "flang/Optimizer/Transforms/Passes.h"
-#include "flang/Optimizer/Transforms/Utils.h"
 #include "flang/Runtime/derived-api.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
@@ -51,7 +50,103 @@ class CfgLoopConv : public mlir::OpRewritePattern<fir::DoLoopOp> {
   llvm::LogicalResult
   matchAndRewrite(DoLoopOp loop,
                   mlir::PatternRewriter &rewriter) const override {
-    convertDoLoopToCFG(loop, rewriter, setNSW, forceLoopToExecuteOnce);
+    auto loc = loop.getLoc();
+    mlir::arith::IntegerOverflowFlags flags{};
+    if (setNSW)
+      flags = bitEnumSet(flags, mlir::arith::IntegerOverflowFlags::nsw);
+    auto iofAttr = mlir::arith::IntegerOverflowFlagsAttr::get(
+        rewriter.getContext(), flags);
+
+    // Create the start and end blocks that will wrap the DoLoopOp with an
+    // initalizer and an end point
+    auto *initBlock = rewriter.getInsertionBlock();
+    auto initPos = rewriter.getInsertionPoint();
+    auto *endBlock = rewriter.splitBlock(initBlock, initPos);
+
+    // Split the first DoLoopOp block in two parts. The part before will be the
+    // conditional block since it already has the induction variable and
+    // loop-carried values as arguments.
+    auto *conditionalBlock = &loop.getRegion().front();
+    conditionalBlock->addArgument(rewriter.getIndexType(), loc);
+    auto *firstBlock =
+        rewriter.splitBlock(conditionalBlock, conditionalBlock->begin());
+    auto *lastBlock = &loop.getRegion().back();
+
+    // Move the blocks from the DoLoopOp between initBlock and endBlock
+    rewriter.inlineRegionBefore(loop.getRegion(), endBlock);
+
+    // Get loop values from the DoLoopOp
+    auto low = loop.getLowerBound();
+    auto high = loop.getUpperBound();
+    assert(low && high && "must be a Value");
+    auto step = loop.getStep();
+
+    // Initalization block
+    rewriter.setInsertionPointToEnd(initBlock);
+    auto diff = mlir::arith::SubIOp::create(rewriter, loc, high, low);
+    auto distance = mlir::arith::AddIOp::create(rewriter, loc, diff, step);
+    mlir::Value iters =
+        mlir::arith::DivSIOp::create(rewriter, loc, distance, step);
+
+    if (forceLoopToExecuteOnce) {
+      auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
+      auto cond = mlir::arith::CmpIOp::create(
+          rewriter, loc, arith::CmpIPredicate::sle, iters, zero);
+      auto one = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
+      iters = mlir::arith::SelectOp::create(rewriter, loc, cond, one, iters);
+    }
+
+    llvm::SmallVector<mlir::Value> loopOperands;
+    loopOperands.push_back(low);
+    auto operands = loop.getIterOperands();
+    loopOperands.append(operands.begin(), operands.end());
+    loopOperands.push_back(iters);
+
+    mlir::cf::BranchOp::create(rewriter, loc, conditionalBlock, loopOperands);
+
+    // Last loop block
+    auto *terminator = lastBlock->getTerminator();
+    rewriter.setInsertionPointToEnd(lastBlock);
+    auto iv = conditionalBlock->getArgument(0);
+    mlir::Value steppedIndex =
+        mlir::arith::AddIOp::create(rewriter, loc, iv, step, iofAttr);
+    assert(steppedIndex && "must be a Value");
+    auto lastArg = conditionalBlock->getNumArguments() - 1;
+    auto itersLeft = conditionalBlock->getArgument(lastArg);
+    auto one = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
+    mlir::Value itersMinusOne =
+        mlir::arith::SubIOp::create(rewriter, loc, itersLeft, one);
+
+    llvm::SmallVector<mlir::Value> loopCarried;
+    loopCarried.push_back(steppedIndex);
+    auto begin = loop.getFinalValue() ? std::next(terminator->operand_begin())
+                                      : terminator->operand_begin();
+    loopCarried.append(begin, terminator->operand_end());
+    loopCarried.push_back(itersMinusOne);
+    auto backEdge = mlir::cf::BranchOp::create(rewriter, loc, conditionalBlock,
+                                               loopCarried);
+    rewriter.eraseOp(terminator);
+
+    // Copy loop annotations from the do loop to the loop back edge.
+    if (auto ann = loop.getLoopAnnotation())
+      backEdge->setAttr("loop_annotation", *ann);
+
+    // Conditional block
+    rewriter.setInsertionPointToEnd(conditionalBlock);
+    auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
+    auto comparison = mlir::arith::CmpIOp::create(
+        rewriter, loc, arith::CmpIPredicate::sgt, itersLeft, zero);
+
+    mlir::cf::CondBranchOp::create(rewriter, loc, comparison, firstBlock,
+                                   llvm::ArrayRef<mlir::Value>(), endBlock,
+                                   llvm::ArrayRef<mlir::Value>());
+
+    // The result of the loop operation is the values of the condition block
+    // arguments except the induction variable on the last iteration.
+    auto args = loop.getFinalValue()
+                    ? conditionalBlock->getArguments()
+                    : conditionalBlock->getArguments().drop_front();
+    rewriter.replaceOp(loop, args.drop_back());
     return success();
   }
 
diff --git a/flang/lib/Optimizer/Transforms/Utils.cpp b/flang/lib/Optimizer/Transforms/Utils.cpp
deleted file mode 100644
index 2b2a2159e2501..0000000000000
--- a/flang/lib/Optimizer/Transforms/Utils.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-//===-- Utils.cpp ---------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
-//
-//===----------------------------------------------------------------------===//
-
-#include "flang/Optimizer/Transforms/Utils.h"
-#include "flang/Optimizer/Dialect/FIRType.h"
-#include "flang/Optimizer/Dialect/Support/FIRContext.h"
-#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
-
-/// Convert fir::DoLoopOp to control-flow operations
-std::pair<mlir::Block *, mlir::Block *>
-fir::convertDoLoopToCFG(DoLoopOp loop, mlir::PatternRewriter &rewriter,
-                        bool setNSW, bool forceLoopToExecuteOnce) {
-  auto loc = loop.getLoc();
-  mlir::arith::IntegerOverflowFlags flags{};
-  if (setNSW)
-    flags = bitEnumSet(flags, mlir::arith::IntegerOverflowFlags::nsw);
-  auto iofAttr =
-      mlir::arith::IntegerOverflowFlagsAttr::get(rewriter.getContext(), flags);
-
-  // Create the start and end blocks that will wrap the DoLoopOp with an
-  // initalizer and an end point
-  auto *initBlock = rewriter.getInsertionBlock();
-  auto initPos = rewriter.getInsertionPoint();
-  auto *endBlock = rewriter.splitBlock(initBlock, initPos);
-
-  // Split the first DoLoopOp block in two parts. The part before will be the
-  // conditional block since it already has the induction variable and
-  // loop-carried values as arguments.
-  auto *conditionalBlock = &loop.getRegion().front();
-  conditionalBlock->addArgument(rewriter.getIndexType(), loc);
-  auto *firstBlock =
-      rewriter.splitBlock(conditionalBlock, conditionalBlock->begin());
-  auto *lastBlock = &loop.getRegion().back();
-
-  // Move the blocks from the DoLoopOp between initBlock and endBlock
-  rewriter.inlineRegionBefore(loop.getRegion(), endBlock);
-
-  // Get loop values from the DoLoopOp
-  auto low = loop.getLowerBound();
-  auto high = loop.getUpperBound();
-  assert(low && high && "must be a Value");
-  auto step = loop.getStep();
-
-  // Initalization block
-  rewriter.setInsertionPointToEnd(initBlock);
-  auto diff = mlir::arith::SubIOp::create(rewriter, loc, high, low);
-  auto distance = mlir::arith::AddIOp::create(rewriter, loc, diff, step);
-  mlir::Value iters =
-      mlir::arith::DivSIOp::create(rewriter, loc, distance, step);
-
-  if (forceLoopToExecuteOnce) {
-    auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
-    auto cond = mlir::arith::CmpIOp::create(
-        rewriter, loc, mlir::arith::CmpIPredicate::sle, iters, zero);
-    auto one = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
-    iters = mlir::arith::SelectOp::create(rewriter, loc, cond, one, iters);
-  }
-
-  llvm::SmallVector<mlir::Value> loopOperands;
-  loopOperands.push_back(low);
-  auto operands = loop.getIterOperands();
-  loopOperands.append(operands.begin(), operands.end());
-  loopOperands.push_back(iters);
-
-  mlir::cf::BranchOp::create(rewriter, loc, conditionalBlock, loopOperands);
-
-  // Last loop block
-  auto *terminator = lastBlock->getTerminator();
-  rewriter.setInsertionPointToEnd(lastBlock);
-  auto iv = conditionalBlock->getArgument(0);
-  mlir::Value steppedIndex =
-      mlir::arith::AddIOp::create(rewriter, loc, iv, step, iofAttr);
-  assert(steppedIndex && "must be a Value");
-  auto lastArg = conditionalBlock->getNumArguments() - 1;
-  auto itersLeft = conditionalBlock->getArgument(lastArg);
-  auto one = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
-  mlir::Value itersMinusOne =
-      mlir::arith::SubIOp::create(rewriter, loc, itersLeft, one);
-
-  llvm::SmallVector<mlir::Value> loopCarried;
-  loopCarried.push_back(steppedIndex);
-  auto begin = loop.getFinalValue() ? std::next(terminator->operand_begin())
-                                    : terminator->operand_begin();
-  loopCarried.append(begin, terminator->operand_end());
-  loopCarried.push_back(itersMinusOne);
-  auto backEdge =
-      mlir::cf::BranchOp::create(rewriter, loc, conditionalBlock, loopCarried);
-  rewriter.eraseOp(terminator);
-
-  // Copy loop annotations from the do loop to the loop back edge.
-  if (auto ann = loop.getLoopAnnotation())
-    backEdge->setAttr("loop_annotation", *ann);
-
-  // Conditional block
-  rewriter.setInsertionPointToEnd(conditionalBlock);
-  auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
-  auto comparison = mlir::arith::CmpIOp::create(
-      rewriter, loc, mlir::arith::CmpIPredicate::sgt, itersLeft, zero);
-
-  mlir::cf::CondBranchOp::create(rewriter, loc, comparison, firstBlock,
-                                 llvm::ArrayRef<mlir::Value>(), endBlock,
-                                 llvm::ArrayRef<mlir::Value>());
-
-  // The result of the loop operation is the values of the condition block
-  // arguments except the induction variable on the last iteration.
-  auto args = loop.getFinalValue()
-                  ? conditionalBlock->getArguments()
-                  : conditionalBlock->getArguments().drop_front();
-  rewriter.replaceOp(loop, args.drop_back());
-
-  return std::make_pair(conditionalBlock, lastBlock);
-}

From de02e17756535586e5999b5a08f3997d5da19742 Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski@arm.com>
Date: Mon, 11 Aug 2025 15:54:09 +0000
Subject: [PATCH 7/9] Fixes to make Windows happy

---
 flang/lib/Semantics/rewrite-parse-tree.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/flang/lib/Semantics/rewrite-parse-tree.cpp b/flang/lib/Semantics/rewrite-parse-tree.cpp
index a1a127fa22121..14d2a49d9fc13 100644
--- a/flang/lib/Semantics/rewrite-parse-tree.cpp
+++ b/flang/lib/Semantics/rewrite-parse-tree.cpp
@@ -164,11 +164,8 @@ void RewriteMutator::OpenMPSimdOnly(parser::SpecificationPart &specPart) {
 // loop.
 void RewriteMutator::OpenMPSimdOnly(
     parser::Block &block, bool isNonSimdLoopBody = false) {
-  using ExecutionListIterator =
-      std::_List_iterator<parser::ExecutionPartConstruct>;
   auto replaceInlineBlock =
-      [&](std::list<parser::ExecutionPartConstruct> &block,
-          ExecutionListIterator it) -> ExecutionListIterator {
+      [&](std::list<parser::ExecutionPartConstruct> &block, auto it) -> auto {
     auto insertPos = std::next(it);
     block.splice(insertPos, block);
     block.erase(it);
@@ -410,9 +407,11 @@ bool RewriteMutator::Pre(parser::OpenMPLoopConstruct &ompLoop) {
     // call OpenMPSimdOnly on the nested loop block while indicating where
     // the block comes from.
     auto &nest = std::get<std::optional<parser::NestedConstruct>>(ompLoop.t);
-    if (nest.has_value()) {
-      auto &doConstruct = std::get<parser::DoConstruct>(nest.value());
-      auto &innerBlock = std::get<parser::Block>(doConstruct.t);
+    if (!nest.has_value()) {
+      return true;
+    }
+    if (auto *doConstruct = std::get_if<parser::DoConstruct>(&*nest)) {
+      auto &innerBlock = std::get<parser::Block>(doConstruct->t);
       OpenMPSimdOnly(innerBlock, /*isNonSimdLoopBody=*/true);
     }
   }

From 7df131677a0d91c96cbfed314e010b643f5df848 Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski@arm.com>
Date: Tue, 12 Aug 2025 12:51:32 +0000
Subject: [PATCH 8/9] Use OpenMP directive sets

---
 .../flang/Semantics/openmp-directive-sets.h   | 16 ++++++++++
 flang/lib/Semantics/rewrite-parse-tree.cpp    | 31 ++++---------------
 2 files changed, 22 insertions(+), 25 deletions(-)

diff --git a/flang/include/flang/Semantics/openmp-directive-sets.h b/flang/include/flang/Semantics/openmp-directive-sets.h
index dd610c9702c28..cc66cc833e8b7 100644
--- a/flang/include/flang/Semantics/openmp-directive-sets.h
+++ b/flang/include/flang/Semantics/openmp-directive-sets.h
@@ -401,6 +401,22 @@ static const OmpDirectiveSet nestedWorkshareErrSet{
         Directive::OMPD_taskloop,
     } | workShareSet,
 };
+
+//===----------------------------------------------------------------------===//
+// Misc directive sets
+//===----------------------------------------------------------------------===//
+
+// Simple standalone directives than can be erased by -fopenmp-simd.
+static const OmpDirectiveSet simpleStandaloneNonSimdOnlySet{
+    Directive::OMPD_taskyield,
+    Directive::OMPD_barrier,
+    Directive::OMPD_ordered,
+    Directive::OMPD_target_enter_data,
+    Directive::OMPD_target_exit_data,
+    Directive::OMPD_target_update,
+    Directive::OMPD_taskwait,
+};
+
 } // namespace llvm::omp
 
 #endif // FORTRAN_SEMANTICS_OPENMP_DIRECTIVE_SETS_H_
diff --git a/flang/lib/Semantics/rewrite-parse-tree.cpp b/flang/lib/Semantics/rewrite-parse-tree.cpp
index 14d2a49d9fc13..d33b193b3d489 100644
--- a/flang/lib/Semantics/rewrite-parse-tree.cpp
+++ b/flang/lib/Semantics/rewrite-parse-tree.cpp
@@ -12,6 +12,7 @@
 #include "flang/Parser/parse-tree-visitor.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Parser/tools.h"
+#include "flang/Semantics/openmp-directive-sets.h"
 #include "flang/Semantics/scope.h"
 #include "flang/Semantics/semantics.h"
 #include "flang/Semantics/symbol.h"
@@ -118,19 +119,7 @@ static bool ReturnsDataPointer(const Symbol &symbol) {
 static bool LoopConstructIsSIMD(parser::OpenMPLoopConstruct *ompLoop) {
   auto &begin = std::get<parser::OmpBeginLoopDirective>(ompLoop->t);
   auto directive = std::get<parser::OmpLoopDirective>(begin.t).v;
-  if (directive == llvm::omp::OMPD_simd ||
-      directive == llvm::omp::OMPD_do_simd ||
-      directive == llvm::omp::OMPD_target_simd ||
-      directive == llvm::omp::OMPD_taskloop_simd ||
-      directive == llvm::omp::OMPD_distribute_simd ||
-      directive == llvm::omp::OMPD_teams_distribute_simd ||
-      directive == llvm::omp::OMPD_teams_distribute_parallel_do_simd ||
-      directive == llvm::omp::OMPD_target_teams_distribute_simd ||
-      directive == llvm::omp::OMPD_target_teams_distribute_parallel_do_simd ||
-      directive == llvm::omp::OMPD_parallel_do_simd) {
-    return true;
-  }
-  return false;
+  return llvm::omp::allSimdSet.test(directive);
 }
 
 // Remove non-SIMD OpenMPConstructs once they are parsed.
@@ -191,17 +180,8 @@ void RewriteMutator::OpenMPSimdOnly(
                   &ompStandalone->u)}) {
             auto directive = constr->v.DirId();
             // Scan should only be removed from non-simd loops
-            if (isNonSimdLoopBody && directive == llvm::omp::OMPD_scan) {
-              it = block.erase(it);
-              continue;
-            }
-            if (directive == llvm::omp::OMPD_taskyield ||
-                directive == llvm::omp::OMPD_barrier ||
-                directive == llvm::omp::OMPD_ordered ||
-                directive == llvm::omp::OMPD_target_enter_data ||
-                directive == llvm::omp::OMPD_target_exit_data ||
-                directive == llvm::omp::OMPD_target_update ||
-                directive == llvm::omp::OMPD_taskwait) {
+            if (llvm::omp::simpleStandaloneNonSimdOnlySet.test(directive) ||
+                (isNonSimdLoopBody && directive == llvm::omp::OMPD_scan)) {
               it = block.erase(it);
               continue;
             }
@@ -225,8 +205,9 @@ void RewriteMutator::OpenMPSimdOnly(
             // We can only remove some constructs from a loop when it's _not_ a
             // OpenMP simd loop
             OpenMPSimdOnly(loopBody, /*isNonSimdLoopBody=*/true);
+            auto newDoConstruct = std::move(*doConstruct);
             auto newLoop = parser::ExecutionPartConstruct{
-                parser::ExecutableConstruct{std::move(*doConstruct)}};
+                parser::ExecutableConstruct{std::move(newDoConstruct)}};
             it = block.erase(it);
             block.insert(it, std::move(newLoop));
             continue;

From 5a2a9f0cc5e4356f1cd81d7bdd3a95101ab16e3c Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski@arm.com>
Date: Wed, 13 Aug 2025 17:12:22 +0000
Subject: [PATCH 9/9] Fix overlapping lambda argument name

---
 flang/lib/Semantics/rewrite-parse-tree.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/flang/lib/Semantics/rewrite-parse-tree.cpp b/flang/lib/Semantics/rewrite-parse-tree.cpp
index d33b193b3d489..b3019762ead1f 100644
--- a/flang/lib/Semantics/rewrite-parse-tree.cpp
+++ b/flang/lib/Semantics/rewrite-parse-tree.cpp
@@ -154,9 +154,10 @@ void RewriteMutator::OpenMPSimdOnly(parser::SpecificationPart &specPart) {
 void RewriteMutator::OpenMPSimdOnly(
     parser::Block &block, bool isNonSimdLoopBody = false) {
   auto replaceInlineBlock =
-      [&](std::list<parser::ExecutionPartConstruct> &block, auto it) -> auto {
+      [&](std::list<parser::ExecutionPartConstruct> &innerBlock,
+          auto it) -> auto {
     auto insertPos = std::next(it);
-    block.splice(insertPos, block);
+    block.splice(insertPos, innerBlock);
     block.erase(it);
     return insertPos;
   };