intel
diff --git a/‎include/triton/Dialect/Gluon/Transforms/Passes.td‎
Lines changed: 10 additions & 0 deletions b/‎include/triton/Dialect/Gluon/Transforms/Passes.td‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 5 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 1 addition & 2 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 19 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎lib/Analysis/Membar.cpp‎
Lines changed: 3 additions & 3 deletions b/‎lib/Analysis/Membar.cpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp‎
Lines changed: 21 additions & 0 deletions b/‎lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎lib/Dialect/Gluon/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎lib/Dialect/Gluon/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Dialect/Gluon/Transforms/Inline.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Dialect/Gluon/Transforms/Inline.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/Gluon/Transforms/SimplifyControlFlow.cpp‎
Lines changed: 49 additions & 0 deletions b/‎lib/Dialect/Gluon/Transforms/SimplifyControlFlow.cpp‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 4 additions & 5 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 4 additions & 5 deletions
@@ -35,4 +35,14 @@ def GluonInline: Pass<"gluon-inline"> {
   let dependentDialects = [];
 }
 
+def GluonSimplifyControlFlow: Pass<"gluon-slimplify-control-flow"> {
+  let summary = "simplications for control flow ops";
+
+  let description = [{
+    The `gluon-inline` pass applies a reduced set of simplification
+    and canonicalization patterns to the module.
+  }];
+  let dependentDialects = [];
+}
+
 #endif
@@ -135,6 +135,11 @@ LinearLayout chooseScaledMfmaScaleLayout(MLIRContext *ctx, int dotOperandIdx,
                                          ArrayRef<unsigned> tilesPerWarp,
                                          ArrayRef<unsigned> warpsPerCTA);
 
+LinearLayout chooseScaledWmmaScaleLayout(
+    MLIRContext *ctx, int dotOperandIdx,
+    const std::vector<std::vector<int32_t>> &dotOperandWarpBasis,
+    ArrayRef<int64_t> dotOperandShape);
+
 LinearLayout getSM120DotScaledScaleLayout(MLIRContext *ctx, int dotOperandIdx,
                                           ArrayRef<int64_t> dotOperandShape,
                                           ArrayRef<unsigned> tilesPerWarp,
 
@@ -1307,8 +1307,7 @@ Row |
   let hasCustomAssemblyFormat = 1;
 
   let extraClassDeclaration = extraDistributedDeclaration # [{
-    SmallVector<int64_t> getRepForOperand(ArrayRef<int64_t> operandShape,
-                                          Type elemType, int opIdx) const;
+    SmallVector<int64_t> getRepForOperand(ArrayRef<int64_t> operandShape, int kDim, int opIdx) const;
     SmallVector<unsigned> getRepOrderForOperand(int opIdx) const;
 
     static SmallVector<unsigned, 3> getDefaultInstrShape() {
 
@@ -574,4 +574,23 @@ def TTG_WarpReturnOp : TTG_Op<"warp_return", [
   let assemblyFormat = "attr-dict";
 }
 
+def TTG_LocalBarrierOp : TTG_Op<"local_barrier"> {
+  let summary = "Synchronizes execution and shared memory reads/writes for all threads in a CTA.";
+  let description = [{
+    The `local_barrier` op synchronizes the execution and all operations
+    between shared memory and registers for all threads in a CTA.
+    It is used to coordinate communication between the threads of the CTA.
+
+    This operation waits until all threads in the CTA have reached a `local_barrier`
+    and operations between shared memory and registers made by these threads prior
+    to the op are visible to all threads in the CTA.
+
+    Data hazards between threads accessing the same memory can be avoided by synchronizing the
+    CTA in-between these accesses with a `local_barrier`.
+
+    A `local_barrier` operation does not provide syncronization guarantees on global memory.
+  }];
+  let assemblyFormat = "attr-dict";
+}
+
 #endif // TRITONGPU_OPS
@@ -159,20 +159,20 @@ void MembarOrFenceAnalysis::visitTerminator(
 
 void MembarAnalysis::insertBarrier(Operation *op, OpBuilder *builder) {
   OpBuilder::InsertionGuard g(*builder);
-  auto barrierOp = builder->create<gpu::BarrierOp>(op->getLoc());
+  auto barrierOp = builder->create<triton::gpu::LocalBarrierOp>(op->getLoc());
 }
 
 void MembarAnalysis::update(Operation *op, BlockInfo *blockInfo,
                             FuncBlockInfoMapT *funcBlockInfoMap,
                             OpBuilder *builder) {
-  if (isa<gpu::BarrierOp>(op)) {
+  if (isa<gpu::BarrierOp, triton::gpu::LocalBarrierOp>(op)) {
     // If the current op is a barrier, we sync previous reads and writes
     blockInfo->sync();
     return;
   }
 
   if (isa<triton::gpu::AsyncWaitOp, triton::nvidia_gpu::TMAStoreWaitOp>(op) &&
-      !isa<gpu::BarrierOp>(op->getNextNode())) {
+      !isa<gpu::BarrierOp, triton::gpu::LocalBarrierOp>(op->getNextNode())) {
     // If the current op is an async wait and the next op is not a barrier we
     // insert a barrier op and sync
     builder->setInsertionPointAfter(op);
 
@@ -1,5 +1,6 @@
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/IR/PatternMatch.h"
 #include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h"
 #include "triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h"
@@ -232,6 +233,25 @@ struct LocalStoreOpConversion
   const TargetInfoBase &targetInfo;
 };
 
+class LocalBarrierOpConversion
+    : public ConvertOpToLLVMPattern<triton::gpu::LocalBarrierOp> {
+public:
+  LocalBarrierOpConversion(const LLVMTypeConverter &converter,
+                           PatternBenefit benefit)
+      : ConvertOpToLLVMPattern<triton::gpu::LocalBarrierOp>(converter,
+                                                            benefit) {}
+  using OpAdaptor = typename triton::gpu::LocalBarrierOp::Adaptor;
+
+  LogicalResult
+  matchAndRewrite(triton::gpu::LocalBarrierOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    rewriter.replaceOpWithNewOp<mlir::gpu::BarrierOp>(op);
+
+    return success();
+  }
+};
+
 } // namespace
 
 void mlir::triton::populateMemoryOpToLLVMPatterns(
@@ -243,4 +263,5 @@ void mlir::triton::populateMemoryOpToLLVMPatterns(
   patterns.add<LocalDeallocOpConversion>(typeConverter, benefit);
   patterns.add<LocalLoadOpConversion>(typeConverter, targetInfo, benefit);
   patterns.add<LocalStoreOpConversion>(typeConverter, targetInfo, benefit);
+  patterns.add<LocalBarrierOpConversion>(typeConverter, benefit);
 }
@@ -2,6 +2,7 @@ add_triton_library(GluonTransforms
   Canonicalize.cpp
   Inline.cpp
   ResolveAutoEncodings.cpp
+  SimplifyControlFlow.cpp
 
   DEPENDS
   GluonTransformsIncGen
 
@@ -22,7 +22,7 @@ struct Inline : public gluon::impl::GluonInlineBase<Inline> {
 void Inline::runOnOperation() {
   mlir::PassManager pm(&getContext());
   pm.addPass(createInlinerPass(/*opPipelines=*/{}, [](OpPassManager &pm) {
-    pm.addPass(gluon::createGluonCanonicalize());
+    pm.addPass(gluon::createGluonSimplifyControlFlow());
   }));
   if (failed(pm.run(getOperation())))
     return signalPassFailure();
 
@@ -0,0 +1,49 @@
+#include "mlir/IR/OperationSupport.h"
+#include "triton/Dialect/Gluon/Transforms/Passes.h"
+
+#include "triton/Dialect/TritonGPU/Transforms/Utility.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+using namespace triton;
+
+namespace mlir::triton::gluon {
+#define GEN_PASS_DEF_GLUONSIMPLIFYCONTROLFLOW
+#include "triton/Dialect/Gluon/Transforms/Passes.h.inc"
+} // namespace mlir::triton::gluon
+
+namespace {
+struct SimplifyControlFlow
+    : public gluon::impl::GluonSimplifyControlFlowBase<SimplifyControlFlow> {
+  void runOnOperation() override;
+};
+} // namespace
+
+void SimplifyControlFlow::runOnOperation() {
+  MLIRContext *ctx = &getContext();
+  RewritePatternSet patterns(&getContext());
+
+  // Populate `scf` and `cf` canonicalizers.
+  ctx->getLoadedDialect<scf::SCFDialect>()->getCanonicalizationPatterns(
+      patterns);
+  ctx->getLoadedDialect<cf::ControlFlowDialect>()->getCanonicalizationPatterns(
+      patterns);
+  for (mlir::RegisteredOperationName op : ctx->getRegisteredOperationsByDialect(
+           scf::SCFDialect::getDialectNamespace()))
+    op.getCanonicalizationPatterns(patterns, ctx);
+  for (mlir::RegisteredOperationName op : ctx->getRegisteredOperationsByDialect(
+           cf::ControlFlowDialect::getDialectNamespace()))
+    op.getCanonicalizationPatterns(patterns, ctx);
+  populateForOpDeadArgumentElimination(patterns);
+
+  GreedyRewriteConfig config;
+  // This is intended to run before AutoLayouts are resolved, in which case
+  // CSEing constants can lead to additional layout conflicts.
+  config.enableConstantCSE(false);
+  (void)applyPatternsGreedily(getOperation(), std::move(patterns), config);
+}
@@ -2300,12 +2300,11 @@ AMDWmmaEncodingAttr::getRepOrderForOperand(int opIdx) const {
 }
 
 SmallVector<int64_t>
-AMDWmmaEncodingAttr::getRepForOperand(ArrayRef<int64_t> operandShape,
-                                      Type elemType, int opIdx) const {
+AMDWmmaEncodingAttr::getRepForOperand(ArrayRef<int64_t> operandShape, int kDim,
+                                      int opIdx) const {
   auto mnkDim = getInstrShape();
-  auto operandTileShape = opIdx == 0
-                              ? SmallVector<int64_t>{mnkDim[0], mnkDim[2]}
-                              : SmallVector<int64_t>{mnkDim[2], mnkDim[1]};
+  SmallVector<int64_t, 2> operandTileShape{opIdx == 0 ? mnkDim[0] : kDim,
+                                           opIdx == 0 ? kDim : mnkDim[1]};
 
   assert(operandTileShape.size() == 2);
   auto warpsPerCTA = getWarpsPerCTA();