diff --git a/include/triton/Dialect/Gluon/Transforms/Passes.td b/include/triton/Dialect/Gluon/Transforms/Passes.td index a9e7019944..bf227c2a22 100644 --- a/include/triton/Dialect/Gluon/Transforms/Passes.td +++ b/include/triton/Dialect/Gluon/Transforms/Passes.td @@ -35,4 +35,14 @@ def GluonInline: Pass<"gluon-inline"> { let dependentDialects = []; } +def GluonSimplifyControlFlow: Pass<"gluon-slimplify-control-flow"> { + let summary = "simplications for control flow ops"; + + let description = [{ + The `gluon-inline` pass applies a reduced set of simplification + and canonicalization patterns to the module. + }]; + let dependentDialects = []; +} + #endif diff --git a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h index 0cfb42b451..6528955ea0 100644 --- a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h +++ b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h @@ -135,6 +135,11 @@ LinearLayout chooseScaledMfmaScaleLayout(MLIRContext *ctx, int dotOperandIdx, ArrayRef tilesPerWarp, ArrayRef warpsPerCTA); +LinearLayout chooseScaledWmmaScaleLayout( + MLIRContext *ctx, int dotOperandIdx, + const std::vector> &dotOperandWarpBasis, + ArrayRef dotOperandShape); + LinearLayout getSM120DotScaledScaleLayout(MLIRContext *ctx, int dotOperandIdx, ArrayRef dotOperandShape, ArrayRef tilesPerWarp, diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td index 7d4ad81c2e..aec8f77153 100644 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td @@ -1307,8 +1307,7 @@ Row | let hasCustomAssemblyFormat = 1; let extraClassDeclaration = extraDistributedDeclaration # [{ - SmallVector getRepForOperand(ArrayRef operandShape, - Type elemType, int opIdx) const; + SmallVector getRepForOperand(ArrayRef operandShape, int kDim, int opIdx) const; SmallVector getRepOrderForOperand(int opIdx) const; static SmallVector getDefaultInstrShape() { diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td index ee0924a2f1..2fbd4a55c8 100644 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td @@ -574,4 +574,23 @@ def TTG_WarpReturnOp : TTG_Op<"warp_return", [ let assemblyFormat = "attr-dict"; } +def TTG_LocalBarrierOp : TTG_Op<"local_barrier"> { + let summary = "Synchronizes execution and shared memory reads/writes for all threads in a CTA."; + let description = [{ + The `local_barrier` op synchronizes the execution and all operations + between shared memory and registers for all threads in a CTA. + It is used to coordinate communication between the threads of the CTA. + + This operation waits until all threads in the CTA have reached a `local_barrier` + and operations between shared memory and registers made by these threads prior + to the op are visible to all threads in the CTA. + + Data hazards between threads accessing the same memory can be avoided by synchronizing the + CTA in-between these accesses with a `local_barrier`. + + A `local_barrier` operation does not provide syncronization guarantees on global memory. + }]; + let assemblyFormat = "attr-dict"; +} + #endif // TRITONGPU_OPS diff --git a/lib/Analysis/Membar.cpp b/lib/Analysis/Membar.cpp index 94a4e0b7ff..41250dbeaa 100644 --- a/lib/Analysis/Membar.cpp +++ b/lib/Analysis/Membar.cpp @@ -159,20 +159,20 @@ void MembarOrFenceAnalysis::visitTerminator( void MembarAnalysis::insertBarrier(Operation *op, OpBuilder *builder) { OpBuilder::InsertionGuard g(*builder); - auto barrierOp = builder->create(op->getLoc()); + auto barrierOp = builder->create(op->getLoc()); } void MembarAnalysis::update(Operation *op, BlockInfo *blockInfo, FuncBlockInfoMapT *funcBlockInfoMap, OpBuilder *builder) { - if (isa(op)) { + if (isa(op)) { // If the current op is a barrier, we sync previous reads and writes blockInfo->sync(); return; } if (isa(op) && - !isa(op->getNextNode())) { + !isa(op->getNextNode())) { // If the current op is an async wait and the next op is not a barrier we // insert a barrier op and sync builder->setInsertionPointAfter(op); diff --git a/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp index 2c47118344..77e3a80dba 100644 --- a/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp +++ b/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp @@ -1,5 +1,6 @@ #include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Conversion/LLVMCommon/TypeConverter.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/IR/PatternMatch.h" #include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h" #include "triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h" @@ -232,6 +233,25 @@ struct LocalStoreOpConversion const TargetInfoBase &targetInfo; }; +class LocalBarrierOpConversion + : public ConvertOpToLLVMPattern { +public: + LocalBarrierOpConversion(const LLVMTypeConverter &converter, + PatternBenefit benefit) + : ConvertOpToLLVMPattern(converter, + benefit) {} + using OpAdaptor = typename triton::gpu::LocalBarrierOp::Adaptor; + + LogicalResult + matchAndRewrite(triton::gpu::LocalBarrierOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + + rewriter.replaceOpWithNewOp(op); + + return success(); + } +}; + } // namespace void mlir::triton::populateMemoryOpToLLVMPatterns( @@ -243,4 +263,5 @@ void mlir::triton::populateMemoryOpToLLVMPatterns( patterns.add(typeConverter, benefit); patterns.add(typeConverter, targetInfo, benefit); patterns.add(typeConverter, targetInfo, benefit); + patterns.add(typeConverter, benefit); } diff --git a/lib/Dialect/Gluon/Transforms/CMakeLists.txt b/lib/Dialect/Gluon/Transforms/CMakeLists.txt index 0d8d6ff9f0..81a9cb36b5 100644 --- a/lib/Dialect/Gluon/Transforms/CMakeLists.txt +++ b/lib/Dialect/Gluon/Transforms/CMakeLists.txt @@ -2,6 +2,7 @@ add_triton_library(GluonTransforms Canonicalize.cpp Inline.cpp ResolveAutoEncodings.cpp + SimplifyControlFlow.cpp DEPENDS GluonTransformsIncGen diff --git a/lib/Dialect/Gluon/Transforms/Inline.cpp b/lib/Dialect/Gluon/Transforms/Inline.cpp index 329edaee1c..0dd7d26c73 100644 --- a/lib/Dialect/Gluon/Transforms/Inline.cpp +++ b/lib/Dialect/Gluon/Transforms/Inline.cpp @@ -22,7 +22,7 @@ struct Inline : public gluon::impl::GluonInlineBase { void Inline::runOnOperation() { mlir::PassManager pm(&getContext()); pm.addPass(createInlinerPass(/*opPipelines=*/{}, [](OpPassManager &pm) { - pm.addPass(gluon::createGluonCanonicalize()); + pm.addPass(gluon::createGluonSimplifyControlFlow()); })); if (failed(pm.run(getOperation()))) return signalPassFailure(); diff --git a/lib/Dialect/Gluon/Transforms/SimplifyControlFlow.cpp b/lib/Dialect/Gluon/Transforms/SimplifyControlFlow.cpp new file mode 100644 index 0000000000..c0a6b40f68 --- /dev/null +++ b/lib/Dialect/Gluon/Transforms/SimplifyControlFlow.cpp @@ -0,0 +1,49 @@ +#include "mlir/IR/OperationSupport.h" +#include "triton/Dialect/Gluon/Transforms/Passes.h" + +#include "triton/Dialect/TritonGPU/Transforms/Utility.h" + +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +using namespace mlir; +using namespace triton; + +namespace mlir::triton::gluon { +#define GEN_PASS_DEF_GLUONSIMPLIFYCONTROLFLOW +#include "triton/Dialect/Gluon/Transforms/Passes.h.inc" +} // namespace mlir::triton::gluon + +namespace { +struct SimplifyControlFlow + : public gluon::impl::GluonSimplifyControlFlowBase { + void runOnOperation() override; +}; +} // namespace + +void SimplifyControlFlow::runOnOperation() { + MLIRContext *ctx = &getContext(); + RewritePatternSet patterns(&getContext()); + + // Populate `scf` and `cf` canonicalizers. + ctx->getLoadedDialect()->getCanonicalizationPatterns( + patterns); + ctx->getLoadedDialect()->getCanonicalizationPatterns( + patterns); + for (mlir::RegisteredOperationName op : ctx->getRegisteredOperationsByDialect( + scf::SCFDialect::getDialectNamespace())) + op.getCanonicalizationPatterns(patterns, ctx); + for (mlir::RegisteredOperationName op : ctx->getRegisteredOperationsByDialect( + cf::ControlFlowDialect::getDialectNamespace())) + op.getCanonicalizationPatterns(patterns, ctx); + populateForOpDeadArgumentElimination(patterns); + + GreedyRewriteConfig config; + // This is intended to run before AutoLayouts are resolved, in which case + // CSEing constants can lead to additional layout conflicts. + config.enableConstantCSE(false); + (void)applyPatternsGreedily(getOperation(), std::move(patterns), config); +} diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp index afef3d6c61..d449fbe817 100644 --- a/lib/Dialect/TritonGPU/IR/Dialect.cpp +++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp @@ -2300,12 +2300,11 @@ AMDWmmaEncodingAttr::getRepOrderForOperand(int opIdx) const { } SmallVector -AMDWmmaEncodingAttr::getRepForOperand(ArrayRef operandShape, - Type elemType, int opIdx) const { +AMDWmmaEncodingAttr::getRepForOperand(ArrayRef operandShape, int kDim, + int opIdx) const { auto mnkDim = getInstrShape(); - auto operandTileShape = opIdx == 0 - ? SmallVector{mnkDim[0], mnkDim[2]} - : SmallVector{mnkDim[2], mnkDim[1]}; + SmallVector operandTileShape{opIdx == 0 ? mnkDim[0] : kDim, + opIdx == 0 ? kDim : mnkDim[1]}; assert(operandTileShape.size() == 2); auto warpsPerCTA = getWarpsPerCTA(); diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp index 146034669c..1e83c0d307 100644 --- a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp +++ b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp @@ -1448,6 +1448,66 @@ LinearLayout chooseDsReadB64TrLayout(Attribute enc, ArrayRef shape, return chooseDotDsReadB64TrLayout(dot, shape, elemBitWidth); } +LinearLayout chooseScaledWmmaScaleLayout( + MLIRContext *ctx, int dotOperandIdx, + const std::vector> &dotOperandWarpBasis, + ArrayRef dotOperandShape) { + using basisT = std::vector>; + unsigned rank = dotOperandShape.size(); + auto order = mlir::triton::gpu::getMatrixOrder(rank, /*rowMajor=*/true); + auto standardOutDims = standardOutDimNames(ctx, rank); + StringAttr kRegister = StringAttr::get(ctx, "register"); + StringAttr kLane = StringAttr::get(ctx, "lane"); + StringAttr kWarp = StringAttr::get(ctx, "warp"); + StringAttr kBlock = StringAttr::get(ctx, "block"); + unsigned int scaleKWidth = dotOperandShape[1]; + // Init register layout. Will be adjusted later + auto regs = + mlir::triton::identityStandardND(kRegister, {1, scaleKWidth}, order); + LinearLayout lanes = LinearLayout::empty(); + // In scaled dot, the shapes of operands(without batch dimension) are, + // respectively: + // - A: [M, K] + // - B: [K, N] + // - aScale: [M, K / 32 or 16] + // - bScale: [N, K / 32 or 16] + // + // To correctly feed A/B and its scale into instruction, we need to + // distribute aScale/bScale among warps in the same way as A/B. But bScale + // is not transposed like B. So we need to transpose the warp layout of + // bScale. + // + // The tricky part is, our desired outputs are [dim0, dim1], but + // at this position, the layouts are transposed to [dim1, dim0]. So + // instead of reverse bScale's layout, we need to reverse aScale's. There + // will be a transpose in the end to correct everything. + basisT warps = dotOperandWarpBasis; + if (dotOperandIdx == 0) { + for (auto &basis : warps) { + std::reverse(basis.begin(), basis.end()); + } + } + + lanes = LinearLayout({{kLane, {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {0, 0}}}, + {kWarp, warps}, + {kBlock, {}}}, + {standardOutDims[order[0]], standardOutDims[order[1]]}); + LinearLayout newLL = regs * lanes; + + // Adjust register-level layout to fill the shape, at this level, both + // aScale and bScale should align with A operand. + SmallVector repOrder = {1, 0}; + for (auto d : repOrder) { + auto outDim = standardOutDims[d]; + auto dimSize = newLL.getOutDimSize(outDim); + newLL *= LinearLayout::identity1D(dotOperandShape[d] / dimSize, kRegister, + outDim); + } + newLL = newLL.transposeOuts(standardOutDims); + + return newLL; +} + // Warp-level block scaling (sm_120, m16n8k32) // Reference: NVIDIA PTX ISA "Warp-level block scaling" // https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-block-scaling diff --git a/python/test/gluon/test_core.py b/python/test/gluon/test_core.py index 66c9da9bd3..bc534c3372 100644 --- a/python/test/gluon/test_core.py +++ b/python/test/gluon/test_core.py @@ -1119,3 +1119,28 @@ def kernel(a_ptr, b_ptr, c_ptr, out_ptr): out = torch.empty((B, B), dtype=torch.float32, device=device) kernel[(1, )](a, b, c, out) torch.testing.assert_close(out, torch.addmm(c, a, b), atol=1e-2, rtol=1e-2) + + +@gluon.jit +def kernel_auto_layout_constant(threads_per_warp: ttgl.constexpr): + BLOCK: ttgl.constexpr = 16 + SIZE: ttgl.constexpr = 10 + + mask = ttgl.full( + (BLOCK, BLOCK), + True, + ttgl.int1, + ttgl.BlockedLayout( + size_per_thread=[1, 1], + threads_per_warp=[1, threads_per_warp], + warps_per_cta=[1, 4], + order=[1, 0], + ), + ) + + mask &= (ttgl.arange(0, BLOCK, ttgl.AutoLayout()) < SIZE).expand_dims(0) + mask &= (ttgl.arange(0, BLOCK, ttgl.AutoLayout()) < SIZE).expand_dims(1) + + +def test_auto_layout_constant(): + kernel_auto_layout_constant.warmup(THREADS_PER_WARP, grid=(1, )) diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py index edb11f0449..1066e475ce 100644 --- a/python/test/gluon/test_frontend.py +++ b/python/test/gluon/test_frontend.py @@ -30,6 +30,7 @@ HIP_TARGET_RDNA4 = GPUTarget("hip", "gfx1200", 32) HIP_TARGET_CDNA3 = GPUTarget("hip", "gfx942", 64) HIP_TARGET_CDNA4 = GPUTarget("hip", "gfx950", 64) +HIP_TARGET_GFX1250 = GPUTarget("hip", "gfx1250", 32) ALL_TARGETS = [AMPERE_TARGET, HOPPER_TARGET, BLACKWELL_TARGET, HIP_TARGET_RDNA4] @@ -2358,6 +2359,58 @@ def kernel(): """) +@pytest.mark.parametrize("target", [HIP_TARGET_GFX1250]) +def test_amd_wmma_scaled(target): + + @gluon.jit + def kernel(): + wmma_layout: ttgl.constexpr = ttgl.amd.AMDWMMALayout(version=3, transposed=True, warps_per_cta=[2, 2], + instr_shape=[16, 16, 128]) + wmma_layout_packed: ttgl.constexpr = ttgl.amd.AMDWMMALayout(version=3, transposed=True, warps_per_cta=[2, 2], + instr_shape=[16, 16, 64]) + a_scale_linear_layout: ttgl.constexpr = ttgl.DistributedLinearLayout( + reg_bases=[[0, 1], [0, 2]], lane_bases=[[1, 0], [2, 0], [4, 0], [8, 0], [0, 0]], + warp_bases=[[0, 0], [16, 0]], block_bases=[], shape=[32, 4]) + b_scale_linear_layout: ttgl.constexpr = ttgl.DistributedLinearLayout( + reg_bases=[[0, 1], [0, 2]], lane_bases=[[1, 0], [2, 0], [4, 0], [8, 0], [0, 0]], + warp_bases=[[16, 0], [0, 0]], block_bases=[], shape=[32, 4]) + + a = ttgl.full([32, 64], 0x11, ttgl.uint8, + ttgl.DotOperandLayout(operand_index=0, parent=wmma_layout_packed, k_width=16)) + b = ttgl.full([64, 32], 0x22, ttgl.uint8, + ttgl.DotOperandLayout(operand_index=1, parent=wmma_layout_packed, k_width=16)) + a_scale = ttgl.full([32, 4], 0x02, ttgl.uint8, a_scale_linear_layout) + b_scale = ttgl.full([32, 4], 0x01, ttgl.uint8, b_scale_linear_layout) + acc = ttgl.full([32, 32], 0, ttgl.float32, wmma_layout) + ttgl.amd.gfx1250.wmma_scaled(a, a_scale, 'e2m1', b, b_scale, 'e2m1', acc) + + module = run_parser(kernel, *make_args(num_warps=4), target=target) + expecttest.assert_expected_inline( + anonymize_ir(module.str_nodebug()), """\ +#linear = #ttg.linear<{register = [[0, 1], [0, 2]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [0, 0]], warp = [[0, 0], [16, 0]], block = []}> +#linear1 = #ttg.linear<{register = [[0, 1], [0, 2]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [0, 0]], warp = [[16, 0], [0, 0]], block = []}> +#mma = #ttg.amd_wmma<{version = 3, isTranspose = true, warpsPerCTA = [2, 2], instrShape = [16, 16, 64]}> +#mma1 = #ttg.amd_wmma<{version = 3, isTranspose = true, warpsPerCTA = [2, 2], instrShape = [16, 16, 128]}> +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @kernel() attributes {noinline = false} { + %c17_i8 = arith.constant 17 : i8 + %cst = arith.constant dense<17> : tensor<32x64xi8, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>> + %c34_i8 = arith.constant 34 : i8 + %cst_0 = arith.constant dense<34> : tensor<64x32xi8, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 16}>> + %c2_i8 = arith.constant 2 : i8 + %cst_1 = arith.constant dense<2> : tensor<32x4xi8, #linear> + %c1_i8 = arith.constant 1 : i8 + %cst_2 = arith.constant dense<1> : tensor<32x4xi8, #linear1> + %cst_3 = arith.constant 0.000000e+00 : f32 + %cst_4 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma1> + %cst_5 = arith.constant 0.000000e+00 : f32 + %0 = tt.dot_scaled %cst scale %cst_1, %cst_0 scale %cst_2, %cst_4 lhs = e2m1 rhs = e2m1 {fastMath = false} : tensor<32x64xi8, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>>, tensor<32x4xi8, #linear> * tensor<64x32xi8, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 16}>>, tensor<32x4xi8, #linear1> -> tensor<32x32xf32, #mma1> + tt.return + } +} +""") + + @gluon.jit def padded_shared_layout_kernel(): shape: ttgl.constexpr = [64, 64] diff --git a/python/triton/_internal_testing.py b/python/triton/_internal_testing.py index 97ed2cda37..ba86218120 100644 --- a/python/triton/_internal_testing.py +++ b/python/triton/_internal_testing.py @@ -84,6 +84,11 @@ def is_hip_gfx12(): return target is not None and target.backend == 'hip' and 'gfx12' in target.arch +def is_hip_gfx1250(): + target = get_current_target() + return target is not None and target.backend == 'hip' and 'gfx1250' in target.arch + + def is_hip_cdna(): return is_hip_cdna2() or is_hip_cdna3() or is_hip_cdna4() diff --git a/python/triton/experimental/gluon/language/amd/_ops.py b/python/triton/experimental/gluon/language/amd/_ops.py index af64cd1442..ab23105772 100644 --- a/python/triton/experimental/gluon/language/amd/_ops.py +++ b/python/triton/experimental/gluon/language/amd/_ops.py @@ -6,21 +6,31 @@ from ._layouts import AMDWMMALayout -def _wmma(version, a, b, acc, semantic): - """ Shared implementation for AMD WMMA operations for Gluon builtins """ - +def _verify_wmma(version, a, b, acc): _check(acc is not None, lambda: "acc is required") + layout = acc.type.layout _check( isinstance(layout, AMDWMMALayout) and layout.version == version, lambda: f"Expected layout to be an instance of AMDWMMALayout with version {version}") + + a_layout = a.type.layout _check( - isinstance(a.type.layout, DotOperandLayout) and a.type.layout.parent == layout, + isinstance(a_layout, DotOperandLayout) and isinstance(a_layout.parent, AMDWMMALayout) + and a_layout.parent.version == version, lambda: "Expected a's layout to be a DotOperandLayout with parent matching AMDWMMALayout") + + b_layout = b.type.layout _check( - isinstance(b.type.layout, DotOperandLayout) and b.type.layout.parent == layout, + isinstance(b_layout, DotOperandLayout) and isinstance(b_layout.parent, AMDWMMALayout) + and b_layout.parent.version == version, lambda: "Expected b's layout to be a DotOperandLayout with parent matching AMDWMMALayout") + +def _wmma(version, a, b, acc, semantic): + """ Shared implementation for AMD WMMA operations for Gluon builtins """ + _verify_wmma(version, a, b, acc) + handle = semantic.dot(a, b, acc, input_precision=knobs.language.fp32_default, max_num_imprecise_acc=None, out_dtype=acc.dtype).handle return ttgl.tensor(handle, acc.type) diff --git a/python/triton/experimental/gluon/language/amd/cdna4/__init__.py b/python/triton/experimental/gluon/language/amd/cdna4/__init__.py index db352a4810..edeb3506f5 100644 --- a/python/triton/experimental/gluon/language/amd/cdna4/__init__.py +++ b/python/triton/experimental/gluon/language/amd/cdna4/__init__.py @@ -43,6 +43,8 @@ def mfma_scaled(a, a_scale, a_format, b, b_scale, b_format, acc, _semantic=None) assert a_format.value in {"e2m1", "e4m3", "e5m2"}, f"Unsupported lhs_format: {a_format.value}" assert b_format.value in {"e2m1", "e4m3", "e5m2"}, f"Unsupported rhs_format: {b_format.value}" + assert a_scale is not None and b_scale is not None, "Scales must not be None" + tensor = _semantic.dot_scaled(a, a_scale, a_format, b, b_scale, b_format, acc, False, True, True, float32) ret_ty = ttgl.distributed_type(tensor.dtype, tensor.shape, layout) diff --git a/python/triton/experimental/gluon/language/amd/gfx1250/__init__.py b/python/triton/experimental/gluon/language/amd/gfx1250/__init__.py index ea95c48eb0..dc604bccec 100644 --- a/python/triton/experimental/gluon/language/amd/gfx1250/__init__.py +++ b/python/triton/experimental/gluon/language/amd/gfx1250/__init__.py @@ -1,7 +1,11 @@ from ..._core import builtin -from .._ops import _wmma +from .._ops import _wmma, _verify_wmma +from triton.experimental.gluon.language import _core as ttgl +from triton.experimental.gluon.language._semantic import _check +from ..._layouts import DotOperandLayout +from .._layouts import AMDWMMALayout -__all__ = ["wmma"] +__all__ = ["wmma", "wmma_scaled"] @builtin @@ -15,3 +19,50 @@ def wmma(a, b, acc, _semantic=None): acc (tensor): The accumulator tensor. """ return _wmma(3, a, b, acc, _semantic) + + +@builtin +def wmma_scaled(a, a_scale, a_format, b, b_scale, b_format, acc, _semantic=None): + """ + AMD Scaled WMMA operation. + + ``` + c = a * a_scale @ b * b_scale + acc + ``` + + `a` and `b` use microscaling formats described in + "OCP Microscaling Formats (MX) Specification": + https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf. + + Args: + a (tensor): The operand A to be multiplied. + a_scale (tensor): Scale factor for operand A. + a_format (str): Format of the operand A. Available formats: `e2m1'. + b (tensor): The operand B to be multiplied. + b_scale (tensor): Scale factor for operand B. + b_format (str): Format of the operand B. Available formats: `e2m1'. + acc (tensor): Accumulator tensor. + """ + _verify_wmma(3, a, b, acc) + if a_format.value == "e2m1": + wmma_layout = a.type.layout.parent + assert isinstance(wmma_layout, AMDWMMALayout) and wmma_layout.instr_shape == (16, 16, 64), \ + "e2m1 format expects instr_shape to be (16, 16, 64)" + if b_format.value == "e2m1": + wmma_layout = b.type.layout.parent + assert isinstance(wmma_layout, AMDWMMALayout) and wmma_layout.instr_shape == (16, 16, 64), \ + "e2m1 format expects instr_shape to be (16, 16, 64)" + + acc_layout = acc.type.layout + assert isinstance(acc_layout, AMDWMMALayout) and acc_layout.instr_shape == (16, 16, 128), \ + "accumulator tensor's layout must be (16, 16, 128)" + + # TODO: Add more formats + assert a_format.value in {"e2m1"}, f"Unsupported lhs_format: {a_format.value}" + assert b_format.value in {"e2m1"}, f"Unsupported rhs_format: {b_format.value}" + + assert a_scale is not None and b_scale is not None, "Scales must not be None" + + handle = _semantic.dot_scaled(a, a_scale, a_format, b, b_scale, b_format, acc, fast_math=False, lhs_k_pack=True, + rhs_k_pack=True, out_dtype=acc.dtype).handle + return ttgl.tensor(handle, acc.type) diff --git a/python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py b/python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py index 93f27b1ea3..8ae7fe98e1 100644 --- a/python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py +++ b/python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py @@ -23,10 +23,10 @@ "fence_async_shared", "get_tmem_32x32b_reg_layout", "mbarrier", + "mma_v2", "tensor_memory_descriptor", "TensorMemoryLayout", "tma", - "mma_v2", ] diff --git a/python/triton/experimental/gluon/language/nvidia/hopper/__init__.py b/python/triton/experimental/gluon/language/nvidia/hopper/__init__.py index 796fe06177..2855730368 100644 --- a/python/triton/experimental/gluon/language/nvidia/hopper/__init__.py +++ b/python/triton/experimental/gluon/language/nvidia/hopper/__init__.py @@ -8,7 +8,7 @@ if TYPE_CHECKING: from triton._C.libtriton import ir -__all__ = ["async_copy", "fence_async_shared", "mbarrier", "tma", "warpgroup_mma", "warpgroup_mma_wait", "mma_v2"] +__all__ = ["async_copy", "fence_async_shared", "mbarrier", "mma_v2", "tma", "warpgroup_mma", "warpgroup_mma_wait"] @_core.builtin diff --git a/python/triton_kernels/bench/distributed.py b/python/triton_kernels/bench/distributed.py index a7ae39f77a..67fb6d574d 100644 --- a/python/triton_kernels/bench/distributed.py +++ b/python/triton_kernels/bench/distributed.py @@ -18,12 +18,10 @@ ScatterIndx, compute_expt_data_torch, topk_torch, - prune_routing, routing_from_bitmatrix, ) from triton_kernels.topk import topk from triton_kernels.matmul_ogs import matmul_ogs, PrecisionConfig, FlexCtx, FnSpecs, FusedActivation -from triton_kernels.routing_details._routing_compute import _routing_clear_bitmatrix from triton_kernels.target_info import get_cdna_version, is_hip, is_cuda, cuda_capability_geq from triton_kernels.tensor_details import layout from triton_kernels.tensor import Bitmatrix @@ -291,6 +289,46 @@ def pack_bitmatrix( tl.store(bitmatrix_ptrs, y, mask=offsets_m[:, None] < n_rows) +@triton.jit +def _routing_clear_bitmatrix(Bitmatrix, stride_bm, stride_bn, shape_bn, cutoff, BLOCK_N: tl.constexpr): + pid_m = tl.program_id(0) + cutoff_word = cutoff // 32 + cutoff_bit = cutoff % 32 + cutoff_mask = (1 << (cutoff_bit)) - 1 + for start_n in range(0, shape_bn, BLOCK_N): + offs_n = start_n + tl.arange(0, BLOCK_N) + values = tl.load(Bitmatrix + pid_m * stride_bm + offs_n * stride_bn, mask=offs_n < shape_bn) + values = tl.where(offs_n == cutoff_word, values & cutoff_mask, values) + values = tl.where(offs_n > cutoff_word, 0, values) + tl.store(Bitmatrix + pid_m * stride_bm + offs_n * stride_bn, values, mask=offs_n < shape_bn) + + +class PruneRouting(torch.autograd.Function): + + @staticmethod + def forward(ctx, expt_scal, expt_indx, bitmatrix, n_expts_tot, simulated_ep): + from triton_kernels.compaction import compaction + n_tokens_pad = expt_scal.shape[0] + assert n_expts_tot % simulated_ep == 0 + _routing_clear_bitmatrix[(n_tokens_pad, )]( + bitmatrix.storage.data, + bitmatrix.storage.data.stride(0), + bitmatrix.storage.data.stride(1), + bitmatrix.storage.data.shape[1], + n_expts_tot // simulated_ep, + BLOCK_N=512, + ) + # perform compaction to update expt_scal / expt_indx + expt_scal, expt_indx = compaction(expt_scal, expt_indx, bitmatrix) + n_expts_tot = n_expts_tot // simulated_ep + bitmatrix.shape[-1] = n_expts_tot + return expt_scal, expt_indx, bitmatrix + + +def prune_routing(expt_scal, expt_indx, bitmatrix, n_expts_tot, simulated_ep): + return PruneRouting.apply(expt_scal, expt_indx, bitmatrix, n_expts_tot, simulated_ep) + + def routing_triton(x, logits, n_expts_act, sm_first=False, expt_indx=None, n_rows=None, EP=1, TP=1): _, n_expts_tot = logits.shape @@ -354,7 +392,7 @@ def routing(x, logits, n_expts_act, sm_first=False, expt_indx=None, n_rows=None, else: raise ValueError(f"Unknown backend: {backend}") else: - return x, *triton_kernels.routing.routing(logits, n_expts_act, sm_first, expt_indx, EP, n_rows), None + return x, *triton_kernels.routing.routing(logits, n_expts_act, sm_first, expt_indx, n_rows), None # The following dummy methods simulate the behavior of distributed operations diff --git a/python/triton_kernels/tests/conftest.py b/python/triton_kernels/tests/conftest.py index 177bd34d0a..b9ca578ec6 100644 --- a/python/triton_kernels/tests/conftest.py +++ b/python/triton_kernels/tests/conftest.py @@ -1,5 +1,6 @@ import pytest import tempfile +import os def pytest_addoption(parser): @@ -29,3 +30,12 @@ def fresh_triton_cache(): with knobs.cache.scope(), knobs.runtime.scope(): knobs.cache.dir = tmpdir yield tmpdir + + +def pytest_configure(config): + worker_id = os.environ.get("PYTEST_XDIST_WORKER") + if worker_id is not None and worker_id.startswith("gw"): + import torch + gpu_id = int(worker_id[2:]) # map gw0 → 0, gw1 → 1, ... + if torch.cuda.is_available(): + os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id % torch.cuda.device_count()) diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py index 9e8b19ec72..bc8cefdbf4 100644 --- a/python/triton_kernels/tests/test_matmul.py +++ b/python/triton_kernels/tests/test_matmul.py @@ -45,19 +45,16 @@ def mask_indx(idx, n_expts_act): return idx -def init_routing_data(m, n_expts_tot, n_expts_act, n_expt_shards, do_gather, do_scatter, device="cuda"): +def init_routing_data(m, n_expts_tot, n_expts_act, do_gather, do_scatter, device="cuda"): logits = torch.randn((m, n_expts_tot), dtype=torch.float16, device=device, requires_grad=True) - routing_data, gather_idx, scatter_idx = routing(logits, n_expts_act, simulated_ep=n_expt_shards) + routing_data, gather_idx, scatter_idx = routing(logits, n_expts_act) routing_data.gate_scal = None gather_idx = gather_idx if do_gather else None scatter_idx = scatter_idx if do_scatter else None - # TODO: re-enable - # if do_gather and do_scatter and n_expts_act == 1 and n_expt_shards == 1: - # scatter_idx = mask_indx(scatter_idx, n_expts_act) return m, routing_data, gather_idx, scatter_idx -def init_compute_data(m, n, k, rdata, gindx, sindx, n_expts_tot, n_expts_act, n_expt_shards, mode, act_dtype, weight_dtype, +def init_compute_data(m, n, k, rdata, gindx, sindx, n_expts_tot, n_expts_act, mode, act_dtype, weight_dtype, has_y_gammas, requires_grad=True, device="cuda", inner_expt_opt=None, padding_block_k=None): torch.manual_seed(0) @@ -70,7 +67,7 @@ def init_compute_data(m, n, k, rdata, gindx, sindx, n_expts_tot, n_expts_act, n_ else: in_m = m * (n_expts_act if gindx is None else 1) shape_x = (n_expts_tot, in_m, k) if mode == 'batched' else (in_m, k) - shape_batch = tuple() if (mode == "plain" or inner_expt_opt is not None) else (n_expts_tot // n_expt_shards, ) + shape_batch = tuple() if (mode == "plain" or inner_expt_opt is not None) else (n_expts_tot, ) x = alloc_rand(shape_x, device=device, dtype=act_dtype, requires_grad=requires_grad) w = alloc_rand(shape_batch + (k, n), device=device, dtype=weight_dtype, requires_grad=requires_grad) bias = alloc_rand(shape_batch + (n, ), device=device, dtype=torch.float32, requires_grad=requires_grad) @@ -194,7 +191,6 @@ class Case: weight_dtype_str: str n_expts_tot: int = 1 n_expts_act: int = 1 - n_expt_shards: int = 1 split_k: int = 1 hbm_swizzling: bool = False epilogue_subtile: Union[int, None] = None @@ -216,10 +212,6 @@ class Case: Case(5, 7, 0, "batched", "float16", "float16"), # Non-mx types: Case(16, 256, 256, "ragged", "float16", "float16", 128, 4), - Case(16, 256, 256, "ragged", "float16", "float16", 128, 4, n_expt_shards=2), - Case(16, 256, 256, "ragged", "float16", "float16", 128, 4, n_expt_shards=4), - Case(400, 300, 500, "ragged", "float16", "float16", 32, 4, n_expt_shards=4), - Case(16, 256, 256, "ragged", "float16", "float16", 4, 1, n_expt_shards=2), Case(16, 256, 256, "ragged", "float16", "float16", 128, 4, split_k=3), Case(16, 256, 256, "ragged", "float16", "float16", 128, 4, split_k=3), Case(300, 400, 400, "batched", "float8_e5m2", "float8_e5m2", 5, 1), @@ -235,8 +227,6 @@ class Case: Case(600, 400, 400, "ragged", "float8_e5m2", "float8_e5m2", 4, 2, epilogue_subtile=2), Case(600, 400, 400, "ragged", "float8_e5m2", "float8_e5m2", 4, 2, epilogue_subtile=4), Case(600, 400, 400, "ragged", "float8_e5m2", "float8_e5m2", 4, 2), - Case(600, 400, 400, "ragged", "float8_e5m2", "float8_e5m2", 4, 2, n_expt_shards=2), - Case(600, 400, 400, "ragged", "float8_e5m2", "float8_e5m2", 4, 1, n_expt_shards=2), Case(600, 400, 400, "ragged", "float8_e5m2", "float8_e5m2", 4, 2, split_k=2), Case(1000, 400, 400, "ragged", "float16", "float16", 3, 1), Case(1000, 700, 700, "ragged", "float16", "float16", 8, 2), @@ -291,19 +281,17 @@ class Case: Case(300, 400, 400, "ragged", "float8_e4m3fnuz", "float8_e4m3fnuz"), Case(1000, 400, 400, "ragged", "float8_e4m3fnuz", "float8_e4m3fnuz", 3, 1), Case(600, 400, 400, "ragged", "float8_e4m3fnuz", "float8_e4m3fnuz", 4, 2), - Case(600, 400, 400, "ragged", "float8_e4m3fnuz", "float8_e4m3fnuz", 4, 2, n_expt_shards=2), Case(600, 400, 400, "ragged", "float8_e4m3fnuz", "float8_e4m3fnuz", 4, 2, split_k=2), Case(300, 400, 400, "ragged", "float8_e4m3fn", "float8_e4m3fn"), Case(1000, 400, 400, "ragged", "float8_e4m3fn", "float8_e4m3fn", 3, 1), Case(600, 400, 400, "ragged", "float8_e4m3fn", "float8_e4m3fn", 4, 2), - Case(600, 400, 400, "ragged", "float8_e4m3fn", "float8_e4m3fn", 4, 2, n_expt_shards=2), ] + [ - Case(320, 400, 400, mode, dtype, dtype, n_expts_tot, n_expts_act, n_expt_shards=n_expt_shards, + Case(320, 400, 400, mode, dtype, dtype, n_expts_tot, n_expts_act, x_transpose=x_transpose, w_transpose=w_transpose, y_transpose=y_transpose) - for (mode, n_expts_tot, n_expts_act, n_expt_shards) in ( - ("batched", 1, 1, 1), - ("ragged", 8, 4, 1), - ("ragged", 32, 4, 4), + for (mode, n_expts_tot, n_expts_act) in ( + ("batched", 1, 1), + ("ragged", 8, 4), + ("ragged", 32, 4), ) for dtype in ("float16", "float8_e5m2") for x_transpose in (False, True) @@ -326,7 +314,7 @@ class Case: @pytest.mark.parametrize("has_y_gammas", [False, True]) @pytest.mark.parametrize("is_persistent", [False, True]) def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_opt, has_y_gammas, is_persistent, n_expts_tot, - n_expts_act, n_expt_shards, mode, act_dtype_str, weight_dtype_str, block_m, hbm_swizzling, epilogue_subtile, + n_expts_act, mode, act_dtype_str, weight_dtype_str, block_m, hbm_swizzling, epilogue_subtile, x_transpose, w_transpose, y_transpose, device, opt_flags_scope, fresh_knobs): # TODO: remove when Triton FP8 supports proper RTNE @@ -425,17 +413,17 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_o weight_dtype = dtype_str_to_torch(weight_dtype_str) act_dtype = dtype_str_to_torch(act_dtype_str) precision_opt = init_precision(act_dtype, act_is_float8, weight_dtype, weight_mxfp, - n_expts_tot // n_expt_shards, expt_is_inner, device=device) + n_expts_tot, expt_is_inner, device=device) # precision_opt.x_pad_trans_requires_flexpoint = False if mode == "ragged": - m, rdata, gindx, sindx = init_routing_data(m, n_expts_tot, n_expts_act, n_expt_shards, do_gather, do_scatter, + m, rdata, gindx, sindx = init_routing_data(m, n_expts_tot, n_expts_act, do_gather, do_scatter, device=device) else: rdata = gindx = sindx = None padding_block_k = 32 x_tri, w_tri, bias_tri, gs0_tri, gs1_tri = init_compute_data(m, n, k, rdata, gindx, sindx, n_expts_tot, n_expts_act, - n_expt_shards, mode, torch.bfloat16 if act_mxfp8 else act_dtype, # + mode, torch.bfloat16 if act_mxfp8 else act_dtype, # torch.bfloat16 if weight_mxfp else weight_dtype, has_y_gammas, requires_grad=test_bwd, device=device, inner_expt_opt=inner_expt_opt, padding_block_k=padding_block_k) @@ -447,9 +435,9 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_o w_tri = w_tri.detach().transpose(-1, -2).contiguous().transpose(-1, -2).requires_grad_(test_bwd) if y_transpose: if mode == "batched": - yT_shape = (n_expts_tot // n_expt_shards, n, x_tri.shape[-2]) + yT_shape = (n_expts_tot, n, x_tri.shape[-2]) elif expt_is_inner: - yT_shape = (n_expts_tot // n_expt_shards, n, k) + yT_shape = (n_expts_tot, n, k) elif sindx is not None: yT_shape = (n, m) else: @@ -550,20 +538,6 @@ def scale(val, scal): assert val.ndim == 3 return val / scal[:, None, None] - if n_expt_shards > 1: - if do_scatter: - indx = sindx.dst_indx[sindx.dst_indx != -1] - ref_y = ref_y[indx // n_expts_act, :] - if act_is_float8: - tri_y = tri_y.view(torch.int8) - tri_y = tri_y[indx // n_expts_act, :] - if act_is_float8: - tri_y = tri_y.view(act_dtype) - elif not expt_is_inner: - n_rows = rdata.expt_hist.sum() - assert n_rows > 0 - ref_y = ref_y[:n_rows] - tri_y = tri_y[:n_rows] if act_mxfp8: tri_y = upcast_from_mxfp(tri_y, precision_opt.out_scale, target_dtype=torch.bfloat16, axis=-1).to(ref_y.dtype) ref_y_quant, ref_y_scale = downcast_to_mxfp_torch(ref_y, act_dtype, axis=-1) @@ -684,18 +658,18 @@ def test_fused_act(m, n, k, mode, split_k, do_gather, do_scatter, fused_scatter, "split_k": split_k, "fused_scatter": fused_scatter, } - n_expts_tot, n_expts_act, n_expt_shards = 1, 1, 1 + n_expts_tot, n_expts_act = 1, 1 opt_flags.update_opt_flags_constraints(constraints) weight_dtype, act_dtype = torch.float16, torch.float16 if mode == "ragged": - m, rdata, gindx, sindx = init_routing_data(m, n_expts_tot, n_expts_act, n_expt_shards, do_gather, do_scatter, + m, rdata, gindx, sindx = init_routing_data(m, n_expts_tot, n_expts_act, do_gather, do_scatter, device=device) else: rdata = gindx = sindx = None - precision_opt = init_precision(act_dtype, str(act_dtype).startswith("torch.float8"), weight_dtype, False, n_expts_tot // n_expt_shards, device=device) - x, w, bias, _, _ = init_compute_data(m, n, k, rdata, gindx, sindx, n_expts_tot, n_expts_act, n_expt_shards, mode, + precision_opt = init_precision(act_dtype, str(act_dtype).startswith("torch.float8"), weight_dtype, False, n_expts_tot, device=device) + x, w, bias, _, _ = init_compute_data(m, n, k, rdata, gindx, sindx, n_expts_tot, n_expts_act, mode, act_dtype, weight_dtype, False, requires_grad=False, device=device) if mode == "batched": diff --git a/python/triton_kernels/tests/test_routing.py b/python/triton_kernels/tests/test_routing.py index 82f625a5cb..a59ab7b4d5 100644 --- a/python/triton_kernels/tests/test_routing.py +++ b/python/triton_kernels/tests/test_routing.py @@ -55,11 +55,8 @@ def _assert_indx_equal(ref, tri): tri_expt_data = tri_routing_data.expt_data assert_equal(ref_expt_data.hist, tri_expt_data.hist) assert_equal(ref_expt_data.token_offs_raw, tri_expt_data.token_offs_raw) - assert len(ref_expt_data.token_offs_pad) == len(tri_expt_data.token_offs_pad) - assert len(ref_expt_data.block_pid_map) == len(tri_expt_data.block_pid_map) - for block_m in ref_expt_data.token_offs_pad.keys(): - assert_equal(ref_expt_data.token_offs_pad[block_m], tri_expt_data.token_offs_pad[block_m]) - assert_equal(ref_expt_data.block_pid_map[block_m], tri_expt_data.block_pid_map[block_m]) + assert_equal(ref_expt_data.token_offs_pad_data, tri_expt_data.token_offs_pad_data) + assert_equal(ref_expt_data.block_pid_map_data, tri_expt_data.block_pid_map_data) assert ref_routing_data.n_expts_tot == ref_routing_data.n_expts_tot assert ref_routing_data.n_expts_act == ref_routing_data.n_expts_act diff --git a/python/triton_kernels/triton_kernels/matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs.py index 4b50ecb2b0..a58b7811fb 100644 --- a/python/triton_kernels/triton_kernels/matmul_ogs.py +++ b/python/triton_kernels/triton_kernels/matmul_ogs.py @@ -169,8 +169,8 @@ def make_kernel_args(data, block_m): return ( expt_data.hist, expt_data.token_offs_raw, - expt_data.token_offs_pad[block], - expt_data.block_pid_map[block], + expt_data.token_offs_pad(block), + expt_data.block_pid_map(block), ) + args @@ -700,7 +700,7 @@ def matmul_ogs_torch(x, w, bias, if k > 0: out[expt] = matmul_ogs_torch( x[:, start_x:start_x+k], w[start_w:start_w+k, :], None, - None, None, None, None, betas, gammas, None, round_x, round_y + None, None, None, None, betas, gammas, None, round_x, round_y, device ) padded_k = triton.cdiv(k, block_k) * block_k start_x += padded_k if inner_routing_data.x_is_padded else k diff --git a/python/triton_kernels/triton_kernels/routing.py b/python/triton_kernels/triton_kernels/routing.py index 0758406df9..e04a487dc1 100644 --- a/python/triton_kernels/triton_kernels/routing.py +++ b/python/triton_kernels/triton_kernels/routing.py @@ -3,10 +3,10 @@ from dataclasses import dataclass, field from .routing_details._routing_compute import _combined_routing_compute from .routing_details._routing_compute import _combined_routing_memset -from .routing_details._routing_compute import _routing_clear_bitmatrix from .routing_details._expt_data import _expt_data_memset from .routing_details._expt_data import _expt_data_compute from .target_info import is_hip +from .topk import topk, topk_torch @dataclass @@ -38,31 +38,43 @@ class ExptData: # token_offs_raw[i] is the offset of the first token routed # to expert i in an expert-sorted array token_offs_raw: torch.Tensor - # token_offs_pad[block][i] is the offset of the first token routed + # token_offs_pad_data[i, :] is the offset of the first token routed # to expert i in an expert-sorted array, assuming histogram - # rounded to the next multiple of `block` - token_offs_pad: dict[int, torch.Tensor] - # block_id_map[block] contain one value for each `pid`` launched by - # the matrix multiplication kernel launched with BLOCK_M=block: + # rounded to the next multiple of `block = 16 * i` + token_offs_pad_data: torch.Tensor + # block_id_map_data[i] contain one value for each `pid`` launched by + # the matrix multiplication kernel launched with BLOCK_M=i*16: # - the value is -1 if the `pid` has no work to do # - otherwise, the value is two int16 (packed as an int32) that # correspond respectively to (1) the expert assigned to # the tokens processed by this pid; (2) the block assigned to the # tokens processed by this pid (think `pid_m` in a regular matmul) # see `test_routing.py` for a reference implementation and more details - block_pid_map: dict[int, torch.Tensor] + block_pid_map_data: torch.Tensor def __post_init__(self): + assert self.token_offs_pad_data.shape[0] == len(ExptData.block_ms()) + assert self.block_pid_map_data.shape[0] == len(ExptData.block_ms()) + assert self.token_offs_pad_data.dtype == torch.int32 + assert self.block_pid_map_data.dtype == torch.int32 if self.hist is not None: assert self.hist.dtype == torch.int32 if self.token_offs_raw is not None: assert self.token_offs_raw.dtype == torch.int32 - if self.token_offs_pad is not None: - for v in self.token_offs_pad.values(): - assert v.dtype == torch.int32 - if self.block_pid_map is not None: - for v in self.block_pid_map.values(): - assert v.dtype == torch.int32 + + def token_offs_pad(self, block_m): + return self.token_offs_pad_data[ExptData.block_ms().index(block_m)] + + def block_pid_map(self, block_m): + return self.block_pid_map_data[ExptData.block_ms().index(block_m)] + + @staticmethod + def block_ms_log2(): + return range(4, 9) if is_hip() else range(4, 8) + + @staticmethod + def block_ms(): + return [2**x for x in ExptData.block_ms_log2()] @dataclass @@ -97,6 +109,7 @@ def forward(ctx, expt_scal, expt_indx, n_expts_tot, bitmatrix): HIST_BLOCK_M = 32 INDX_OFFS_BLOCK_M = 512 MEMSET_BLOCK = 1024 + MEMSET_BLOCK_A = 512 cdiv = triton.cdiv device = expt_scal.device @@ -104,22 +117,30 @@ def forward(ctx, expt_scal, expt_indx, n_expts_tot, bitmatrix): n_tokens_raw, _ = bitmatrix.shape n_tokens_pad, n_expts_act = expt_scal.shape n_gates_pad = n_tokens_pad * n_expts_act + block_ms_log2 = ExptData.block_ms_log2() + block_m_num = len(block_ms_log2) hist, partial_hist = bitmatrix.sum(partials_block_size=HIST_BLOCK_M) hist = hist[:n_expts_tot] assert hist.dtype == torch.int32 - # scratchpad + + # allocate memory expt_offs = torch.empty(n_expts_tot, dtype=torch.int32, device=device) combined_indx = torch.empty(n_gates_pad * 2, dtype=torch.int32, device=device) - # output - topk_indx = combined_indx[:n_gates_pad] - gate_indx = combined_indx[n_gates_pad:] gate_scal = torch.empty(n_gates_pad, dtype=dtype, device=device) - - token_offs_combined, token_offs_raw, token_offs_pad, block_pid_map, blocks1a, blocks2a, MEMSET_BLOCK_A, HIST2_BLOCK_M, block_m_log2_start, block_m_num = _compute_expt_data_internal( - hist, n_expts_tot, n_gates_pad) - + token_offs_combined = empty_aligned((block_m_num + 1, n_expts_tot + 1), torch.int32, device, MEMSET_BLOCK_A) + block_pid_map = empty_aligned((block_m_num, max_n_tiles(n_expts_tot, n_gates_pad)), torch.int32, device, + MEMSET_BLOCK_A) + # slice padded allocations + combine_indx = combined_indx[:n_gates_pad] + dispatch_indx = combined_indx[n_gates_pad:] + token_offs_raw, token_offs_pad = token_offs_combined[0], token_offs_combined[1:] + + # grid sizes + block_pid_map_n_elts = block_pid_map.untyped_storage().size() // block_pid_map.dtype.itemsize + blocks1a = exact_div(block_pid_map_n_elts, MEMSET_BLOCK_A) + token_offs_combined.shape[0] blocks1b = cdiv(n_gates_pad * 2, MEMSET_BLOCK) + n_expts_tot + 1 + blocks2a = n_expts_tot * token_offs_pad.shape[0] blocks2b = cdiv(n_tokens_pad, HIST_BLOCK_M) _combined_routing_memset[(blocks1a + blocks1b, )]( @@ -128,31 +149,32 @@ def forward(ctx, expt_scal, expt_indx, n_expts_tot, bitmatrix): partial_hist.shape[0], partial_hist.stride(0), partial_hist.stride(1), # outputs token_offs_combined, token_offs_combined.stride(0), # blocks1a, block_pid_map, # - block_m_log2_start, SIZES=block_m_num, BLOCK_A=MEMSET_BLOCK_A, # optimization parameters + block_ms_log2[0], SIZES=len(block_ms_log2), BLOCK_A=MEMSET_BLOCK_A, # optimization parameters BLOCK_N=512, BLOCK_M=INDX_OFFS_BLOCK_M, # tunable parameters ) indx_offs = partial_hist _combined_routing_compute[(blocks2a + blocks2b, )]( - topk_indx, gate_indx, gate_scal, # outputs + combine_indx, dispatch_indx, gate_scal, # outputs expt_scal, expt_indx, indx_offs, indx_offs.stride(0), indx_offs.stride(1), # inputs expt_offs, n_tokens_raw, # input shape HIST_BLOCK_M, n_expts_act, # constants hist, token_offs_pad, token_offs_pad.stride(0), block_pid_map, block_pid_map.stride(0), # outputs - block_m_log2_start, block_m_num, HIST2_BLOCK_M, blocks2a, # etc. + block_ms_log2[0], len(block_ms_log2), 512, blocks2a, # etc. ) ctx.n_tokens_raw = n_tokens_raw ctx.n_tokens_pad = n_tokens_pad ctx.n_expts_act = n_expts_act - ctx.save_for_backward(gate_indx) - return hist, topk_indx, gate_indx, gate_scal, token_offs_raw, token_offs_pad, block_pid_map + ctx.save_for_backward(dispatch_indx) + + return hist, combine_indx, dispatch_indx, gate_scal, token_offs_raw, token_offs_pad, block_pid_map @staticmethod def backward(ctx, _0, _1, _2, dgate_scal, _3, _4, _5): - (gate_indx, ) = ctx.saved_tensors - dgate_scal = dgate_scal[gate_indx] + (dispatch_indx, ) = ctx.saved_tensors + dgate_scal = dgate_scal[dispatch_indx] dgate_scal = dgate_scal.reshape(ctx.n_tokens_pad, ctx.n_expts_act) return dgate_scal, None, None, None @@ -161,89 +183,28 @@ def sort_tokens(expt_scal, expt_indx, n_expts_tot, bitmatrix): return SortTokens.apply(expt_scal, expt_indx, n_expts_tot, bitmatrix) -# -------------------------- -# prune routing -# -------------------------- - - -class PruneRouting(torch.autograd.Function): - - @staticmethod - def forward(ctx, expt_scal, expt_indx, bitmatrix, n_expts_tot, simulated_ep): - from .compaction import compaction - n_tokens_pad = expt_scal.shape[0] - assert n_expts_tot % simulated_ep == 0 - _routing_clear_bitmatrix[(n_tokens_pad, )]( - bitmatrix.storage.data, - bitmatrix.storage.data.stride(0), - bitmatrix.storage.data.stride(1), - bitmatrix.storage.data.shape[1], - n_expts_tot // simulated_ep, - BLOCK_N=512, - ) - # perform compaction to update expt_scal / expt_indx - expt_scal, expt_indx = compaction(expt_scal, expt_indx, bitmatrix) - n_expts_tot = n_expts_tot // simulated_ep - bitmatrix.shape[-1] = n_expts_tot - return expt_scal, expt_indx, bitmatrix - - -def prune_routing(expt_scal, expt_indx, bitmatrix, n_expts_tot, simulated_ep): - return PruneRouting.apply(expt_scal, expt_indx, bitmatrix, n_expts_tot, simulated_ep) - - # -------------------------- # expt_data # -------------------------- -def log2_power_of_two(x): - assert x > 0 and (x & (x - 1)) == 0, "x must be a power of two" - return x.bit_length() - 1 +def exact_div(x, y): + assert x % y == 0 + return x // y -block_m_log2_start = 4 +def empty_aligned(shape, dtype, device, pad_size): + cdiv = lambda x, y: (x + y - 1) // y + pad = lambda x: cdiv(x, pad_size) * pad_size + ret = torch.empty((*shape[:-1], pad(shape[-1])), dtype=dtype, device=device) + ret_slices = (*[slice(None)] * (len(shape) - 1), slice(0, shape[-1])) + return ret[ret_slices] -def _compute_expt_data_internal(expt_hist, n_expts_tot, n_gates): - - MEMSET_BLOCK = 512 - HIST2_BLOCK_M = 512 - device = expt_hist.device - n_expts_tot = n_expts_tot - cdiv = triton.cdiv - # block_ms are all powers-of-two between 16 and 128 (inclusive) - block_m_log2_end = 9 if is_hip() else 8 - block_m_num = block_m_log2_end - block_m_log2_start +def max_n_tiles(n_expts_tot, n_gates): if n_gates <= n_expts_tot: - max_n_tiles = n_gates - else: - max_n_tiles = n_expts_tot - 1 - ((n_expts_tot - n_gates - 1) // 2**block_m_log2_start) - # allocate memory - pad = lambda x: cdiv(x, MEMSET_BLOCK) * MEMSET_BLOCK - dtype = torch.int32 - - token_offs_combined = torch.empty((block_m_num + 1, pad(n_expts_tot + 1)), dtype=dtype, device=device) - - token_offs_raw = token_offs_combined[0][:n_expts_tot + 1] - token_offs_pad = token_offs_combined[1:] - - block_pid_map = torch.empty((block_m_num, pad(max_n_tiles)), dtype=dtype, device=device) - memset_grid = torch.numel(block_pid_map) // MEMSET_BLOCK # exact division - # compute outputs - token_offs_pad = token_offs_pad[:, :n_expts_tot + 1] - block_pid_map = block_pid_map[:, :max_n_tiles] - - blocks1 = memset_grid + block_m_num + 1 - blocks2 = n_expts_tot * block_m_num - return token_offs_combined, token_offs_raw, token_offs_pad, block_pid_map, blocks1, blocks2, MEMSET_BLOCK, HIST2_BLOCK_M, block_m_log2_start, block_m_num - - -def _unpack_into_dict(x): - - block_m_log2_end = block_m_log2_start + x.shape[0] - x = {2**j: x[i, :] for i, j in enumerate(range(block_m_log2_start, block_m_log2_end))} - return x + return n_gates + return n_expts_tot - 1 - ((n_expts_tot - n_gates - 1) // ExptData.block_ms()[0]) def compute_expt_data(expt_hist, n_expts_tot, n_gates): @@ -251,23 +212,28 @@ def compute_expt_data(expt_hist, n_expts_tot, n_gates): if expt_hist is None: return ExptData(None, None, None, None) - # this just computes the kernel arguments: - token_offs_combined, token_offs_raw, token_offs_pad, block_pid_map, blocks1, blocks2, MEMSET_BLOCK, HIST2_BLOCK_M, block_m_log2_start, block_m_num = _compute_expt_data_internal( - expt_hist, n_expts_tot, n_gates) + block_ms_log2 = ExptData.block_ms_log2() + block_m_num = len(block_ms_log2) + MEMSET_BLOCK = 512 + dtype = torch.int32 + device = expt_hist.device + token_offs_combined = empty_aligned((block_m_num + 1, n_expts_tot + 1), dtype, device, MEMSET_BLOCK) + block_pid_map = empty_aligned((block_m_num, max_n_tiles(n_expts_tot, n_gates)), dtype, device, MEMSET_BLOCK) + token_offs_raw, token_offs_pad = token_offs_combined[0], token_offs_combined[1:] + n_memset_blocks = exact_div(block_pid_map.storage().size(), MEMSET_BLOCK) - _expt_data_memset[(blocks1, )]( + _expt_data_memset[(token_offs_combined.shape[0] + n_memset_blocks, )]( expt_hist, n_expts_tot, # token_offs_combined, token_offs_combined.stride(0), # block_pid_map, # - block_m_log2_start, SIZES=block_m_num, BLOCK=MEMSET_BLOCK, # optimization parameters + block_ms_log2[0], SIZES=len(block_ms_log2), BLOCK=MEMSET_BLOCK, # optimization parameters num_warps=4) - _expt_data_compute[(blocks2, )]( + + _expt_data_compute[(block_m_num * n_expts_tot, )]( expt_hist, token_offs_pad, token_offs_pad.stride(0), block_pid_map, block_pid_map.stride(0), # outputs - block_m_log2_start, SIZES=block_m_num, BLOCK=HIST2_BLOCK_M, # optimization parameters + block_ms_log2[0], SIZES=len(block_ms_log2), BLOCK=512, # optimization parameters num_warps=4) - token_offs_pad = _unpack_into_dict(token_offs_pad) - block_pid_map = _unpack_into_dict(block_pid_map) return ExptData(expt_hist, token_offs_raw, token_offs_pad, block_pid_map) @@ -277,30 +243,20 @@ def compute_expt_data(expt_hist, n_expts_tot, n_gates): def routing_from_bitmatrix(bitmatrix, expt_scal, expt_indx, n_expts_tot, n_expts_act): - hist, topk_indx, gate_indx, gate_scal, token_offs_raw, token_offs_pad, block_pid_map = sort_tokens( + hist, combine_indx, dispatch_indx, gate_scal, token_offs_raw, token_offs_pad, block_pid_map = sort_tokens( expt_scal, expt_indx, n_expts_tot, bitmatrix) - token_offs_pad = _unpack_into_dict(token_offs_pad) - block_pid_map = _unpack_into_dict(block_pid_map) expt_data = ExptData(hist, token_offs_raw, token_offs_pad, block_pid_map) - - # pack the matmul data structure - gather_indx = GatherIndx(src_indx=topk_indx, dst_indx=gate_indx) - scatter_indx = ScatterIndx(src_indx=gate_indx, dst_indx=topk_indx) + gather_indx = GatherIndx(src_indx=combine_indx, dst_indx=dispatch_indx) + scatter_indx = ScatterIndx(src_indx=dispatch_indx, dst_indx=combine_indx) return RoutingData(gate_scal, hist, n_expts_tot, n_expts_act, expt_data), gather_indx, scatter_indx -def routing(logits, n_expts_act, sm_first=False, expt_indx=None, simulated_ep=1, n_rows=None): - from .topk import topk +def routing(logits, n_expts_act, sm_first=False, expt_indx=None, n_rows=None): if sm_first: logits = torch.softmax(logits, dim=-1) - expt_scal, expt_indx, bitmatrix = topk(logits, n_expts_act, # - apply_softmax=not sm_first, y_indx=expt_indx, n_rows=n_rows) - n_expts_tot = logits.shape[-1] // simulated_ep - # mutate bitmatrix - if simulated_ep > 1: - expt_scal, expt_indx, bitmatrix = prune_routing(expt_scal, expt_indx, bitmatrix, logits.shape[-1], simulated_ep) - - return routing_from_bitmatrix(bitmatrix, expt_scal, expt_indx, n_expts_tot, n_expts_act) + expt_scal, expt_indx, bitmatrix = topk(logits, n_expts_act, apply_softmax=not sm_first, y_indx=expt_indx, + n_rows=n_rows) + return routing_from_bitmatrix(bitmatrix, expt_scal, expt_indx, logits.shape[-1], n_expts_act) # -------------------------- @@ -315,51 +271,32 @@ def compute_expt_data_torch(hist, n_expts_tot, n_gates): token_offs_raw = torch.cat((torch.zeros(1, device=device), token_offs_raw)) token_offs_raw = token_offs_raw.int() # maximum number of tiles for all values of `block_m` considered - block_ms = [16, 32, 64, 128] - if is_hip(): - block_ms.append(256) if n_gates <= n_expts_tot: max_n_tiles = n_gates else: # ceil_div(n_gates - n_experts + 1, d_tile) + n_experts - 1 # ceil_div(x, y): -(-x // y) - max_n_tiles = n_expts_tot - 1 - ((n_expts_tot - n_gates - 1) // min(block_ms)) + max_n_tiles = n_expts_tot - 1 - ((n_expts_tot - n_gates - 1) // min(ExptData.block_ms())) # fill up tile offset/infos for each block token_offs_pad = dict() block_pid_map = dict() - for block_m in block_ms: + for block_m in ExptData.block_ms(): n_tiles = (hist + block_m - 1) // block_m # matmul blocks needed token_offs_pad[block_m] = torch.cumsum(n_tiles, dim=0) token_offs_pad[block_m] = torch.cat((torch.zeros(1, device=device), token_offs_pad[block_m])) token_offs_pad[block_m] = token_offs_pad[block_m].int() # compute data required to drive ragged batch matmul block_pid_map[block_m] = -torch.ones(max_n_tiles, dtype=torch.int32, device=device) - - # for e in range(n_expts_tot): - # offset = token_offs_pad[block_m][e] - # for b in range(n_tiles[e]): - # block_pid_map[block_m][offset + b] = (b << 16) + e - col = torch.arange(max_n_tiles, device=device) map_vals = torch.arange(n_expts_tot, device=device)[:, None] + (col << 16)[None, :] map_idxs = token_offs_pad[block_m][:-1, None] + col[None, :] mask = col[None, :] < n_tiles[:, None] block_pid_map[block_m].index_put_((map_idxs[mask], ), map_vals.int()[mask]) + token_offs_pad = torch.stack(list(token_offs_pad.values())) + block_pid_map = torch.stack(list(block_pid_map.values())) return ExptData(hist, token_offs_raw, token_offs_pad, block_pid_map) -def topk_torch(vals, k, expt_indx, has_user_provided_indx=False): - # topk of experts - if has_user_provided_indx: - tk_indx = expt_indx - else: - tk_indx = torch.argsort(-vals, dim=1, stable=True)[:, :k] - tk_indx = tk_indx.long() - tk_val = torch.take_along_dim(vals, tk_indx, dim=1) - tk_indx = tk_indx.int() - return tk_val, tk_indx - - def routing_torch(logits, n_expts_act, sm_first=False, expt_indx=None, n_rows=None): has_user_provided_indx = expt_indx is not None n_gates_pad = logits.shape[0] * n_expts_act @@ -369,7 +306,7 @@ def routing_torch(logits, n_expts_act, sm_first=False, expt_indx=None, n_rows=No _, n_expts_tot = logits.shape if sm_first: logits = torch.softmax(logits, dim=-1) - expt_scal, expt_indx = topk_torch(logits, n_expts_act, expt_indx, has_user_provided_indx=has_user_provided_indx) + expt_scal, expt_indx = topk_torch(logits, n_expts_act, expt_indx) if not sm_first: expt_scal = torch.softmax(expt_scal, dim=-1) # sort each token's selections by expert @@ -380,13 +317,13 @@ def routing_torch(logits, n_expts_act, sm_first=False, expt_indx=None, n_rows=No expt_scal = expt_scal.reshape(-1) expt_indx = expt_indx.reshape(-1).to(torch.int32) # sort by expert_id so experts are contiguous for the matmul - topk_indx = torch.argsort(expt_indx, stable=True) - gate_indx = torch.argsort(topk_indx, stable=True) - gate_scal = expt_scal[topk_indx] + combine_indx = torch.argsort(expt_indx, stable=True) + dispatch_indx = torch.argsort(combine_indx, stable=True) + gate_scal = expt_scal[combine_indx] hist = torch.histc(expt_indx, bins=n_expts_tot, max=n_expts_tot - 1).int() # histogram of tokens over experts # pack the matmul data structure - gather_indx = GatherIndx(src_indx=topk_indx.int(), dst_indx=gate_indx.int()) - scatter_indx = ScatterIndx(src_indx=gate_indx.int(), dst_indx=topk_indx.int()) + gather_indx = GatherIndx(src_indx=combine_indx.int(), dst_indx=dispatch_indx.int()) + scatter_indx = ScatterIndx(src_indx=dispatch_indx.int(), dst_indx=combine_indx.int()) # compute expt_data expt_data = compute_expt_data_torch(hist, n_expts_tot, n_gates_pad) return RoutingData(gate_scal, hist, n_expts_tot, n_expts_act, expt_data), gather_indx, scatter_indx diff --git a/python/triton_kernels/triton_kernels/routing_details/_routing_compute.py b/python/triton_kernels/triton_kernels/routing_details/_routing_compute.py index a72900030f..b1afa9c9c9 100644 --- a/python/triton_kernels/triton_kernels/routing_details/_routing_compute.py +++ b/python/triton_kernels/triton_kernels/routing_details/_routing_compute.py @@ -97,20 +97,6 @@ def _combined_routing_compute(GatherIndx, ScatterIndx, GateScal, ExptScal, ExptI stride_pn, TokensStart, n_tokens, BLOCK_M, N_EXPTS_ACT) -@triton.jit -def _routing_clear_bitmatrix(Bitmatrix, stride_bm, stride_bn, shape_bn, cutoff, BLOCK_N: tl.constexpr): - pid_m = tl.program_id(0) - cutoff_word = cutoff // 32 - cutoff_bit = cutoff % 32 - cutoff_mask = (1 << (cutoff_bit)) - 1 - for start_n in range(0, shape_bn, BLOCK_N): - offs_n = start_n + tl.arange(0, BLOCK_N) - values = tl.load(Bitmatrix + pid_m * stride_bm + offs_n * stride_bn, mask=offs_n < shape_bn) - values = tl.where(offs_n == cutoff_word, values & cutoff_mask, values) - values = tl.where(offs_n > cutoff_word, 0, values) - tl.store(Bitmatrix + pid_m * stride_bm + offs_n * stride_bn, values, mask=offs_n < shape_bn) - - @triton.jit def _combined_routing_memset(Indx, size, sentinel, BLOCK: tl.constexpr, ExpertHist, FinalExpertOffs, hist_size, n_expts_tot, PartialHist, shape_pm, stride_pm, stride_pn, MDStarts, tile_starts_stridem, diff --git a/python/triton_kernels/triton_kernels/topk.py b/python/triton_kernels/triton_kernels/topk.py index 95dc2c20b9..2cd68b4a29 100644 --- a/python/triton_kernels/triton_kernels/topk.py +++ b/python/triton_kernels/triton_kernels/topk.py @@ -122,3 +122,14 @@ def topk( """ ret = TopK.apply(x, k, apply_softmax, dim, return_bitmatrix, y_indx, n_rows) return ret + + +def topk_torch(x, k, y_indx): + if y_indx is not None: + tk_indx = y_indx + else: + tk_indx = torch.argsort(-x, dim=1, stable=True)[:, :k] + tk_indx = tk_indx.long() + tk_val = torch.take_along_dim(x, tk_indx, dim=1) + tk_indx = tk_indx.int() + return tk_val, tk_indx diff --git a/scripts/skiplist/default/triton_kernels.txt b/scripts/skiplist/default/triton_kernels.txt index 5441879aff..f3a4d1078e 100644 --- a/scripts/skiplist/default/triton_kernels.txt +++ b/scripts/skiplist/default/triton_kernels.txt @@ -1,561 +1,73 @@ # https://github.com/intel/intel-xpu-backend-for-triton/issues/5074 -tests/test_matmul.py::test_op[False-False-False-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-None-128-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-None-16-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-1000-400-400-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-1000-400-400-ragged-float8_e5m2-float8_e5m2-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-1000-700-700-ragged-float16-float16-8-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-1000-700-700-ragged-float16-float16-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False0] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False1] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-256-256-256-ragged-float16-float16-4-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-256-256-256-ragged-float16-float16-4-1-1-3-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-300-400-400-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-300-400-400-ragged-float8_e4m3fn-float8_e4m3fn-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-300-400-400-ragged-float8_e5m2-float8_e5m2-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-400-300-500-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-5-0-7-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-5-7-0-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-1-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-2-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-4-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-2-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-1000-400-400-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-1000-400-400-ragged-float8_e5m2-float8_e5m2-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-1000-700-700-ragged-float16-float16-8-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-1000-700-700-ragged-float16-float16-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False0] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False1] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-256-256-256-ragged-float16-float16-4-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-256-256-256-ragged-float16-float16-4-1-1-3-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-300-400-400-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-300-400-400-ragged-float8_e4m3fn-float8_e4m3fn-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-300-400-400-ragged-float8_e5m2-float8_e5m2-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-400-300-500-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-5-0-7-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-5-7-0-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-1-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-2-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-4-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-2-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-1000-400-400-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-1000-400-400-ragged-float8_e5m2-float8_e5m2-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-1000-700-700-ragged-float16-float16-8-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-1000-700-700-ragged-float16-float16-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False0] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False1] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-256-256-256-ragged-float16-float16-4-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-256-256-256-ragged-float16-float16-4-1-1-3-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-300-400-400-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-300-400-400-ragged-float8_e4m3fn-float8_e4m3fn-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-300-400-400-ragged-float8_e5m2-float8_e5m2-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-400-300-500-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-5-0-7-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-5-7-0-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-1-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-2-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-4-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-2-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-1000-400-400-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-1000-400-400-ragged-float8_e5m2-float8_e5m2-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-1000-700-700-ragged-float16-float16-8-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-1000-700-700-ragged-float16-float16-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False0] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False1] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-256-256-256-ragged-float16-float16-4-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-256-256-256-ragged-float16-float16-4-1-1-3-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-300-400-400-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-300-400-400-ragged-float8_e4m3fn-float8_e4m3fn-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-300-400-400-ragged-float8_e5m2-float8_e5m2-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-400-300-500-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-5-0-7-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-5-7-0-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-1-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-2-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-4-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-2-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-128-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-16-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-False-False-None-128-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-False-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-False-False-None-16-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-False-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-128-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-16-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-None-128-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-None-16-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-1000-400-400-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-1000-400-400-ragged-float8_e5m2-float8_e5m2-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-1000-700-700-ragged-float16-float16-8-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-1000-700-700-ragged-float16-float16-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False0] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False1] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-256-256-256-ragged-float16-float16-4-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-256-256-256-ragged-float16-float16-4-1-1-3-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-300-400-400-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-300-400-400-ragged-float8_e4m3fn-float8_e4m3fn-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-300-400-400-ragged-float8_e5m2-float8_e5m2-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-400-300-500-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-5-0-7-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-5-7-0-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-1-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-2-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-4-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-2-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-1000-400-400-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-1000-400-400-ragged-float8_e5m2-float8_e5m2-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-1000-700-700-ragged-float16-float16-8-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-1000-700-700-ragged-float16-float16-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False0] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False1] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-256-256-256-ragged-float16-float16-4-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-256-256-256-ragged-float16-float16-4-1-1-3-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-300-400-400-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-300-400-400-ragged-float8_e4m3fn-float8_e4m3fn-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-300-400-400-ragged-float8_e5m2-float8_e5m2-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-400-300-500-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-5-0-7-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-5-7-0-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-1-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-2-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-4-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-2-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-1000-400-400-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-1000-400-400-ragged-float8_e5m2-float8_e5m2-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-1000-700-700-ragged-float16-float16-8-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-1000-700-700-ragged-float16-float16-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False0] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False1] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-256-256-256-ragged-float16-float16-4-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-256-256-256-ragged-float16-float16-4-1-1-3-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-300-400-400-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-300-400-400-ragged-float8_e4m3fn-float8_e4m3fn-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-300-400-400-ragged-float8_e5m2-float8_e5m2-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-400-300-500-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-5-0-7-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-5-7-0-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-1-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-2-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-4-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-2-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-1000-400-400-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-1000-400-400-ragged-float8_e5m2-float8_e5m2-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-1000-700-700-ragged-float16-float16-8-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-1000-700-700-ragged-float16-float16-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False0] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False1] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-256-256-256-ragged-float16-float16-4-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-256-256-256-ragged-float16-float16-4-1-1-3-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-300-400-400-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-300-400-400-ragged-float8_e4m3fn-float8_e4m3fn-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-300-400-400-ragged-float8_e5m2-float8_e5m2-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-400-300-500-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-5-0-7-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-5-7-0-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-1-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-2-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-4-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-2-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-128-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-16-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-False-False-None-128-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-False-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-False-False-None-16-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-False-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-128-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-16-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-False-False-None-128-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-False-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-False-False-None-16-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-False-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-128-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-16-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-False-False-None-128-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-False-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-False-False-None-16-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-False-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-128-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-16-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-False-False-None-128-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-False-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-False-False-None-16-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-False-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-128-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-16-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-False-False-None-128-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-False-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-False-False-None-16-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-False-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-128-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-16-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] diff --git a/scripts/skiplist/lts/triton_kernels.txt b/scripts/skiplist/lts/triton_kernels.txt index 5441879aff..f3a4d1078e 100644 --- a/scripts/skiplist/lts/triton_kernels.txt +++ b/scripts/skiplist/lts/triton_kernels.txt @@ -1,561 +1,73 @@ # https://github.com/intel/intel-xpu-backend-for-triton/issues/5074 -tests/test_matmul.py::test_op[False-False-False-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-None-128-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-None-16-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-1000-400-400-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-1000-400-400-ragged-float8_e5m2-float8_e5m2-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-1000-700-700-ragged-float16-float16-8-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-1000-700-700-ragged-float16-float16-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False0] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False1] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-256-256-256-ragged-float16-float16-4-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-256-256-256-ragged-float16-float16-4-1-1-3-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-300-400-400-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-300-400-400-ragged-float8_e4m3fn-float8_e4m3fn-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-300-400-400-ragged-float8_e5m2-float8_e5m2-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-400-300-500-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-5-0-7-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-5-7-0-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-1-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-2-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-4-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-2-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-1000-400-400-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-1000-400-400-ragged-float8_e5m2-float8_e5m2-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-1000-700-700-ragged-float16-float16-8-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-1000-700-700-ragged-float16-float16-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False0] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False1] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-256-256-256-ragged-float16-float16-4-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-256-256-256-ragged-float16-float16-4-1-1-3-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-300-400-400-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-300-400-400-ragged-float8_e4m3fn-float8_e4m3fn-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-300-400-400-ragged-float8_e5m2-float8_e5m2-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-400-300-500-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-5-0-7-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-5-7-0-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-1-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-2-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-4-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-2-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-1000-400-400-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-1000-400-400-ragged-float8_e5m2-float8_e5m2-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-1000-700-700-ragged-float16-float16-8-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-1000-700-700-ragged-float16-float16-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False0] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False1] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-256-256-256-ragged-float16-float16-4-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-256-256-256-ragged-float16-float16-4-1-1-3-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-300-400-400-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-300-400-400-ragged-float8_e4m3fn-float8_e4m3fn-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-300-400-400-ragged-float8_e5m2-float8_e5m2-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-400-300-500-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-5-0-7-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-5-7-0-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-1-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-2-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-4-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-2-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-1000-400-400-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-1000-400-400-ragged-float8_e5m2-float8_e5m2-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-1000-700-700-ragged-float16-float16-8-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-1000-700-700-ragged-float16-float16-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False0] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False1] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-256-256-256-ragged-float16-float16-4-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-256-256-256-ragged-float16-float16-4-1-1-3-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-300-400-400-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-300-400-400-ragged-float8_e4m3fn-float8_e4m3fn-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-300-400-400-ragged-float8_e5m2-float8_e5m2-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-400-300-500-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-5-0-7-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-5-7-0-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-1-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-2-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-4-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-2-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-128-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-16-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-False-False-None-128-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-False-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-False-False-None-16-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-False-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-128-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-16-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-None-128-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-None-16-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-1000-400-400-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-1000-400-400-ragged-float8_e5m2-float8_e5m2-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-1000-700-700-ragged-float16-float16-8-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-1000-700-700-ragged-float16-float16-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False0] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False1] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-256-256-256-ragged-float16-float16-4-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-256-256-256-ragged-float16-float16-4-1-1-3-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-300-400-400-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-300-400-400-ragged-float8_e4m3fn-float8_e4m3fn-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-300-400-400-ragged-float8_e5m2-float8_e5m2-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-400-300-500-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-5-0-7-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-5-7-0-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-1-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-2-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-4-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-2-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-1000-400-400-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-1000-400-400-ragged-float8_e5m2-float8_e5m2-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-1000-700-700-ragged-float16-float16-8-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-1000-700-700-ragged-float16-float16-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False0] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False1] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-256-256-256-ragged-float16-float16-4-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-256-256-256-ragged-float16-float16-4-1-1-3-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-300-400-400-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-300-400-400-ragged-float8_e4m3fn-float8_e4m3fn-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-300-400-400-ragged-float8_e5m2-float8_e5m2-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-400-300-500-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-5-0-7-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-5-7-0-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-1-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-2-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-4-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-2-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-1000-400-400-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-1000-400-400-ragged-float8_e5m2-float8_e5m2-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-1000-700-700-ragged-float16-float16-8-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-1000-700-700-ragged-float16-float16-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False0] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False1] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-256-256-256-ragged-float16-float16-4-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-256-256-256-ragged-float16-float16-4-1-1-3-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-300-400-400-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-300-400-400-ragged-float8_e4m3fn-float8_e4m3fn-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-300-400-400-ragged-float8_e5m2-float8_e5m2-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-400-300-500-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-5-0-7-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-5-7-0-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-1-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-2-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-4-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-2-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-1000-400-400-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-1000-400-400-ragged-float8_e5m2-float8_e5m2-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-1000-700-700-ragged-float16-float16-8-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-1000-700-700-ragged-float16-float16-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False0] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False1] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-256-256-256-ragged-float16-float16-4-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-256-256-256-ragged-float16-float16-4-1-1-3-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-300-400-400-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-300-400-400-ragged-float8_e4m3fn-float8_e4m3fn-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-300-400-400-ragged-float8_e5m2-float8_e5m2-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-400-300-500-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-5-0-7-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-5-7-0-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-1-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-2-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-4-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-2-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-128-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-16-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-False-False-None-128-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-False-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-False-False-None-16-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-False-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-128-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-16-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-False-False-None-128-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-False-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-False-False-None-16-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-False-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-128-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-16-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-False-False-None-128-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-False-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-False-False-None-16-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-False-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-128-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-16-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-False-False-None-128-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-False-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-False-False-None-16-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-False-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-128-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-16-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-False-False-None-128-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-False-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-False-False-None-16-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-False-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-128-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-16-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] diff --git a/scripts/skiplist/xe2/triton_kernels.txt b/scripts/skiplist/xe2/triton_kernels.txt index fdf74e87f5..fbfa847500 100644 --- a/scripts/skiplist/xe2/triton_kernels.txt +++ b/scripts/skiplist/xe2/triton_kernels.txt @@ -1,564 +1,76 @@ # https://github.com/intel/intel-xpu-backend-for-triton/issues/5074 -tests/test_matmul.py::test_op[False-False-False-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-None-128-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-None-16-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-1000-400-400-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-1000-400-400-ragged-float8_e5m2-float8_e5m2-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-1000-700-700-ragged-float16-float16-8-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-1000-700-700-ragged-float16-float16-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False0] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False1] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-256-256-256-ragged-float16-float16-4-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-256-256-256-ragged-float16-float16-4-1-1-3-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-300-400-400-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-300-400-400-ragged-float8_e4m3fn-float8_e4m3fn-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-300-400-400-ragged-float8_e5m2-float8_e5m2-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-400-300-500-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-5-0-7-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-5-7-0-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-1-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-2-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-4-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-2-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-1000-400-400-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-1000-400-400-ragged-float8_e5m2-float8_e5m2-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-1000-700-700-ragged-float16-float16-8-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-1000-700-700-ragged-float16-float16-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False0] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False1] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-256-256-256-ragged-float16-float16-4-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-256-256-256-ragged-float16-float16-4-1-1-3-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-300-400-400-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-300-400-400-ragged-float8_e4m3fn-float8_e4m3fn-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-300-400-400-ragged-float8_e5m2-float8_e5m2-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-400-300-500-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-5-0-7-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-5-7-0-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-1-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-2-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-4-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-2-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-1000-400-400-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-1000-400-400-ragged-float8_e5m2-float8_e5m2-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-1000-700-700-ragged-float16-float16-8-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-1000-700-700-ragged-float16-float16-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False0] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False1] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-256-256-256-ragged-float16-float16-4-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-256-256-256-ragged-float16-float16-4-1-1-3-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-300-400-400-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-300-400-400-ragged-float8_e4m3fn-float8_e4m3fn-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-300-400-400-ragged-float8_e5m2-float8_e5m2-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-400-300-500-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-5-0-7-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-5-7-0-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-1-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-2-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-4-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-2-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-1000-400-400-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-1000-400-400-ragged-float8_e5m2-float8_e5m2-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-1000-700-700-ragged-float16-float16-8-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-1000-700-700-ragged-float16-float16-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False0] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False1] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-256-256-256-ragged-float16-float16-4-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-256-256-256-ragged-float16-float16-4-1-1-3-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-300-400-400-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-300-400-400-ragged-float8_e4m3fn-float8_e4m3fn-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-300-400-400-ragged-float8_e5m2-float8_e5m2-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-400-300-500-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-5-0-7-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-5-7-0-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-1-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-2-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-4-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-2-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-128-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-16-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-False-True-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-False-False-None-128-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-False-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-False-False-None-16-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-False-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-128-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-16-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-False-True-True-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-None-128-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-None-16-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-1000-400-400-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-1000-400-400-ragged-float8_e5m2-float8_e5m2-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-1000-700-700-ragged-float16-float16-8-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-1000-700-700-ragged-float16-float16-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False0] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False1] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-128-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-16-256-256-ragged-float16-float16-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-256-256-256-ragged-float16-float16-4-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-256-256-256-ragged-float16-float16-4-1-1-3-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-300-400-400-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-300-400-400-ragged-float8_e4m3fn-float8_e4m3fn-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-300-400-400-ragged-float8_e5m2-float8_e5m2-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-400-300-500-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-5-0-7-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-5-7-0-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-1-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-2-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-4-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-2-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-1000-400-400-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-1000-400-400-ragged-float8_e5m2-float8_e5m2-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-1000-700-700-ragged-float16-float16-8-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-1000-700-700-ragged-float16-float16-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False0] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False1] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-128-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-16-256-256-ragged-float16-float16-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-256-256-256-ragged-float16-float16-4-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-256-256-256-ragged-float16-float16-4-1-1-3-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-300-400-400-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-300-400-400-ragged-float8_e4m3fn-float8_e4m3fn-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-300-400-400-ragged-float8_e5m2-float8_e5m2-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-400-300-500-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-5-0-7-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-5-7-0-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-1-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-2-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-4-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-2-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_w-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-1000-400-400-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-1000-400-400-ragged-float8_e5m2-float8_e5m2-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-1000-700-700-ragged-float16-float16-8-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-1000-700-700-ragged-float16-float16-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False0] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False1] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-128-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-16-256-256-ragged-float16-float16-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-256-256-256-ragged-float16-float16-4-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-256-256-256-ragged-float16-float16-4-1-1-3-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-300-400-400-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-300-400-400-ragged-float8_e4m3fn-float8_e4m3fn-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-300-400-400-ragged-float8_e5m2-float8_e5m2-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-400-300-500-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-5-0-7-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-5-7-0-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-1-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-2-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-4-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-2-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-128-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-1000-400-400-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-1000-400-400-ragged-float8_e5m2-float8_e5m2-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-1000-700-700-ragged-float16-float16-8-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-1000-700-700-ragged-float16-float16-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False0] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-1-3-False-None-False-False-False1] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-128-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-16-256-256-ragged-float16-float16-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-256-256-256-ragged-float16-float16-4-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-256-256-256-ragged-float16-float16-4-1-1-3-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-300-400-400-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-300-400-400-ragged-float8_e4m3fn-float8_e4m3fn-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-300-400-400-ragged-float8_e5m2-float8_e5m2-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float16-float16-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-32-4-4-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-False-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-False-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-320-400-400-ragged-float8_e5m2-float8_e5m2-8-4-1-1-False-None-True-True-True] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-400-300-500-ragged-float16-float16-32-4-4-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-5-0-7-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-5-7-0-ragged-float16-float16-1-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e4m3fn-float8_e4m3fn-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-1-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-1-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-2-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-4-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-1-2-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-False-False-pad_x-16-600-400-400-ragged-float8_e5m2-float8_e5m2-4-2-2-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-128-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-16-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-False-True-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-False-False-None-128-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-False-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-False-False-None-16-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-False-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-128-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-1-9-True-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-16-16-16-1000-batched-float16-float16-5-1-1-None-False-None-False-False-False] -tests/test_matmul.py::test_op[False-True-True-True-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-False-False-None-128-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-False-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-False-False-None-16-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-False-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-128-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-16-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-False-True-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-False-False-None-128-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-False-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-False-False-None-16-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-False-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-128-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-16-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-False-True-True-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-False-False-None-128-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-False-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-False-False-None-16-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-False-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-128-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-16-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-False-True-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-False-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-False-False-None-128-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-False-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-False-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-False-False-None-16-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-False-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-128-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-128-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-128-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-16-1000-704-800-ragged-mxfloat8_e4m3fn-mxfloat4_e2m1-8-2-9-True-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-16-16-16-1000-batched-float16-float16-5-1-None-False-None-False-False-False] +tests/test_matmul.py::test_op[False-True-True-True-False-None-16-16-16-1000-batched-float8_e5m2-float8_e5m2-5-1-None-False-None-False-False-False] # https://github.com/intel/intel-xpu-backend-for-triton/issues/5117 tests/test_routing.py::test_op[r"[^-]*-[^-]*-1500-8-[^-]*-[^-]*"]@regexp tests/test_routing.py::test_op[r"True\-True-.*"]@regexp diff --git a/test/Analysis/test-membar-ttng.mlir b/test/Analysis/test-membar-ttng.mlir index 0b0d0bb60e..8ff1632eb3 100644 --- a/test/Analysis/test-membar-ttng.mlir +++ b/test/Analysis/test-membar-ttng.mlir @@ -10,7 +10,7 @@ tt.func @async_store_wait(%arg: tensor<32x16xf16, #AL>) { %alloc = ttg.local_alloc : () -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable> // CHECK: async_tma_store_wait ttng.async_tma_store_wait {pendings = 0 : i32} - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: ttg.local_store ttg.local_store %arg, %alloc : tensor<32x16xf16, #AL> -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable> tt.return @@ -36,7 +36,7 @@ tt.func @tma_special_cases(%arg1: !tt.tensordesc>) - ttng.init_barrier %barrier, 1 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> ttng.init_barrier %barrier, 1 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: ttng.barrier_expect // CHECK-NEXT: ttng.async_tma_copy_global_to_local // CHECK-NEXT: ttng.wait_barrier @@ -46,7 +46,7 @@ tt.func @tma_special_cases(%arg1: !tt.tensordesc>) - // CHECK-NEXT: ttng.async_tma_copy_global_to_local // CHECK-NEXT: ttng.barrier_expect - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: ttng.wait_barrier ttng.async_tma_copy_global_to_local %arg1[%c0, %c0] %alloc, %barrier, %true : !tt.tensordesc>, !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> -> !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable> ttng.barrier_expect %barrier, 49152, %true : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> @@ -56,7 +56,7 @@ tt.func @tma_special_cases(%arg1: !tt.tensordesc>) - %t = ttg.local_load %alloc : !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable> -> tensor<256x64xf16, #blocked> // CHECK-NEXT: ttng.barrier_expect - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: ttng.async_tma_copy_global_to_local // CHECK-NEXT: ttng.wait_barrier ttng.barrier_expect %barrier, 49152, %true : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> @@ -66,14 +66,14 @@ tt.func @tma_special_cases(%arg1: !tt.tensordesc>) - // CHECK-NEXT: memdesc_subslice // CHECK-NEXT: ttng.barrier_expect // CHECK-NEXT: ttng.async_tma_gather - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: ttng.wait_barrier %view = ttg.memdesc_subslice %alloc [0, 0] : !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable> -> !ttg.memdesc<32x64xf16, #shared, #ttg.shared_memory, mutable> ttng.barrier_expect %barrier, 49152, %true : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> ttng.async_tma_gather %arg1[%cx, %c0] %view, %barrier, %true : !tt.tensordesc>, tensor<32xi32>, i32, !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>, !ttg.memdesc<32x64xf16, #shared, #ttg.shared_memory, mutable>, i1 ttng.wait_barrier %barrier, %c0 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: ttng.inval_barrier // CHECK-NEXT: ttng.inval_barrier ttng.inval_barrier %barrier : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> @@ -99,7 +99,7 @@ tt.func @tma_special_cases_cf(%arg1: !tt.tensordesc> // CF: cf.cond_br // SCF: scf.if scf.if %i1 { - // CHECK-NOT: gpu.barrier + // CHECK-NOT: ttg.local_barrier // CHECK: ttng.async_tma_copy_global_to_local // CHECK-NEXT: ttng.barrier_expect // CHECK-NEXT: ttng.wait_barrier @@ -109,13 +109,13 @@ tt.func @tma_special_cases_cf(%arg1: !tt.tensordesc> ttng.barrier_expect %barrier, 49152, %true : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> ttng.wait_barrier %barrier, %c0 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> } else { - // CHECK-NOT: gpu.barrier + // CHECK-NOT: ttg.local_barrier // CHECK: ttg.local_store // CF-NEXT: cf.br // SCF-NEXT: } ttg.local_store %arg2, %alloc : tensor<256x64xf16, #blocked> -> !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable> } - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %t = ttg.local_load %alloc : !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable> -> tensor<256x64xf16, #blocked> tt.return %t : tensor<256x64xf16, #blocked> @@ -137,7 +137,7 @@ module attributes {"ttg.num-warps" = 4 : i32} { %0 = ttg.local_alloc %arg0 {allocation.offset = 53248 : i32} : (tensor<128x16xf8E4M3FN, #blocked>) -> !ttg.memdesc<128x16xf8E4M3FN, #shared, #smem> // CHECK: tmem_alloc %1 = ttng.tmem_alloc {tensor_memory_col_offset = 256 : i32, tensor_memory_row_offset = 0 : i32} : () -> !ttg.memdesc<128x16xf8E4M3FN, #tmem_scales, #ttng.tensor_memory, mutable> - // gpu.barrier + // ttg.local_barrier // CHECK: tmem_copy ttng.tmem_copy %0, %1 : !ttg.memdesc<128x16xf8E4M3FN, #shared, #smem>, !ttg.memdesc<128x16xf8E4M3FN, #tmem_scales, #ttng.tensor_memory, mutable> tt.return diff --git a/test/Analysis/test-membar.mlir b/test/Analysis/test-membar.mlir index ba96cafbc2..c256969f64 100644 --- a/test/Analysis/test-membar.mlir +++ b/test/Analysis/test-membar.mlir @@ -48,7 +48,7 @@ tt.func @raw_single_block(%A : !tt.ptr) { %0 = tt.splat %A : !tt.ptr -> tensor<128x32x!tt.ptr, #AL> %1 = tt.load %0, %cst1, %cst2 : tensor<128x32x!tt.ptr, #AL> %2 = ttg.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %3 = ttg.local_load %2 : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> tt.return @@ -62,10 +62,10 @@ tt.func @war_single_block(%A : !tt.ptr) { %1 = tt.load %0, %cst1, %cst2 : tensor<128x32x!tt.ptr, #AL> %2 = ttg.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> // CHECK: ttg.local_alloc - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %3 = ttg.local_load %2 : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: %4 = ttg.local_alloc %4 = ttg.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> tt.return @@ -79,10 +79,10 @@ tt.func @war_single_block_local_store(%A : !tt.ptr) { %1 = tt.load %0, %cst1, %cst2 : tensor<128x32x!tt.ptr, #AL> %2 = ttg.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable> // CHECK: ttg.local_alloc - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %3 = ttg.local_load %2 : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<128x32xf16, #AL> - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_store ttg.local_store %1, %2 : tensor<128x32xf16, #AL> -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable> tt.return @@ -91,9 +91,9 @@ tt.func @war_single_block_local_store(%A : !tt.ptr) { // CHECK-LABEL: scratch tt.func @scratch(%arg: tensor<16x16xf16, #AL>) { %cst0 = ttg.local_alloc %arg : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK: tt.reduce %1 = ttg.local_load %cst0 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> %2 = "tt.reduce" (%1) ({ @@ -109,7 +109,7 @@ tt.func @async_wait(%arg: tensor<32x16xf16, #AL>) { %cst0 = ttg.local_alloc %arg : (tensor<32x16xf16, #AL>) -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory> // CHECK: ttg.async_wait ttg.async_wait {num = 4 : i32} - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: ttg.local_load %1 = ttg.local_load %cst0 : !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<32x16xf16, #AL> tt.return @@ -120,10 +120,10 @@ tt.func @subview() { %cst0 = arith.constant dense<0.000000e+00> : tensor<32x16xf16, #AL> %a = ttg.local_alloc %cst0 : (tensor<32x16xf16, #AL>) -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory> %0 = ttg.memdesc_subslice %a [0, 0] : !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory> -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %1 = ttg.local_load %0 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_alloc %2 = ttg.local_alloc %1 : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> tt.return @@ -131,7 +131,7 @@ tt.func @subview() { // CHECK-LABEL: trans tt.func @trans(%a: !ttg.memdesc<16x32xf16, #A_SHARED, #ttg.shared_memory>) { - // CHECK-NOT: gpu.barrier + // CHECK-NOT: ttg.local_barrier %b = ttg.memdesc_trans %a {order=array} : !ttg.memdesc<16x32xf16, #A_SHARED, #ttg.shared_memory> -> !ttg.memdesc<32x16xf16, #A_SHARED_T, #ttg.shared_memory> tt.return } @@ -145,7 +145,7 @@ tt.func @async_copy_global_to_local(%A : !tt.ptr, %i1 : i1) { %alloc = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> %subview = ttg.memdesc_index %alloc[%index] : !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> %1 = ttg.async_copy_global_to_local %a_ptr, %subview : tensor<16x16x!tt.ptr, #AL> -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %4 = ttg.local_load %subview : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<16x16xf16, #AL> tt.return @@ -156,7 +156,7 @@ tt.func @multi_blocks(%i1 : i1) { %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> %cst0 = ttg.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> scf.if %i1 { - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %0 = ttg.local_load %cst0 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> scf.yield @@ -164,7 +164,7 @@ tt.func @multi_blocks(%i1 : i1) { %cst1 = ttg.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> scf.yield } - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %2 = ttg.local_load %cst0 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> tt.return @@ -176,17 +176,17 @@ tt.func @multi_blocks_join_barrier(%i1 : i1) { %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> %cst0 = ttg.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> scf.if %i1 { - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %0 = ttg.local_load %cst0 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> scf.yield } else { - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %1 = ttg.local_load %cst0 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> scf.yield } - // CHECK-NOT: gpu.barrier + // CHECK-NOT: ttg.local_barrier // CHECK: tt.return %a_ = ttg.local_load %cst0 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> tt.return @@ -198,13 +198,13 @@ tt.func @multi_blocks_yield(%i1 : i1) { %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> %cst0 = ttg.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> %a = scf.if %i1 -> (!ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory>) { - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %0 = ttg.local_load %cst0 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> %1 = ttg.local_alloc %0 : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> scf.yield %1 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> } else { - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %2 = ttg.local_load %cst0 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> %3 = ttg.local_alloc %2 : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> @@ -212,7 +212,7 @@ tt.func @multi_blocks_yield(%i1 : i1) { } %a_ = ttg.local_load %cst0 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> // CHECK: ttg.local_load - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: ttg.local_load %4 = ttg.local_load %a : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> tt.return @@ -224,23 +224,23 @@ tt.func @multi_blocks_entry_no_shared(%i1 : i1) { %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> %cst0 = ttg.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> %a = scf.if %i1 -> (!ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory>) { - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_alloc - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: ttg.local_load - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: ttg.local_alloc %cst1 = ttg.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> %0 = ttg.local_load %cst1 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> %1 = ttg.local_alloc %0 : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> scf.yield %1 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> } else { - // CHECK-NOT: gpu.barrier + // CHECK-NOT: ttg.local_barrier // CHECK: ttg.local_alloc %cst1 = ttg.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> scf.yield %cst1 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> } - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %2 = ttg.local_load %a : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> tt.return @@ -252,12 +252,12 @@ tt.func @multi_blocks_noelse(%i1 : i1) { %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> %cst0 = ttg.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> scf.if %i1 { - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %0 = ttg.local_load %cst0 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> scf.yield } - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %1 = ttg.local_load %cst0 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> tt.return @@ -270,19 +270,19 @@ tt.func @multi_blocks_nested_scf(%i1 : i1, %i2 : i1) { %cst0 = ttg.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> scf.if %i1 { scf.if %i2 { - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %0 = ttg.local_load %cst0 : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> scf.yield } scf.yield } else { - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %1 = ttg.local_load %cst0 : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> scf.yield } - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %2 = ttg.local_load %cst0 : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> tt.return @@ -295,7 +295,7 @@ tt.func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !t %b_shared_init = ttg.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> %c_shared_init = ttg.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>) { - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %a0 = ttg.local_load %a_shared : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> %b0 = ttg.local_load %b_shared : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> @@ -311,20 +311,20 @@ tt.func @for_alias(%lb : index, %ub : index, %step : index, %A : !tt.ptr, % %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL> %a_shared_init = ttg.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> %b_shared_init = ttg.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %a0 = ttg.local_load %a_shared_init : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> %b0 = ttg.local_load %b_shared_init : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> %0 = ttg.local_alloc %a0 : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> %c_shared_init = ttg.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>) { - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %a1 = ttg.local_load %a_shared : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> %b1 = ttg.local_load %b_shared : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> scf.yield %c_shared, %a_shared, %b_shared : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> } - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %r = ttg.local_load %0 : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> tt.return @@ -337,26 +337,26 @@ tt.func @for_reuse(%lb : index, %ub : index, %step : index, %A : !tt.ptr, % %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL> %a_shared_init = ttg.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> %b_shared_init = ttg.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %a0 = ttg.local_load %a_shared_init : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> %b0 = ttg.local_load %b_shared_init : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> %0 = ttg.local_alloc %a0 : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> %c_shared_init = ttg.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>) { - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_alloc %a1 = ttg.local_load %a_shared_init : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> %b1 = ttg.local_load %b_shared_init : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> %1 = ttg.local_alloc %a1 : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_alloc %a2 = ttg.local_load %a_shared_init : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> %b2 = ttg.local_load %b_shared_init : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> %2 = ttg.local_alloc %a1 : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> scf.yield %c_shared, %a_shared, %b_shared : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> } - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %r = ttg.local_load %0 : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> tt.return @@ -367,20 +367,20 @@ tt.func @for_reuse_nested(%lb : index, %ub : index, %step : index, %A : !tt.ptr< %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL> %a_shared_init = ttg.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> %b_shared_init = ttg.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %a0 = ttg.local_load %a_shared_init : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> %b0 = ttg.local_load %b_shared_init : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> %0 = ttg.local_alloc %a0 : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> %c_shared_init = ttg.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>) { - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_alloc %a1 = ttg.local_load %a_shared_init : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> %b1 = ttg.local_load %b_shared_init : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> %1 = ttg.local_alloc %a1 : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> %a_shared_next, %b_shared_next, %c_shared_next = scf.for %ivv = %lb to %ub step %step iter_args(%a_shared_nested = %a_shared_init, %b_shared_nested = %b_shared_init, %c_shared_nested = %c_shared_init) -> (!ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>) { - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_alloc %a2 = ttg.local_load %a_shared_init : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> %b2 = ttg.local_load %b_shared_init : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> @@ -389,7 +389,7 @@ tt.func @for_reuse_nested(%lb : index, %ub : index, %step : index, %A : !tt.ptr< } scf.yield %c_shared, %a_shared, %b_shared : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> } - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %r = ttg.local_load %0 : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> tt.return @@ -405,12 +405,12 @@ tt.func @for_for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>) { %c_shared_next = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (!ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>) { %c_shared_next_next = scf.if %i1 -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> { - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_alloc %cst0 = ttg.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> scf.yield %cst0 : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> } else { - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_alloc %cst0 = ttg.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> scf.yield %cst0 : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> @@ -429,25 +429,25 @@ tt.func @for_if_for(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %a_shared_init = ttg.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> %b_shared_init = ttg.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> %c_shared_init = ttg.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier %c_blocked = ttg.local_load %c_shared_init : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>) { %c_shared_next_next = scf.if %i1 -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> { - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_alloc %cst0 = ttg.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> scf.yield %cst0 : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> } else { %c_shared_ = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (!ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>) { - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %c_blocked_next = ttg.local_load %c_shared_next : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> scf.yield %c_shared : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> } scf.yield %c_shared_ : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> } - // CHECK-NOT: gpu.barrier + // CHECK-NOT: ttg.local_barrier %b_blocked_next = ttg.local_load %b_shared: !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> scf.yield %a_shared, %b_shared, %c_shared_next_next : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> } @@ -460,12 +460,12 @@ tt.func @cf_if(%i1 : i1) { %a = ttg.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> cf.cond_br %i1, ^bb1, ^bb2 ^bb1: // pred: ^bb0 - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %0 = ttg.local_load %a : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> cf.br ^bb2 ^bb2: // 2 preds: ^bb0, ^bb1 - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %1 = ttg.local_load %a : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> tt.return @@ -477,13 +477,13 @@ tt.func @cf_if_else(%i1 : i1) { %a = ttg.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> cf.cond_br %i1, ^bb1, ^bb2 ^bb1: // pred: ^bb0 - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %0 = ttg.local_load %a : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> %1 = ttg.local_alloc %0 : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> cf.br ^bb3(%1 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory>) ^bb2: // pred: ^bb0 - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %2 = ttg.local_load %a : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> %3 = ttg.local_alloc %2 : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> @@ -493,7 +493,7 @@ tt.func @cf_if_else(%i1 : i1) { ^bb4: // pred: ^bb3 // CHECK: ttg.local_load %4 = ttg.local_load %a : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %5 = ttg.local_load %arg : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> tt.return @@ -506,13 +506,13 @@ tt.func @cf_if_else_return(%i1 : i1) { %b = ttg.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> cf.cond_br %i1, ^bb1, ^bb2 ^bb1: // pred: ^bb0 - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %0 = ttg.local_load %a : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> %1 = ttg.local_load %b : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> tt.return ^bb2: // pred: ^bb0 - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %2 = ttg.local_load %a : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> %3 = ttg.local_load %b : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> -> tensor<16x16xf16, #AL> @@ -521,7 +521,7 @@ tt.func @cf_if_else_return(%i1 : i1) { // CHECK-LABEL: atomic_scalar tt.func @atomic_scalar(%arg3: !tt.ptr) -> i32 { - // CHECK-NOT: gpu.barrier + // CHECK-NOT: ttg.local_barrier %c0_i32 = arith.constant 0 : i32 %1 = arith.constant dense<1.0> : tensor<128x32xf16, #AL> %2 = ttg.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> @@ -536,7 +536,7 @@ tt.func @atomic_scalar_no_use(%arg3: !tt.ptr) { %1 = arith.constant dense<1.0> : tensor<128x32xf16, #AL> %2 = ttg.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> %4 = tt.atomic_cas acq_rel, gpu, %arg3, %c0_i32, %c0_i32 : (!tt.ptr, i32, i32) -> i32 - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: ttg.local_load %3 = ttg.local_load %2 : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL> tt.return @@ -548,7 +548,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.num-ctas" = 1 : i32} { // CHECK-LABEL: convert_layout1 tt.func @convert_layout1(%A : !tt.ptr) { - // CHECK-NOT: gpu.barrier + // CHECK-NOT: ttg.local_barrier %0 = ttg.local_alloc : () -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> %1 = ttg.local_load %0 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<16x16xf16, #AL> tt.return @@ -560,7 +560,7 @@ tt.func @convert_layout2(%A : !tt.ptr) { %0 = ttg.local_alloc : () -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> %1 = ttg.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> // CHECK: ttg.local_load - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK: ttg.local_load %3 = ttg.local_load %0 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<16x16xf16, #AL> %4 = ttg.local_load %1 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<16x16xf16, #AL> @@ -572,12 +572,12 @@ tt.func @convert_layout3(%cond : i1) { scf.if %cond { %0 = ttg.local_alloc : () -> !ttg.memdesc<16x64xf16, #A_SHARED, #ttg.shared_memory, mutable> // CHECK: ttg.local_load - // CHECK-NOT: gpu.barrier + // CHECK-NOT: ttg.local_barrier %1 = ttg.local_load %0 : !ttg.memdesc<16x64xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<16x64xf16, #AL> } else { %0 = ttg.local_alloc : () -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> // CHECK: ttg.local_load - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: ttg.local_alloc %1 = ttg.local_load %0 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<16x16xf16, #AL> %2 = ttg.local_alloc %1 : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> @@ -587,7 +587,7 @@ tt.func @convert_layout3(%cond : i1) { // CHEKC-LABEL: convert_layout4 tt.func @convert_layout4(%A : !tt.ptr, %cond : i1) { - // CHECK-NOT: gpu.barrier + // CHECK-NOT: ttg.local_barrier scf.if %cond { tt.call @convert_layout3(%cond) : (i1) -> () } else { @@ -602,7 +602,7 @@ tt.func @convert_layout5(%A : !tt.ptr) { %0 = ttg.local_alloc : () -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable> %1 = ttg.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> // CHECK: ttg.local_load - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK: ttg.local_load %3 = ttg.local_load %0 : !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<32x16xf16, #AL> %4 = ttg.local_load %1 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<16x16xf16, #AL> @@ -613,7 +613,7 @@ tt.func @convert_layout5(%A : !tt.ptr) { tt.func @single_call_sync(%A : !tt.ptr) { %0 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL> // CHECK: tt.call - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier tt.call @convert_layout1(%A) : (!tt.ptr) -> () %1 = ttg.convert_layout %0 : tensor<16x32xf16, #AL> -> tensor<16x32xf16, #BL> tt.return @@ -622,7 +622,7 @@ tt.func @single_call_sync(%A : !tt.ptr) { // CHECK-LABEL: single_call_no_sync // %1 can reuse %0 in convert_layout2, which has been synced tt.func @single_call_no_sync(%A : !tt.ptr) { - // CHECK-NOT: gpu.barrier + // CHECK-NOT: ttg.local_barrier %0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> tt.call @convert_layout5(%A) : (!tt.ptr) -> () %1 = ttg.convert_layout %0 : tensor<16x16xf16, #AL> -> tensor<16x16xf16, #BL> @@ -645,15 +645,15 @@ tt.func @if_else_calls(%A : !tt.ptr, %cond : i1) { %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> %cst_ = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL> %cst0 = ttg.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: tt.call - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier tt.call @convert_layout1(%A) : (!tt.ptr) -> () %cst1 = ttg.local_alloc %cst_ : (tensor<16x32xf16, #AL>) -> !ttg.memdesc<16x32xf16, #A_SHARED, #ttg.shared_memory> } else { %cst0 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL> // CHECK: tt.call - // CHECK-NOT: gpu.barrier + // CHECK-NOT: ttg.local_barrier tt.call @convert_layout2(%A) : (!tt.ptr) -> () } tt.return @@ -668,7 +668,7 @@ tt.func @for_calls(%A : !tt.ptr, %cond : i1) { %ub = arith.constant 10 : index %step = arith.constant 1 : index scf.for %iv = %lb to %ub step %step { - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK-NEXT: tt.call tt.call @convert_layout1(%A) : (!tt.ptr) -> () } @@ -678,7 +678,7 @@ tt.func @for_calls(%A : !tt.ptr, %cond : i1) { // CHECK-LABEL: call_graph_1 tt.func @call_graph_1(%A : !tt.ptr, %cond : i1) { %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> - %cst0 = ttg.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> // CHECK: gpu.barrier + %cst0 = ttg.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> // CHECK: ttg.local_barrier // CHECK-NEXT: tt.call tt.call @convert_layout3(%cond) : (i1) -> () tt.return @@ -689,7 +689,7 @@ tt.func @call_graph_2(%A : !tt.ptr, %cond : i1) { %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> tt.call @convert_layout4(%A, %cond) : (!tt.ptr, i1) -> () // CHECK: tt.call - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier %cst0 = ttg.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory> tt.return } @@ -710,7 +710,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} { ttg.local_store %c, %alloc : tensor<16x16xf16> -> !ttg.memdesc<16x16xf16, #shared, #ttg.shared_memory, mutable> // CHECK-NEXT: ttg.convert_layout %cvt = ttg.convert_layout %src : tensor<32x!tt.ptr, #block0> -> tensor<32x!tt.ptr, #block1> - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: ttg.local_load %ld = ttg.local_load %alloc : !ttg.memdesc<16x16xf16, #shared, #ttg.shared_memory, mutable> -> tensor<16x16xf16> tt.return @@ -759,7 +759,7 @@ tt.func @warp_specialize_isolated_regions(%arg0: tensor<1xi64>) { %0 = ttg.local_alloc : () -> !ttg.memdesc<1xi64, #layout, #smem, mutable> // CHECK-NEXT: local_store ttg.local_store %arg0, %0 : tensor<1xi64> -> !ttg.memdesc<1xi64, #layout, #smem, mutable> - // CHECK-NEXT: barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: local_load ttg.local_load %0 : !ttg.memdesc<1xi64, #layout, #smem, mutable> -> tensor<1xi64> @@ -775,7 +775,7 @@ tt.func @warp_specialize_isolated_regions(%arg0: tensor<1xi64>) { %1 = ttg.local_alloc : () -> !ttg.memdesc<1xi64, #layout, #smem, mutable> // CHECK-NEXT: local_store ttg.local_store %cst, %1 : tensor<1xi64> -> !ttg.memdesc<1xi64, #layout, #smem, mutable> - // CHECK-NEXT: barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: local_load ttg.local_load %1 : !ttg.memdesc<1xi64, #layout, #smem, mutable> -> tensor<1xi64> // CHECK-NEXT: warp_return @@ -795,11 +795,11 @@ tt.func @warp_specialize_into_default(%arg0: tensor<1xi64>) { ttg.warp_specialize() // CHECK-NEXT: default default { - // CHECK-NEXT: barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: local_load ttg.local_load %0 : !ttg.memdesc<1xi64, #layout, #smem, mutable> -> tensor<1xi64> - // CHECK-NEXT: barrier - gpu.barrier + // CHECK-NEXT: ttg.local_barrier + ttg.local_barrier // CHECK-NEXT: warp_yield ttg.warp_yield // CHECK-NEXT: () -> () @@ -819,14 +819,14 @@ tt.func @default_region_cfg(%arg0: tensor<1xi64>, %arg1: i1) { ttg.warp_specialize() // CHECK-NEXT: default default { - // CHECK-NEXT: barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: local_load ttg.local_load %0 : !ttg.memdesc<1xi64, #layout, #smem, mutable> -> tensor<1xi64> cf.cond_br %arg1, ^bb1, ^bb2 // CHECK: ^bb1: ^bb1: - // CHECK-NEXT: barrier - gpu.barrier + // CHECK-NEXT: ttg.local_barrier + ttg.local_barrier cf.br ^bb3 ^bb2: cf.br ^bb3 @@ -836,7 +836,7 @@ tt.func @default_region_cfg(%arg0: tensor<1xi64>, %arg1: i1) { ttg.warp_yield // CHECK-NEXT: () -> () } : () -> () - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: local_store ttg.local_store %arg0, %0 : tensor<1xi64> -> !ttg.memdesc<1xi64, #layout, #smem, mutable> tt.return @@ -856,7 +856,7 @@ tt.func @direct_backedge_within_loop(%arg0: index, %arg1: index, %arg2: index, % %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #blocked> // CHECK-NEXT: local_alloc %0 = ttg.local_alloc %cst : (tensor<128x32xf16, #blocked>) -> !ttg.memdesc<128x32xf16, #shared, #smem> - // CHECK-NEXT: barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: local_load %1 = ttg.local_load %0 : !ttg.memdesc<128x32xf16, #shared, #smem> -> tensor<128x32xf16, #blocked> // CHECK-NEXT: br @@ -865,14 +865,14 @@ tt.func @direct_backedge_within_loop(%arg0: index, %arg1: index, %arg2: index, % cf.cond_br %arg5, ^bb2, ^bb3 // CHECK: ^bb2: ^bb2: - // CHECK-NEXT: barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: local_alloc %4 = ttg.local_alloc %cst : (tensor<128x32xf16, #blocked>) -> !ttg.memdesc<128x32xf16, #shared, #smem> // CHECK-NEXT: br cf.br ^bb1(%arg1, %4 : index, !ttg.memdesc<128x32xf16, #shared, #smem>) // CHECK: ^bb3 ^bb3: - // CHECK-NEXT: barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: local_load %5 = ttg.local_load %3 : !ttg.memdesc<128x32xf16, #shared, #smem> -> tensor<128x32xf16, #blocked> // CHECK-NEXT: cond_br @@ -903,7 +903,7 @@ tt.func @membar_alias_through_warp_specialize() { %c = arith.constant dense<0.0> : tensor<16x16xf16> // CHECK: local_store ttg.local_store %c, %1 : tensor<16x16xf16> -> !ttg.memdesc<16x16xf16, #shared, #ttg.shared_memory, mutable> - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: local_store ttg.local_store %c, %1 : tensor<16x16xf16> -> !ttg.memdesc<16x16xf16, #shared, #ttg.shared_memory, mutable> ttg.warp_return @@ -915,7 +915,7 @@ tt.func @membar_alias_through_warp_specialize() { %c = arith.constant dense<0.0> : tensor<16x16xf16> // CHECK: local_store ttg.local_store %c, %1 : tensor<16x16xf16> -> !ttg.memdesc<16x16xf16, #shared, #ttg.shared_memory, mutable> - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: local_store ttg.local_store %c, %1 : tensor<16x16xf16> -> !ttg.memdesc<16x16xf16, #shared, #ttg.shared_memory, mutable> ttg.warp_return @@ -924,3 +924,33 @@ tt.func @membar_alias_through_warp_specialize() { } } + +// ----- + +#layout = #ttg.swizzled_shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [0]}> +#smem = #ttg.shared_memory + +// CHECK-LABEL: @check_barrier_no_duplication +tt.func @check_barrier_no_duplication(%arg0: tensor<1xi64>) { + // CHECK-NEXT: local_alloc + %0 = ttg.local_alloc : () -> !ttg.memdesc<1xi64, #layout, #smem, mutable> + // CHECK-NEXT: local_store + ttg.local_store %arg0, %0 : tensor<1xi64> -> !ttg.memdesc<1xi64, #layout, #smem, mutable> + // CHECK-NEXT: warp_specialize + ttg.warp_specialize() + // CHECK-NEXT: default + default { + // CHECK-NEXT: ttg.local_barrier + // CHECK-NEXT: local_load + ttg.local_load %0 : !ttg.memdesc<1xi64, #layout, #smem, mutable> -> tensor<1xi64> + // CHECK-NEXT: gpu.barrier + // CHECK-NOT: gpu.local_barrier + gpu.barrier + // CHECK-NEXT: warp_yield + ttg.warp_yield + // CHECK-NEXT: () -> () + } : () -> () + // CHECK-NEXT: local_store + ttg.local_store %arg0, %0 : tensor<1xi64> -> !ttg.memdesc<1xi64, #layout, #smem, mutable> + tt.return +} diff --git a/test/Conversion/amd/amdgpu_membar.mlir b/test/Conversion/amd/amdgpu_membar.mlir index 780e047000..9eb9b72f0b 100644 --- a/test/Conversion/amd/amdgpu_membar.mlir +++ b/test/Conversion/amd/amdgpu_membar.mlir @@ -22,10 +22,10 @@ tt.func @pipelined_async_copy_local_to_global(%A: !tt.ptr) { // Load into TileB %3 = ttg.async_copy_global_to_local %a_ptr, %tile_b : tensor<16x16x!tt.ptr, #AL> -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> // There should be a single barrier after async_wait - // CHECK-NOT: gpu.barrier + // CHECK-NOT: ttg.local_barrier // CHECK: ttg.async_wait - // CHECK-NEXT: gpu.barrier - // CHECK-NOT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier + // CHECK-NOT: ttg.local_barrier // CHECK: tt.return tt.return } @@ -47,10 +47,10 @@ tt.func @pipelined_async_copy_local_to_global_2(%A: !tt.ptr) { // Read TileA %4 = ttg.local_load %tile_a token %2 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<16x16xf16, #AL> // There should be a single barrier after async_wait - // CHECK-NOT: gpu.barrier + // CHECK-NOT: ttg.local_barrier // CHECK: ttg.async_wait - // CHECK-NEXT: gpu.barrier - // CHECK-NOT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier + // CHECK-NOT: ttg.local_barrier // CHECK: tt.return tt.return } @@ -86,10 +86,10 @@ tt.func @pipelined_async_copy_local_to_global_3(%A: !tt.ptr, %B: !tt.ptr, #AL> -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> // There should be a single barrier after async_wait - // CHECK-NOT: gpu.barrier + // CHECK-NOT: ttg.local_barrier // CHECK: ttg.async_wait - // CHECK-NEXT: gpu.barrier - // CHECK-NOT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier + // CHECK-NOT: ttg.local_barrier // CHECK: tt.return tt.return } @@ -109,11 +109,11 @@ tt.func @async_wait_in_previous_loop_iteration(%a_ptr: tensor<16x16x!tt.ptr %6 = ttg.local_load %alloc token %arg10 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<16x16xf16, #AL> %7 = ttg.async_copy_global_to_local %a_ptr, %alloc : tensor<16x16x!tt.ptr, #AL> -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> - // CHECK-NOT: gpu.barrier + // CHECK-NOT: ttg.local_barrier // CHECK: ttg.async_wait %8 = ttg.async_wait %7 {num = 4 : i32} - // CHECK: gpu.barrier - // CHECK-NOT: gpu.barrier + // CHECK: ttg.local_barrier + // CHECK-NOT: ttg.local_barrier scf.yield %8: !ttg.async.token } // CHECK: tt.return @@ -131,7 +131,7 @@ tt.func @intial_loop_token_is_not_from_async_wait(%a_ptr: tensor<16x16x!tt.ptr (!ttg.async.token) : i32 { %6 = ttg.local_load %alloc token %arg10 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<16x16xf16, #AL> // CHECK: ttg.local_load - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK: ttg.async_copy_global_to_local %7 = ttg.async_copy_global_to_local %a_ptr, %alloc : tensor<16x16x!tt.ptr, #AL> -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> %8 = ttg.async_wait %7 {num = 4 : i32} @@ -153,7 +153,7 @@ tt.func @loop_carried_token_not_from_async_wait(%a_ptr: tensor<16x16x!tt.ptr (!ttg.async.token) : i32 { %6 = ttg.local_load %alloc token %arg10 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<16x16xf16, #AL> // CHECK: ttg.local_load - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK: ttg.async_copy_global_to_local %7 = ttg.async_copy_global_to_local %a_ptr, %alloc : tensor<16x16x!tt.ptr, #AL> -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> scf.yield %7: !ttg.async.token @@ -176,7 +176,7 @@ tt.func @async_wait_inside_if(%cond: i1, %a_ptr: tensor<16x16x!tt.ptr, #AL> %loop_result:1 = scf.for %arg14 = %c0_i32 to %loopIterCount step %c1_i32 iter_args(%arg10 = %2) -> (!ttg.async.token) : i32 { %6 = ttg.local_load %alloc token %arg10 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<16x16xf16, #AL> // CHECK: ttg.local_load - // CHECK-NOT: gpu.barrier + // CHECK-NOT: ttg.local_barrier // CHECK: ttg.async_copy_global_to_local %7 = ttg.async_copy_global_to_local %a_ptr, %alloc : tensor<16x16x!tt.ptr, #AL> -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> %103 = scf.if %cond -> (!ttg.async.token) { @@ -206,7 +206,7 @@ tt.func @non_async_wait_token_from_then(%cond: i1, %a_ptr: tensor<16x16x!tt.ptr< %6 = ttg.local_load %alloc token %arg10 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<16x16xf16, #AL> // We should get a barrier because the then branch does not yield an token from AsyncWait // CHECK: ttg.local_load - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK: ttg.async_copy_global_to_local %7 = ttg.async_copy_global_to_local %a_ptr, %alloc : tensor<16x16x!tt.ptr, #AL> -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> %103 = scf.if %cond -> (!ttg.async.token) { @@ -235,7 +235,7 @@ tt.func @non_async_wait_token_from_else(%cond: i1, %a_ptr: tensor<16x16x!tt.ptr< %6 = ttg.local_load %alloc token %arg10 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<16x16xf16, #AL> // We should get a barrier because the else branch does not yield an token from AsyncWait // CHECK: ttg.local_load - // CHECK: gpu.barrier + // CHECK: ttg.local_barrier // CHECK: ttg.async_copy_global_to_local %7 = ttg.async_copy_global_to_local %a_ptr, %alloc : tensor<16x16x!tt.ptr, #AL> -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> %103 = scf.if %cond -> (!ttg.async.token) { diff --git a/test/Conversion/amd/async_ops_to_llvm.mlir b/test/Conversion/amd/async_ops_to_llvm.mlir index ccbe05e4f1..edb89d0819 100644 --- a/test/Conversion/amd/async_ops_to_llvm.mlir +++ b/test/Conversion/amd/async_ops_to_llvm.mlir @@ -104,20 +104,25 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar %arg2: !ttg.memdesc<32x64xf16, #shared, #smem, mutable>) { // The waitcnt stores all counters in one i32 bits 15:14 and 3:0 store the vmcnt we have to wait on // CHECK: rocdl.s.waitcnt -49168 - // CHECK: rocdl.barrier + // CHECK: rocdl.s.waitcnt -7937 + // CHECK: rocdl.s.barrier ttg.async_wait {num = 0 : i32} // CHECK: rocdl.s.waitcnt -49167 - // CHECK: rocdl.barrier + // CHECK: rocdl.s.waitcnt -7937 + // CHECK: rocdl.s.barrier ttg.async_wait {num = 1 : i32} // CHECK: rocdl.s.waitcnt -2 - // CHECK: rocdl.barrier + // CHECK: rocdl.s.waitcnt -7937 + // CHECK: rocdl.s.barrier ttg.async_wait {num = 62 : i32} // CHECK: rocdl.s.waitcnt -1 - // CHECK: rocdl.barrier + // CHECK: rocdl.s.waitcnt -7937 + // CHECK: rocdl.s.barrier ttg.async_wait {num = 63 : i32} // Check that we clamp values > 63 // CHECK: rocdl.s.waitcnt -1 - // CHECK: rocdl.barrier + // CHECK: rocdl.s.waitcnt -7937 + // CHECK: rocdl.s.barrier ttg.async_wait {num = 64 : i32} tt.return } diff --git a/test/Conversion/amd/load_store.mlir b/test/Conversion/amd/load_store.mlir index 596de97d61..7f60856b44 100644 --- a/test/Conversion/amd/load_store.mlir +++ b/test/Conversion/amd/load_store.mlir @@ -30,7 +30,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} { // ----- -#mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [1, 1], instrShape = [16, 16, 16], isTransposed = true}> +#mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [1, 1], instrShape = [16, 16, 4], isTransposed = true}> module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 64 : i32} { // CHECK-LABEL: global_store_mfma_vec16 tt.func public @global_store_mfma_vec16(%arg0: !tt.ptr {tt.divisibility = 16 : i32}) { diff --git a/test/Conversion/amd/tritongpu_wmma_dot_scaled_to_llvm.mlir b/test/Conversion/amd/tritongpu_wmma_dot_scaled_to_llvm.mlir new file mode 100644 index 0000000000..bceb77fde2 --- /dev/null +++ b/test/Conversion/amd/tritongpu_wmma_dot_scaled_to_llvm.mlir @@ -0,0 +1,38 @@ +// RUN: triton-opt %s --split-input-file --convert-triton-amdgpu-to-llvm=arch=gfx1250 --convert-builtin-func-to-llvm | FileCheck %s + +#linear = #ttg.linear<{register = [[0, 1], [0, 2]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [0, 0]], warp = [[0, 0], [16, 0]], block = []}> +#linear1 = #ttg.linear<{register = [[0, 1], [0, 2]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [0, 0]], warp = [[16, 0], [0, 0]], block = []}> +#mma = #ttg.amd_wmma<{version = 3, isTranspose = true, warpsPerCTA = [2, 2], instrShape=[16, 16, 128]}> +#mma1 = #ttg.amd_wmma<{version = 3, isTranspose = true, warpsPerCTA = [2, 2], instrShape=[16, 16, 64]}> + +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx1250", "ttg.threads-per-warp" = 32 : i32} { + // CHECK-LABEL: wmma_scaled_dot_fp4 + tt.func @wmma_scaled_dot_fp4(%arg0: tensor<16x64xi8, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 16}>>, %arg1: tensor<16x4xi8, #linear>, %arg2: tensor<64x16xi8, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 16}>>, %arg3: tensor<16x4xi8, #linear1>, %out0: !tt.ptr {tt.divisibility = 16 : i32}) { + %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma> + // Matrix A + // CHECK-COUNT-32: llvm.extractvalue {{.*}} : !llvm.struct<(i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8)> + // CHECK-COUNT-32: llvm.insertelement {{.*}} : vector<32xi8> + // CHECK: llvm.bitcast {{.*}} : vector<32xi8> to vector<8xi32> + // Matrix B + // CHECK-COUNT-32: llvm.extractvalue {{.*}} : !llvm.struct<(i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8)> + // CHECK-COUNT-32: llvm.insertelement {{.*}} : vector<32xi8> + // CHECK: llvm.bitcast {{.*}} : vector<32xi8> to vector<8xi32> + // Scale A + // CHECK-COUNT-2: llvm.extractvalue {{.*}} : !llvm.struct<(i8, i8, i8, i8)> + // CHECK-COUNT-2: llvm.insertelement {{.*}} : vector<4xi8> + // CHECK: llvm.bitcast {{.*}} : vector<4xi8> to i32 + // Scale B + // CHECK-COUNT-2: llvm.extractvalue {{.*}} : !llvm.struct<(i8, i8, i8, i8)> + // CHECK-COUNT-2: llvm.insertelement {{.*}} : vector<4xi8> + // CHECK: llvm.bitcast {{.*}} : vector<4xi8> to i32 + // Matrix C + // CHECK-COUNT-8: llvm.insertelement {{.*}} : vector<8xf32> + // CHECK: llvm.call_intrinsic "llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4"{{.*}} : (i32, vector<8xi32>, i32, vector<8xi32>, i16, vector<8xf32>, i32, i32, i32, i32, i32, i32, i1, i1) -> vector<8xf32> + %c = tt.dot_scaled %arg0 scale %arg1, %arg2 scale %arg3, %cst lhs = e2m1 rhs = e2m1 {fastMath = false} : tensor<16x64xi8, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 16}>>, tensor<16x4xi8, #linear> * tensor<64x16xi8, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 16}>>, tensor<16x4xi8, #linear1> -> tensor<16x16xf32, #mma> + // CHECK-COUNT-8: llvm.extractelement {{.*}} : vector<8xf32> + // CHECK-COUNT-8: llvm.insertelement {{.*}} : vector<1xf32> + %ptr0 = tt.splat %out0 : !tt.ptr -> tensor<16x16x!tt.ptr, #mma> + tt.store %ptr0, %c : tensor<16x16x!tt.ptr, #mma> + tt.return + } +} diff --git a/test/TritonGPU/amd/accelerate-amd-matmul-wmma-gfx1250.mlir b/test/TritonGPU/amd/accelerate-amd-matmul-wmma-gfx1250.mlir new file mode 100644 index 0000000000..2a521f7475 --- /dev/null +++ b/test/TritonGPU/amd/accelerate-amd-matmul-wmma-gfx1250.mlir @@ -0,0 +1,34 @@ +// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul="arch-generation-name=gfx1250 matrix-instruction-size=16" | FileCheck %s --check-prefixes CHECK + +#blocked = #ttg.blocked<{sizePerThread = [1, 16], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked2 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked3 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +// CHECK{LITERAL}: #linear = #ttg.linear<{register = [[0, 1], [0, 2]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [0, 0]], warp = [[0, 0], [16, 0]], block = []}> +// CHECK{LITERAL}: #linear1 = #ttg.linear<{register = [[0, 1], [0, 2]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [0, 0]], warp = [[16, 0], [0, 0]], block = []}> +// CHECK{LITERAL}: #mma = #ttg.amd_wmma<{version = 3, isTranspose = true, warpsPerCTA = [2, 2], instrShape = [16, 16, 128]}> +// CHECK{LITERAL}: #mma1 = #ttg.amd_wmma<{version = 3, isTranspose = true, warpsPerCTA = [2, 2], instrShape = [16, 16, 64]}> +// CHECK-LABEL: wmma_dot_scaled_mxfp4_mxfp4 +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx1250", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @wmma_dot_scaled_mxfp4_mxfp4( + %arg0: tensor<32x64xi8, #blocked>, + %arg1: tensor<64x32xi8, #blocked1>, + %arg2: tensor<32x4xi8, #blocked2>, + %arg3: tensor<32x4xi8, #blocked2>, + %arg4: tensor<32x32x!tt.ptr, #blocked3> + ) { + // CHECK-NOT: arith.constant dense<127> : tensor<32x4xi8, #linear> + // CHECK-NOT: arith.constant dense<127> : tensor<32x4xi8, #linear1> + // CHECK-NOT: tt.fp_to_fp + // CHECK: %[[C:.+]] = ttg.convert_layout {{.*}} : tensor<32x32xf32, #blocked3> -> tensor<32x32xf32, #mma> + // CHECK: %[[A:.+]] = ttg.convert_layout {{.*}} : tensor<32x64xi8, #blocked> -> tensor<32x64xi8, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 16}>> + // CHECK: %[[B:.+]] = ttg.convert_layout {{.*}} : tensor<64x32xi8, #blocked1> -> tensor<64x32xi8, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 16}>> + // CHECK: %[[SCALE0:.+]] = ttg.convert_layout {{.*}} : tensor<32x4xi8, #blocked2> -> tensor<32x4xi8, #linear> + // CHECK: %[[SCALE1:.+]] = ttg.convert_layout {{.*}} : tensor<32x4xi8, #blocked2> -> tensor<32x4xi8, #linear1> + // CHECK: tt.dot_scaled %[[A]] scale %[[SCALE0]], %[[B]] scale %[[SCALE1]], %[[C]] lhs = e2m1 rhs = e2m1 + %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked3> + %1 = tt.dot_scaled %arg0 scale %arg2, %arg1 scale %arg3, %cst lhs = e2m1 rhs = e2m1 {fastMath = false} : tensor<32x64xi8, #blocked>, tensor<32x4xi8, #blocked2> * tensor<64x32xi8, #blocked1>, tensor<32x4xi8, #blocked2> -> tensor<32x32xf32, #blocked3> + tt.store %arg4, %1 : tensor<32x32x!tt.ptr, #blocked3> + tt.return + } +} diff --git a/test/TritonGPU/amd/amd-convert-buffer-ops-small-tensor.mlir b/test/TritonGPU/amd/amd-convert-buffer-ops-small-tensor.mlir new file mode 100644 index 0000000000..2b146c4c51 --- /dev/null +++ b/test/TritonGPU/amd/amd-convert-buffer-ops-small-tensor.mlir @@ -0,0 +1,616 @@ +// RUN: triton-opt %s -split-input-file --tritonamdgpu-convert-buffer-ops="arch-generation-name=gfx942 analyze-small-tensor-ofst=false" | FileCheck %s --check-prefixes=COMMON,GFX942-ONLY +// RUN: triton-opt %s -split-input-file --tritonamdgpu-convert-buffer-ops="arch-generation-name=gfx950 analyze-small-tensor-ofst=false" | FileCheck %s --check-prefixes=COMMON,GFX950-ONLY + +////////////////////////////////////////////////////////////////////////////// +// +// This file contains lit tests primarily for buffer-ops conversion for +// small-tensor (size <= 2G) with analyze-small-tensor-ofst being off +// (default value). +// +// The initial revision of this file is copied from amd-convert-buffer-ops.mlir +// with following changes: +// - some completely irrelevant tests are removed +// - some tests are slightly modified to demonstrate some conversion +// can be done with skip-small-tensor-ofst-analysis=false +// +// TODO: some testings still need polishing to make them more relevant to +// small-tensor-offset related optimization. Regardless, it's no harm to keep +// them. +// +////////////////////////////////////////////////////////////////////////////// +// +#blocked0 = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [1], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} { + // COMMON-LABEL: simple + tt.func @simple(%arg0: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 :i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32) { + %c256_i32 = arith.constant 256 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c256_i32 : i32 + %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked0> + %3 = tt.splat %1 : i32 -> tensor<256xi32, #blocked0> + // COMMON: %[[offset:.*]] = arith.addi + %4 = arith.addi %3, %2 : tensor<256xi32, #blocked0> + %5 = tt.splat %arg0 : !tt.ptr -> tensor<256x!tt.ptr, #blocked0> + %6 = tt.addptr %5, %4 : tensor<256x!tt.ptr, #blocked0>, tensor<256xi32, #blocked0> + %7 = tt.splat %arg1 : !tt.ptr -> tensor<256x!tt.ptr, #blocked0> + %8 = tt.addptr %7, %4 : tensor<256x!tt.ptr, #blocked0>, tensor<256xi32, #blocked0> + // COMMON: buffer_load %arg0[%[[offset]]] + %9 = tt.load %6 : tensor<256x!tt.ptr, #blocked0> + // COMMON: buffer_load %arg1[%[[offset]]] + %10 = tt.load %8 : tensor<256x!tt.ptr, #blocked0> + // COMMON: %[[data:.*]] = arith.addf + %11 = arith.addf %9, %10 : tensor<256xf32, #blocked0> + %12 = tt.splat %arg2 : !tt.ptr -> tensor<256x!tt.ptr, #blocked0> + %13 = tt.addptr %12, %4 : tensor<256x!tt.ptr, #blocked0>, tensor<256xi32, #blocked0> + // COMMON: buffer_store %[[data]], %arg2[%[[offset]]] + tt.store %13, %11 : tensor<256x!tt.ptr, #blocked0> + tt.return + } +} + +// ----- + +#blocked = #ttg.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { + // COMMON-LABEL: assume_positive_offset + tt.func @assume_positive_offset(%arg0: !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<1024xf32, #blocked>{ + %c1024_i32 = arith.constant 1024 : i32 + %c128_i32 = arith.constant 128 : i32 + %c0_i32 = arith.constant 0 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + %sub = arith.subi %1, %c128_i32 : i32 + %cmp = arith.cmpi sgt, %sub, %c0_i32 : i32 + llvm.intr.assume %cmp : i1 + %2 = tt.splat %sub : i32 -> tensor<1024xi32, #blocked> + %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> + // COMMON: %[[offset:.*]] = arith.addi + %4 = arith.addi %2, %3 : tensor<1024xi32, #blocked> + // COMMON: %[[scalar_ptr:.*]] = tt.addptr %arg0 + %5 = tt.addptr %arg0, %1 : !tt.ptr, i32 + %8 = tt.splat %5 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %9 = tt.addptr %8, %4 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + // COMMON: buffer_load %[[scalar_ptr]][%[[offset]]] + %10 = tt.load %9 : tensor<1024x!tt.ptr, #blocked> + tt.return %10 : tensor<1024xf32, #blocked> + } +} + +// ----- + +#blocked = #ttg.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { + // COMMON-LABEL: offset_64_bits + tt.func @offset_64_bits(%arg0: !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<1024xf32, #blocked> { + %c1024_i32 = arith.constant 1024 : i32 + %c128_i32 = arith.constant 128 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + %sub = arith.subi %1, %c128_i32 : i32 + %2 = tt.splat %sub : i32 -> tensor<1024xi32, #blocked> + %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> + %ext2 = arith.extsi %2 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked> + %ext3 = arith.extsi %3 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked> + %4 = arith.addi %ext2, %ext3 : tensor<1024xi64, #blocked> + %5 = tt.addptr %arg0, %1 : !tt.ptr, i32 + %8 = tt.splat %5 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %9 = tt.addptr %8, %4 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> + // COMMON: tt.load + %10 = tt.load %9 : tensor<1024x!tt.ptr, #blocked> + tt.return %10 : tensor<1024xf32, #blocked> + } +} + +// ----- + +#blocked = #ttg.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { + // COMMON-LABEL: offset_64_bits_narrow + tt.func public @offset_64_bits_narrow(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}) -> tensor<1024xf32, #blocked> { + %c1024_i32 = arith.constant 1024 : i32 + %c128_i32 = arith.constant 128 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + %2 = tt.splat %1: i32 -> tensor<1024xi32, #blocked> + %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> + %ext2 = arith.extsi %2 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked> + %ext3 = arith.extsi %3 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked> + %4 = arith.addi %ext2, %ext3 : tensor<1024xi64, #blocked> + // COMMON: %[[scalar_ptr:.*]] = tt.addptr %arg0 + %5 = tt.addptr %arg0, %1 : !tt.ptr, i32 + %8 = tt.splat %5 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + // COMMON: %[[offset_32_bit:.*]] = arith.trunci + %narrow4 = arith.trunci %4 : tensor<1024xi64, #blocked> to tensor <1024xi32, #blocked> + %9 = tt.addptr %8, %narrow4 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + // COMMON: buffer_load %[[scalar_ptr]][%[[offset_32_bit]]] + %10 = tt.load %9 : tensor<1024x!tt.ptr, #blocked> + tt.return %10 : tensor<1024xf32, #blocked> + } +} + +// ----- +// NOTE: compared to @non_canonical_ptr in amd-convert-buffer-ops.mlir, the load +// can be converted to buffer-loads. + +#blocked = #ttg.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { + // COMMON-LABEL: non_canonical_ptr + tt.func @non_canonical_ptr(%arg0: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: tensor<1024xi32, #blocked>) -> tensor<1024xf32, #blocked>{ + %8 = tt.splat %arg0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %9 = tt.addptr %8, %arg1: tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + // COMMON: buffer_load + %10 = tt.load %9 : tensor<1024x!tt.ptr, #blocked> + tt.return %10 : tensor<1024xf32, #blocked> + } +} + +// ----- + +// NOTE: compared the @assume_eq_non_neg in amd-convert-buffer-ops.mlir. +// tt.load and tt.store can be converted without tl.assume. + +#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { + // COMMON-LABEL: assume_eq_non_neg + tt.func @assume_eq_non_neg(%arg0: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg2: i32) { + %c10_i32 = arith.constant 10 : i32 + // COMMON: %[[range:.*]] = tt.make_range + %1 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #blocked> + // COMMON: %[[ptr:.*]] = tt.addptr %arg0, %arg2 + %2 = tt.addptr %arg0, %arg2: !tt.ptr, i32 + %3 = tt.splat %2 : !tt.ptr -> tensor<16x!tt.ptr, #blocked> + %4 = tt.addptr %3, %1 : tensor<16x!tt.ptr, #blocked>, tensor<16xi32, #blocked> + %5 = tt.splat %arg1 : !tt.ptr -> tensor<16x!tt.ptr, #blocked> + %6 = tt.addptr %5, %1 : tensor<16x!tt.ptr, #blocked>, tensor<16xi32, #blocked> + // COMMON: %[[loaded:.*]] = amdgpu.buffer_load %arg1[%[[range]]] + %7 = tt.load %6 : tensor<16x!tt.ptr, #blocked> + // COMMON: amdgpu.buffer_store %[[loaded]], %[[ptr]][%[[range]]] + tt.store %4, %7 : tensor<16x!tt.ptr, #blocked> + tt.return + } +} + +// ----- + +// NOTE: compared to the @assume_nonneg_less in amd-convert-buffer-ops.mlir. +// tl.assume are removed. + +#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { + // COMMON-LABEL: assume_nonneg_less + tt.func @assume_nonneg_less(%arg0: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg2: i32) { + %c10_i32 = arith.constant 5 : i32 + // %0 = arith.cmpi slt, %c10_i32, %arg2 : i32 + // llvm.intr.assume %0 : i1 + // COMMON: %[[range:.*]] = tt.make_range + %1 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #blocked> + // COMMON: %[[ptr:.*]] = tt.addptr %arg0, %arg2 + %2 = tt.addptr %arg0, %arg2: !tt.ptr, i32 + %3 = tt.splat %2 : !tt.ptr -> tensor<16x!tt.ptr, #blocked> + %4 = tt.addptr %3, %1 : tensor<16x!tt.ptr, #blocked>, tensor<16xi32, #blocked> + %5 = tt.splat %arg1 : !tt.ptr -> tensor<16x!tt.ptr, #blocked> + %6 = tt.addptr %5, %1 : tensor<16x!tt.ptr, #blocked>, tensor<16xi32, #blocked> + // COMMON: %[[loaded:.*]] = amdgpu.buffer_load %arg1[%[[range]]] + %7 = tt.load %6 : tensor<16x!tt.ptr, #blocked> + // COMMON: amdgpu.buffer_store %[[loaded]], %[[ptr]][%[[range]]] + tt.store %4, %7 : tensor<16x!tt.ptr, #blocked> + tt.return + } +} + +// ----- + +// NOTE: compared to the @assume_nonneg_less in amd-convert-buffer-ops.mlir. +// tl.assume are removed. + +#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { + // COMMON-LABEL: assume_cmp_non_const + tt.func @assume_cmp_non_const(%arg0: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg2: i32, %arg3 : i32, %arg4 : i32, %arg5 : i32, %arg6 : i32) { + %0 = arith.cmpi sgt, %arg2, %arg3 : i32 + llvm.intr.assume %0 : i1 + %1 = arith.subi %arg2, %arg3 : i32 + %2 = arith.cmpi sge, %1, %arg4 : i32 + // llvm.intr.assume %2 : i1 + %3 = arith.subi %1, %arg4 : i32 + %4 = arith.cmpi slt, %3, %arg5 : i32 + // llvm.intr.assume %4 : i1 + %5 = arith.subi %arg5, %3 : i32 + %6 = arith.cmpi sle, %5, %arg6 : i32 + // llvm.intr.assume %6 : i1 + %7 = arith.subi %arg6, %5 : i32 + %8 = arith.minsi %1, %3 : i32 + %9 = arith.minsi %8, %5 : i32 + %10 = arith.minsi %9, %7 : i32 + // COMMON: %[[range:.*]] = tt.make_range + %11 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #blocked> + %12 = tt.splat %10 : i32 -> tensor<16xi32, #blocked> + // COMMON: %[[offsets:.*]] = arith.addi + %offsets = arith.addi %11, %12 : tensor<16xi32, #blocked> + %13 = tt.splat %arg0 : !tt.ptr -> tensor<16x!tt.ptr, #blocked> + %14 = tt.addptr %13, %11 : tensor<16x!tt.ptr, #blocked>, tensor<16xi32, #blocked> + %15 = tt.splat %arg1 : !tt.ptr -> tensor<16x!tt.ptr, #blocked> + %16 = tt.addptr %15, %offsets : tensor<16x!tt.ptr, #blocked>, tensor<16xi32, #blocked> + // COMMON: %[[loaded:.*]] = amdgpu.buffer_load %arg1[%[[offsets]]] + %17 = tt.load %16 : tensor<16x!tt.ptr, #blocked> + // COMMON: amdgpu.buffer_store %[[loaded]], %arg0[%[[range]]] + tt.store %14, %17 : tensor<16x!tt.ptr, #blocked> + tt.return + } +} + +// ----- + +#blocked = #ttg.blocked<{sizePerThread = [2, 2], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}> +#blockedtrans = #ttg.blocked<{sizePerThread = [2, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}> +#blocked1 = #ttg.slice<{dim=0, parent=#blocked}> +#blocked2 = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { + // COMMON-LABEL: unary_triton_ops_transitive_nonneg + tt.func @unary_triton_ops_transitive_nonneg(%arg0: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) { + %c10_i32 = arith.constant 5 : i32 + %0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #blocked1> + %1 = tt.expand_dims %0 {axis = 0 : i32} : tensor<16xi32, #blocked1> -> tensor<1x16xi32, #blocked> + %2 = tt.reshape %1 allow_reorder : tensor<1x16xi32, #blocked> -> tensor<8x2xi32, #blocked> + %3 = tt.reshape %1 allow_reorder : tensor<1x16xi32, #blocked> -> tensor<2x8xi32, #blocked> + %4 = tt.trans %3 {order = array} : tensor<2x8xi32, #blocked> -> tensor<8x2xi32, #blockedtrans> + %5 = ttg.convert_layout %4 : tensor<8x2xi32, #blockedtrans> -> tensor<8x2xi32, #blocked> + %6 = arith.addi %5, %2 : tensor<8x2xi32, #blocked> + %7 = tt.make_range {end = 10 : i32, start = 2 : i32} : tensor<8xi32, #blocked2> + %8 = ttg.convert_layout %7 : tensor<8xi32, #blocked2> -> tensor<8xi32, #blocked1> + %9 = tt.expand_dims %8 {axis = 0 : i32} : tensor<8xi32, #blocked1> -> tensor<1x8xi32, #blocked> + %10 = tt.broadcast %9 : tensor<1x8xi32, #blocked> -> tensor<2x8xi32, #blocked> + %11 = tt.reshape %10 allow_reorder : tensor<2x8xi32, #blocked> -> tensor<8x2xi32, #blocked> + %12 = tt.splat %c10_i32 : i32 -> tensor<8x2xi32, #blocked> + %13 = arith.addi %11, %12 : tensor<8x2xi32, #blocked> + %14 = arith.minsi %13, %5 : tensor<8x2xi32, #blocked> + // COMMON: %[[lhs:.*]], %[[rhs:.*]] = tt.split + %15, %16 = tt.split %11: tensor<8x2xi32, #blocked> -> tensor<8xi32, #blocked2> + %17 = tt.splat %arg0 : !tt.ptr -> tensor<8x!tt.ptr, #blocked2> + %18 = tt.addptr %17, %15 : tensor<8x!tt.ptr, #blocked2>, tensor<8xi32, #blocked2> + // COMMON: %[[loaded:.*]] = amdgpu.buffer_load %arg0[%[[lhs]]] + %19 = tt.load %18 : tensor<8x!tt.ptr, #blocked2> + %20 = tt.addptr %17, %16 : tensor<8x!tt.ptr, #blocked2>, tensor<8xi32, #blocked2> + // COMMON: %[[loaded2:.*]] = amdgpu.buffer_load %arg0[%[[rhs]]] + %21 = tt.load %20 : tensor<8x!tt.ptr, #blocked2> + // COMMON: %[[added:.*]] = arith.addf %[[loaded]], %[[loaded2]] + %22 = arith.addf %19, %21 : tensor<8xbf16, #blocked2> + %23 = tt.splat %arg1 : !tt.ptr -> tensor<8x!tt.ptr, #blocked2> + %24 = tt.addptr %23, %7 : tensor<8x!tt.ptr, #blocked2>, tensor<8xi32, #blocked2> + // COMMON: amdgpu.buffer_store %[[added]], %arg1[%{{.*}}] + tt.store %24, %22 : tensor<8x!tt.ptr, #blocked2> + tt.return + } +} + +// ----- + + +#blocked = #ttg.blocked<{sizePerThread = [2, 2], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { + // COMMON-LABEL: join_cat_transitive_nonneg + tt.func @join_cat_transitive_nonneg(%arg0: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) { + %0 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #blocked1> + %1 = tt.make_range {end = 10 : i32, start = 2 : i32} : tensor<8xi32, #blocked1> + %2 = tt.join %0, %1 : tensor<8xi32, #blocked1> -> tensor<8x2xi32, #blocked> + %3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #blocked1> + %4 = tt.make_range {end = 8 : i32, start = 4 : i32} : tensor<4xi32, #blocked1> + %5 = tt.join %3, %4 : tensor<4xi32, #blocked1> -> tensor<4x2xi32, #blocked> + %6 = tt.cat %5, %5 : tensor<4x2xi32, #blocked> -> tensor<8x2xi32, #blocked> + %7 = arith.addi %2, %6 : tensor<8x2xi32, #blocked> + %zeros = arith.constant dense<0> : tensor<8x1xi32, #blocked> + %ones = arith.constant dense<1> : tensor<8x1xi32, #blocked> + %8 = tt.gather %7[%zeros] {axis = 1 : i32} : (tensor<8x2xi32, #blocked>, tensor<8x1xi32, #blocked>) -> tensor<8x1xi32, #blocked> + %9 = tt.gather %7[%ones] {axis = 1 : i32} : (tensor<8x2xi32, #blocked>, tensor<8x1xi32, #blocked>) -> tensor<8x1xi32, #blocked> + %10 = arith.addi %8, %9 : tensor<8x1xi32, #blocked> + %11 = tt.reshape %10 allow_reorder : tensor<8x1xi32, #blocked> -> tensor<8xi32, #blocked1> + %12 = tt.splat %arg0 : !tt.ptr -> tensor<8x!tt.ptr, #blocked1> + %14 = tt.addptr %12, %11 : tensor<8x!tt.ptr, #blocked1>, tensor<8xi32, #blocked1> + // COMMON: %[[loaded:.*]] = amdgpu.buffer_load %arg0[%{{.*}}] + %15 = tt.load %14 : tensor<8x!tt.ptr, #blocked1> + %16 = tt.splat %arg1 : !tt.ptr -> tensor<8x!tt.ptr, #blocked1> + %17 = tt.addptr %16, %0 : tensor<8x!tt.ptr, #blocked1>, tensor<8xi32, #blocked1> + // COMMON: amdgpu.buffer_store %[[loaded]], %arg1[%{{.*}}] + tt.store %17, %15 : tensor<8x!tt.ptr, #blocked1> + tt.return + } +} + +// ----- + +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { + // COMMON-LABEL: histo_nonneg + tt.func @histo_nonneg(%arg0: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg2 : tensor<256xi32, #blocked>) { + /// Purposely specify %arg2 so that we can't statically determine the input + /// data is nonneg. + // COMMON: tt.histogram + %0 = tt.histogram %arg2 : tensor<256xi32, #blocked> -> tensor<8xi32, #blocked> + %1 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #blocked> + %2 = tt.splat %arg0 : !tt.ptr -> tensor<8x!tt.ptr, #blocked> + %3 = tt.addptr %2, %0 : tensor<8x!tt.ptr, #blocked>, tensor<8xi32, #blocked> + // COMMON: %[[loaded:.*]] = amdgpu.buffer_load %arg0[%{{.*}}] + %4 = tt.load %3 : tensor<8x!tt.ptr, #blocked> + %5 = tt.splat %arg1 : !tt.ptr -> tensor<8x!tt.ptr, #blocked> + %6 = tt.addptr %5, %1 : tensor<8x!tt.ptr, #blocked>, tensor<8xi32, #blocked> + // COMMON: amdgpu.buffer_store %[[loaded]], %arg1[%{{.*}}] + tt.store %6, %4 : tensor<8x!tt.ptr, #blocked> + tt.return + } +} + +// ----- + +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { + // COMMON-LABEL: get_num_prog_nonneg + tt.func @get_num_prog_nonneg(%arg0: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg2 : i32) { + %0 = tt.get_num_programs x : i32 + %1 = tt.get_num_programs y : i32 + %2 = tt.get_num_programs z : i32 + %3 = arith.minsi %0, %1 : i32 + %4 = arith.minsi %2, %3 : i32 + %5 = arith.maxsi %arg2, %4 : i32 + %6 = tt.splat %5 : i32 -> tensor<8xi32, #blocked> + %7 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #blocked> + %8 = arith.addi %6, %7 : tensor<8xi32, #blocked> + %9 = tt.splat %arg0 : !tt.ptr -> tensor<8x!tt.ptr, #blocked> + %10 = tt.addptr %9, %8 : tensor<8x!tt.ptr, #blocked>, tensor<8xi32, #blocked> + // COMMON: %[[loaded:.*]] = amdgpu.buffer_load %arg0[%{{.*}}] + %11 = tt.load %10 : tensor<8x!tt.ptr, #blocked> + %12 = tt.splat %arg1 : !tt.ptr -> tensor<8x!tt.ptr, #blocked> + %13 = tt.addptr %12, %7 : tensor<8x!tt.ptr, #blocked>, tensor<8xi32, #blocked> + // COMMON: amdgpu.buffer_store %[[loaded]], %arg1[%{{.*}}] + tt.store %13, %11 : tensor<8x!tt.ptr, #blocked> + tt.return + } +} + +// ----- + +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { + // COMMON-LABEL: unsigned_ops + tt.func @unsigned_ops(%arg0: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg2 : i32, %arg3 : i32, %arg4 : f32) { + %c5_i32 = arith.constant 5 : i32 + %0 = arith.ceildivui %arg2, %c5_i32 : i32 + %1 = arith.divui %arg3, %c5_i32 : i32 + %2 = arith.fptoui %arg4 : f32 to i32 + %4 = arith.maxui %arg2, %arg3 : i32 + %5 = arith.minui %arg2, %arg3 : i32 + %6 = arith.remui %arg2, %c5_i32 : i32 + %7 = arith.shrui %arg3, %c5_i32 : i32 + %8 = arith.addi %0, %1 : i32 + %10 = arith.addi %4, %5 : i32 + %11 = arith.addi %6, %7 : i32 + %12 = arith.addi %8, %2 : i32 + %13 = arith.addi %10, %11 : i32 + %14 = arith.addi %8, %13 : i32 + %15 = tt.splat %14 : i32 -> tensor<8xi32, #blocked> + %16 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #blocked> + %17 = arith.addi %15, %16 : tensor<8xi32, #blocked> + %18 = tt.splat %arg0 : !tt.ptr -> tensor<8x!tt.ptr, #blocked> + %19 = tt.addptr %18, %17 : tensor<8x!tt.ptr, #blocked>, tensor<8xi32, #blocked> + // COMMON: %[[loaded:.*]] = amdgpu.buffer_load %arg0[%{{.*}}] + %20 = tt.load %19 : tensor<8x!tt.ptr, #blocked> + %21 = tt.splat %arg1 : !tt.ptr -> tensor<8x!tt.ptr, #blocked> + %22 = tt.addptr %21, %16 : tensor<8x!tt.ptr, #blocked>, tensor<8xi32, #blocked> + // COMMON: amdgpu.buffer_store %[[loaded]], %arg1[%{{.*}}] + tt.store %22, %20 : tensor<8x!tt.ptr, #blocked> + tt.return + } +} + +// ----- + +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { + // COMMON-LABEL: extui_nonneg + tt.func @extui_nonneg(%arg0: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg2 : i32) { + %0 = arith.extui %arg2 : i32 to i64 + %1 = tt.splat %0 : i64 -> tensor<8xi64, #blocked> + %2 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #blocked> + %3 = arith.extui %2 : tensor<8xi32, #blocked> to tensor<8xi64, #blocked> + %4 = arith.addi %1, %3 : tensor<8xi64, #blocked> + %5 = arith.trunci %4 : tensor<8xi64, #blocked> to tensor<8xi32, #blocked> + %6 = tt.splat %arg0 : !tt.ptr -> tensor<8x!tt.ptr, #blocked> + %7 = tt.addptr %6, %5 : tensor<8x!tt.ptr, #blocked>, tensor<8xi32, #blocked> + // COMMON: %[[loaded:.*]] = amdgpu.buffer_load %arg0[%{{.*}}] + %8 = tt.load %7: tensor<8x!tt.ptr, #blocked> + %9 = tt.splat %arg1 : !tt.ptr -> tensor<8x!tt.ptr, #blocked> + %10 = tt.addptr %9, %2 : tensor<8x!tt.ptr, #blocked>, tensor<8xi32, #blocked> + // COMMON: amdgpu.buffer_store %[[loaded]], %arg1[%{{.*}}] + tt.store %10, %8 : tensor<8x!tt.ptr, #blocked> + tt.return + } +} + +// ----- + +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { + // COMMON-LABEL: traverse_if + tt.func @traverse_if(%arg0: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg2 : i32, %arg3 : i32) { + %c0_i32 = arith.constant 0 : i32 + %c2_i32 = arith.constant 2 : i32 + %c5_i32 = arith.constant 7 : i32 + %c7_i32 = arith.constant 5 : i32 + %0 = arith.extui %arg2 : i32 to i64 + %1 = arith.remui %arg2, %c2_i32 : i32 + %2 = arith.cmpi eq, %1, %c0_i32 : i32 + %3 = scf.if %2 -> tensor<8xi64, #blocked> { + %20 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #blocked> + %21 = arith.extui %20 : tensor<8xi32, #blocked> to tensor<8xi64, #blocked> + %22 = tt.splat %arg3 : i32 -> tensor<8xi32, #blocked> + %23 = arith.extui %22 : tensor<8xi32, #blocked> to tensor<8xi64, #blocked> + %24 = arith.addi %21, %23 : tensor<8xi64, #blocked> + scf.yield %24 : tensor<8xi64, #blocked> + } else { + %30 = tt.make_range {end = 16 : i32, start = 8 : i32} : tensor<8xi32, #blocked> + %31 = arith.extui %30 : tensor<8xi32, #blocked> to tensor<8xi64, #blocked> + %32 = tt.splat %0 : i64 -> tensor<8xi64, #blocked> + %33 = arith.addi %31, %32 : tensor<8xi64, #blocked> + scf.yield %33 : tensor<8xi64, #blocked> + } + %4 = arith.trunci %3 : tensor<8xi64, #blocked> to tensor<8xi32, #blocked> + %5 = tt.splat %arg0 : !tt.ptr -> tensor<8x!tt.ptr, #blocked> + %6 = tt.addptr %5, %4 : tensor<8x!tt.ptr, #blocked>, tensor<8xi32, #blocked> + // COMMON: %[[loaded:.*]] = amdgpu.buffer_load %arg0[%{{.*}}] + %7 = tt.load %6: tensor<8x!tt.ptr, #blocked> + %8 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #blocked> + %9 = tt.splat %arg1 : !tt.ptr -> tensor<8x!tt.ptr, #blocked> + %10 = tt.addptr %9, %8 : tensor<8x!tt.ptr, #blocked>, tensor<8xi32, #blocked> + // COMMON: amdgpu.buffer_store %[[loaded]], %arg1[%{{.*}}] + tt.store %10, %7 : tensor<8x!tt.ptr, #blocked> + tt.return + } +} + +// ----- + +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { + // COMMON-LABEL: traverse_if + tt.func @traverse_if(%arg0: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg2 : i32, %arg3 : i32) { + %c0_i32 = arith.constant 0 : i32 + %c2_i32 = arith.constant 2 : i32 + %c5_i32 = arith.constant 7 : i32 + %c7_i32 = arith.constant 5 : i32 + %zeros = arith.constant dense<0> : tensor<8xi32, #blocked> + %0 = arith.extui %arg2 : i32 to i64 + %1 = arith.remui %arg2, %c2_i32 : i32 + %2 = arith.cmpi eq, %1, %c0_i32 : i32 + %3, %4 = scf.if %2 -> (tensor<8xi64, #blocked>, tensor<8xi32, #blocked>) { + %20 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #blocked> + %21 = arith.extui %20 : tensor<8xi32, #blocked> to tensor<8xi64, #blocked> + %22 = tt.splat %arg3 : i32 -> tensor<8xi32, #blocked> + %23 = arith.extui %22 : tensor<8xi32, #blocked> to tensor<8xi64, #blocked> + %24 = arith.addi %21, %23 : tensor<8xi64, #blocked> + %25 = tt.make_range {end = 9 : i32, start = 1 : i32} : tensor<8xi32, #blocked> + scf.yield %24, %25 : tensor<8xi64, #blocked>, tensor<8xi32, #blocked> + } else { + %30 = tt.make_range {end = 16 : i32, start = 8 : i32} : tensor<8xi32, #blocked> + %31 = arith.extui %30 : tensor<8xi32, #blocked> to tensor<8xi64, #blocked> + %32 = tt.splat %0 : i64 -> tensor<8xi64, #blocked> + %33 = arith.addi %31, %32 : tensor<8xi64, #blocked> + scf.yield %33, %zeros : tensor<8xi64, #blocked>, tensor<8xi32, #blocked> + } + %5 = arith.trunci %3 : tensor<8xi64, #blocked> to tensor<8xi32, #blocked> + %6 = arith.addi %4, %5 : tensor<8xi32, #blocked> + %7 = tt.splat %arg0 : !tt.ptr -> tensor<8x!tt.ptr, #blocked> + %8 = tt.addptr %7, %6 : tensor<8x!tt.ptr, #blocked>, tensor<8xi32, #blocked> + // COMMON: %[[loaded:.*]] = amdgpu.buffer_load %arg0[%{{.*}}] + %9 = tt.load %8: tensor<8x!tt.ptr, #blocked> + %10 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #blocked> + %11 = tt.splat %arg1 : !tt.ptr -> tensor<8x!tt.ptr, #blocked> + %12 = tt.addptr %11, %10 : tensor<8x!tt.ptr, #blocked>, tensor<8xi32, #blocked> + // COMMON: amdgpu.buffer_store %[[loaded]], %arg1[%{{.*}}] + tt.store %12, %9 : tensor<8x!tt.ptr, #blocked> + tt.return + } +} + +// ----- + +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}> +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} { + // COMMON-LABEL: atomic_add_bf16 + tt.func public @atomic_add_bf16(%arg0: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) { + %cst = arith.constant dense : tensor<512xi1, #blocked> + %cst_0 = arith.constant dense<1.000000e+00> : tensor<512xbf16, #blocked> + %c512_i32 = arith.constant 512 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c512_i32 : i32 + %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> + %3 = tt.addptr %arg0, %1 : !tt.ptr, i32 + %4 = tt.splat %3 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> + %5 = tt.addptr %4, %2 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> + // GFX942-ONLY-NOT: amdgpu.buffer_atomic_rmw + // GFX950-ONLY: amdgpu.buffer_atomic_rmw + %6 = tt.atomic_rmw fadd, acq_rel, gpu, %5, %cst_0, %cst : (tensor<512x!tt.ptr, #blocked>, tensor<512xbf16, #blocked>, tensor<512xi1, #blocked>) -> tensor<512xbf16, #blocked> + tt.return + } +} + +// ----- + +#blocked = #ttg.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { + // COMMON-LABEL: assume_positive_offset_buffer_atomic + tt.func @assume_positive_offset_buffer_atomic(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: tensor<1024xf32, #blocked>) -> tensor<1024xf32, #blocked>{ + %c1024_i32 = arith.constant 1024 : i32 + %c128_i32 = arith.constant 128 : i32 + %c0_i32 = arith.constant 0 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + %sub = arith.subi %1, %c128_i32 : i32 + %cmp = arith.cmpi sgt, %sub, %c0_i32 : i32 + llvm.intr.assume %cmp : i1 + %2 = tt.splat %sub : i32 -> tensor<1024xi32, #blocked> + %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> + // COMMON: %[[offset:.*]] = arith.addi + %4 = arith.addi %2, %3 : tensor<1024xi32, #blocked> + // COMMON: %[[scalar_ptr:.*]] = tt.addptr %arg0 + %5 = tt.addptr %arg0, %1 : !tt.ptr, i32 + %6 = tt.splat %5 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %7 = tt.addptr %6, %4 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + // COMMON: %[[loaded:.*]] = amdgpu.buffer_atomic_rmw fadd, acq_rel, gpu, %arg1, %[[scalar_ptr]][%[[offset]]] + %8 = tt.atomic_rmw fadd, acq_rel, gpu, %7, %arg1 : (tensor<1024x!tt.ptr, #blocked>, tensor<1024xf32, #blocked>) -> tensor<1024xf32, #blocked> + tt.return %8 : tensor<1024xf32, #blocked> + } +} + +// ----- + +#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 16], warpsPerCTA = [2, 2], order = [1, 0]}> + +module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} { + tt.func @extract_slice(%arg0: !tt.ptr) -> tensor<128x256xf32, #blocked> { + %0 = arith.constant dense<0> : tensor<256x256xi64, #blocked> + %1 = amdgpu.extract_slice %0 [0, 0] : tensor<256x256xi64, #blocked> to tensor<128x256xi64, #blocked> + %2 = arith.trunci %1 : tensor<128x256xi64, #blocked> to tensor<128x256xi32, #blocked> + %3 = tt.splat %arg0 : !tt.ptr -> tensor<128x256x!tt.ptr, #blocked> + %4 = tt.addptr %3, %2 : tensor<128x256x!tt.ptr, #blocked>, tensor<128x256xi32, #blocked> + %5 = tt.load %4 : tensor<128x256x!tt.ptr, #blocked> + tt.return %5 : tensor<128x256xf32, #blocked> + } +} + +// COMMON-LABEL: tt.func @extract_slice( +// COMMON-SAME: %[[ARG_0:.*]]: !tt.ptr) -> tensor<128x256xf32, #blocked> { +// COMMON: %[[VAR_0:.*]] = arith.constant dense<0> : tensor<256x256xi64, #blocked> +// COMMON: %[[VAR_1:.*]] = amdgpu.extract_slice %[[VAR_0]] [0, 0] : tensor<256x256xi64, #blocked> to tensor<128x256xi64, #blocked> +// COMMON: %[[VAR_2:.*]] = arith.trunci %[[VAR_1]] : tensor<128x256xi64, #blocked> to tensor<128x256xi32, #blocked> +// COMMON: %[[VAR_3:.*]] = amdgpu.buffer_load %[[ARG_0]][%[[VAR_2]]] : tensor<128x256xf32, #blocked> +// COMMON: tt.return %[[VAR_3]] : tensor<128x256xf32, #blocked> +// COMMON: } + +// ----- + +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}> +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} { + // COMMON-LABEL: buffer_atomic_cas_i64 + tt.func public @buffer_atomic_cas_i64(%arg0: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32} , %arg1: !tt.ptr {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) { + // COMMON: %[[val:.*]] = arith.constant dense<2> + %cst = arith.constant dense<2> : tensor<1024xi64, #blocked> + // COMMON: %[[cmp:.*]] = arith.constant dense<0> + %cst_0 = arith.constant dense<0> : tensor<1024xi64, #blocked> + %c1024_i32 = arith.constant 1024 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + // COMMON: %[[offset:.*]] = tt.make_range + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> + // COMMON: %[[scalar_ptr:.*]] = tt.addptr %arg0 + %3 = tt.addptr %arg0, %1 : !tt.ptr, i32 + %4 = tt.splat %3 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %5 = tt.addptr %4, %2 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + // COMMON: amdgpu.buffer_atomic_cas acq_rel, gpu, %[[cmp]], %[[val]], %[[scalar_ptr]][%[[offset]]] + %6 = tt.atomic_cas acq_rel, gpu, %5, %cst_0, %cst : (tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked>, tensor<1024xi64, #blocked>) -> tensor<1024xi64, #blocked> + %7 = tt.addptr %arg1, %1 : !tt.ptr, i32 + %8 = tt.splat %7 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %9 = tt.addptr %8, %2 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + tt.store %9, %6 : tensor<1024x!tt.ptr, #blocked> + tt.return + } +} diff --git a/test/TritonGPU/amd/amd-convert-buffer-ops.mlir b/test/TritonGPU/amd/amd-convert-buffer-ops.mlir index d763cc5fda..7b04877a94 100644 --- a/test/TritonGPU/amd/amd-convert-buffer-ops.mlir +++ b/test/TritonGPU/amd/amd-convert-buffer-ops.mlir @@ -1,5 +1,5 @@ -// RUN: triton-opt %s -split-input-file --tritonamdgpu-convert-buffer-ops="arch-generation-name=gfx942"| FileCheck %s --check-prefixes=COMMON,GFX942-ONLY -// RUN: triton-opt %s -split-input-file --tritonamdgpu-convert-buffer-ops="arch-generation-name=gfx950"| FileCheck %s --check-prefixes=COMMON,GFX950-ONLY +// RUN: triton-opt %s -split-input-file --tritonamdgpu-convert-buffer-ops="arch-generation-name=gfx942 analyze-small-tensor-ofst=true"| FileCheck %s --check-prefixes=COMMON,GFX942-ONLY +// RUN: triton-opt %s -split-input-file --tritonamdgpu-convert-buffer-ops="arch-generation-name=gfx950 analyze-small-tensor-ofst=true"| FileCheck %s --check-prefixes=COMMON,GFX950-ONLY #blocked0 = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [1], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} { diff --git a/test/TritonGPU/amd/mfma-xf32.mlir b/test/TritonGPU/amd/mfma-xf32.mlir index 200fe7c567..5505fe777c 100644 --- a/test/TritonGPU/amd/mfma-xf32.mlir +++ b/test/TritonGPU/amd/mfma-xf32.mlir @@ -4,7 +4,7 @@ #blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 8], order = [0, 1]}> #blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [8, 1], order = [1, 0]}> -#mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [2, 4], instrShape = [16, 16, 16], isTransposed = true}> +#mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [2, 4], instrShape = [16, 16, 8], isTransposed = true}> module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} { tt.func public @mfma_xf32( %arg0: tensor<64x128xf32, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>, @@ -24,7 +24,7 @@ module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, #blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 8], order = [0, 1]}> #blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [8, 1], order = [1, 0]}> -#mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [2, 4], instrShape = [16, 16, 16], isTransposed = true}> +#mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [2, 4], instrShape = [16, 16, 4], isTransposed = true}> module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} { tt.func public @mfma_not_xf32( %arg0: tensor<64x128xf32, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>, diff --git a/test/TritonNvidiaGPU/membar.mlir b/test/TritonNvidiaGPU/membar.mlir index d9e3e4d246..6e29600e62 100644 --- a/test/TritonNvidiaGPU/membar.mlir +++ b/test/TritonNvidiaGPU/membar.mlir @@ -6,7 +6,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { // CHECK-LABEL: init_barrier // CHECK: local_alloc - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: init_barrier tt.func @init_barrier() { %cst = arith.constant dense<0> : tensor<1xi64, #blocked0> @@ -24,9 +24,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { // CHECK-LABEL: inval_barrier // CHECK: local_alloc - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: init_barrier - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: inval_barrier tt.func @inval_barrier() { %cst = arith.constant dense<0> : tensor<1xi64, #blocked0> @@ -45,9 +45,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { // CHECK-LABEL: barrier_expect // CHECK: local_alloc - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: init_barrier - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: barrier_expect tt.func @barrier_expect(%pred : i1) { %cst = arith.constant dense<0> : tensor<1xi64, #blocked0> @@ -66,9 +66,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { // CHECK-LABEL: wait_barrier // CHECK: local_alloc - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: init_barrier - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: wait_barrier tt.func @wait_barrier(%phase : i32) { %cst = arith.constant dense<0> : tensor<1xi64, #blocked0> @@ -91,7 +91,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ // CHECK: local_dealloc // CHECK-NEXT: local_alloc // CHECK-NEXT: local_alloc - // CHECK-NEXT: gpu.barrier + // CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: init_barrier %cst = arith.constant dense<0> : tensor<128x64xi64, #blocked0> %alloc = ttg.local_alloc %cst : (tensor<128x64xi64, #blocked0>) -> !ttg.memdesc<128x64xi64, #shared0, #smem, mutable> @@ -110,7 +110,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ // CHECK-LABEL: tma_store // CHECK: ttg.local_alloc // CHECK-NEXT: ttg.local_dealloc -// CHECK-NEXT: gpu.barrier +// CHECK-NEXT: ttg.local_barrier // CHECK-NEXT: ttg.local_alloc tt.func public @tma_store(%arg0: !tt.tensordesc>, %arg1: i32 {tt.divisibility = 16 : i32}, %arg2: tensor<128x256xf32, #blocked0>) { %cst = arith.constant dense<0> : tensor<128x64xi64, #blocked0> diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py index 11d258853a..464c4f77dd 100644 --- a/third_party/amd/backend/compiler.py +++ b/third_party/amd/backend/compiler.py @@ -30,7 +30,7 @@ def is_in_thread_transpose_enabled(arch): @dataclass(frozen=True) class HIPOptions: num_warps: int = 4 - waves_per_eu: int = 1 + waves_per_eu: int = 0 num_stages: int = 2 num_ctas: int = 1 extern_libs: dict = None diff --git a/third_party/amd/include/TritonAMDGPUTransforms/MfmaGroup.h b/third_party/amd/include/TritonAMDGPUTransforms/MfmaGroup.h index 50ff6cb0e2..7b758aaada 100644 --- a/third_party/amd/include/TritonAMDGPUTransforms/MfmaGroup.h +++ b/third_party/amd/include/TritonAMDGPUTransforms/MfmaGroup.h @@ -19,6 +19,11 @@ struct MfmaIntrinsic { unsigned inputKDim, Type aElemType, Type bElemType, bool withScale, bool useTF32); + // Gets the mfma intrinsic based on exact match of all parameters. + static FailureOr get(Location loc, int version, unsigned mDim, + unsigned nDim, unsigned kDim, + Type aElemType, Type bElemType, + bool withScale, bool useTF32); MfmaIntrinsic(StringRef symbol, unsigned m, unsigned n, unsigned k, unsigned kB, Type aET, Type bET) diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td index 1ff551c388..c6b0fcc313 100644 --- a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td +++ b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td @@ -152,6 +152,9 @@ def TritonAMDGPUConvertToBufferOps : Pass<"tritonamdgpu-convert-buffer-ops", "ml Option<"allowBufferAtomics", "allow-buffer-atomics", "bool", /*default*/"true", "Allow buffer atomic operations when the hardware supports it.">, + Option<"analyzeSmallTensorOfst", "analyze-small-tensor-ofst", + "bool", /*default=*/"false", + "Whether to still analyze index range for tensors whose base has tt.pointer_range = 32 specialization. If false load/store from such tensors will go down buffer ops without analzying index range."> ]; } diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM.cpp index 98360ab448..fe55a7b826 100644 --- a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM.cpp +++ b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM.cpp @@ -1,5 +1,6 @@ #include "Utility.h" #include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h" +#include "triton/Dialect/TritonGPU/IR/Attributes.h" using namespace mlir; @@ -23,6 +24,11 @@ LogicalResult convertScaledMFMA(triton::DotScaledOp op, LogicalResult convertWMMA(triton::DotOp op, triton::DotOp::Adaptor adaptor, const LLVMTypeConverter *typeConverter, ConversionPatternRewriter &rewriter); + +LogicalResult convertScaledWMMA(triton::DotScaledOp op, + triton::DotScaledOp::Adaptor adaptor, + const LLVMTypeConverter *typeConverter, + ConversionPatternRewriter &rewriter); } // namespace mlir::triton::AMD namespace { @@ -60,7 +66,19 @@ struct ScaledDotOpConversion LogicalResult matchAndRewrite(triton::DotScaledOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - return AMD::convertScaledMFMA(op, adaptor, getTypeConverter(), rewriter); + Value D = op.getResult(); + + auto dEncoding = cast(D.getType()).getEncoding(); + + if (isa(dEncoding)) { + return AMD::convertScaledMFMA(op, adaptor, getTypeConverter(), rewriter); + } + if (isa(dEncoding)) { + return AMD::convertScaledWMMA(op, adaptor, getTypeConverter(), rewriter); + } + + llvm::report_fatal_error( + "Unsupported DotScaleOp found when converting TritonGPU to LLVM."); } }; } // namespace diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp index 3b6b762805..1c2b6dfe90 100644 --- a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp +++ b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp @@ -241,11 +241,11 @@ struct DotOpMFMAConversionHelper { auto setPrioOp = dyn_cast_or_null(op->getPrevNode()); auto warpsPerCTA = mfmaLayout.getWarpsPerCTA(); - auto mDim = mfmaLayout.getInstrShape()[0]; - auto nDim = mfmaLayout.getInstrShape()[1]; + auto mnkDim = mfmaLayout.getInstrShape(); + auto mDim = mnkDim[0]; + auto nDim = mnkDim[1]; + auto kDim = mnkDim[2]; auto mfmaVersion = mfmaLayout.getVersion(); - assert((mDim == nDim && (mDim == 32 || mDim == 16)) || - (mDim == 64 && nDim == 4) || (mDim == 4 && nDim == 64)); Value a = op.getA(); Value b = op.getB(); @@ -261,8 +261,8 @@ struct DotOpMFMAConversionHelper { bool allowXF32 = op.getInputPrecision() == InputPrecision::TF32 && mfmaVersion == 3; StringRef intrinsicName; - FailureOr maybeMfmaIntrinsic = MfmaIntrinsic::selectFor( - op.getLoc(), mfmaVersion, mDim, nDim, kDimOperandSize, elemTyA, elemTyB, + FailureOr maybeMfmaIntrinsic = MfmaIntrinsic::get( + op.getLoc(), mfmaVersion, mDim, nDim, kDim, elemTyA, elemTyB, /*withScale=*/false, allowXF32); if (failed(maybeMfmaIntrinsic)) return op.emitError( @@ -578,11 +578,11 @@ struct ScaledDotOpMFMAConversionHelper : DotOpMFMAConversionHelper { auto setPrioOp = dyn_cast_or_null(op->getPrevNode()); auto warpsPerCTA = mfmaLayout.getWarpsPerCTA(); - auto mDim = mfmaLayout.getInstrShape()[0]; - auto nDim = mfmaLayout.getInstrShape()[1]; + auto mnkDim = mfmaLayout.getInstrShape(); + auto mDim = mnkDim[0]; + auto nDim = mnkDim[1]; + auto kDim = mnkDim[2]; auto mfmaVersion = mfmaLayout.getVersion(); - assert((mDim == nDim && (mDim == 32 || mDim == 16 || mDim == 4)) || - (mDim == 64 && nDim == 4) || (mDim == 4 && nDim == 64)); Value a = op.getA(); Value b = op.getB(); @@ -620,13 +620,11 @@ struct ScaledDotOpMFMAConversionHelper : DotOpMFMAConversionHelper { auto ctx = op.getContext(); constexpr bool allowXF32 = false; - FailureOr maybeMfmaIntrinsic = MfmaIntrinsic::selectFor( - op.getLoc(), mfmaVersion, mDim, nDim, - aElemType == ScaleDotElemType::E2M1 ? kDimOperandSize * 2 - : kDimOperandSize, - scaleDotElemTypeToMLIRType(ctx, aElemType), - scaleDotElemTypeToMLIRType(ctx, bElemType), - /*withScale=*/true, allowXF32); + FailureOr maybeMfmaIntrinsic = + MfmaIntrinsic::get(op.getLoc(), mfmaVersion, mDim, nDim, kDim, + scaleDotElemTypeToMLIRType(ctx, aElemType), + scaleDotElemTypeToMLIRType(ctx, bElemType), + /*withScale=*/true, allowXF32); if (failed(maybeMfmaIntrinsic)) return op.emitError( "no matching matrix core intrinsic due to unsupported element type"); diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/WMMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/WMMA.cpp index 0d2e816c38..2ef0816ff4 100644 --- a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/WMMA.cpp +++ b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/WMMA.cpp @@ -26,12 +26,14 @@ #include "Utility.h" #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" #include "triton/Conversion/TritonGPUToLLVM/Utility.h" +#include "llvm/ADT/TypeSwitch.h" namespace mlir::triton::AMD { namespace { using ::mlir::triton::gpu::AMDWmmaEncodingAttr; using ::mlir::triton::gpu::DotOperandEncodingAttr; +using ::mlir::triton::gpu::LinearEncodingAttr; using ValueTable = std::map, Value>; @@ -63,6 +65,8 @@ ValueTable getValuesFromDotOperandLayoutStruct( // Before wmma v3, bf16 is converted to i16 if (wmmaVer < 3) convertedElems = tb.bitcast(rawElems, vec_ty(i16_ty, kBase)); + } else if (kBase == 4 && type.getIntOrFloatBitWidth() == 8) { + convertedElems = tb.bitcast(rawElems, i32_ty); } else { convertedElems = tb.bitcast( rawElems, vec_ty(i32_ty, kBase * type.getIntOrFloatBitWidth() / @@ -75,6 +79,16 @@ ValueTable getValuesFromDotOperandLayoutStruct( return vals; } +static inline int32_t getWmmaF8F6F4MatrixFormat(Type t) { + return llvm::TypeSwitch(t) + .Case([](Type) { return 0; }) + .Case([](Type) { return 1; }) + .Case([](Type) { return 2; }) + .Case([](Type) { return 3; }) + .Case([](Type) { return 4; }) + .Default([](Type) { return -1; }); +} + Value generateWMMAIntrinsic(ConversionPatternRewriter &rewriter, Location loc, int wmmaVer, Value valA, Value valB, Value valC, Type aElType, Type bElType, Type dElType, @@ -132,6 +146,52 @@ Value generateWMMAIntrinsic(ConversionPatternRewriter &rewriter, Location loc, return wmmaIntrinsic.getResult(0); } +Value generateScaledWMMAIntrinsic(ConversionPatternRewriter &rewriter, + Location loc, Value valA, Value valScaleA, + Value valB, Value valScaleB, Value valC, + Type aElType, Type bElType, Type dElType, + int scaleKWidth) { + assert(scaleKWidth == 4 || scaleKWidth == 8); + auto b = TritonLLVMOpBuilder(loc, rewriter); + std::string name = "llvm.amdgcn.wmma.scale"; + if (scaleKWidth == 8) { + name += "16"; + } + name += ".f32.16x16x128.f8f6f4"; + + LLVM::FastmathFlagsAttr defaultFlags{}; + SmallVector operands; + + // Reference: llvm/include/llvm/IR/IntrinsicsAMDGPU.td, + // int_amdgcn_wmma_scale_f32_16x16x128_f8f6f4 + Value fmtA = b.i32_val(getWmmaF8F6F4MatrixFormat(aElType)); + operands.push_back(fmtA); + operands.push_back(valA); + Value fmtB = b.i32_val(getWmmaF8F6F4MatrixFormat(bElType)); + operands.push_back(fmtB); + operands.push_back(valB); + // C_mod is unused. Should be set to 0 + Value modC = b.i16_val(0); + operands.push_back(modC); + operands.push_back(valC); + // Set a_scale mantissa to zero as use E8M0 format (no mantissa bits) + operands.push_back(b.i32_val(0)); + // Set a_scale_fmt to 0 = E8M0 + operands.push_back(b.i32_val(0)); + operands.push_back(valScaleA); + // Set b_scale mantissa to zero as we use E8M0 format (no mantissa bits) + operands.push_back(b.i32_val(0)); + // Set b_scale fmt to 0 = E8M0 + operands.push_back(b.i32_val(0)); + operands.push_back(valScaleB); + // Set "Reuse matrix A" and "Reuse matrix B" to 0. + operands.push_back(b.i1_val(0)); + operands.push_back(b.i1_val(0)); + auto wmmaIntrinsic = LLVM::createLLVMIntrinsicCallOp( + rewriter, loc, name, valC.getType(), operands); + return wmmaIntrinsic.getResult(0); +} + Value generateWMMAOp(ConversionPatternRewriter &rewriter, Location loc, int version, Value valA, Value valB, Value valC, Type aElType, Type bElType, Type dElType, @@ -181,8 +241,8 @@ LogicalResult convertDot(DotOp op, DotOpAdaptor adaptor, auto bEncoding = cast(bTensorTy.getEncoding()); intrinsicName = maybeWmmaIntrinsic->name; - auto repA = wmmaLayout.getRepForOperand(aTensorTy.getShape(), aElemTy, 0); - auto repB = wmmaLayout.getRepForOperand(bTensorTy.getShape(), bElemTy, 1); + auto repA = wmmaLayout.getRepForOperand(aTensorTy.getShape(), kDim, 0); + auto repB = wmmaLayout.getRepForOperand(bTensorTy.getShape(), kDim, 1); assert(repA[2] == repB[1]); @@ -277,6 +337,129 @@ LogicalResult convertDot(DotOp op, DotOpAdaptor adaptor, return success(); } +LogicalResult convertScaledDot(triton::DotScaledOp op, + triton::DotScaledOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter, + const LLVMTypeConverter *typeConverter) { + auto wmmaLayout = cast( + cast(op.getResult().getType()).getEncoding()); + int wmmaVer = wmmaLayout.getVersion(); + assert(wmmaVer == 3 && "Scaled dot not supported for wmma1/wmma2"); + auto warpsPerCTA = wmmaLayout.getWarpsPerCTA(); + auto mnkDim = wmmaLayout.getInstrShape(); + + auto loc = op.getLoc(); + auto tb = TritonLLVMOpBuilder(loc, rewriter); + Value a = op.getA(); + Value b = op.getB(); + Value aScale = op.getAScale(); + Value bScale = op.getBScale(); + Value d = op.getD(); + auto aTensorTy = cast(a.getType()); + auto aScaleTensorTy = cast(aScale.getType()); + auto bTensorTy = cast(b.getType()); + auto bScaleTensorTy = cast(bScale.getType()); + auto dTensorTy = cast(d.getType()); + auto elemTy = aTensorTy.getElementType(); + + unsigned kDim = mnkDim[2]; + unsigned kBase = 64; + + bool isFp4A = op.getAElemType() == triton::ScaleDotElemType::E2M1; + int kBaseA = isFp4A ? kBase / 2 : kBase; + int kDimA = isFp4A ? kDim / 2 : kDim; + + bool isFp4B = op.getBElemType() == triton::ScaleDotElemType::E2M1; + int kBaseB = isFp4B ? kBase / 2 : kBase; + int kDimB = isFp4B ? kDim / 2 : kDim; + + auto repA = wmmaLayout.getRepForOperand(aTensorTy.getShape(), kDimA, 0); + auto repB = wmmaLayout.getRepForOperand(bTensorTy.getShape(), kDimB, 1); + + assert(repA[2] == repB[1]); + + Value loadedA = adaptor.getA(); + Value loadedAScale = adaptor.getAScale(); + Value loadedB = adaptor.getB(); + Value loadedBScale = adaptor.getBScale(); + Value loadedC = adaptor.getC(); + auto numRepM = repA[1]; + auto numRepN = repB[2]; + auto numRepK = repA[2]; + auto numRepB = repA[0]; + + auto scaleShapeA = aScaleTensorTy.getShape(); + constexpr int scaleKWidthA = 4; + auto scaleShapeB = bScaleTensorTy.getShape(); + constexpr int scaleKWidthB = 4; + + ValueTable ha = getValuesFromDotOperandLayoutStruct( + rewriter, typeConverter, wmmaVer, loadedA, numRepB, numRepM, numRepK, + kBaseA, aTensorTy.getElementType(), loc); + ValueTable hb = getValuesFromDotOperandLayoutStruct( + rewriter, typeConverter, wmmaVer, loadedB, numRepB, numRepN, numRepK, + kBaseB, bTensorTy.getElementType(), loc); + ValueTable sa = getValuesFromDotOperandLayoutStruct( + rewriter, typeConverter, wmmaVer, loadedAScale, numRepB, numRepM, numRepK, + scaleKWidthA, aScaleTensorTy.getElementType(), loc); + ValueTable sb = getValuesFromDotOperandLayoutStruct( + rewriter, typeConverter, wmmaVer, loadedBScale, numRepB, numRepN, numRepK, + scaleKWidthB, bScaleTensorTy.getElementType(), loc); + auto dstElemTy = dTensorTy.getElementType(); + auto fc = unpackLLElements(loc, loadedC, rewriter); + + Type scaledAElemType = + LLVM::AMD::scaleDotElemTypeToMLIRType(op.getContext(), op.getAElemType()); + Type scaledBElemType = + LLVM::AMD::scaleDotElemTypeToMLIRType(op.getContext(), op.getBElemType()); + + unsigned warpSize = gpu::lookupThreadsPerWarp(rewriter); + constexpr unsigned vgprElemBitWidth = 32; + // compute number of output elements that each thread holds for one WMMA + // instruction. + auto elemsPerVec = mnkDim[0] * mnkDim[1] / warpSize; + auto dElemsToStorePerThread = mnkDim[0] * mnkDim[1] / warpSize; + auto vecTy = vec_ty(dstElemTy, elemsPerVec); + for (int b = 0; b < numRepB; ++b) { + for (int m = 0; m < numRepM; ++m) { + for (int n = 0; n < numRepN; ++n) { + auto batchOffIdx = b * numRepM * numRepN * dElemsToStorePerThread; + auto mRepOffId = m * numRepN * dElemsToStorePerThread; + auto nRepOffId = n * dElemsToStorePerThread; + auto fcThreadOffIdx = batchOffIdx + mRepOffId + nRepOffId; + + Value acc = tb.undef(vecTy); + for (unsigned v = 0; v < dElemsToStorePerThread; ++v) { + acc = tb.insert_element(vecTy, acc, fc[fcThreadOffIdx + v], + tb.i32_val(v)); + } + for (size_t k = 0; k < numRepK; k++) { + acc = wmmaLayout.getIsTransposed() + ? generateScaledWMMAIntrinsic( + rewriter, loc, hb[{b, n, k}], sb[{b, n, k}], + ha[{b, m, k}], sa[{b, m, k}], acc, scaledBElemType, + scaledAElemType, dstElemTy, scaleKWidthA) + : generateScaledWMMAIntrinsic( + rewriter, loc, ha[{b, m, k}], sa[{b, m, k}], + hb[{b, n, k}], sb[{b, n, k}], acc, scaledAElemType, + scaledBElemType, dstElemTy, scaleKWidthB); + } + for (unsigned v = 0; v < dElemsToStorePerThread; ++v) { + fc[fcThreadOffIdx + v] = + tb.extract_element(dstElemTy, acc, tb.i32_val(v)); + } + } + } + } + + Type structTy = LLVM::LLVMStructType::getLiteral( + wmmaLayout.getContext(), SmallVector(fc.size(), dstElemTy)); + Value res = packLLElements(loc, typeConverter, fc, rewriter, structTy); + + rewriter.replaceOp(op, res); + return success(); +} + } // namespace LogicalResult convertWMMA(triton::DotOp op, triton::DotOp::Adaptor adaptor, @@ -301,4 +484,27 @@ LogicalResult convertWMMA(triton::DotOp op, triton::DotOp::Adaptor adaptor, return convertDot(op, adaptor, rewriter, typeConverter); } + +LogicalResult convertScaledWMMA(triton::DotScaledOp op, + triton::DotScaledOp::Adaptor adaptor, + const LLVMTypeConverter *typeConverter, + ConversionPatternRewriter &rewriter) { + assert(isa(op.getAScale().getType().getEncoding()) && + isa(op.getBScale().getType().getEncoding()) && + "Both LhsScale and RhsScale should be linear layout."); + + auto cTensorTy = op.getC().getType(); + auto dTensorTy = op.getD().getType(); + assert(isa(cTensorTy.getEncoding()) && + "Currently, we only support C with a wmma layout."); + + assert(cTensorTy.getShape()[0] == dTensorTy.getShape()[0] && + cTensorTy.getShape()[1] == dTensorTy.getShape()[1] && + "DotOp's C operand should pass the same number of values as D."); + + auto loc = op.getLoc(); + auto wmmaLayout = cast( + cast(op.getResult().getType()).getEncoding()); + return convertScaledDot(op, adaptor, rewriter, typeConverter); +} } // namespace mlir::triton::AMD diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp index 8fca9bd046..c67ad64173 100644 --- a/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp +++ b/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp @@ -4,6 +4,7 @@ #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" #include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h" #include "triton/Conversion/TritonGPUToLLVM/Utility.h" +#include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "triton/Dialect/TritonGPU/Transforms/Utility.h" using ::mlir::triton::gpu::AMDMfmaEncodingAttr; @@ -256,15 +257,51 @@ class TransLocalLoadOpConversion const AMD::TargetInfo &targetInfo; }; +class LocalBarrierOpConversion + : public ConvertOpToLLVMPattern { +public: + LocalBarrierOpConversion(const LLVMTypeConverter &converter, + const AMD::TargetInfo &targetInfo, + PatternBenefit benefit) + : ConvertOpToLLVMPattern(converter, benefit), + targetInfo(targetInfo) {} + using OpAdaptor = typename triton::gpu::LocalBarrierOp::Adaptor; + + LogicalResult + matchAndRewrite(triton::gpu::LocalBarrierOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (!isCDNA(targetInfo.getISAFamily())) + return failure(); + // In CDNA we can lower local_barrier to s_waitcnt + s_barrier + // - s_waitcnt specifies how many operations to VMEM/LDS can be outstanding + // when the instruction completes. + // In this case we require 0 outstanding LDS operations + // - s_barrier syncronizes the execution for the CTA + constexpr int32_t ldsOnlyBits = ~(0x1f << 8); + Location loc = op->getLoc(); + ROCDL::SWaitcntOp::create(rewriter, loc, ldsOnlyBits); + rewriter.replaceOpWithNewOp(op); + + return success(); + } + +private: + const AMD::TargetInfo &targetInfo; +}; + } // namespace void mlir::triton::AMD::populateMemoryOpToLLVMPatterns( LLVMTypeConverter &typeConverter, RewritePatternSet &patterns, const TargetInfo &targetInfo, PatternBenefit benefit) { PatternBenefit transBenefit = PatternBenefit(benefit.getBenefit() + 1); + PatternBenefit barrierBenefit = PatternBenefit(benefit.getBenefit() + 1); + patterns.add>( typeConverter, targetInfo, transBenefit); patterns.add< TransLocalLoadOpConversion>( typeConverter, targetInfo, benefit); + patterns.add(typeConverter, targetInfo, + barrierBenefit); } diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp index e42735305b..8eef6d7302 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp @@ -49,10 +49,13 @@ int getMfmaVersion(ISAFamily isaFamily) { } int getWmmaVersion(StringRef archGen) { - if (archGen.contains("gfx11")) + if (archGen.starts_with("gfx11")) return 1; - if (archGen.contains("gfx12")) + if (archGen.starts_with("gfx12") && !archGen.ends_with("50")) return 2; + if (archGen == "gfx1250") + return 3; + return 0; } @@ -329,7 +332,7 @@ OperandTypesVector getOperandTypesForWmmaOp(PatternRewriter &rewriter, // by WMMA instruction, but not supported by triton // clang-format on }; - if (version == 2) { + if (version == 2 || version == 3) { Type fp8e4nv = rewriter.getType(); Type fp8e5 = rewriter.getType(); applicableTypes.append({ @@ -1175,6 +1178,155 @@ class ScaledBlockedToScaledMFMAF8F6F4 final } }; +class ScaledBlockedToScaledWMMAF8F6F4 final + : public OpRewritePattern { + int wmmaVersion; + +public: + ScaledBlockedToScaledWMMAF8F6F4(MLIRContext *context, int wmmaVersion, + PatternBenefit benefit = 1) + : OpRewritePattern(context, benefit), wmmaVersion(wmmaVersion) {} + + LogicalResult matchAndRewrite(triton::DotScaledOp dotOp, + PatternRewriter &rewriter) const override { + using TensorValue = TypedValue; + + if (wmmaVersion != 3) { + return rewriter.notifyMatchFailure( + dotOp, "F8F6F4 scaled dot is only natively supported on gfx1250"); + } + + RankedTensorType oldRetType = dotOp.getType(); + if (!isa_and_nonnull(oldRetType.getEncoding())) { + return rewriter.notifyMatchFailure( + dotOp, "expected blocked encoding result tensor"); + } + + unsigned rank = oldRetType.getRank(); + if (rank == 3) + return rewriter.notifyMatchFailure(dotOp, "NYI: 3d case"); + + TensorValue a = dotOp.getA(); + TensorValue b = dotOp.getB(); + TensorValue aScale = dotOp.getAScale(); + TensorValue bScale = dotOp.getBScale(); + auto oldShape = oldRetType.getShape(); + + ScaleDotElemType aElemType = dotOp.getAElemType(); + ScaleDotElemType bElemType = dotOp.getBElemType(); + // TODO: Add more supported types + auto supportsTypes = [](ScaleDotElemType elemType) { + return elemType == ScaleDotElemType::E2M1; + }; + + if (!supportsTypes(aElemType) || !supportsTypes(bElemType)) { + return rewriter.notifyMatchFailure(dotOp, "Not supported yet mxfp type"); + } + + MLIRContext *ctx = dotOp.getContext(); + + ttg::CTALayoutAttr ctaLayout = ttg::getCTALayout(oldRetType.getEncoding()); + unsigned numWarps = ttg::lookupNumWarps(dotOp); + + constexpr unsigned mDim = 16; + constexpr unsigned nDim = 16; + constexpr unsigned kDim = 128; + + auto warpsPerTile = + warpsPerTileWMMA(dotOp, oldShape, numWarps, {mDim, nDim}); + + auto wmmaEnc = ttg::AMDWmmaEncodingAttr::get( + ctx, wmmaVersion, true, warpsPerTile, ctaLayout, {mDim, nDim, kDim}); + auto wmmaPackedEnc = + ttg::AMDWmmaEncodingAttr::get(ctx, wmmaVersion, true, warpsPerTile, + ctaLayout, {mDim, nDim, kDim / 2}); + + auto newRetType = + RankedTensorType::get(oldShape, oldRetType.getElementType(), wmmaEnc); + + auto newAcc = rewriter.create( + dotOp.getC().getLoc(), newRetType, dotOp.getC()); + + StringAttr kRegister = StringAttr::get(ctx, "register"); + StringAttr kLane = StringAttr::get(ctx, "lane"); + StringAttr kWarp = StringAttr::get(ctx, "warp"); + StringAttr kBlock = StringAttr::get(ctx, "block"); + + auto order = ttg::getMatrixOrder(rank, /*rowMajor=*/true); + auto standardOutDims = standardOutDimNames(ctx, rank); + + using basisT = std::vector>; + + auto aShape = a.getType().getShape(); + auto bShape = b.getType().getShape(); + + auto aEncLL = LinearLayout::empty(); + auto bEncLL = LinearLayout::empty(); + + auto convertInputLayout = [&](TensorValue v, unsigned opIdx, + bool isFp4) -> TensorValue { + auto parent = isFp4 ? wmmaPackedEnc : wmmaEnc; + auto vType = v.getType(); + auto newEnc = DotOperandEncodingAttr::get(ctx, opIdx, parent, 16); + auto newVType = RankedTensorType::get(vType.getShape(), + vType.getElementType(), newEnc); + (opIdx == 0 ? aEncLL : bEncLL) *= + newEnc.toLinearLayout(opIdx == 0 ? aShape : bShape); + return rewriter.create(v.getLoc(), newVType, v); + }; + a = convertInputLayout(a, 0, aElemType == ScaleDotElemType::E2M1); + b = convertInputLayout(b, 1, bElemType == ScaleDotElemType::E2M1); + + auto convertScaleLayout = [&](TensorValue scale, + llvm::ArrayRef valShape, + LinearLayout dotLL, int idx) -> Value { + LinearLayout::BasesT scaleBases = dotLL.getBases(); + auto &warpBases = scaleBases[kWarp]; + + SmallVector shape; + if (!scale) { + int64_t nonKDim = idx == 0 ? valShape[0] : valShape[1]; + int64_t k = idx == 0 ? valShape[1] : valShape[0]; + ScaleDotElemType &elemType = idx == 0 ? aElemType : bElemType; + int packSize = elemType == ScaleDotElemType::E2M1 ? 2 : 1; + shape = {nonKDim, k * packSize / 32}; + } else { + shape = llvm::to_vector(scale.getType().getShape()); + } + + LinearLayout newLL = + ttg::chooseScaledWmmaScaleLayout(ctx, idx, warpBases, shape); + Attribute newScaleEncoding = ttg::LinearEncodingAttr::get(ctx, newLL); + // Scale's data type is always i8 + auto newScaleType = RankedTensorType::get(shape, i8_ty, newScaleEncoding); + + if (!scale) { + // 0x7F is 1.0 in E8M0 + return rewriter.create( + dotOp->getLoc(), newScaleType, + DenseElementsAttr::get(newScaleType, llvm::APInt(8, 0x7F))); + } else { + return rewriter.create(scale.getLoc(), + newScaleType, scale); + } + }; + auto newAScale = + convertScaleLayout(aScale, aShape, aEncLL, /*dotOperandIdx=*/0); + auto newBScale = + convertScaleLayout(bScale, bShape, bEncLL, /*dotOperandIdx=*/1); + + auto newDot = rewriter.create( + dotOp.getLoc(), newRetType, a, b, newAcc, newAScale, newBScale, + aElemType, bElemType, dotOp.getFastMath()); + + auto m = dotOp->getParentOfType(); + rewriter.replaceOpWithNewOp(dotOp, oldRetType, + newDot); + + return success(); + } +}; + static Value promoteOperand(OpBuilder &builder, Location loc, Value operand, Type promotedType) { Type tensorPromotedType = cast(operand.getType()) @@ -1573,6 +1725,10 @@ struct TritonAMDGPUAccelerateMatmulPass RewritePatternSet mfmaPatterns(context); switch (auto isaFamily = triton::AMD::deduceISAFamily(archGenerationName)) { + case ISAFamily::GFX1250: + mfmaPatterns.add( + context, getWmmaVersion(archGenerationName), /*benefit=*/3); + break; case ISAFamily::CDNA4: mfmaPatterns.add<::ScaledBlockedToScaledMFMAF8F6F4>( context, getMfmaVersion(isaFamily), matrixInstructionSize, diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ConvertToBufferOps.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ConvertToBufferOps.cpp index f621c3bc55..fb0892c90b 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/ConvertToBufferOps.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/ConvertToBufferOps.cpp @@ -17,6 +17,7 @@ #include "triton/Dialect/Triton/IR/Dialect.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "triton/Dialect/TritonGPU/Transforms/Utility.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/TypeSwitch.h" #undef DEBUG_TYPE @@ -230,11 +231,32 @@ bool verifyNonNegativeExpr( return nonNegative; } +bool isFuncArgWith32bitPtrRange(mlir::Value value) { + if (value.getDefiningOp()) + return false; + + mlir::BlockArgument blockArg = mlir::cast(value); + auto blk = blockArg.getOwner(); + auto funcOp = dyn_cast_or_null(blk->getParentOp()); + + if (funcOp && blk == &funcOp->getRegion(0).front()) { + for (auto [idx, arg] : llvm::enumerate(funcOp.getArguments())) { + if (arg != value) + continue; + auto attr = funcOp.getArgAttrOfType(idx, "tt.pointer_range"); + return attr && attr.getInt() <= 32; + } + } + + return false; +} + // Quick analysis on the Triton IR to decide if we can safely use // buffer operations bool canUseBufferOps(Value ptr, const DenseMap> &assumptions, - std::shared_ptr solver) { + std::shared_ptr solver, + bool analyzeSmallTensorOfst) { // 1. Check if the pointer is uniform: i.e., if it comes from a uniform // pointer(splatted) and non-uniform offset addition @@ -248,12 +270,30 @@ bool canUseBufferOps(Value ptr, return false; LDBG("Pattern matched"); - // 2. Check if the offset is a 32-bit tensor + // 2. check if the offset is either 32 or 64-bit. Value offset = addPtrOp.getOffset(); - if (cast(offset.getType()).getElementTypeBitWidth() != 32) + auto ofstBit = + cast(offset.getType()).getElementTypeBitWidth(); + LLVM_DEBUG(llvm::dbgs() << "offset bits:" << ofstBit << "\n"); + + // TODO: step 3 and 4 can be reversed to further optimize for performance. + // When the base-ptr is func argument and has tt.pointer_range=32 attribute, + // it's safe to promote the mem-op into buffer-op even if offset is a 64-bit + // value. If this is the case, offset need to be cast down to 32-bit. + + // 3. Bail out if ofst cannot fit in 32-bit. + if (ofstBit != 32) return false; - LDBG("32 bit offset"); + // 4. If the base is function formal argument which has attribute + // tt.point_range=32, then it's safe to promote this memory op into + // bufferOp. In this case, if offset is 64-bit, we should cast it down to + // 32-bit. + if (!analyzeSmallTensorOfst && + isFuncArgWith32bitPtrRange(maybeSplatOp.getSrc())) { + LDBG("base-ptr as tt.pointer_range=32 attribute"); + return true; + } return verifyNonNegativeExpr(offset, assumptions, std::move(solver)); } @@ -284,10 +324,11 @@ struct ConvertTritonAtomicCASOpToBufferAtomicCAS mlir::MLIRContext *context, DenseMap> &assumptions, ModuleAxisInfoAnalysis &axisAnalysisPass, - std::shared_ptr solver) + std::shared_ptr solver, bool analyzeSmallTensorOfst_) : mlir::OpRewritePattern(context), assumptions(assumptions), axisAnalysisPass(axisAnalysisPass), - solver(std::move(solver)) {} + solver(std::move(solver)), + analyzeSmallTensorOfst(analyzeSmallTensorOfst_) {} mlir::LogicalResult matchAndRewrite(triton::AtomicCASOp op, @@ -297,7 +338,7 @@ struct ConvertTritonAtomicCASOpToBufferAtomicCAS auto sem = op.getSem(); auto scope = op.getScope(); - if (!canUseBufferOps(ptr, assumptions, solver)) { + if (!canUseBufferOps(ptr, assumptions, solver, analyzeSmallTensorOfst)) { return rewriter.notifyMatchFailure(op, "canUseBufferOps check failed"); } @@ -363,6 +404,7 @@ struct ConvertTritonAtomicCASOpToBufferAtomicCAS const DenseMap> &assumptions; ModuleAxisInfoAnalysis &axisAnalysisPass; std::shared_ptr solver; + bool analyzeSmallTensorOfst; }; struct ConvertTritonAtomicRMWOpToBufferAtomicRMW @@ -373,10 +415,12 @@ struct ConvertTritonAtomicRMWOpToBufferAtomicRMW mlir::MLIRContext *context, DenseMap> &assumptions, ModuleAxisInfoAnalysis &axisAnalysisPass, - std::shared_ptr solver, ISAFamily isaFamily) + std::shared_ptr solver, ISAFamily isaFamily, + bool analyzeSmallTensorOfst_) : mlir::OpRewritePattern(context), assumptions(assumptions), axisAnalysisPass(axisAnalysisPass), - solver(std::move(solver)), isaFamily(isaFamily) {} + solver(std::move(solver)), isaFamily(isaFamily), + analyzeSmallTensorOfst(analyzeSmallTensorOfst_) {} mlir::LogicalResult matchAndRewrite(triton::AtomicRMWOp op, @@ -389,7 +433,7 @@ struct ConvertTritonAtomicRMWOpToBufferAtomicRMW // In addition to the `canUserBufferOps` check, we should ensure that // 1. Perform the canUserBufferOps check - if (!canUseBufferOps(ptr, assumptions, solver)) { + if (!canUseBufferOps(ptr, assumptions, solver, analyzeSmallTensorOfst)) { return rewriter.notifyMatchFailure(op, "canUseBufferOps check failed"); } @@ -458,12 +502,18 @@ struct ConvertTritonAtomicRMWOpToBufferAtomicRMW case RMWOp::XOR: case RMWOp::ADD: case RMWOp::FADD: - case RMWOp::MAX: - case RMWOp::MIN: case RMWOp::UMAX: case RMWOp::UMIN: case RMWOp::XCHG: break; + case RMWOp::MAX: + case RMWOp::MIN: + // TODO: It likely means smax/smin, for now intrinsic + // llvm.amdgcn.raw.ptr.buffer.atomic.{min|max} is emitted, and llvm get + // confused as how to deal with {f|s|u}{min|max}. + if (!checkType.isInteger()) + break; + // else fall through default: auto rmwOpStr = stringifyRMWOp(atomicRmwOp).str(); return rewriter.notifyMatchFailure(op, "RMW with unsupported op: " + @@ -507,6 +557,7 @@ struct ConvertTritonAtomicRMWOpToBufferAtomicRMW ModuleAxisInfoAnalysis &axisAnalysisPass; std::shared_ptr solver; ISAFamily isaFamily; + bool analyzeSmallTensorOfst; }; // Workaround to allow static_assert(false) on older compilers as it was @@ -521,16 +572,22 @@ struct ConvertTritonLoadToBufferLoad : public mlir::OpRewritePattern { ConvertTritonLoadToBufferLoad( mlir::MLIRContext *context, DenseMap> &assumptions, - std::shared_ptr solver) + std::shared_ptr solver, bool analyzeSmallTensorOfst_) : mlir::OpRewritePattern(context), assumptions(assumptions), - solver(std::move(solver)) {} + solver(std::move(solver)), + analyzeSmallTensorOfst(analyzeSmallTensorOfst_) {} mlir::LogicalResult matchAndRewrite(SourceOp op, PatternRewriter &rewriter) const override { LDBG("Try to convert: " << op); Value ptr = op.getOperand(0); - if (canUseBufferOps(ptr, assumptions, solver)) { + if (toDodgeBug(op)) { + LDBG("To dodge a llc bug arising from f32 load fed to tt.dot " << op); + return rewriter.notifyMatchFailure(op, "Failed to convert LoadOp"); + } + + if (canUseBufferOps(ptr, assumptions, solver, analyzeSmallTensorOfst)) { auto addPtrOp = ptr.getDefiningOp(); Value tensorPtr = addPtrOp.getPtr(); Value tensorOffset = addPtrOp.getOffset(); @@ -572,9 +629,33 @@ struct ConvertTritonLoadToBufferLoad : public mlir::OpRewritePattern { } private: + // Currently, we need to dodge a LLC bug arising from f32 load fed to + // tt.dot. + mutable llvm::SmallMapVector, 2> hasDotOpMap; + bool toDodgeBug(SourceOp ld) const { + auto ty = getElementTypeOrSelf(ld.getResult()); + if (!ty.isF32()) + return false; + + auto func = ld->template getParentOfType(); + if (!func) + return true; + + bool mayHaveDot = false; + if (auto iter = hasDotOpMap.find(func); iter != hasDotOpMap.end()) { + mayHaveDot = iter->second.value(); + } else { + mayHaveDot = false; + func.walk([&](tt::DotOp dot) { mayHaveDot = true; }); + hasDotOpMap.insert(std::make_pair(func, std::optional(mayHaveDot))); + } + return mayHaveDot; + } + // Assumptions collected through the function DenseMap> assumptions; std::shared_ptr solver; + bool analyzeSmallTensorOfst; }; struct ConvertTritonStoreToBufferStore @@ -584,9 +665,10 @@ struct ConvertTritonStoreToBufferStore ConvertTritonStoreToBufferStore( mlir::MLIRContext *context, DenseMap> &assumptions, - std::shared_ptr solver) + std::shared_ptr solver, bool analyzeSmallTensorOfst_) : mlir::OpRewritePattern(context), - assumptions(assumptions), solver(std::move(solver)) {} + assumptions(assumptions), solver(std::move(solver)), + analyzeSmallTensorOfst(analyzeSmallTensorOfst_) {} mlir::LogicalResult matchAndRewrite(triton::StoreOp op, @@ -594,7 +676,7 @@ struct ConvertTritonStoreToBufferStore LDBG("Try to convert: " << op); Value ptr = op.getPtr(); - if (canUseBufferOps(ptr, assumptions, solver)) { + if (canUseBufferOps(ptr, assumptions, solver, analyzeSmallTensorOfst)) { auto addPtrOp = ptr.getDefiningOp(); Value tensorPtr = addPtrOp.getPtr(); Value tensorOffset = addPtrOp.getOffset(); @@ -617,6 +699,7 @@ struct ConvertTritonStoreToBufferStore // Assumptions collected through the function DenseMap> assumptions; std::shared_ptr solver; + bool analyzeSmallTensorOfst; }; } // anonymous namespace @@ -644,7 +727,8 @@ struct TritonAMDGPUConvertToBufferOpsPass AMD::ModuleAxisInfoAnalysis axisInfoAnalysis(mod); patterns.add, ConvertTritonLoadToBufferLoad, - ConvertTritonStoreToBufferStore>(context, assumptions, solver); + ConvertTritonStoreToBufferStore>(context, assumptions, solver, + this->analyzeSmallTensorOfst); // Gate buffer atomics behind CDNA3 for now // GFX942-specific assumptions regarding cache coherence are made when @@ -654,9 +738,11 @@ struct TritonAMDGPUConvertToBufferOpsPass if (this->allowBufferAtomics && (ISAFamily::CDNA3 == isaFamily || ISAFamily::CDNA4 == isaFamily)) patterns.add( - context, assumptions, axisInfoAnalysis, solver, isaFamily); + context, assumptions, axisInfoAnalysis, solver, isaFamily, + this->analyzeSmallTensorOfst); patterns.add( - context, assumptions, axisInfoAnalysis, solver); + context, assumptions, axisInfoAnalysis, solver, + this->analyzeSmallTensorOfst); if (applyPatternsGreedily(mod, std::move(patterns)).failed()) signalPassFailure(); diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp index 6d64068018..880250e910 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp @@ -299,4 +299,28 @@ MfmaIntrinsic::selectFor(Location loc, int version, unsigned mDim, auto [symbol, k, kBase] = values.back(); return MfmaIntrinsic(symbol, mDim, nDim, k, kBase, aElemType, bElemType); } + +FailureOr MfmaIntrinsic::get(Location loc, int version, + unsigned mDim, unsigned nDim, + unsigned kDim, Type aElemType, + Type bElemType, bool withScale, + bool useTF32) { + const MfmaMap &mfmaMap = MfmaDatabase::get(aElemType.getContext()); + MfmaKey key = composeMfmaKeyFor(loc, version, mDim, nDim, aElemType, + bElemType, withScale, useTF32); + + auto it = mfmaMap.find(key); + if (it == mfmaMap.end()) + return failure(); + + const SmallVector &values = it->second; + auto match = llvm::find_if(values, [&](const MfmaMapValue &val) { + return std::get<1>(val) == kDim; + }); + if (match == values.end()) + return failure(); + + auto [symbol, k, kBase] = *match; + return MfmaIntrinsic(symbol, mDim, nDim, k, kBase, aElemType, bElemType); +} } // namespace mlir diff --git a/third_party/amd/python/test/test_gluon_gfx1250.py b/third_party/amd/python/test/test_gluon_gfx1250.py index 1161b6c340..ac3ff6f714 100644 --- a/third_party/amd/python/test/test_gluon_gfx1250.py +++ b/third_party/amd/python/test/test_gluon_gfx1250.py @@ -9,8 +9,10 @@ import torch import triton +import triton.language as tl from triton.backends.compiler import GPUTarget from triton._internal_testing import str_to_triton_dtype +from triton._internal_testing import is_hip_gfx1250 from triton.experimental import gluon import triton.experimental.gluon.language as ttgl @@ -178,3 +180,174 @@ def create_operand(shape, dtype): c_triton = c_device.cpu() c_torch = a.to(torch.float32) @ b.to(torch.float32) torch.testing.assert_close(c_triton, c_torch, rtol=1e-4, atol=1e-4) + + +@gluon.jit +def dot_mxfp_gluon_kernel(a_base, stride_am, stride_ak, a_scale, b_base, stride_bk, stride_bn, b_scale, out, + BLOCK_M: ttgl.constexpr, BLOCK_N: ttgl.constexpr, BLOCK_K: ttgl.constexpr, + type_a: ttgl.constexpr, type_b: ttgl.constexpr): + DIV_FACTOR_A: ttgl.constexpr = 2 if type_a == "e2m1" else 1 + DIV_FACTOR_B: ttgl.constexpr = 2 if type_b == "e2m1" else 1 + PACKED_BLOCK_K_A: ttgl.constexpr = BLOCK_K // DIV_FACTOR_A + PACKED_BLOCK_K_B: ttgl.constexpr = BLOCK_K // DIV_FACTOR_B + SCALE_BLOCK_K: ttgl.constexpr = BLOCK_K // 32 + + scale_blocked_layout: ttgl.constexpr = ttgl.BlockedLayout([1, 1], [8, 4], [4, 1], [1, 0]) + a_layout: ttgl.constexpr = ttgl.BlockedLayout([1, 16], [8, 4], [4, 1], [1, 0]) + a_scale_linear_layout: ttgl.constexpr = ttgl.DistributedLinearLayout( + reg_bases=[[0, 1], [0, 2]], lane_bases=[[1, 0], [2, 0], [4, 0], [8, 0], [0, 0]], warp_bases=[[0, 0], [16, 0]], + block_bases=[], shape=[32, 4]) + b_layout: ttgl.constexpr = ttgl.BlockedLayout([1, 16], [16, 2], [4, 1], [1, 0]) + b_scale_linear_layout: ttgl.constexpr = ttgl.DistributedLinearLayout( + reg_bases=[[0, 1], [0, 2]], lane_bases=[[1, 0], [2, 0], [4, 0], [8, 0], [0, 0]], warp_bases=[[16, 0], [0, 0]], + block_bases=[], shape=[32, 4]) + + wmma_layout: ttgl.constexpr = ttgl.amd.AMDWMMALayout(version=3, transposed=True, warps_per_cta=[2, 2], + instr_shape=[16, 16, 128]) + wmma_layout_packed: ttgl.constexpr = ttgl.amd.AMDWMMALayout(version=3, transposed=True, warps_per_cta=[2, 2], + instr_shape=[16, 16, 64]) + + zero = ttgl.zeros([BLOCK_M, BLOCK_N], dtype=ttgl.float32, layout=wmma_layout) + + offs_am = ttgl.arange(0, BLOCK_M, layout=ttgl.SliceLayout(1, a_layout)) + offs_ak = ttgl.arange(0, PACKED_BLOCK_K_A, layout=ttgl.SliceLayout(0, a_layout)) + a_offsets = offs_am[:, None] * stride_am + offs_ak[None, :] * stride_ak + a = ttgl.load(a_base + a_offsets) + a = ttgl.convert_layout( + a, + ttgl.DotOperandLayout(operand_index=0, parent=wmma_layout_packed if type_a == "e2m1" else wmma_layout, + k_width=16)) + + offs_bk = ttgl.arange(0, PACKED_BLOCK_K_B, layout=ttgl.SliceLayout(1, b_layout)) + offs_bn = ttgl.arange(0, BLOCK_N, layout=ttgl.SliceLayout(0, b_layout)) + b_offsets = offs_bk[:, None] * stride_bk + offs_bn[None, :] * stride_bn + b = ttgl.load(b_base + b_offsets) + b = ttgl.convert_layout( + b, + ttgl.DotOperandLayout(operand_index=1, parent=wmma_layout_packed if type_b == "e2m1" else wmma_layout, + k_width=16)) + + if a_scale is not None: + offs_scale_am = ttgl.arange(0, BLOCK_M, layout=ttgl.SliceLayout(1, scale_blocked_layout)) + off_scale_ak = ttgl.arange(0, SCALE_BLOCK_K, layout=ttgl.SliceLayout(0, scale_blocked_layout)) + a_scale_offsets = offs_scale_am[:, None] * SCALE_BLOCK_K + off_scale_ak[None, :] + scale_a = ttgl.load(a_scale + a_scale_offsets) + else: + scale_a = ttgl.full([BLOCK_M, SCALE_BLOCK_K], 127, dtype=ttgl.int8, layout=scale_blocked_layout) + + if b_scale is not None: + offs_scale_bn = ttgl.arange(0, BLOCK_N, layout=ttgl.SliceLayout(1, scale_blocked_layout)) + offs_scale_bk = ttgl.arange(0, SCALE_BLOCK_K, layout=ttgl.SliceLayout(0, scale_blocked_layout)) + b_scale_offsets = offs_scale_bn[:, None] * SCALE_BLOCK_K + offs_scale_bk[None, :] + scale_b = ttgl.load(b_scale + b_scale_offsets) + else: + scale_b = ttgl.full([BLOCK_N, SCALE_BLOCK_K], 127, dtype=ttgl.int8, layout=scale_blocked_layout) + + scale_a = ttgl.convert_layout(scale_a, a_scale_linear_layout) + scale_b = ttgl.convert_layout(scale_b, b_scale_linear_layout) + c = ttgl.amd.gfx1250.wmma_scaled(a, scale_a, type_a, b, scale_b, type_b, zero) + c = c.to(out.dtype.element_ty) + + offs_cm = ttgl.arange(0, BLOCK_M, layout=ttgl.SliceLayout(1, wmma_layout)) + offs_cn = ttgl.arange(0, BLOCK_N, layout=ttgl.SliceLayout(0, wmma_layout)) + out_offsets = offs_cm[:, None] * BLOCK_N + offs_cn[None, :] + out = out + out_offsets + ttgl.store(out, c) + + +@pytest.mark.parametrize("BLOCK_M, BLOCK_N, BLOCK_K", [(16, 16, 128), (32, 32, 128), (32, 32, 256), (32, 32, 512), + (64, 64, 128), (128, 128, 256)]) +@pytest.mark.parametrize("mxfp_type", ["e2m1"]) +@pytest.mark.parametrize("hasScale", [True, False]) +def test_compile_amd_wmma_scaled(BLOCK_M, BLOCK_N, BLOCK_K, mxfp_type, hasScale): + k = triton.compile( + gluon._runtime.GluonASTSource( + fn=dot_mxfp_gluon_kernel, signature={ + "a_base": "*u8", "stride_am": "i32", "stride_ak": "i32", "a_scale": "*u8", "b_base": "*u8", "stride_bk": + "i32", "stride_bn": "i32", "b_scale": "*u8", "out": "*fp32", "BLOCK_M": "constexpr", "BLOCK_N": + "constexpr", "BLOCK_K": "constexpr", "type_a": "constexpr", "type_b": "constexpr" + }, constexprs={ + "BLOCK_M": BLOCK_M, "BLOCK_N": BLOCK_N, "BLOCK_K": BLOCK_K, "type_a": mxfp_type, "type_b": mxfp_type + }), target=GPUTarget("hip", 'gfx1250', 32)) + amdgcn = k.asm["amdgcn"] + assert "v_wmma_scale_f32_16x16x128_f8f6f4" in amdgcn, "The AMDGCN assembly does not contain the expected scaled WMMA instruction." + + +@pytest.mark.skipif(not is_hip_gfx1250(), reason="Requires GFX1250") +@pytest.mark.parametrize("BLOCK_M, BLOCK_N, BLOCK_K", [(16, 16, 128), (32, 32, 128), (32, 32, 256), (32, 32, 512), + (64, 64, 128), (128, 128, 256)]) +@pytest.mark.parametrize("mxfp_type", ["e2m1"]) +@pytest.mark.parametrize("hasScale", [True, False]) +def test_runtime_amd_wmma_scaled(BLOCK_M, BLOCK_N, BLOCK_K, mxfp_type, hasScale): + + @triton.jit + def dot_mxfp_triton_kernel(a_base, stride_am, stride_ak, a_scale, b_base, stride_bk, stride_bn, b_scale, out, + BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, + type_a: tl.constexpr, type_b: tl.constexpr): + DIV_FACTOR_A: tl.constexpr = 2 if type_a == "e2m1" else 1 + DIV_FACTOR_B: tl.constexpr = 2 if type_b == "e2m1" else 1 + PACKED_BLOCK_K_A: tl.constexpr = BLOCK_K // DIV_FACTOR_A + PACKED_BLOCK_K_B: tl.constexpr = BLOCK_K // DIV_FACTOR_B + a_ptr = a_base + tl.arange(0, BLOCK_M)[:, None] * stride_am + \ + tl.arange(0, PACKED_BLOCK_K_A)[None, :] * stride_ak + b_ptr = b_base + tl.arange(0, PACKED_BLOCK_K_B)[:, None] * stride_bk + \ + tl.arange(0, BLOCK_N)[None, :] * stride_bn + + a = tl.load(a_ptr) + b = tl.load(b_ptr) + SCALE_BLOCK_K: tl.constexpr = BLOCK_K // 32 + + if a_scale is not None: + scale_a_ptr = a_scale + tl.arange(0, BLOCK_M)[:, None] * SCALE_BLOCK_K + tl.arange(0, + SCALE_BLOCK_K)[None, :] + a_scale = tl.load(scale_a_ptr) + if b_scale is not None: + scale_b_ptr = b_scale + tl.arange(0, BLOCK_N)[:, None] * SCALE_BLOCK_K + tl.arange(0, + SCALE_BLOCK_K)[None, :] + b_scale = tl.load(scale_b_ptr) + c = tl.dot_scaled(a, a_scale, type_a, b, b_scale, type_b) + out_ptr = out + tl.arange(0, BLOCK_M)[:, None] * BLOCK_N + tl.arange(0, BLOCK_N)[None, :] + tl.store(out_ptr, c) + + torch.manual_seed(0) + + type_a = mxfp_type + type_b = mxfp_type + + DIV_FACTOR_A = 2 if type_a == "e2m1" else 1 + DIV_FACTOR_B = 2 if type_b == "e2m1" else 1 + + x = torch.randint(20, 40, (BLOCK_M, BLOCK_K // DIV_FACTOR_A), dtype=torch.uint8).cuda() + y = torch.randint(20, 40, (BLOCK_K // DIV_FACTOR_B, BLOCK_N), dtype=torch.uint8).cuda() + + if hasScale: + min_scale, max_scale = (0, 142) + scale_x = torch.randint(min_scale, max_scale + 1, (BLOCK_M, BLOCK_K // 32), dtype=torch.uint8).cuda() + scale_y = torch.randint(min_scale, max_scale + 1, (BLOCK_N, BLOCK_K // 32), dtype=torch.uint8).cuda() + else: + scale_x = None + scale_y = None + + def make_finite(x, dtype): + if dtype not in ("e5m2", "e4m3"): + return x + mask = 0x7C if dtype == "e5m2" else 0x7F + finite = torch.arange(x.numel(), dtype=torch.uint8).cuda().reshape_as(x) % mask + x_finite = torch.where(x & mask == mask, finite | (0x80 & x), x) + x.copy_(x_finite) + return x + + x = make_finite(x, type_a) + y = make_finite(y, type_b) + + z = torch.zeros((BLOCK_M, BLOCK_N), dtype=torch.float32).cuda() + pgm = dot_mxfp_gluon_kernel[(1, )](x, *x.stride(), scale_x, y, *y.stride(), scale_y, z, BLOCK_M, BLOCK_N, BLOCK_K, + type_a, type_b) + assert "v_wmma_scale_f32_16x16x128_f8f6f4" in pgm.asm[ + "amdgcn"], "The AMDGCN assembly does not contain the expected scaled WMMA instruction." + + z_ref = torch.zeros((BLOCK_M, BLOCK_N), dtype=torch.float32).cuda() + dot_mxfp_triton_kernel[(1, )](x, *x.stride(), scale_x, y, *y.stride(), scale_y, z_ref, BLOCK_M, BLOCK_N, BLOCK_K, + type_a, type_b) + + torch.testing.assert_close(z.cpu(), z_ref.cpu(), rtol=1e-5, atol=1e-5) diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py index b73575dacb..240d2a72ec 100644 --- a/third_party/nvidia/backend/compiler.py +++ b/third_party/nvidia/backend/compiler.py @@ -329,6 +329,7 @@ def gluon_to_ttgir(self, src, metadata, options, capability): passes.gluon.add_inliner(pm) passes.gluon.add_resolve_auto_encodings(pm) + passes.gluon.add_canonicalizer(pm) passes.common.add_sccp(pm) passes.ttir.add_loop_aware_cse(pm) passes.gluon.add_canonicalizer(pm)