Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
18293ce
Short-circuit zero-dimensional splats (#8331)
saagarjha Oct 1, 2025
32e0baf
[AMD][Gluon] Refactor buffer_atomic_rmw API (#8325)
zwu-2025 Oct 1, 2025
bc4ec6a
[Build] Fix crt header download location for CUDA >= 13 (#8336)
saagarjha Oct 1, 2025
7b84e26
[BACKEND] Implement shmem matrix descriptors generically (#8321)
lezcano Oct 1, 2025
532fd37
[AMD] Enhance scaled wmma gluon runtime unit tests (#8339)
PMylon Oct 1, 2025
2d4f16d
[AMD] Use linear layout to infer and emit ds_read_tr (#8235)
nzaghen Oct 1, 2025
872e102
[Build] Fix deprecation warning from TarFile.extractall (#8337)
peterbell10 Oct 1, 2025
75cd616
[WS] assign stage-phase only to partitions that needs it (#8329)
3gx Oct 1, 2025
bc22e6e
[AMD] Add initial support for TDM on gfx1250 (#8333)
borontion Oct 1, 2025
eb7cdba
[KERNELS] Fix and enable batched matmul with split-k. (#8327)
yongjik Oct 1, 2025
210c7b5
[ConSan] ConSan env var should be cache invalidating (#8332)
pawelszczerbuk Oct 1, 2025
48ff763
[Proton] Intra kernel profiling tutorial and examples (#8334)
fywkevin Oct 2, 2025
3e464e9
[Build] Remove Python 3.9 compatibility code for `sysconfig.get_defau…
anmyachev Oct 2, 2025
7fc1d56
Do not use C++20 designed initializers in `TritonNVIDIAGPUToLLVM/Tens…
anmyachev Oct 2, 2025
d9215b9
Bump actions/setup-python from 5 to 6 (#8347)
anmyachev Oct 2, 2025
9273fb3
[AMD][NFC] Move LowerLoops into TritonAMDGPUPipeline (#8341)
knwng Oct 2, 2025
aafec41
[triton_kernels] fused matmul_ogs + comms (#8340)
wuweil-openai Oct 2, 2025
6e4647e
[BACKEND] Lower `tcgen05.cp` via the generic matrix descriptor loweri…
lezcano Oct 2, 2025
1d74879
[ConSan] Make sure kernel is recompiled when consan state changes (#8…
pawelszczerbuk Oct 2, 2025
43dbdd1
[PROTON] Add a flag to disable proton in order to use other profilers…
Jokeren Oct 3, 2025
ec800b5
[PROTON] Simplify proton runtime instrumentation using Triton knobs (…
Jokeren Oct 3, 2025
537dfc8
Fold layout conversion for TMEM Store to fix perf drop for flex attn …
pchen7e2 Oct 3, 2025
be6a688
[Tests] Remove subprocess usage from `test_triton_debuginfo_on` (#8350)
anmyachev Oct 3, 2025
7e042c6
[Build] Remove unused `find_library(TERMINFO_LIBRARY tinfo)` (#8362)
anmyachev Oct 3, 2025
c5d1e01
[TESTS] Remove fresh_knobs from matmul.py::test_op (#8364)
lezcano Oct 3, 2025
73b5dc1
[AMD][GLUON] Add layout in make tensor descriptor (#8355)
borontion Oct 3, 2025
4c388af
[PROTON] Fix TestScopeIdAllocation.cpp random build failure (#8363)
lijinpei Oct 3, 2025
5201154
[AMD] Use lowerLdSt for local_load to ds_read_tr path (#8344)
nzaghen Oct 3, 2025
d5156d7
[KERNELS] Change routing code to avoid storage(). (#8357)
yongjik Oct 3, 2025
88b8a5c
[AMD] disable pointer-canonicalization for large-tensor (#8359)
yangshuxin Oct 3, 2025
1888f81
[KERNELS] remove unwanted device_print =_= (#8367)
yongjik Oct 3, 2025
59aeb6b
[Gluon] Require warp_specialize default_args and worker_args be tuple…
peterbell10 Oct 3, 2025
5d84a91
[mxfp] fix x_scale OOB (#8369)
jongsoo-openai Oct 3, 2025
3910f27
[mxfp] handle values close to max correctly w/o overflow (#8356)
jongsoo-openai Oct 4, 2025
0f91265
Get MLIRContext from `newOp`, not the deleted `load` (#8373)
alexbaden Oct 5, 2025
60605d8
[mxfp] remove col-major assert for mx weight (#8249)
jongsoo-openai Oct 5, 2025
483f9ea
[AMD] Disable flaky atomic cas test on CDNA2 (#8376)
antiagainst Oct 6, 2025
6edcd49
[AMD] Limit vec size for ds_read_tr + padded layouts by min interval …
AlexAUT Oct 6, 2025
d5f3f23
[AMD] Refactor FP conversion mode setting (#8351)
ravil-mobile Oct 6, 2025
0173f75
[AMD] Add Tests for MXFP GEMM Gluon Kernel for GFX1250 (#8371)
knwng Oct 6, 2025
8868aca
Merge commit '210c7b5bb29c01781c3e3053fe6bf28eb178347f'
whitneywhtsang Oct 7, 2025
5c020ef
[WIN] Fix error C7555: use of designated initializers requires at lea…
whitneywhtsang Oct 7, 2025
633d32d
Merge commit '43dbdd1685625ce71daea1caf8a4d90fdea6457f'
whitneywhtsang Oct 7, 2025
609e327
Merge commit '5d84a9122b519251d1453fc7e7f31e2e304dc1d6'
whitneywhtsang Oct 7, 2025
e52429a
Merge commit '0173f7524d8cfc9a5b4b52dec0010eaedef14526'
whitneywhtsang Oct 7, 2025
f21b341
Revert "[mxfp] remove col-major assert for mx weight (#8249)"
whitneywhtsang Oct 8, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/integration-tests-amd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ jobs:
integration-tests-amd:
runs-on: ${{ matrix.runner }}
timeout-minutes: 45
continue-on-error: ${{ matrix.runner[1] == 'gfx90a' }}
strategy:
matrix:
runner: ${{ fromJson(inputs.matrix) }}
Expand Down
4 changes: 0 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,6 @@ if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "Release")
endif()

if(NOT WIN32)
find_library(TERMINFO_LIBRARY tinfo)
endif()

if(TRITON_BUILD_UT)
# This is an aggregate target for all unit tests.
add_custom_target(TritonUnitTests)
Expand Down
26 changes: 0 additions & 26 deletions include/triton/Conversion/TritonGPUToLLVM/Utility.h
Original file line number Diff line number Diff line change
Expand Up @@ -528,32 +528,6 @@ Value emitPadding(Location loc, RewriterBase &rewriter,
triton::gpu::PaddedSharedEncodingAttr layout,
unsigned bitwidth, Value smemOffset, bool offsetInBytes);

// Emits IR to load data from shared memory into registers, or to store data
// from registers into shared memory.
//
// You supply perVectorCallback, which is called once per group of register
// elements to transfer. You can use this callback to emit IR to load or store
// data from or to shared memory.
//
// elemLlvmTy should be dstTy's element type converted to an LLVM-dialect type.
//
// If maxVecElems is provided, we won't vectorize more than this many elements.
//
// Returns true on success.
[[nodiscard]] bool emitTransferBetweenRegistersAndShared(
RankedTensorType registerTy, triton::gpu::MemDescType sharedTy,
Type elemLlvmTy, std::optional<int32_t> maxVecElems,
const SharedMemoryObject &smemObj, Location loc, RewriterBase &rewriter,
const TargetInfoBase &target,
std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);

[[nodiscard]] bool emitTransferBetweenRegistersAndShared(
LinearLayout &regLayout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy,
std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
Value laneId, Value warpId,
std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);

// Close cousin of lowerLdStMatrix in MemoryOpToLLVM.cpp
// We might want to merge them at some point, but having to support
// ldmatrix.trans makes the code in lowerLdStMatrix a bit specific
Expand Down
6 changes: 5 additions & 1 deletion include/triton/Dialect/TritonGPU/Transforms/Schedule.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ class CoarseSchedule {
iterator end() { return orderClusters.end(); }
const_iterator end() const { return orderClusters.end(); }
size_t size() const { return orderClusters.size(); }
void clear() { orderClusters.clear(); }
iterator newAtBack() {
orderClusters.push_back(orderClusters.size());
return std::prev(orderClusters.end());
Expand Down Expand Up @@ -157,7 +158,10 @@ class CoarseSchedule {
// Set <stage, cluster> based on CoarseSchedule.
void serialize(scf::ForOp &forOp) const;
// Create a CoarseSchedule based on forOp's <stage, cluster>.
LogicalResult deSerialize(scf::ForOp &forOp);
// If normalizeClusterId is true, clusters [minClusterId, maxClusterId] will
// be remapped to [0, maxClusterId - minClusterId].
// If false, it won't remap and clusters [0, maxClusterId] will be created.
LogicalResult deSerialize(scf::ForOp &forOp, bool normalizeClusterId = true);

static ClusterHash hashCluster(Cluster cluster) {
return reinterpret_cast<ClusterHash>(&*cluster);
Expand Down
35 changes: 35 additions & 0 deletions include/triton/Tools/LayoutUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,41 @@ std::pair<int, ColumnAction>
largestVectorisation(MLIRContext *ctx, const LinearLayout &cvt, int bitwidth,
std::optional<int> maybeMaxVecElems = std::nullopt);

// Close cousin of doing zerosLike(tile) * divideLeft(cvt, tile)
// This one is a tad more general in the sense that it allows to divide
// cvt:
// - register=1 -> (0, 1)
// register=2 -> (8, 0)
// register=4 -> (0, 8)
// register=8 -> (0, 16)
// register=16 -> (0, 32)
// register=32 -> (0, 64)
// register=64 -> (16, 0)
// - lane=1 -> (0, 2)
// lane=2 -> (0, 4)
// lane=4 -> (1, 0)
// lane=8 -> (2, 0)
// lane=16 -> (4, 0)
// - warp=1 -> (32, 0)
// warp=2 -> (64, 0)
// - block is a size 1 dimension
// where out dims are: [row (size 128), col (size 128)]
// tile:
// - register=1 -> (0, 1)
// register=2 -> (8, 0)
// - lane=1 -> (0, 2)
// lane=2 -> (0, 4)
// lane=4 -> (1, 0)
// lane=8 -> (2, 0)
// lane=16 -> (4, 0)
// - warp=1 -> (32, 0)
// warp=2 -> (64, 0)
// where out dims are: [row (size 128), col (size 8)]
// which would not be possible to lower via the divideLeft approach as we
// cannot divide by the tile given the `register=64 -> (16, 0)` basis.
std::optional<LinearLayout> getReps(const LinearLayout &cvt,
const LinearLayout &tile);

} // namespace mlir::triton

#endif // TRITON_TOOLS_LAYOUTUTILS_H
1 change: 1 addition & 0 deletions include/triton/Tools/Sys/GetEnv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
"ALLOW_LHS_TMEM_LAYOUT_CONVERSION",
"TRITON_F32_DEFAULT",
"TRITON_PREFER_TMEM_16x256_LAYOUT",
"TRITON_ENABLE_EXPERIMENTAL_CONSAN",
"TRITON_INTEL_AGGRESSIVE_DPAS_REUSE",
"TRITON_INTEL_ENABLE_BLOCK_IO_ALL_LAYOUTS",
"TRITON_INTEL_ENABLE_DPAS_FOR_WARP_SIZE_32",
Expand Down
104 changes: 0 additions & 104 deletions lib/Conversion/TritonGPUToLLVM/Utility.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -706,110 +706,6 @@ lowerLocalLdSt(Location loc, MLIRContext *ctx,
maybeMaxVecElems, localLoadOp);
}

bool emitTransferBetweenRegistersAndShared(
LinearLayout &regLayout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy,
std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
Value laneId, Value warpId,
std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback) {
MLIRContext *ctx = rewriter.getContext();
auto b = TritonLLVMOpBuilder(loc, rewriter);

StringAttr kBlock = str_attr("block");
StringAttr kRegister = str_attr("register");
StringAttr kLane = str_attr("lane");
StringAttr kWarp = str_attr("warp");
StringAttr kOffset = str_attr("offset");

auto shape = sharedTy.getShape();
auto paddedEnc =
dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(sharedTy.getEncoding());
LinearLayout regToSharedLayout = LinearLayout::empty();
if (paddedEnc) {
const auto &sharedLL = paddedEnc.getLinearComponent();
regToSharedLayout = regLayout.invertAndCompose(sharedLL);
} else {
auto sharedLL = triton::gpu::toLinearLayout(sharedTy);
regToSharedLayout = regLayout.invertAndCompose(sharedLL);
}

// TODO(jlebar): We don't currently support loading from shared memory in a
// different CTA. We'd need to emit `mapa.shared::cluster` instructions.
if (regToSharedLayout.hasInDim(kBlock) &&
regToSharedLayout.hasOutDim(kBlock) &&
!regToSharedLayout.isTrivialOver({kBlock})) {
return false;
}

// Determine how many consecutive registers map to consecutive shmem elements
// in out-dimension offsetN. This is our load instruction's vector width.
//
// It's OK if the vector width we choose here is wider than the hardware
// supports; LLVM will legalize it.
int vecElems =
std::min({regToSharedLayout.getNumConsecutiveInOut(),
maxVecElems.value_or(std::numeric_limits<int>::max())});
if (paddedEnc) {
vecElems = std::min(vecElems, int(paddedEnc.getMinInterval()));
}

auto withCTAOffset = triton::gpu::getNumCTAs(sharedTy.getEncoding()) > 1;
Value blockId =
withCTAOffset ? target.getClusterCTAId(rewriter, loc) : b.i32_val(0);

int numElems = regToSharedLayout.getInDimSize(kRegister);
auto vecTy = vec_ty(elemLlvmTy, vecElems);
SmallVector<uint32_t> regIds;
for (int i = 0; i < numElems / vecElems; i++) {
regIds.push_back(i * vecElems);
}

auto smemBase = smemObj.getBase();

auto indicesVec = applyLinearLayoutVec(loc, rewriter, regToSharedLayout,
{{kRegister, b.i32_val(0)},
{kLane, laneId},
{kWarp, warpId},
{kBlock, blockId}},
regIds);

// Compute affine offset given by memdesc_subslice
auto offset = smemObj.getShmemOffset(loc, rewriter, sharedTy);
SmallVector<Value> vecAddrVec;
for (auto &indices : indicesVec) {
Value smemOffset = indices[0].second;
smemOffset = b.xor_(smemOffset, offset);
if (paddedEnc) {
// Apply the offset needed for padding.
auto bitwidth = elemLlvmTy.getIntOrFloatBitWidth();
Value padOffset = emitPadding(loc, rewriter, paddedEnc, bitwidth,
smemOffset, /*offsetInBytes=*/false);
smemOffset = b.add(smemOffset, padOffset);
}
auto vecAddr = b.gep(smemBase.getType(), elemLlvmTy, smemBase, smemOffset,
LLVM::GEPNoWrapFlags::inbounds);
vecAddrVec.push_back(vecAddr);
}

for (Value &vecAddr : vecAddrVec) {
perVectorCallback(vecTy, vecAddr);
}
return true;
}

bool emitTransferBetweenRegistersAndShared(
RankedTensorType registerTy, triton::gpu::MemDescType sharedTy,
Type elemLlvmTy, std::optional<int32_t> maxVecElems,
const SharedMemoryObject &smemObj, Location loc, RewriterBase &rewriter,
const TargetInfoBase &target,
std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback) {
auto regLayout = triton::gpu::toLinearLayout(registerTy);
auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc);
return emitTransferBetweenRegistersAndShared(
regLayout, sharedTy, elemLlvmTy, maxVecElems, smemObj, loc, rewriter,
target, laneId, warpId, perVectorCallback);
}

SmallVector<Value> unpackLLElements(Location loc, Value llvmStruct,
RewriterBase &rewriter) {
assert(bool(llvmStruct) && "can not unpack null values");
Expand Down
Loading
Loading