Skip to content

Commit

Permalink
Fix broadcast (intel#2180)
Browse files Browse the repository at this point in the history
relax upstream broadcast constraint
  • Loading branch information
Dewei-Wang-sh authored Sep 13, 2024
1 parent 7f5ca47 commit e5f0e02
Show file tree
Hide file tree
Showing 9 changed files with 73 additions and 50 deletions.
4 changes: 2 additions & 2 deletions test/Conversion/intel/triton_to_tritongpu_warp.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ module {
// CHECK1: [[EXP_DIM2:%.*]] = tt.expand_dims {{%.*}} {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked>
%31 = tt.expand_dims %19 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32>
%32 = tt.expand_dims %20 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32>
// CHECK1: [[BC1:%.*]] = tt.broadcast [[EXP_DIM1]] : tensor<128x1xi32, #blocked> -> tensor<128x64xi32, #blocked>
// CHECK1: [[BC1:%.*]] = triton_intel_gpu.broadcast [[EXP_DIM1]] : tensor<128x1xi32, #blocked> -> tensor<128x64xi32, #blocked>
%33 = tt.broadcast %31 : tensor<128x1xi32> -> tensor<128x64xi32>
%34 = tt.splat %21 : f32 -> tensor<128x64xf32>
%35:5 = scf.for %arg6 = %26 to %28 step %c64_i32 iter_args(%arg7 = %25#0, %arg8 = %25#1, %arg9 = %25#2, %arg10 = %30, %arg11 = %29) -> (tensor<128xf32>, tensor<128x64xf32>, tensor<128xf32>, !tt.ptr<tensor<64x64xf16>>, !tt.ptr<tensor<64x64xf16>>) : i32 {
Expand All @@ -297,7 +297,7 @@ module {
%41 = tt.splat %arg6 : i32 -> tensor<1x64xi32>
// CHECK1: [[OFFSET:%.*]] = arith.addi {{%.*}}, [[EXP_DIM2]] : tensor<1x64xi32, #blocked>
%42 = arith.addi %41, %32 : tensor<1x64xi32>
// CHECK1: [[BC2:%.*]] = tt.broadcast [[OFFSET]] : tensor<1x64xi32, #blocked> -> tensor<128x64xi32, #blocked>
// CHECK1: [[BC2:%.*]] = triton_intel_gpu.broadcast [[OFFSET]] : tensor<1x64xi32, #blocked> -> tensor<128x64xi32, #blocked>
%43 = tt.broadcast %42 : tensor<1x64xi32> -> tensor<128x64xi32>
// CHECK1: arith.cmpi sge, [[BC1]], [[BC2]] : tensor<128x64xi32, #blocked>
%44 = arith.cmpi sge, %33, %43 : tensor<128x64xi32>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 :
// CHECK: [[VAL_4:%.*]] = llvm.shufflevector [[VAL_3]], [[VAL_1]] [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xf32>
%0 = tt.splat %arg0 : f32 -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #warp}>>
%1 = tt.expand_dims %0 {axis = 1 : i32} : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #warp}>> -> tensor<16x1xf32, #warp>
%2 = tt.broadcast %1 : tensor<16x1xf32, #warp> -> tensor<16x16xf32>
%2 = triton_intel_gpu.broadcast %1 : tensor<16x1xf32, #warp> -> tensor<16x16xf32>
tt.return %2 : tensor<16x16xf32>
}

Expand All @@ -222,7 +222,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 :
// CHECK: llvm.shufflevector [[VEC]], [[EMPTY]] [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>
%0 = tt.make_range {start = 0 : i32, end = 16 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #warp}>>
%1 = tt.expand_dims %0 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #warp}>> -> tensor<1x16xi32, #warp>
%2 = tt.broadcast %1 : tensor<1x16xi32, #warp> -> tensor<16x16xi32>
%2 = triton_intel_gpu.broadcast %1 : tensor<1x16xi32, #warp> -> tensor<16x16xi32>
tt.return %2 : tensor<16x16xi32>
}

Expand Down
22 changes: 11 additions & 11 deletions test/TritonIntelGPU/match-target-size.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -469,9 +469,9 @@ tt.func public @attn_fwd(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.pt
%34 = arith.mulf %30, %24 : tensor<16x64xf32, #warp>

// CHECK: tt.expand_dims {{.*}} {axis = 1 : i32} : tensor<16xf32
// CHECK: tt.broadcast {{.*}} -> tensor<16x16xf32>
// CHECK: triton_intel_gpu.broadcast {{.*}} -> tensor<16x16xf32>
%35 = tt.expand_dims %33 {axis = 1 : i32} : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #warp}>> -> tensor<16x1xf32, #warp>
%36 = tt.broadcast %35 : tensor<16x1xf32, #warp> -> tensor<16x64xf32, #warp>
%36 = triton_intel_gpu.broadcast %35 : tensor<16x1xf32, #warp> -> tensor<16x64xf32, #warp>
%37 = arith.subf %34, %36 : tensor<16x64xf32, #warp>
%38 = math.exp2 %37 : tensor<16x64xf32, #warp>
%39 = "tt.reduce"(%38) <{axis = 1 : i32}> ({
Expand All @@ -484,7 +484,7 @@ tt.func public @attn_fwd(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.pt
%42 = arith.mulf %arg7, %41 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #warp}>>
%43 = arith.addf %42, %39 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #warp}>>
%44 = tt.expand_dims %41 {axis = 1 : i32} : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #warp}>> -> tensor<16x1xf32, #warp>
%45 = tt.broadcast %44 : tensor<16x1xf32, #warp> -> tensor<16x64xf32, #warp>
%45 = triton_intel_gpu.broadcast %44 : tensor<16x1xf32, #warp> -> tensor<16x64xf32, #warp>
%46 = arith.mulf %arg8, %45 : tensor<16x64xf32, #warp>
%47 = tt.load %arg10 : !tt.ptr<tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #warp}>>>
%48 = arith.truncf %38 : tensor<16x64xf32, #warp> to tensor<16x64xf16, #warp>
Expand All @@ -500,7 +500,7 @@ tt.func public @attn_fwd(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.pt
scf.yield %43, %50, %33, %51, %52 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #warp}>>, tensor<16x64xf32, #warp>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #warp}>>, !tt.ptr<tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #warp}>>>, !tt.ptr<tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #warp}>>>
} {triton_gpu.workload = 4 : i32, tt.divisibility_arg1 = dense<64> : tensor<1xi32>}
%26 = tt.expand_dims %25#0 {axis = 1 : i32} : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #warp}>> -> tensor<16x1xf32, #warp>
%27 = tt.broadcast %26 : tensor<16x1xf32, #warp> -> tensor<16x64xf32, #warp>
%27 = triton_intel_gpu.broadcast %26 : tensor<16x1xf32, #warp> -> tensor<16x64xf32, #warp>
%28 = arith.divf %25#1, %27 : tensor<16x64xf32, #warp>
tt.store %20, %28 : !tt.ptr<tensor<16x64xf32, #warp>>
tt.return
Expand Down Expand Up @@ -534,18 +534,18 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 :
%2 = tt.expand_dims %0 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #warp}>> -> tensor<16x1xi32, #warp>
%3 = tt.expand_dims %1 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #warp}>> -> tensor<1x64xi32, #warp>

// CHECK: %[[BC1:.*]] = tt.broadcast %[[ED1]] : tensor<16x1xi32, #warp> -> tensor<16x16xi32>
%4 = tt.broadcast %2 : tensor<16x1xi32, #warp> -> tensor<16x64xi32, #warp>
// CHECK: %[[BC1:.*]] = triton_intel_gpu.broadcast %[[ED1]] : tensor<16x1xi32, #warp> -> tensor<16x16xi32>
%4 = triton_intel_gpu.broadcast %2 : tensor<16x1xi32, #warp> -> tensor<16x64xi32, #warp>

// CHECK: %[[EX0:.*]] = triton_intel_gpu.extract %[[ED2]][0] : tensor<1x64xi32, #warp> -> tensor<1x16xi32>
// CHECK: %[[BC20:.*]] = tt.broadcast %[[EX0]] : tensor<1x16xi32> -> tensor<16x16xi32>
// CHECK: %[[BC20:.*]] = triton_intel_gpu.broadcast %[[EX0]] : tensor<1x16xi32> -> tensor<16x16xi32>
// CHECK: %[[EX1:.*]] = triton_intel_gpu.extract %[[ED2]][1] : tensor<1x64xi32, #warp> -> tensor<1x16xi32>
// CHECK: %[[BC21:.*]] = tt.broadcast %[[EX1]] : tensor<1x16xi32> -> tensor<16x16xi32>
// CHECK: %[[BC21:.*]] = triton_intel_gpu.broadcast %[[EX1]] : tensor<1x16xi32> -> tensor<16x16xi32>
// CHECK: %[[EX2:.*]] = triton_intel_gpu.extract %[[ED2]][2] : tensor<1x64xi32, #warp> -> tensor<1x16xi32>
// CHECK: %[[BC22:.*]] = tt.broadcast %[[EX2]] : tensor<1x16xi32> -> tensor<16x16xi32>
// CHECK: %[[BC22:.*]] = triton_intel_gpu.broadcast %[[EX2]] : tensor<1x16xi32> -> tensor<16x16xi32>
// CHECK: %[[EX3:.*]] = triton_intel_gpu.extract %[[ED2]][3] : tensor<1x64xi32, #warp> -> tensor<1x16xi32>
// CHECK: %[[BC23:.*]] = tt.broadcast %[[EX3]] : tensor<1x16xi32> -> tensor<16x16xi32>
%5 = tt.broadcast %3 : tensor<1x64xi32, #warp> -> tensor<16x64xi32, #warp>
// CHECK: %[[BC23:.*]] = triton_intel_gpu.broadcast %[[EX3]] : tensor<1x16xi32> -> tensor<16x16xi32>
%5 = triton_intel_gpu.broadcast %3 : tensor<1x64xi32, #warp> -> tensor<16x64xi32, #warp>

// CHECK: arith.addi %[[BC1]], %[[BC20]] : tensor<16x16xi32>
// CHECK: arith.addi %[[BC1]], %[[BC21]] : tensor<16x16xi32>
Expand Down
3 changes: 2 additions & 1 deletion third_party/intel/include/TritonToTritonGPUWarp/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ def ConvertTritonToTritonGPUWarp: Pass<"convert-triton-to-tritongpu-warp", "mlir
"mlir::math::MathDialect",
"mlir::scf::SCFDialect",
"mlir::triton::TritonDialect",
"mlir::triton::gpu::TritonGPUDialect"];
"mlir::triton::gpu::TritonGPUDialect",
"mlir::triton::gpu::intel::TritonIntelGPUDialect"];

let options = [
Option<"numWarps", "num-warps",
Expand Down
45 changes: 24 additions & 21 deletions third_party/intel/lib/TritonIntelGPUToLLVM/TritonOpsToLLVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "PatternTritonGPUOpToLLVM.h"

#include "intel/include/Dialect/TritonGEN/IR/TritonGENDialect.h"
#include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h"

#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
Expand All @@ -18,7 +19,7 @@

using namespace mlir;
using namespace mlir::triton;
using namespace mlir::triton::gpu::intel;
namespace ttgi = mlir::triton::gpu::intel;

namespace {

Expand Down Expand Up @@ -124,8 +125,9 @@ class AdvanceOpConversion : public ConvertTritonGPUOpToLLVMPattern<AdvanceOp> {
/// elemSize 32)
/// Arg 12: cache controls options (LSC_CACHE_OPTS)
/// Arg 13: stored value
template <typename OpType, typename = std::enable_if_t<llvm::is_one_of<
OpType, PrefetchOp, LoadOp, StoreOp>::value>>
template <typename OpType,
typename = std::enable_if_t<llvm::is_one_of<OpType, ttgi::PrefetchOp,
LoadOp, StoreOp>::value>>
class LoadStorePrefetchOpConversion
: public ConvertTritonGPUOpToLLVMPattern<OpType> {
public:
Expand Down Expand Up @@ -228,7 +230,7 @@ class LoadStorePrefetchOpConversion
VERIFY_OPERATION(load)

rewriter.replaceOp(op, bitcast(load, resType));
} else if constexpr (std::is_same_v<OpType, PrefetchOp>) {
} else if constexpr (std::is_same_v<OpType, ttgi::PrefetchOp>) {
if (transpose)
std::swap(offsetX, offsetY);
auto newOp = rewriter.create<TritonGEN::Matrix2DBlockPrefetchOp>(
Expand Down Expand Up @@ -377,12 +379,12 @@ class DotOpConversion : public ConvertTritonGPUOpToLLVMPattern<DotOp> {
}
};

class GlueOpConversion : public ConvertTritonGPUOpToLLVMPattern<GlueOp> {
class GlueOpConversion : public ConvertTritonGPUOpToLLVMPattern<ttgi::GlueOp> {
public:
using ConvertTritonGPUOpToLLVMPattern<
GlueOp>::ConvertTritonGPUOpToLLVMPattern;
ttgi::GlueOp>::ConvertTritonGPUOpToLLVMPattern;
LogicalResult
matchAndRewrite(GlueOp op, OpAdaptor adaptor,
matchAndRewrite(ttgi::GlueOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
ValueRange operands = adaptor.getOperands();
Value result = TypeSwitch<Type, Value>(operands.front().getType())
Expand All @@ -401,7 +403,7 @@ class GlueOpConversion : public ConvertTritonGPUOpToLLVMPattern<GlueOp> {
}

private:
Value vectorGlueOp(GlueOp op, ValueRange operands,
Value vectorGlueOp(ttgi::GlueOp op, ValueRange operands,
ConversionPatternRewriter &rewriter) const {
Location loc = op.getLoc();
if (!llvm::isPowerOf2_64(operands.size())) {
Expand Down Expand Up @@ -438,7 +440,7 @@ class GlueOpConversion : public ConvertTritonGPUOpToLLVMPattern<GlueOp> {
return treeVectorGlueOp(loc, res, rewriter);
}

Value scalarGlueOp(GlueOp op, ValueRange operands,
Value scalarGlueOp(ttgi::GlueOp op, ValueRange operands,
ConversionPatternRewriter &rewriter) const {
Location loc = op.getLoc();
auto dstType =
Expand All @@ -459,12 +461,13 @@ class GlueOpConversion : public ConvertTritonGPUOpToLLVMPattern<GlueOp> {
/// %extract = ttgi.extract %a[0] : tensor<8xf16> -> tensor<4xf16>
/// is converted to
/// %extract = llvm.shufflevector %a, %a : [0, 1, 2, 3] : vector<4xf16>
class ExtractOpConversion : public ConvertTritonGPUOpToLLVMPattern<ExtractOp> {
class ExtractOpConversion
: public ConvertTritonGPUOpToLLVMPattern<ttgi::ExtractOp> {
public:
using ConvertTritonGPUOpToLLVMPattern<
ExtractOp>::ConvertTritonGPUOpToLLVMPattern;
ttgi::ExtractOp>::ConvertTritonGPUOpToLLVMPattern;
LogicalResult
matchAndRewrite(ExtractOp op, OpAdaptor adaptor,
matchAndRewrite(ttgi::ExtractOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
Location loc = op.getLoc();
Value base = adaptor.getBase();
Expand Down Expand Up @@ -704,12 +707,12 @@ class ExpandDimsOpConversion
};

class BroadcastOpConversion
: public ConvertTritonGPUOpToLLVMPattern<triton::BroadcastOp> {
: public ConvertTritonGPUOpToLLVMPattern<ttgi::BroadcastOp> {
public:
using ConvertTritonGPUOpToLLVMPattern<
triton::BroadcastOp>::ConvertTritonGPUOpToLLVMPattern;
ttgi::BroadcastOp>::ConvertTritonGPUOpToLLVMPattern;
LogicalResult
matchAndRewrite(triton::BroadcastOp op, OpAdaptor adaptor,
matchAndRewrite(ttgi::BroadcastOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
constexpr unsigned subgroupSize = 16;

Expand Down Expand Up @@ -745,13 +748,13 @@ class BroadcastOpConversion
};

class SubGroupTransposeOpConversion
: public ConvertTritonGPUOpToLLVMPattern<SubGroupTransposeOp> {
: public ConvertTritonGPUOpToLLVMPattern<ttgi::SubGroupTransposeOp> {
public:
using ConvertTritonGPUOpToLLVMPattern<
SubGroupTransposeOp>::ConvertTritonGPUOpToLLVMPattern;
ttgi::SubGroupTransposeOp>::ConvertTritonGPUOpToLLVMPattern;

LogicalResult
matchAndRewrite(SubGroupTransposeOp op, OpAdaptor adaptor,
matchAndRewrite(ttgi::SubGroupTransposeOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const final {
Value src = adaptor.getSrc();
auto vecTy = cast<VectorType>(src.getType());
Expand Down Expand Up @@ -862,12 +865,12 @@ void mlir::triton::intel::populateTritonOpsToLLVMPatterns(
patterns.add<ExpandDimsOpConversion>(typeConverter, benefit);
patterns.add<ExtractOpConversion>(typeConverter, benefit);
patterns.add<GlueOpConversion>(typeConverter, benefit);
patterns.add<LoadStorePrefetchOpConversion<PrefetchOp>>(typeConverter,
benefit);
patterns.add<LoadStorePrefetchOpConversion<ttgi::PrefetchOp>>(typeConverter,
benefit);
patterns.add<LoadStorePrefetchOpConversion<LoadOp>>(typeConverter, benefit);
patterns.add<LoadStorePrefetchOpConversion<StoreOp>>(typeConverter, benefit);
patterns.add<MakeTensorPtrOpConversion>(typeConverter, benefit);
if (applyTransposedReduction())
if (ttgi::applyTransposedReduction())
patterns.add<TransposedReduceOpConversion>(typeConverter, benefit);
else
patterns.add<ReduceOpConversion>(typeConverter, benefit);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,8 @@ class TritonIntelGPUDistributeToWarpsPass
else if (auto makeRange = dyn_cast<tt::MakeRangeOp>(op))
distributeMakeRangeOp(makeRange, warpId);
else if (isa<tt::LoadOp, tt::DotOp, tt::AdvanceOp, tt::ReduceOp,
tt::SplatOp, tt::BroadcastOp, tt::ExpandDimsOp>(op) ||
tt::SplatOp, tt::BroadcastOp, ttgi::BroadcastOp,
tt::ExpandDimsOp>(op) ||
op->getDialect() == arithDialect ||
op->getDialect() == mathDialect)
distributeGenericOp(op);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ class MatchTargetSizePass
transformMakeTensorPtrOp(ptrOp);
} else if (auto dot = dyn_cast<tt::DotOp>(op))
transformDotOp(dot);
else if (auto bc = dyn_cast<tt::BroadcastOp>(op))
else if (auto bc = dyn_cast<ttgi::BroadcastOp>(op))
transformBroadcastOp(bc);
else
transformGenericOp(op);
Expand Down Expand Up @@ -391,7 +391,7 @@ class MatchTargetSizePass
void transformDotOp(tt::DotOp dot);
void transformReduceOp(tt::ReduceOp op);
void transformTransposedReduceOp(tt::ReduceOp op);
void transformBroadcastOp(tt::BroadcastOp op);
void transformBroadcastOp(ttgi::BroadcastOp op);
void transformMakeRangeOp(tt::MakeRangeOp op);

/// Generic transformation.
Expand Down Expand Up @@ -1155,7 +1155,7 @@ void MatchTargetSizePass::transformDotOp(tt::DotOp dot) {
dot->erase();
}

void MatchTargetSizePass::transformBroadcastOp(tt::BroadcastOp op) {
void MatchTargetSizePass::transformBroadcastOp(ttgi::BroadcastOp op) {
OpBuilder b(op);
Location loc = op->getLoc();
RankedTensorType resType = op.getResult().getType();
Expand All @@ -1170,14 +1170,14 @@ void MatchTargetSizePass::transformBroadcastOp(tt::BroadcastOp op) {
unsigned resDim1 = resType.getShape()[1];
Operation *glue;
if (srcDim0 == dstDim0) {
Value newOp = b.create<tt::BroadcastOp>(loc, tType, op.getSrc());
Value newOp = b.create<ttgi::BroadcastOp>(loc, tType, op.getSrc());
unsigned num = resType.getShape()[1] / tType.getShape()[1];
SmallVector<Value> ops(num, newOp);
glue = b.create<ttgi::GlueOp>(loc, resType, ops);
} else if (srcDim0 == 2 * dstDim0) {
auto newTy = RankedTensorType::get({srcDim0, tType.getShape()[1]},
tType.getElementType());
auto newOp = b.create<tt::BroadcastOp>(loc, newTy, op.getSrc());
auto newOp = b.create<ttgi::BroadcastOp>(loc, newTy, op.getSrc());
auto extract0 = b.create<ttgi::ExtractOp>(loc, tType, newOp, 0);
auto extract1 = b.create<ttgi::ExtractOp>(loc, tType, newOp, 1);
SmallVector<Value> ops{extract0, extract1, extract0, extract1,
Expand All @@ -1193,7 +1193,7 @@ void MatchTargetSizePass::transformBroadcastOp(tt::BroadcastOp op) {
SmallVector<Value> subBroadcasts;
for (int i = 0; i < nExtracts; ++i) {
auto ext = b.create<ttgi::ExtractOp>(loc, subRowVecTy, op.getSrc(), i);
auto sbc = b.create<tt::BroadcastOp>(loc, tType, ext);
auto sbc = b.create<ttgi::BroadcastOp>(loc, tType, ext);
subBroadcasts.push_back(sbc);
}

Expand Down
1 change: 1 addition & 0 deletions third_party/intel/lib/TritonToTritonGPUWarp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ add_triton_library(TritonToTritonGPUWarp
TritonIR
TritonGPUIR
TritonGPUTransforms
TritonIntelGPUIR
)
Loading

0 comments on commit e5f0e02

Please sign in to comment.