Skip to content

Commit

Permalink
Sorted all invocations of alloca (#2981)
Browse files Browse the repository at this point in the history
Signed-off-by: Alexandre Eichenberger <[email protected]>
  • Loading branch information
AlexandreEichenberger authored Oct 18, 2024
1 parent c934b1d commit 1435011
Show file tree
Hide file tree
Showing 17 changed files with 574 additions and 74 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,8 @@ Value ZTensorHelper::getPreTransformedDescPtr(zdnn_data_types zDNNDataType,
Type llvmZTensorDescStructTy = getZTensorDescStructTy(context);
Value one = create.llvm.constant(llvmI64Ty, static_cast<int64_t>(1));

// Alloca is fine for LLVM structs; if we were to use alloc, we would also to
// manually insert free calls. So alloca makes total sense here.
Value preTransformedDescPtr = create.llvm._alloca(
krnl::getPointerType(context, llvmZTensorDescStructTy),
llvmZTensorDescStructTy, one,
Expand Down
18 changes: 3 additions & 15 deletions src/Conversion/KrnlToAffine/KrnlMatmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -372,13 +372,7 @@ class KrnlMatmulLowering : public ConversionPattern {
assert(BUFFER_ALIGN >= gDefaultAllocAlign &&
"alignment of buffers cannot be smaller than the default alignment "
"(which is set for SIMD correctness");
// TODO: alloca is good as it help simplify away this data structures (as it
// is only used as local temp, basically extensions of registers). However,
// there might be issues with non-removed alloca when they are not in the
// innermost loop. Still think its worth it having alloca as we want
// eventually all the refs to alloca to be register/spill access, not memory
// load/stores.
Value TmpProd = create.mem.alignedAlloca(CTmpType, BUFFER_ALIGN);
Value TmpProd = create.mem.alignedAlloc(CTmpType, BUFFER_ALIGN);
// Init with zero.
Value fZero = create.math.constant(elementType, 0);
Value vFZero = create.vec.broadcast(vecType, fZero);
Expand Down Expand Up @@ -455,13 +449,7 @@ class KrnlMatmulLowering : public ConversionPattern {
// Have to privatize CTmpType by unroll factor (1 if none).
MemRefType CTmpType = MemRefType::get({unrollFactor}, vecType);
assert(BUFFER_ALIGN >= gDefaultAllocAlign);
// TODO: alloca is good as it help simplify away this data structures (as it
// is only used as local temp, basically extensions of registers). However,
// there might be issues with non-removed alloca when they are not in the
// innermost loop. Still think its worth it having alloca as we want
// eventually all the refs to alloca to be register/spill access, not memory
// load/stores.
Value TmpC = create.mem.alignedAlloca(CTmpType, BUFFER_ALIGN);
Value TmpC = create.mem.alignedAlloc(CTmpType, BUFFER_ALIGN);

// Iterates over the I indices (j are simd dim).
Value iSaved, kSaved;
Expand All @@ -473,7 +461,7 @@ class KrnlMatmulLowering : public ConversionPattern {
MultiDialectBuilder<MathBuilder, VectorBuilder> create(createAffine);
Value i = loopInd[0];
iSaved = i; // Saved for unroll and jam.
// Alloca temp vector TmpC and save C(i)/0.0 into it.
// Alloc temp vector TmpC and save C(i)/0.0 into it.
Value initVal = create.vec.loadIE(vecType, C, cStart, {i, iZero});
Value tmpCAccess = (unrollFactor > 1) ? i : zeroIE.getValue();
createAffine.store(initVal, TmpC, tmpCAccess);
Expand Down
7 changes: 4 additions & 3 deletions src/Conversion/KrnlToLLVM/KrnlEntryPoint.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ class KrnlEntryPointOpLowering : public OpRewritePattern<KrnlEntryPointOp> {
// entry point instead of the wrapped static entry point.
Type memRefOutTy = staticEntryPointFuncTy.getReturnTypes()[0];
Type memRefOutPtrTy = getPointerType(context, memRefOutTy);
Value ptrToOutMemRef =
Value ptrToOutMemRef = // alloca ok as there is only one entry point.
create.llvm._alloca(memRefOutPtrTy, memRefOutTy, one, /*alignment=*/0);
staticInputs.emplace_back(ptrToOutMemRef);

Expand All @@ -250,7 +250,7 @@ class KrnlEntryPointOpLowering : public OpRewritePattern<KrnlEntryPointOp> {
// Original input is shifted by 1 in the iface func.
Type memRefInTy = typeConverter.convertType(origInputMemRefTypes[i - 1]);
Type memRefInPtrTy = getPointerType(context, memRefInTy);
Value ptrToMemRef =
Value ptrToMemRef = // alloca ok as there is only one entry point.
create.llvm._alloca(memRefInPtrTy, memRefInTy, one, /*alignment=*/0);

// Fill in the memref underlying ptrToMemRef with information extracted
Expand Down Expand Up @@ -287,7 +287,8 @@ class KrnlEntryPointOpLowering : public OpRewritePattern<KrnlEntryPointOp> {

Value numOutput = create.llvm.constant(
int64Ty, static_cast<int64_t>(outMemRefList.size()));
// Assume that OMTensor pointer size is 8
// Assume that OMTensor pointer size is 8.
// Alloca ok as its only for 1 small data structure per parameters.
Value outOmtPtrsArr = create.llvm._alloca(
omTensorPtrAddrTy, opaquePtrTy, numOutput, /*alignment=*/0);

Expand Down
31 changes: 9 additions & 22 deletions src/Conversion/ONNXToKrnl/Math/Gemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ struct ONNXGemmOpLowering : public OpConversionPattern<GemmOp> {
MultiDialectBuilder<KrnlBuilder, MemRefBuilder, MathBuilder> create(
createKrnl);
// Create temp, single scalar, no need for default alignment.
// Alloca is ok here as its for a scalar, and in the generic version
// of GEMM.
Value red = create.mem.alloca(MemRefType::get({}, elementType));
// Set to zero.
create.krnl.store(zeroVal, red);
Expand Down Expand Up @@ -203,14 +205,6 @@ struct ONNXGemmOpLowering : public OpConversionPattern<GemmOp> {
MemRefType bTileType =
MemRefType::get({kCacheTile, jCacheTile}, elementType);
SmallVector<IndexExpr, 1> empty;
// Allocate here on heap, only when no parallelism.
Value aBuff, bBuff, rBuff;
if (!enableParallel) {
aBuff = create.mem.alignedAlloc(aTileType, BUFFER_ALIGN);
bBuff = create.mem.alignedAlloc(bTileType, BUFFER_ALIGN);
if (mustTileR)
rBuff = create.mem.alignedAlloc(aTileType, BUFFER_ALIGN);
}

// 3) introduce the loops and permute them
// I, J, K loop.
Expand Down Expand Up @@ -253,13 +247,10 @@ struct ONNXGemmOpLowering : public OpConversionPattern<GemmOp> {
{I, J, K},
[&](const KrnlBuilder &createKrnl, ValueRange i1_j1_indices) {
Value i1(i1_j1_indices[0]), j1(i1_j1_indices[1]);
// If parallel, allocate on stack inside the parallel region.
if (enableParallel) {
aBuff = create.mem.alignedAlloca(aTileType, BUFFER_ALIGN);
bBuff = create.mem.alignedAlloca(bTileType, BUFFER_ALIGN);
if (mustTileR)
rBuff = create.mem.alignedAlloca(aTileType, BUFFER_ALIGN);
}
// If parallel, will stay inside, otherwise will migrate out.
Value aBuff = create.mem.alignedAlloc(aTileType, BUFFER_ALIGN);
Value bBuff = create.mem.alignedAlloc(bTileType, BUFFER_ALIGN);
Value rBuff = create.mem.alignedAlloc(aTileType, BUFFER_ALIGN);
createKrnl.copyToBuffer(rBuff, R, {i1, j1}, zeroVal, false);
createKrnl.iterateIE({}, {kk1}, {}, {},
[&](const KrnlBuilder &createKrnl, ValueRange k1_index) {
Expand Down Expand Up @@ -321,13 +312,9 @@ struct ONNXGemmOpLowering : public OpConversionPattern<GemmOp> {
{J, K, I},
[&](const KrnlBuilder &createKrnl, ValueRange j1_k1_indices) {
Value j1(j1_k1_indices[0]), k1(j1_k1_indices[1]);
// If parallel, allocate on stack inside the parallel region.
if (enableParallel) {
aBuff = create.mem.alignedAlloca(aTileType, BUFFER_ALIGN);
bBuff = create.mem.alignedAlloca(bTileType, BUFFER_ALIGN);
if (mustTileR)
rBuff = create.mem.alignedAlloca(aTileType, BUFFER_ALIGN);
}
// If parallel, it will stay inside, otherwise it will migrate out.
Value aBuff = create.mem.alignedAlloc(aTileType, BUFFER_ALIGN);
Value bBuff = create.mem.alignedAlloc(bTileType, BUFFER_ALIGN);
if (bTrans)
createKrnl.copyToBuffer(bBuff, B, {j1, k1}, zeroVal, true);
else
Expand Down
26 changes: 14 additions & 12 deletions src/Conversion/ONNXToKrnl/Math/Reduction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1063,7 +1063,7 @@ struct ONNXReductionOpLowering : public OpConversionPattern<ONNXReductionOp> {

void genOneHorizontalSimdReduction(ConversionPatternRewriter &rewriter,
MDBuilder &create, Operation *op, Type elementType, VectorType vecType,
Value tmpAlloca, Value flatInput, Value flatAlloc, Value initVec,
Value tmpAlloc, Value flatInput, Value flatAlloc, Value initVec,
Value divisorForMean, ValueRange outLoopInd, Value simdUB, int64_t VL,
bool simdOnly) const {
IndexExpr lb = LitIE(0);
Expand All @@ -1076,7 +1076,7 @@ struct ONNXReductionOpLowering : public OpConversionPattern<ONNXReductionOp> {
rewriter, create.getLoc(), elementType);
create.krnl.simdReduceIE(lb, ub, VL, simdOnly,
/* inputs*/ {flatInput}, {inputAF},
/* temp */ {tmpAlloca}, {tmpAF},
/* temp */ {tmpAlloc}, {tmpAF},
/* output */ {flatAlloc}, {outputAF},
/* init */ {identity},
/* reduction simd/scalar */
Expand Down Expand Up @@ -1145,20 +1145,21 @@ struct ONNXReductionOpLowering : public OpConversionPattern<ONNXReductionOp> {
onnxToKrnlParallelReport(
op, true, 0, lbs[0], flatOutDims[0], "reduction h-simd");
} else {
enableParallel = false;
onnxToKrnlParallelReport(op, false, 0, lbs[0], flatOutDims[0],
"not enough work for reduction h-simd");
}
}
create.krnl.iterateIE(outLoopDef, outLoopDef, lbs, flatOutDims,
[&](const KrnlBuilder &ck, ValueRange outLoopInd) {
MDBuilder create(ck);
// Allocate temp inside loop (because of parallel).
Value tmpAlloca = create.mem.alignedAlloca(tmpType);
// When parallel, will stay inside; otherwise will migrate out.
Value tmpAlloc = create.mem.alignedAlloc(tmpType);
Value identity = getIdentityValue<ONNXReductionOp>(
rewriter, create.getLoc(), elementType);
Value initVec = create.vec.splat(vecType, identity);
genOneHorizontalSimdReduction(rewriter, create, op, elementType,
vecType, tmpAlloca, flatInput, flatAlloc, initVec, divisorForMean,
vecType, tmpAlloc, flatInput, flatAlloc, initVec, divisorForMean,
outLoopInd, simdUB, VL, simdOnly);
});
}
Expand All @@ -1183,7 +1184,7 @@ struct ONNXReductionOpLowering : public OpConversionPattern<ONNXReductionOp> {

void genVlHorizontalSimdReduction(ConversionPatternRewriter &rewriter,
MDBuilder &create, Operation *op, Type elementType, VectorType vecType,
Value tmpBlockedAlloca, Value flatInput, Value flatAlloc, Value initVec,
Value tmpBlockedAlloc, Value flatInput, Value flatAlloc, Value initVec,
Value divisorForMean, ValueRange blockedOutLoopInd,
IndexExpr blockedCurrIndex, Value simdUB, int64_t VL,
bool simdOnly) const {
Expand All @@ -1200,7 +1201,7 @@ struct ONNXReductionOpLowering : public OpConversionPattern<ONNXReductionOp> {
rewriter, create.getLoc(), elementType);
if (simdOnly) {
create.affine.simdReduce2DIE(
lb, ub, VL, simdOnly, flatInput, inputAF, tmpBlockedAlloca, tmpAF,
lb, ub, VL, simdOnly, flatInput, inputAF, tmpBlockedAlloc, tmpAF,
flatAlloc, outputAF, identity,
[&](const AffineBuilder &b, Value inputVal, Value tmpVal,
int64_t VL) {
Expand All @@ -1215,7 +1216,7 @@ struct ONNXReductionOpLowering : public OpConversionPattern<ONNXReductionOp> {
});
} else {
create.scf.simdReduce2DIE( // Affine fails with dynamic shapes.
lb, ub, VL, simdOnly, flatInput, inputAF, tmpBlockedAlloca, tmpAF,
lb, ub, VL, simdOnly, flatInput, inputAF, tmpBlockedAlloc, tmpAF,
flatAlloc, outputAF, identity,
[&](const SCFBuilder &b, Value inputVal, Value tmpVal, int64_t VL) {
Type type = VL > 1 ? vecType : elementType;
Expand Down Expand Up @@ -1298,15 +1299,16 @@ struct ONNXReductionOpLowering : public OpConversionPattern<ONNXReductionOp> {
onnxToKrnlParallelReport(op, true, parId, lbs[parId],
flatOutDims[parId], "reduction shuffle h-simd");
} else {
enableParallel = false;
onnxToKrnlParallelReport(op, false, 0, lbs[0], flatOutDims[0],
"not enough work for reduction shuffle h-simd");
}
}
create.krnl.iterateIE(outLoopDef, optimizedOutLoopDef, lbs, flatOutDims,
[&](const KrnlBuilder &ck, ValueRange blockedOutLoopInd) {
MDBuilder create(ck);
// Create temp inside loop (because of parallel).
Value tmpBlockedAlloca = create.mem.alignedAlloca(tmpBlockedType);
// When parallel, will stay inside; otherwise will migrate out.
Value tmpBlockedAlloc = create.mem.alignedAlloc(tmpBlockedType);
Value identity = getIdentityValue<ONNXReductionOp>(
rewriter, create.getLoc(), elementType);
Value initVec = create.vec.splat(vecType, identity);
Expand Down Expand Up @@ -1336,7 +1338,7 @@ struct ONNXReductionOpLowering : public OpConversionPattern<ONNXReductionOp> {
outLoopInd.emplace_back(blockLocalInd);
// Perform reduction for one output value.
genOneHorizontalSimdReduction(rewriter, create, op,
elementType, vecType, tmpBlockedAlloca, flatInput,
elementType, vecType, tmpBlockedAlloc, flatInput,
flatAlloc, initVec, divisorForMean, outLoopInd,
simdUB, VL, simdOnly);
}); /* for inside blocked loop */
Expand All @@ -1345,7 +1347,7 @@ struct ONNXReductionOpLowering : public OpConversionPattern<ONNXReductionOp> {
MDBuilder create(scf);
// create.krnl.printf("full tile\n");
genVlHorizontalSimdReduction(rewriter, create, op, elementType,
vecType, tmpBlockedAlloca, flatInput, flatAlloc, initVec,
vecType, tmpBlockedAlloc, flatInput, flatAlloc, initVec,
divisorForMean, blockedOutLoopInd, blockedCurrIndex, simdUB,
VL, simdOnly);
});
Expand Down
14 changes: 12 additions & 2 deletions src/Conversion/ONNXToKrnl/NN/Normalization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ struct ONNXInstanceNormalizationOpLowering
create.krnlIE.getShapeAsSymbols(inputMemRef, inputBounds);
MemRefType tmpType = MemRefType::get({}, elementType);
Value fZero = create.math.constant(elementType, 0);
// Ok to use alloca, just one scalar.
Value tmpMemRef = create.mem.alloca(tmpType);

// Compute the number of values in a single channel: product of spatial
Expand Down Expand Up @@ -957,12 +958,21 @@ struct GenericLayerNormaOpLowering : public OpConversionPattern<OP_TYPE> {
} else {
onnxToKrnlParallelReport(op, false, -1, -1, "no parallel in layer norm");
}
Value tmpRedMemRef, tmpRedMemRef2;
if (!useParallel) {
// Sequential, alloc before loop.
tmpRedMemRef = create.mem.alignedAlloc(tmpRedType);
tmpRedMemRef2 = create.mem.alignedAlloc(tmpRedType);
}
create.krnl.forLoopIE(LitIE(0), XFlatDims[0], /*step*/ B, useParallel,
[&](const KrnlBuilder &ck, ValueRange blockedLoopIndices) {
MDBuilder create(ck);
IndexExprScope innerScope(ck);
Value tmpRedMemRef = create.mem.alignedAlloca(tmpRedType);
Value tmpRedMemRef2 = create.mem.alignedAlloca(tmpRedType);
if (useParallel) {
// Parallel, alloc inside parallel loop.
tmpRedMemRef = create.mem.alignedAlloc(tmpRedType);
tmpRedMemRef2 = create.mem.alignedAlloc(tmpRedType);
}
IndexExpr blockedCurrIndex = DimIE(blockedLoopIndices[0]);
IndexExpr blockedUB = SymIE(XFlatDims[0]);
IndexExpr isFull =
Expand Down
2 changes: 1 addition & 1 deletion src/Conversion/ONNXToKrnl/NN/Pooling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ struct ONNXPoolOpLowering : public OpConversionPattern<PoolOp> {
// Identity value of the operation.
auto identity = getIdentityValue<PoolOp>(rewriter, loc, outputElementType);
// Create a local reduction value for output[n][c][ho][wo].
// Single scalar, no need for default alignment.
// Single scalar, no need for default alignment. Ok to use alloca.
Value reductionVal =
create.mem.alloca(MemRefType::get({}, memRefType.getElementType()));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ static void suppressByScores(ConversionPatternRewriter &rewriter, Location loc,
Value zero = create.math.constantIndex(0);
Value one = create.math.constantIndex(1);
// Store the number of scores whose value is greater than the threshold.
// Scalar, ok to use alloca.
Value topk = create.mem.alloca(MemRefType::get({}, indexType));

// Compute the effective max output per class.
Expand Down Expand Up @@ -272,6 +273,7 @@ struct ONNXNonMaxSuppressionOpLowering

// Refine the number of output boxes per class by suppressing it using
// spatial dimension size and score threshold.
// Scalar, ok to use alloca.
Value maxOutputPerClass = create.mem.alloca(MemRefType::get({}, indexType));
// 1. Suppress by using spatial dimension size.
Value x = create.math.castToIndex(maxOutputBoxPerClass);
Expand Down Expand Up @@ -312,6 +314,7 @@ struct ONNXNonMaxSuppressionOpLowering
// dim of the output, which is suppressed by IOU during computation and
// cannot be computed in advance.
// Final output shape : [effective_num_selected_indices, 3]
// Scalar, ok to use alloca.
Value effectiveNumSelectedIndices =
create.mem.alloca(MemRefType::get({}, indexType));
create.krnl.store(zero, effectiveNumSelectedIndices);
Expand Down
2 changes: 2 additions & 0 deletions src/Conversion/ONNXToKrnl/Tensor/Compress.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ struct ONNXCompressOpLowering : public OpConversionPattern<ONNXCompressOp> {
// Create temp memory for summing up the true value and init to zero.
Type indexType = rewriter.getIndexType();
MemRefType indexMemRefType = MemRefType::get({}, indexType);
// Scalar, ok to use alloca.
Value sumMemRef = create.mem.alloca(indexMemRefType);
create.krnl.store(zeroIE.getValue(), sumMemRef);
// Now create a loop to iterate over all conditions.
Expand Down Expand Up @@ -142,6 +143,7 @@ struct ONNXCompressOpLowering : public OpConversionPattern<ONNXCompressOp> {
}
}

// Scalar, ok to use alloca.
Value readIndexMemRef = create.mem.alloca(indexMemRefType);
create.krnl.store(zeroIE.getValue(), readIndexMemRef);

Expand Down
1 change: 1 addition & 0 deletions src/Conversion/ONNXToKrnl/Tensor/GatherND.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ struct ONNXGatherNDOpLowering : public OpConversionPattern<ONNXGatherNDOp> {
// Initialize the index used to store the result values.
Value iZero = create.math.constantIndex(0);
Value iOne = create.math.constantIndex(1);
// Scalar, ok to use alloca.
Value storeIndex =
create.mem.alloca(MemRefType::get({}, rewriter.getIndexType()));
create.krnl.store(iZero, storeIndex);
Expand Down
2 changes: 2 additions & 0 deletions src/Conversion/ONNXToKrnl/Tensor/NonZero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ struct ONNXNonZeroOpLowering : public OpConversionPattern<ONNXNonZeroOp> {
create.krnlIE.getShapeAsDims(X, xUbs);

// Emit a variable for the total number of nonzero values.
// Scalar, ok to use alloca.
Value nonzeroCount = create.mem.alloca(MemRefType::get({}, indexTy));
create.krnl.store(iZero, nonzeroCount);

Expand Down Expand Up @@ -176,6 +177,7 @@ struct ONNXNonZeroOpLowering : public OpConversionPattern<ONNXNonZeroOp> {
// out[0][i] = p
// ```

// Scalars, ok to use alloca.
Value pos = create.mem.alloca(MemRefType::get({}, indexTy));
Value sum = create.mem.alloca(MemRefType::get({}, indexTy));
ValueRange iLoopDef = create.krnl.defineLoops(1);
Expand Down
1 change: 1 addition & 0 deletions src/Conversion/ONNXToKrnl/Tensor/Unique.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ struct ONNXUniqueOpLowering : public ConversionPattern {
//
Type indexTy = rewriter.getIndexType();
Value iZero = create.math.constantIndex(0);
// Scalar, ok to use alloca.
Value uniqueCount = create.mem.alloca(MemRefType::get({}, indexTy));
create.krnl.store(iZero, uniqueCount);
Value noneValue;
Expand Down
3 changes: 3 additions & 0 deletions src/Dialect/Mlir/DialectBuilder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,9 @@ struct MemRefBuilder final : DialectBuilder {
// currently executing function, to be automatically released when this
// function returns to its caller. It is strongly suggested to place alloca
// instructions outside of a loop.
//
// When possible, DO NOT USE ALLOCA except for a few scalars.
//
mlir::memref::AllocaOp alloca(mlir::MemRefType type) const;
mlir::memref::AllocaOp alignedAlloca(
mlir::MemRefType type, int64_t align = defaultAlign) const;
Expand Down
Loading

0 comments on commit 1435011

Please sign in to comment.