Successfully identified regression in *llvm* in CI configuration tcwg_bmk_llvm_tx1/llvm-master-aarch64-spec2k6-O2_LTO. So far, this commit has regressed CI configurations: - tcwg_bmk_llvm_tx1/llvm-master-aarch64-spec2k6-O2_LTO
Culprit: <cut> commit 428a62f65f16f1640b1bfe033d20e6a4f545dd3e Author: thomasraoux thomasraoux@google.com Date: Wed Jun 9 09:42:32 2021 -0700
[mlir][gpu] Add op to create MMA constant matrix
This allow creating a matrix with all elements set to a given value. This is needed to be able to implement a simple dot op.
Differential Revision: https://reviews.llvm.org/D103870 </cut>
Results regressed to (for first_bad == 428a62f65f16f1640b1bfe033d20e6a4f545dd3e) # reset_artifacts: -10 # build_abe binutils: -9 # build_abe stage1 -- --set gcc_override_configure=--disable-libsanitizer: -8 # build_abe linux: -7 # build_abe glibc: -6 # build_abe stage2 -- --set gcc_override_configure=--disable-libsanitizer: -5 # build_llvm true: -3 # true: 0 # benchmark -O2_LTO -- artifacts/build-428a62f65f16f1640b1bfe033d20e6a4f545dd3e/results_id: 1 # 400.perlbench,perlbench_base.default regressed by 103
from (for last_good == 3b46283c1539f89619f2b40ab7732f434d7c68ff) # reset_artifacts: -10 # build_abe binutils: -9 # build_abe stage1 -- --set gcc_override_configure=--disable-libsanitizer: -8 # build_abe linux: -7 # build_abe glibc: -6 # build_abe stage2 -- --set gcc_override_configure=--disable-libsanitizer: -5 # build_llvm true: -3 # true: 0 # benchmark -O2_LTO -- artifacts/build-3b46283c1539f89619f2b40ab7732f434d7c68ff/results_id: 1
Artifacts of last_good build: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-a... Results ID of last_good: tx1_64/tcwg_bmk_llvm_tx1/bisect-llvm-master-aarch64-spec2k6-O2_LTO/1827 Artifacts of first_bad build: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-a... Results ID of first_bad: tx1_64/tcwg_bmk_llvm_tx1/bisect-llvm-master-aarch64-spec2k6-O2_LTO/1831 Build top page/logs: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-a...
Configuration details:
Reproduce builds: <cut> mkdir investigate-llvm-428a62f65f16f1640b1bfe033d20e6a4f545dd3e cd investigate-llvm-428a62f65f16f1640b1bfe033d20e6a4f545dd3e
git clone https://git.linaro.org/toolchain/jenkins-scripts
mkdir -p artifacts/manifests curl -o artifacts/manifests/build-baseline.sh https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-a... --fail curl -o artifacts/manifests/build-parameters.sh https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-a... --fail curl -o artifacts/test.sh https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-a... --fail chmod +x artifacts/test.sh
# Reproduce the baseline build (build all pre-requisites) ./jenkins-scripts/tcwg_bmk-build.sh @@ artifacts/manifests/build-baseline.sh
# Save baseline build state (which is then restored in artifacts/test.sh) rsync -a --del --delete-excluded --exclude bisect/ --exclude artifacts/ --exclude llvm/ ./ ./bisect/baseline/
cd llvm
# Reproduce first_bad build git checkout --detach 428a62f65f16f1640b1bfe033d20e6a4f545dd3e ../artifacts/test.sh
# Reproduce last_good build git checkout --detach 3b46283c1539f89619f2b40ab7732f434d7c68ff ../artifacts/test.sh
cd .. </cut>
History of pending regressions and results: https://git.linaro.org/toolchain/ci/base-artifacts.git/log/?h=linaro-local/c...
Artifacts: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-a... Build log: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-a...
Full commit (up to 1000 lines): <cut> commit 428a62f65f16f1640b1bfe033d20e6a4f545dd3e Author: thomasraoux thomasraoux@google.com Date: Wed Jun 9 09:42:32 2021 -0700
[mlir][gpu] Add op to create MMA constant matrix
This allow creating a matrix with all elements set to a given value. This is needed to be able to implement a simple dot op.
Differential Revision: https://reviews.llvm.org/D103870 --- mlir/include/mlir/Dialect/GPU/GPUOps.td | 45 ++++++++++++++++++++++ mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp | 42 +++++++++++++++++++- .../Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir | 25 ++++++++++++ mlir/test/Dialect/GPU/ops.mlir | 4 ++ 4 files changed, 115 insertions(+), 1 deletion(-)
diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td index 8e2520b675ae..1e78e4af4d51 100644 --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -1022,4 +1022,49 @@ def GPU_SubgroupMmaComputeOp : GPU_Op<"subgroup_mma_compute", let verifier = [{ return ::verify(*this); }]; }
+def GPU_SubgroupMmaConstantMatrixOp : GPU_Op<"subgroup_mma_constant_matrix", + [NoSideEffect, + TypesMatchWith<"value type matches element type of mma_matrix", + "res", "value", + "$_self.castgpu::MMAMatrixType().getElementType()">]>{ + + let summary = "GPU warp synchronous constant matrix"; + + let description = [{ + The `gpu.subgroup_mma_constant_matrix` creates a `!gpu.mma_matrix` with + constant elements. + + The operation takes a scalar input and return a `!gpu.mma_matrix` where each + element of is equal to the operand constant. The destination mma_matrix type + must have elememt type equal to the constant type. Since the layout of + `!gpu.mma_matrix` is opaque this only support setting all the elements to + the same value. + + This op is meant to be used along with `gpu.subgroup_mma_compute`. + + Example: + + ```mlir + %0 = gpu.subgroup_mma_constant_matrix %a : + !gpu.mma_matrix<16x16xf16, "AOp"> + %1 = gpu.subgroup_mma_constant_matrix %b : + !gpu.mma_matrix<16x16xf32, "COp"> + ``` + }]; + + let arguments = (ins AnyTypeOf<[F16, F32]>:$value); + + let results = (outs GPU_MMAMatrix:$res); + + let extraClassDeclaration = [{ + gpu::MMAMatrixType getType() { + return res().getType().castgpu::MMAMatrixType(); + } + }]; + + let assemblyFormat = [{ + $value attr-dict `:` type($res) + }]; +} + #endif // GPU_OPS diff --git a/mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp b/mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp index d72c8c217f86..d46a185dec22 100644 --- a/mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp +++ b/mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp @@ -348,12 +348,52 @@ struct WmmaMmaOpToNVVMLowering } };
+/// Convert GPU MMA ConstantMatrixOp to a chain of InsertValueOp. +struct WmmaConstantOpToNVVMLowering + : public ConvertOpToLLVMPatterngpu::SubgroupMmaConstantMatrixOp { + using ConvertOpToLLVMPattern< + gpu::SubgroupMmaConstantMatrixOp>::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(gpu::SubgroupMmaConstantMatrixOp subgroupMmaConstantOp, + ArrayRef<Value> operands, + ConversionPatternRewriter &rewriter) const override { + if (failed(areAllLLVMTypes(subgroupMmaConstantOp.getOperation(), operands, + rewriter))) + return failure(); + Location loc = subgroupMmaConstantOp.getLoc(); + Value cst = operands[0]; + LLVM::LLVMStructType type = convertMMAToLLVMType( + subgroupMmaConstantOp.getType().castgpu::MMAMatrixType()); + // If the element type is a vector create a vector from the operand. + if (auto vecType = type.getBody()[0].dyn_cast<VectorType>()) { + Value vecCst = rewriter.createLLVM::UndefOp(loc, vecType); + for (int64_t vecEl = 0; vecEl < vecType.getNumElements(); vecEl++) { + Value idx = rewriter.createLLVM::ConstantOp( + loc, typeConverter->convertType(rewriter.getIntegerType(32)), + rewriter.getI32ArrayAttr(vecEl)); + vecCst = rewriter.createLLVM::InsertElementOp(loc, vecType, vecCst, + cst, idx); + } + cst = vecCst; + } + Value matrixStruct = rewriter.createLLVM::UndefOp(loc, type); + for (size_t i : llvm::seq(size_t(0), type.getBody().size())) { + matrixStruct = rewriter.createLLVM::InsertValueOp( + loc, matrixStruct, cst, rewriter.getI32ArrayAttr(i)); + } + rewriter.replaceOp(subgroupMmaConstantOp, matrixStruct); + return success(); + } +}; + } // anonymous namespace
namespace mlir { void populateGpuWMMAToNVVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns) { patterns.insert<WmmaLoadOpToNVVMLowering, WmmaMmaOpToNVVMLowering, - WmmaStoreOpToNVVMLowering>(converter); + WmmaStoreOpToNVVMLowering, WmmaConstantOpToNVVMLowering>( + converter); } } // namespace mlir diff --git a/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir index de5d0d3fcf1c..f692dffdfcba 100644 --- a/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir +++ b/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir @@ -151,3 +151,28 @@ gpu.module @test_module { return } } + + +// ----- + +gpu.module @test_module { + +// CHECK-LABEL: func @gpu_wmma_constant_op +// CHECK: %[[CST:.+]] = llvm.mlir.constant(1.000000e+00 : f16) : f16 +// CHECK: %[[V0:.+]] = llvm.mlir.undef : vector<2xf16> +// CHECK: %[[C0:.+]] = llvm.mlir.constant([0 : i32]) : i32 +// CHECK: %[[V1:.+]] = llvm.insertelement %[[CST]], %[[V0]][%[[C0]] : i32] : vector<2xf16> +// CHECK: %[[C1:.+]] = llvm.mlir.constant([1 : i32]) : i32 +// CHECK: %[[V2:.+]] = llvm.insertelement %[[CST]], %[[V1]][%[[C1]] : i32] : vector<2xf16> +// CHECK: %[[M0:.+]] = llvm.mlir.undef : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> +// CHECK: %[[M1:.+]] = llvm.insertvalue %[[V2]], %[[M0]][0 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> +// CHECK: %[[M2:.+]] = llvm.insertvalue %[[V2]], %[[M1]][1 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> +// CHECK: %[[M3:.+]] = llvm.insertvalue %[[V2]], %[[M2]][2 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> +// CHECK: %[[M4:.+]] = llvm.insertvalue %[[V2]], %[[M3]][3 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> +// CHECK: llvm.return %[[M4]] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> + func @gpu_wmma_constant_op() ->(!gpu.mma_matrix<16x16xf16, "COp">) { + %cst = constant 1.0 : f16 + %C = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf16, "COp"> + return %C : !gpu.mma_matrix<16x16xf16, "COp"> + } +} diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir index a98fe1c49683..1bed13c4b21a 100644 --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -201,8 +201,12 @@ module attributes {gpu.container_module} { // CHECK: %[[wg:.*]] = memref.alloca() %i = constant 16 : index // CHECK: %[[i:.*]] = constant 16 : index + %cst = constant 1.000000e+00 : f32 + // CHECK: %[[cst:.*]] = constant 1.000000e+00 : f32 %0 = gpu.subgroup_mma_load_matrix %wg[%i, %i] {leadDimension = 32 : index} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xf16, "AOp"> // CHECK: gpu.subgroup_mma_load_matrix %[[wg]][%[[i]], %[[i]]] {leadDimension = 32 : index} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xf16, "AOp"> + %1 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp"> + // CHECK: gpu.subgroup_mma_constant_matrix %[[cst]] : !gpu.mma_matrix<16x16xf32, "COp"> return } } </cut>